Skip to content

Commit

Permalink
style update
Browse files Browse the repository at this point in the history
  • Loading branch information
mengxr committed Mar 27, 2014
1 parent 78c4671 commit b11659c
Show file tree
Hide file tree
Showing 9 changed files with 57 additions and 35 deletions.
4 changes: 2 additions & 2 deletions mllib/src/main/scala/org/apache/spark/mllib/MLContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ import org.apache.spark.rdd.RDD
class MLContext(self: SparkContext) {
/**
* Reads labeled data in the LIBSVM format into an RDD[LabeledPoint].
* The LIBSVM format is a text-based format used by LIBSVM (http://www.csie.ntu.edu.tw/~cjlin/libsvm/).
* The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR.
* Each line represents a labeled sparse feature vector using the following format:
* {{{label index1:value1 index2:value2 ...}}}
* where the indices are one-based and in ascending order.
* This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]] instance,
* This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]],
* where the feature indices are converted to zero-based.
*
* @param path file or directory path in any Hadoop-supported file system URI
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,17 @@ abstract class Gradient extends Serializable {
def compute(data: Vector, label: Double, weights: Vector): (Vector, Double)

/**
* Compute the gradient and loss given the features of a single data point, add the gradient to a provided vector to
* avoid creating new objects, and return loss.
* Compute the gradient and loss given the features of a single data point,
* add the gradient to a provided vector to avoid creating new objects, and return loss.
*
* @param data features for one data point
* @param label label for this data point
* @param weights weights/coefficients corresponding to features
* @param gradientAddTo gradient will be added to this vector
* @param cumGradient the computed gradient will be added to this vector
*
* @return (gradient: Vector, loss: Double)
* @return loss
*/
def compute(data: Vector, label: Double, weights: Vector, gradientAddTo: Vector): Double
def compute(data: Vector, label: Double, weights: Vector, cumGradient: Vector): Double
}

/**
Expand All @@ -71,13 +71,17 @@ class LogisticGradient extends Gradient {
(Vectors.fromBreeze(gradient), loss)
}

override def compute(data: Vector, label: Double, weights: Vector, gradientAddTo: Vector): Double = {
override def compute(
data: Vector,
label: Double,
weights: Vector,
cumGradient: Vector): Double = {
val brzData = data.toBreeze
val brzWeights = weights.toBreeze
val margin: Double = -1.0 * brzWeights.dot(brzData)
val gradientMultiplier = (1.0 / (1.0 + math.exp(margin))) - label

brzAxpy(gradientMultiplier, brzData, gradientAddTo.toBreeze)
brzAxpy(gradientMultiplier, brzData, cumGradient.toBreeze)

if (label > 0) {
math.log(1 + math.exp(margin))
Expand All @@ -104,12 +108,16 @@ class LeastSquaresGradient extends Gradient {
(Vectors.fromBreeze(gradient), loss)
}

override def compute(data: Vector, label: Double, weights: Vector, gradientAddTo: Vector): Double = {
override def compute(
data: Vector,
label: Double,
weights: Vector,
cumGradient: Vector): Double = {
val brzData = data.toBreeze
val brzWeights = weights.toBreeze
val diff = brzWeights.dot(brzData) - label

brzAxpy(2.0 * diff, brzData, gradientAddTo.toBreeze)
brzAxpy(2.0 * diff, brzData, cumGradient.toBreeze)

diff * diff
}
Expand Down Expand Up @@ -137,7 +145,11 @@ class HingeGradient extends Gradient {
}
}

override def compute(data: Vector, label: Double, weights: Vector, gradientAddTo: Vector): Double = {
override def compute(
data: Vector,
label: Double,
weights: Vector,
cumGradient: Vector): Double = {
val brzData = data.toBreeze
val brzWeights = weights.toBreeze
val dotProduct = brzWeights.dot(brzData)
Expand All @@ -147,7 +159,7 @@ class HingeGradient extends Gradient {
val labelScaled = 2 * label - 1.0

if (1.0 > labelScaled * dotProduct) {
brzAxpy(-labelScaled, brzData, gradientAddTo.toBreeze)
brzAxpy(-labelScaled, brzData, cumGradient.toBreeze)
1.0 - labelScaled * dotProduct
} else {
0.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@

package org.apache.spark.mllib.regression

import breeze.linalg.{Vector => BV}

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.{Vector, Vectors}

import breeze.linalg.{Vector => BV, DenseVector => BDV}

/**
* Regression model trained using Lasso.
Expand Down Expand Up @@ -142,7 +142,8 @@ object LassoWithSGD {
regParam: Double,
miniBatchFraction: Double,
initialWeights: Vector): LassoModel = {
new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction).run(input, initialWeights)
new LassoWithSGD(stepSize, numIterations, regParam, miniBatchFraction)
.run(input, initialWeights)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ package org.apache.spark.mllib.regression

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.{Vector, Vectors}

/**
* Regression model trained using LinearRegression.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@

package org.apache.spark.mllib.regression

import breeze.linalg.{Vector => BV}

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.{Vectors, Vector}

import breeze.linalg.{Vector => BV, DenseVector => BDV}

/**
* Regression model trained using RidgeRegression.
*
Expand Down Expand Up @@ -72,9 +72,9 @@ class RidgeRegressionWithSGD private (
// We don't want to penalize the intercept in RidgeRegression, so set this to false.
super.setIntercept(false)

var yMean = 0.0
var xColMean: BV[Double] = _
var xColSd: BV[Double] = _
private var yMean = 0.0
private var xColMean: BV[Double] = _
private var xColSd: BV[Double] = _

/**
* Construct a RidgeRegression object with default parameters
Expand Down Expand Up @@ -214,8 +214,8 @@ object RidgeRegressionWithSGD {

def main(args: Array[String]) {
if (args.length != 5) {
println("Usage: RidgeRegression <master> <input_dir> <step_size> <regularization_parameter>" +
" <niters>")
println("Usage: RidgeRegression <master> <input_dir> <step_size> " +
"<regularization_parameter> <niters>")
System.exit(1)
}
val sc = new SparkContext(args(0), "RidgeRegression")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ object MLUtils {
* xColMean - Row vector with mean for every column (or feature) of the input data
* xColSd - Row vector standard deviation for every column (or feature) of the input data.
*/
def computeStats(data: RDD[LabeledPoint], numFeatures: Int, numExamples: Long): (Double, Vector, Vector) = {
def computeStats(
data: RDD[LabeledPoint],
numFeatures: Int,
numExamples: Long): (Double, Vector, Vector) = {
val brzData = data.map { case LabeledPoint(label, features) =>
(label, features.toBreeze)
}
Expand Down
11 changes: 7 additions & 4 deletions mllib/src/test/scala/org/apache/spark/mllib/MLContextSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,16 @@

package org.apache.spark.mllib

import org.apache.spark.mllib.MLContext._
import org.apache.spark.mllib.util.LocalSparkContext
import org.scalatest.FunSuite
import com.google.common.io.Files
import java.io.File

import org.scalatest.FunSuite

import com.google.common.base.Charsets
import com.google.common.io.Files

import org.apache.spark.mllib.MLContext._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.LocalSparkContext

class MLContextSuite extends FunSuite with LocalSparkContext {
test("libSVMFile") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ import scala.util.Random

import org.scalatest.FunSuite

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.LocalSparkContext
import org.apache.spark.mllib.linalg.Vectors

object NaiveBayesSuite {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ package org.apache.spark.mllib.regression

import org.scalatest.FunSuite

import org.apache.spark.mllib.util.{LinearDataGenerator, LocalSparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.{LinearDataGenerator, LocalSparkContext}

class LinearRegressionSuite extends FunSuite with LocalSparkContext {

Expand Down Expand Up @@ -88,7 +88,8 @@ class LinearRegressionSuite extends FunSuite with LocalSparkContext {

// Test if we can correctly learn Y = 10*X1 + 10*X10000
test("sparse linear regression without intercept") {
val denseRDD = sc.parallelize(LinearDataGenerator.generateLinearInput(0.0, Array(10.0, 10.0), 100, 42), 2)
val denseRDD = sc.parallelize(
LinearDataGenerator.generateLinearInput(0.0, Array(10.0, 10.0), 100, 42), 2)
val sparseRDD = denseRDD.map { case LabeledPoint(label, v) =>
val sv = Vectors.sparse(10000, Seq((0, v(0)), (9999, v(1))))
LabeledPoint(label, sv)
Expand All @@ -113,9 +114,11 @@ class LinearRegressionSuite extends FunSuite with LocalSparkContext {
val sparseValidationRDD = sc.parallelize(sparseValidationData, 2)

// Test prediction on RDD.
validatePrediction(model.predict(sparseValidationRDD.map(_.features)).collect(), sparseValidationData)
validatePrediction(
model.predict(sparseValidationRDD.map(_.features)).collect(), sparseValidationData)

// Test prediction on Array.
validatePrediction(sparseValidationData.map(row => model.predict(row.features)), sparseValidationData)
validatePrediction(
sparseValidationData.map(row => model.predict(row.features)), sparseValidationData)
}
}

0 comments on commit b11659c

Please sign in to comment.