Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-21681][ML] fix bug of MLOR do not work correctly when featureStd contains zero #18896

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -270,11 +270,13 @@ private[ml] class LogisticAggregator(

val margins = new Array[Double](numClasses)
features.foreachActive { (index, value) =>
val stdValue = value / localFeaturesStd(index)
var j = 0
while (j < numClasses) {
margins(j) += localCoefficients(index * numClasses + j) * stdValue
j += 1
if (localFeaturesStd(index) != 0.0 && value != 0.0) {
val stdValue = value / localFeaturesStd(index)
var j = 0
while (j < numClasses) {
margins(j) += localCoefficients(index * numClasses + j) * stdValue
j += 1
}
}
}
var i = 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class LogisticRegressionSuite
@transient var smallMultinomialDataset: Dataset[_] = _
@transient var binaryDataset: Dataset[_] = _
@transient var multinomialDataset: Dataset[_] = _
@transient var multinomialDatasetWithZeroVar: Dataset[_] = _
private val eps: Double = 1e-5

override def beforeAll(): Unit = {
Expand Down Expand Up @@ -99,6 +100,23 @@ class LogisticRegressionSuite
df.cache()
df
}

multinomialDatasetWithZeroVar = {
val nPoints = 100
val coefficients = Array(
-0.57997, 0.912083, -0.371077,
-0.16624, -0.84355, -0.048509)

val xMean = Array(5.843, 3.0)
val xVariance = Array(0.6856, 0.0)

val testData = generateMultinomialLogisticInput(
coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)

val df = sc.parallelize(testData, 4).toDF().withColumn("weight", lit(1.0))
df.cache()
df
}
}

/**
Expand All @@ -112,6 +130,11 @@ class LogisticRegressionSuite
multinomialDataset.rdd.map { case Row(label: Double, features: Vector, weight: Double) =>
label + "," + weight + "," + features.toArray.mkString(",")
}.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset")
multinomialDatasetWithZeroVar.rdd.map {
case Row(label: Double, features: Vector, weight: Double) =>
label + "," + weight + "," + features.toArray.mkString(",")
}.repartition(1)
.saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDatasetWithZeroVar")
}

test("params") {
Expand Down Expand Up @@ -1392,6 +1415,61 @@ class LogisticRegressionSuite
assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
}

test("multinomial logistic regression with zero variance (SPARK-21681)") {
val sqlContext = multinomialDatasetWithZeroVar.sqlContext
import sqlContext.implicits._
val mlr = new LogisticRegression().setFamily("multinomial").setFitIntercept(true)
.setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setWeightCol("weight")

val model = mlr.fit(multinomialDatasetWithZeroVar)

/*
Use the following R code to load the data and train the model using glmnet package.

library("glmnet")
data <- read.csv("path", header=FALSE)
label = as.factor(data$V1)
w = data$V2
features = as.matrix(data.frame(data$V3, data$V4))
coefficients = coef(glmnet(features, label, weights=w, family="multinomial",
alpha = 0, lambda = 0))
coefficients
$`0`
3 x 1 sparse Matrix of class "dgCMatrix"
s0
0.2658824
data.V3 0.1881871
data.V4 .

$`1`
3 x 1 sparse Matrix of class "dgCMatrix"
s0
0.53604701
data.V3 -0.02412645
data.V4 .

$`2`
3 x 1 sparse Matrix of class "dgCMatrix"
s0
-0.8019294
data.V3 -0.1640607
data.V4 .
*/

val coefficientsR = new DenseMatrix(3, 2, Array(
0.1881871, 0.0,
-0.02412645, 0.0,
-0.1640607, 0.0), isTransposed = true)
val interceptsR = Vectors.dense(0.2658824, 0.53604701, -0.8019294)

model.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps))

assert(model.coefficientMatrix ~== coefficientsR relTol 0.05)
assert(model.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
assert(model.interceptVector ~== interceptsR relTol 0.05)
assert(model.interceptVector.toArray.sum ~== 0.0 absTol eps)
}

test("multinomial logistic regression with intercept without regularization with bound") {
// Bound constrained optimization with bound on one side.
val lowerBoundsOnCoefficients = Matrices.dense(3, 4, Array.fill(12)(1.0))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {

@transient var instances: Array[Instance] = _
@transient var instancesConstantFeature: Array[Instance] = _
@transient var instancesConstantFeatureFiltered: Array[Instance] = _

override def beforeAll(): Unit = {
super.beforeAll()
Expand All @@ -41,6 +42,11 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
Instance(1.0, 0.5, Vectors.dense(1.0, 1.0)),
Instance(2.0, 0.3, Vectors.dense(1.0, 0.5))
)
instancesConstantFeatureFiltered = Array(
Instance(0.0, 0.1, Vectors.dense(2.0)),
Instance(1.0, 0.5, Vectors.dense(1.0)),
Instance(2.0, 0.3, Vectors.dense(0.5))
)
}

/** Get summary statistics for some data and create a new LogisticAggregator. */
Expand Down Expand Up @@ -233,21 +239,44 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
val binaryInstances = instancesConstantFeature.map { instance =>
if (instance.label <= 1.0) instance else Instance(0.0, instance.weight, instance.features)
}
val binaryInstancesFiltered = instancesConstantFeatureFiltered.map { instance =>
if (instance.label <= 1.0) instance else Instance(0.0, instance.weight, instance.features)
}
val coefArray = Array(1.0, 2.0, -2.0, 3.0, 0.0, -1.0)
val coefArrayFiltered = Array(3.0, 0.0, -1.0)
val interceptArray = Array(4.0, 2.0, -3.0)
val aggConstantFeature = getNewAggregator(instancesConstantFeature,
Vectors.dense(coefArray ++ interceptArray), fitIntercept = true, isMultinomial = true)
instances.foreach(aggConstantFeature.add)
val aggConstantFeatureFiltered = getNewAggregator(instancesConstantFeatureFiltered,
Vectors.dense(coefArrayFiltered ++ interceptArray), fitIntercept = true, isMultinomial = true)

instancesConstantFeature.foreach(aggConstantFeature.add)
instancesConstantFeatureFiltered.foreach(aggConstantFeatureFiltered.add)

// constant features should not affect gradient
assert(aggConstantFeature.gradient(0) === 0.0)
def validateGradient(grad: Vector, gradFiltered: Vector, numCoefficientSets: Int): Unit = {
for (i <- 0 until numCoefficientSets) {
assert(grad(i) === 0.0)
assert(grad(numCoefficientSets + i) == gradFiltered(i))
}
}

validateGradient(aggConstantFeature.gradient, aggConstantFeatureFiltered.gradient, 3)

val binaryCoefArray = Array(1.0, 2.0)
val binaryCoefArrayFiltered = Array(2.0)
val intercept = 1.0
val aggConstantFeatureBinary = getNewAggregator(binaryInstances,
Vectors.dense(binaryCoefArray ++ Array(intercept)), fitIntercept = true,
isMultinomial = false)
instances.foreach(aggConstantFeatureBinary.add)
val aggConstantFeatureBinaryFiltered = getNewAggregator(binaryInstancesFiltered,
Vectors.dense(binaryCoefArrayFiltered ++ Array(intercept)), fitIntercept = true,
isMultinomial = false)
binaryInstances.foreach(aggConstantFeatureBinary.add)
binaryInstancesFiltered.foreach(aggConstantFeatureBinaryFiltered.add)

// constant features should not affect gradient
assert(aggConstantFeatureBinary.gradient(0) === 0.0)
validateGradient(aggConstantFeatureBinary.gradient,
aggConstantFeatureBinaryFiltered.gradient, 1)
}
}