Skip to content

Commit

Permalink
[SPARK-32907][ML] adaptively blockify instances - revert blockify gmm
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
revert blockify gmm

### Why are the changes needed?
WeichenXu123  and I thought we should use memory size instead of number of rows to blockify instance; then if a buffer's size is large and determined by number of rows, we should discard it.
In GMM, we found that the pre-allocated memory maybe too large and should be discarded:
```
transient private lazy val auxiliaryPDFMat = DenseMatrix.zeros(blockSize, numFeatures)
```
We had some offline discuss and thought it is better to revert blockify GMM.

### Does this PR introduce _any_ user-facing change?
blockSize added in master branch will be removed

### How was this patch tested?
existing testsuites

Closes #29782 from zhengruifeng/unblockify_gmm.

Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
  • Loading branch information
zhengruifeng committed Sep 23, 2020
1 parent 21b7479 commit 432afac
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 294 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class MultivariateGaussian @Since("2.0.0") (
*/
@transient private lazy val tuple = {
val (rootSigmaInv, u) = calculateCovarianceConstants
val rootSigmaInvMat = Matrices.fromBreeze(rootSigmaInv).toDense
val rootSigmaInvMat = Matrices.fromBreeze(rootSigmaInv)
val rootSigmaInvMulMu = rootSigmaInvMat.multiply(mean)
(rootSigmaInvMat, u, rootSigmaInvMulMu)
}
Expand All @@ -81,36 +81,6 @@ class MultivariateGaussian @Since("2.0.0") (
u - 0.5 * BLAS.dot(v, v)
}

private[ml] def pdf(X: Matrix): DenseVector = {
val mat = DenseMatrix.zeros(X.numRows, X.numCols)
pdf(X, mat)
}

private[ml] def pdf(X: Matrix, mat: DenseMatrix): DenseVector = {
require(!mat.isTransposed)

BLAS.gemm(1.0, X, rootSigmaInvMat.transpose, 0.0, mat)
val m = mat.numRows
val n = mat.numCols

val pdfVec = mat.multiply(rootSigmaInvMulMu)

val blas = BLAS.getBLAS(n)
val squared1 = blas.ddot(n, rootSigmaInvMulMu.values, 1, rootSigmaInvMulMu.values, 1)

val localU = u
var i = 0
while (i < m) {
val squared2 = blas.ddot(n, mat.values, i, m, mat.values, i, m)
val dot = pdfVec(i)
val squaredSum = squared1 + squared2 - dot - dot
pdfVec.values(i) = math.exp(localU - 0.5 * squaredSum)
i += 1
}

pdfVec
}

/**
* Calculate distribution dependent components used for the density function:
* pdf(x) = (2*pi)^(-k/2)^ * det(sigma)^(-1/2)^ * exp((-1/2) * (x-mu).t * inv(sigma) * (x-mu))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ class MultivariateGaussianSuite extends SparkMLFunSuite {
test("univariate") {
val x1 = Vectors.dense(0.0)
val x2 = Vectors.dense(1.5)
val mat = Matrices.fromVectors(Seq(x1, x2))

val mu = Vectors.dense(0.0)
val sigma1 = Matrices.dense(1, 1, Array(1.0))
Expand All @@ -36,21 +35,18 @@ class MultivariateGaussianSuite extends SparkMLFunSuite {
assert(dist1.logpdf(x2) ~== -2.0439385332046727 absTol 1E-5)
assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)
assert(dist1.pdf(mat) ~== Vectors.dense(0.39894, 0.12952) absTol 1E-5)

val sigma2 = Matrices.dense(1, 1, Array(4.0))
val dist2 = new MultivariateGaussian(mu, sigma2)
assert(dist2.logpdf(x1) ~== -1.612085713764618 absTol 1E-5)
assert(dist2.logpdf(x2) ~== -1.893335713764618 absTol 1E-5)
assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
assert(dist2.pdf(mat) ~== Vectors.dense(0.19947, 0.15057) absTol 1E-5)
}

test("multivariate") {
val x1 = Vectors.dense(0.0, 0.0)
val x2 = Vectors.dense(1.0, 1.0)
val mat = Matrices.fromVectors(Seq(x1, x2))

val mu = Vectors.dense(0.0, 0.0)
val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
Expand All @@ -59,33 +55,28 @@ class MultivariateGaussianSuite extends SparkMLFunSuite {
assert(dist1.logpdf(x2) ~== -2.8378770664093453 absTol 1E-5)
assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)
assert(dist1.pdf(mat) ~== Vectors.dense(0.15915, 0.05855) absTol 1E-5)

val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
val dist2 = new MultivariateGaussian(mu, sigma2)
assert(dist2.logpdf(x1) ~== -2.810832140937002 absTol 1E-5)
assert(dist2.logpdf(x2) ~== -3.3822607123655732 absTol 1E-5)
assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
assert(dist2.pdf(mat) ~== Vectors.dense(0.060155, 0.033971) absTol 1E-5)
}

test("multivariate degenerate") {
val x1 = Vectors.dense(0.0, 0.0)
val x2 = Vectors.dense(1.0, 1.0)
val mat = Matrices.fromVectors(Seq(x1, x2))

val mu = Vectors.dense(0.0, 0.0)
val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
val dist = new MultivariateGaussian(mu, sigma)
assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
assert(dist.pdf(mat) ~== Vectors.dense(0.11254, 0.068259) absTol 1E-5)
}

test("SPARK-11302") {
val x = Vectors.dense(629, 640, 1.7188, 618.19)
val mat = Matrices.fromVectors(Seq(x))
val mu = Vectors.dense(
1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
val sigma = Matrices.dense(4, 4, Array(
Expand All @@ -96,6 +87,5 @@ class MultivariateGaussianSuite extends SparkMLFunSuite {
val dist = new MultivariateGaussian(mu, sigma)
// Agrees with R's dmvnorm: 7.154782e-05
assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
assert(dist.pdf(mat) ~== Vectors.dense(7.154782224045512E-5) absTol 1E-5)
}
}
Loading

0 comments on commit 432afac

Please sign in to comment.