Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-30699][ML][PYSPARK] GMM blockify input vectors #27473

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ private[spark] object BLAS extends Serializable {
}

/**
* Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's ?SPR.
* Adds alpha * v * v.t to a matrix in-place. This is the same as BLAS's ?SPR.
*
* @param U the upper triangular part of the matrix packed in an array (column major)
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class MultivariateGaussian @Since("2.0.0") (
*/
@transient private lazy val tuple = {
val (rootSigmaInv, u) = calculateCovarianceConstants
val rootSigmaInvMat = Matrices.fromBreeze(rootSigmaInv)
val rootSigmaInvMat = Matrices.fromBreeze(rootSigmaInv).toDense
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment in #29782

val rootSigmaInvMulMu = rootSigmaInvMat.multiply(mean)
(rootSigmaInvMat, u, rootSigmaInvMulMu)
}
Expand All @@ -81,6 +81,36 @@ class MultivariateGaussian @Since("2.0.0") (
u - 0.5 * BLAS.dot(v, v)
}

private[ml] def pdf(X: Matrix): DenseVector = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment in #29782

val mat = DenseMatrix.zeros(X.numRows, X.numCols)
pdf(X, mat)
}

private[ml] def pdf(X: Matrix, mat: DenseMatrix): DenseVector = {
require(!mat.isTransposed)

BLAS.gemm(1.0, X, rootSigmaInvMat.transpose, 0.0, mat)
val m = mat.numRows
val n = mat.numCols

val pdfVec = mat.multiply(rootSigmaInvMulMu)

val blas = BLAS.getBLAS(n)
val squared1 = blas.ddot(n, rootSigmaInvMulMu.values, 1, rootSigmaInvMulMu.values, 1)

val localU = u
var i = 0
while (i < m) {
val squared2 = blas.ddot(n, mat.values, i, m, mat.values, i, m)
val dot = pdfVec(i)
val squaredSum = squared1 + squared2 - dot - dot
pdfVec.values(i) = math.exp(localU - 0.5 * squaredSum)
i += 1
}

pdfVec
}

/**
* Calculate distribution dependent components used for the density function:
* pdf(x) = (2*pi)^(-k/2)^ * det(sigma)^(-1/2)^ * exp((-1/2) * (x-mu).t * inv(sigma) * (x-mu))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class MultivariateGaussianSuite extends SparkMLFunSuite {
test("univariate") {
val x1 = Vectors.dense(0.0)
val x2 = Vectors.dense(1.5)
val mat = Matrices.fromVectors(Seq(x1, x2))

val mu = Vectors.dense(0.0)
val sigma1 = Matrices.dense(1, 1, Array(1.0))
Expand All @@ -35,18 +36,21 @@ class MultivariateGaussianSuite extends SparkMLFunSuite {
assert(dist1.logpdf(x2) ~== -2.0439385332046727 absTol 1E-5)
assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)
assert(dist1.pdf(mat) ~== Vectors.dense(0.39894, 0.12952) absTol 1E-5)

val sigma2 = Matrices.dense(1, 1, Array(4.0))
val dist2 = new MultivariateGaussian(mu, sigma2)
assert(dist2.logpdf(x1) ~== -1.612085713764618 absTol 1E-5)
assert(dist2.logpdf(x2) ~== -1.893335713764618 absTol 1E-5)
assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
assert(dist2.pdf(mat) ~== Vectors.dense(0.19947, 0.15057) absTol 1E-5)
}

test("multivariate") {
val x1 = Vectors.dense(0.0, 0.0)
val x2 = Vectors.dense(1.0, 1.0)
val mat = Matrices.fromVectors(Seq(x1, x2))

val mu = Vectors.dense(0.0, 0.0)
val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
Expand All @@ -55,28 +59,33 @@ class MultivariateGaussianSuite extends SparkMLFunSuite {
assert(dist1.logpdf(x2) ~== -2.8378770664093453 absTol 1E-5)
assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)
assert(dist1.pdf(mat) ~== Vectors.dense(0.15915, 0.05855) absTol 1E-5)

val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
val dist2 = new MultivariateGaussian(mu, sigma2)
assert(dist2.logpdf(x1) ~== -2.810832140937002 absTol 1E-5)
assert(dist2.logpdf(x2) ~== -3.3822607123655732 absTol 1E-5)
assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
assert(dist2.pdf(mat) ~== Vectors.dense(0.060155, 0.033971) absTol 1E-5)
}

test("multivariate degenerate") {
val x1 = Vectors.dense(0.0, 0.0)
val x2 = Vectors.dense(1.0, 1.0)
val mat = Matrices.fromVectors(Seq(x1, x2))

val mu = Vectors.dense(0.0, 0.0)
val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
val dist = new MultivariateGaussian(mu, sigma)
assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
assert(dist.pdf(mat) ~== Vectors.dense(0.11254, 0.068259) absTol 1E-5)
}

test("SPARK-11302") {
val x = Vectors.dense(629, 640, 1.7188, 618.19)
val mat = Matrices.fromVectors(Seq(x))
val mu = Vectors.dense(
1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
val sigma = Matrices.dense(4, 4, Array(
Expand All @@ -87,5 +96,6 @@ class MultivariateGaussianSuite extends SparkMLFunSuite {
val dist = new MultivariateGaussian(mu, sigma)
// Agrees with R's dmvnorm: 7.154782e-05
assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
assert(dist.pdf(mat) ~== Vectors.dense(7.154782224045512E-5) absTol 1E-5)
}
}
Loading