Skip to content

Commit

Permalink
fix bug of countApproxDistinct() when have more than one partition
Browse files Browse the repository at this point in the history
  • Loading branch information
davies committed Aug 6, 2014
1 parent e537b33 commit bf757ce
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 6 deletions.
2 changes: 1 addition & 1 deletion core/src/main/scala/org/apache/spark/rdd/RDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1004,7 +1004,7 @@ abstract class RDD[T: ClassTag](
},
(h1: HyperLogLogPlus, h2: HyperLogLogPlus) => {
h1.addAll(h2)
h2
h1
}).cardinality()
}

Expand Down
10 changes: 5 additions & 5 deletions core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,11 @@ class RDDSuite extends FunSuite with SharedSparkContext {

def error(est: Long, size: Long) = math.abs(est - size) / size.toDouble

val size = 100
val uniformDistro = for (i <- 1 to 100000) yield i % size
val simpleRdd = sc.makeRDD(uniformDistro)
assert(error(simpleRdd.countApproxDistinct(4, 0), size) < 0.4)
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.1)
val size = 1000
val uniformDistro = for (i <- 1 to 5000) yield i % size
val simpleRdd = sc.makeRDD(uniformDistro, 10)
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)
assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)
}

test("SparkContext.union") {
Expand Down

0 comments on commit bf757ce

Please sign in to comment.