Skip to content

Commit

Permalink
calculate hash in Python
Browse files Browse the repository at this point in the history
  • Loading branch information
davies committed Aug 28, 2014
1 parent 4cba98f commit ded624f
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 6 deletions.
11 changes: 7 additions & 4 deletions python/pyspark/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2008,9 +2008,6 @@ def countApproxDistinct(self, relativeSD=0.05):
of The Art Cardinality Estimation Algorithm", available
<a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
This support all the types of objects, which is supported by
Pyrolite, nearly all builtin types.
@param relativeSD Relative accuracy. Smaller values create
counters that require more space.
It must be greater than 0.000017.
Expand All @@ -2026,7 +2023,13 @@ def countApproxDistinct(self, relativeSD=0.05):
raise ValueError("relativeSD should be greater than 0.000017")
if relativeSD > 0.37:
raise ValueError("relativeSD should be smaller than 0.37")
return self._to_java_object_rdd().countApproxDistinct(relativeSD)
hashRDD = self.map(lambda x: portable_hash(x) % sys.maxint)
c = hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)
# range of hash is [0, sys.maxint]
if c > sys.maxint / 30:
# correction for hash collision in Python
c = -sys.maxint * log(1 - float(c) / sys.maxint)
return int(c)


class PipelinedRDD(RDD):
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,13 +409,13 @@ def test_count_approx_distinct(self):
self.assertTrue(950 < rdd.countApproxDistinct(0.04) < 1050)
self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.04) < 1050)
self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.04) < 1050)
self.assertTrue(950 < rdd.map(lambda x: set([x, -x])).countApproxDistinct(0.04) < 1050)
self.assertTrue(950 < rdd.map(lambda x: (x, -x)).countApproxDistinct(0.04) < 1050)

rdd = self.sc.parallelize([i % 20 for i in range(1000)], 7)
self.assertTrue(18 < rdd.countApproxDistinct() < 22)
self.assertTrue(18 < rdd.map(float).countApproxDistinct() < 22)
self.assertTrue(18 < rdd.map(str).countApproxDistinct() < 22)
self.assertTrue(18 < rdd.map(lambda x: set([x, -x])).countApproxDistinct() < 22)
self.assertTrue(18 < rdd.map(lambda x: (x, -x)).countApproxDistinct() < 22)

self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.00000001))
self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.5))
Expand Down

0 comments on commit ded624f

Please sign in to comment.