Skip to content

Commit

Permalink
add key to top()
Browse files Browse the repository at this point in the history
  • Loading branch information
davies committed Aug 23, 2014
1 parent ad7e374 commit ccbaf25
Showing 1 changed file with 6 additions and 10 deletions.
16 changes: 6 additions & 10 deletions python/pyspark/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,7 +938,7 @@ def mergeMaps(m1, m2):
return m1
return self.mapPartitions(countPartition).reduce(mergeMaps)

def top(self, num):
def top(self, num, key=None):
"""
Get the top N elements from a RDD.
Expand All @@ -947,20 +947,16 @@ def top(self, num):
[12]
>>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2)
[6, 5]
>>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str)
[4, 3, 2]
"""
def topIterator(iterator):
q = []
for k in iterator:
if len(q) < num:
heapq.heappush(q, k)
else:
heapq.heappushpop(q, k)
yield q
yield heapq.nlargest(num, iterator, key=key)

def merge(a, b):
return next(topIterator(a + b))
return heapq.nlargest(num, a + b, key=key)

return sorted(self.mapPartitions(topIterator).reduce(merge), reverse=True)
return self.mapPartitions(topIterator).reduce(merge)

def takeOrdered(self, num, key=None):
"""
Expand Down

0 comments on commit ccbaf25

Please sign in to comment.