From 78bf997f13c6f08129671a9d6a3484620d5b37a2 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 13 Nov 2014 13:08:10 -0800 Subject: [PATCH] fix tests, do not use numpy in randomSplit, no performance gain --- python/pyspark/rdd.py | 6 +++--- python/pyspark/rddsampler.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 0e8920281e842..50535d2711708 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -325,11 +325,11 @@ def randomSplit(self, weights, seed=None): :return: split RDDs in a list >>> rdd = sc.parallelize(range(5), 1) - >>> rdd1, rdd2 = rdd.randomSplit([2, 3], 101) + >>> rdd1, rdd2 = rdd.randomSplit([2, 3], 17) >>> rdd1.collect() - [2, 3] + [1, 3] >>> rdd2.collect() - [0, 1, 4] + [0, 2, 4] """ s = float(sum(weights)) cweights = [0.0] diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py index 4365640040116..558dcfd12d46f 100644 --- a/python/pyspark/rddsampler.py +++ b/python/pyspark/rddsampler.py @@ -119,6 +119,7 @@ class RDDRangeSampler(RDDSamplerBase): def __init__(self, lowerBound, upperBound, seed=None): RDDSamplerBase.__init__(self, False, seed) + self._use_numpy = False # no performance gain from numpy self._lowerBound = lowerBound self._upperBound = upperBound