fixes from comments in PR

apache · Sep 19, 2017 · f451d65 · f451d65
1 parent 69112a5
commit f451d65
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 6 deletions.
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -231,7 +231,7 @@ def dumps(self, series):
             series = [series]
         series = [(s, None) if not isinstance(s, (list, tuple)) else s for s in series]
         arrs = [pa.Array.from_pandas(s[0], type=s[1], mask=s[0].isnull()) for s in series]
-        batch = pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in range(len(arrs))])
+        batch = pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])
         return super(ArrowPandasSerializer, self).dumps(batch)
 
     def loads(self, obj):
@@ -241,9 +241,9 @@ def loads(self, obj):
         """
         import pyarrow as pa
         reader = pa.RecordBatchFileReader(pa.BufferReader(obj))
-        batches = [reader.get_batch(i) for i in range(reader.num_record_batches)]
+        batches = [reader.get_batch(i) for i in xrange(reader.num_record_batches)]
         # NOTE: a 0-parameter pandas_udf will produce an empty batch that can have num_rows set
-        num_rows = sum([batch.num_rows for batch in batches])
+        num_rows = sum((batch.num_rows for batch in batches))
         table = pa.Table.from_batches(batches)
         return [c.to_pandas() for c in table.itercolumns()] + [{"length": num_rows}]
 

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3231,7 +3231,7 @@ def test_vectorized_udf_null_string(self):
     def test_vectorized_udf_zero_parameter(self):
         from pyspark.sql.functions import pandas_udf
         import pandas as pd
-        df = self.spark.range(100000)
+        df = self.spark.range(10)
         f0 = pandas_udf(lambda **kwargs: pd.Series(1).repeat(kwargs['length']), LongType())
         res = df.select(f0())
         self.assertEquals(df.select(lit(1)).collect(), res.collect())

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -74,14 +74,16 @@ def wrap_udf(f, return_type):
 
 
 def wrap_pandas_udf(f, return_type):
+    arrow_return_type = toArrowType(return_type)
+
     def verify_result_length(*a):
         kwargs = a[-1]
         result = f(*a[:-1], **kwargs)
         if len(result) != kwargs["length"]:
             raise RuntimeError("Result vector from pandas_udf was not the required length: "
-                               "expected %d, got %d\nUse input vector length or kwarg['length']"
+                               "expected %d, got %d\nUse input vector length or kwargs['length']"
                                % (kwargs["length"], len(result)))
-        return result, toArrowType(return_type)
+        return result, arrow_return_type
     return lambda *a: verify_result_length(*a)