Skip to content

Commit

Permalink
use fillna before casting with astype
Browse files Browse the repository at this point in the history
  • Loading branch information
BryanCutler committed Sep 21, 2017
1 parent 44a20f6 commit 53926cc
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion python/pyspark/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,16 @@ def dumps(self, series):
(len(series) == 2 and isinstance(series[1], pa.DataType)):
series = [series]
series = ((s, None) if not isinstance(s, (list, tuple)) else s for s in series)
arrs = [pa.Array.from_pandas(s[0], type=s[1], mask=s[0].isnull()) for s in series]

# If a nullable integer series has been promoted to floating point with NaNs, need to cast
# NOTE: this is not necessary with Arrow >= 0.7
def cast_series(s, t):
if t is None or s.dtype == t.to_pandas_dtype():
return s
else:
return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)

arrs = [pa.Array.from_pandas(cast_series(s, t), mask=s.isnull(), type=t) for s, t in series]
batch = pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])
return super(ArrowPandasSerializer, self).dumps(batch)

Expand Down

0 comments on commit 53926cc

Please sign in to comment.