You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Calling describe on a Dask-cuDF dataframe currently appears to fail due to the nature of the data structure we're passing to as_column, which forces pyarrow to guess what type the object is (causing an error). We're passing a numpy array of single element cupy arrays.
import cudf
import dask_cudf
ddf = dask_cudf.from_cudf(cudf.datasets.randomdata(10), 2)
ddf.describe().compute()
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
<ipython-input-3-e1d42e6272d7> in <module>
3
4 ddf = dask_cudf.from_cudf(cudf.datasets.randomdata(10), 2)
----> 5 ddf.describe().compute()
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/dask/base.py in compute(self, **kwargs)
164 dask.base.compute
165 """
--> 166 (result,) = compute(self, traverse=False, **kwargs)
167 return result
168
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/dask/base.py in compute(*args, **kwargs)
442 postcomputes.append(x.__dask_postcompute__())
443
--> 444 results = schedule(dsk, keys, **kwargs)
445 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
446
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/dask/local.py in get_sync(dsk, keys, **kwargs)
525 """
526 kwargs.pop("num_workers", None) # if num_workers present, remove it
--> 527 return get_async(apply_sync, 1, dsk, keys, **kwargs)
528
529
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
492
493 while state["ready"] and len(state["running"]) < num_workers:
--> 494 fire_task()
495
496 succeeded = True
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/dask/local.py in fire_task()
464 pack_exception,
465 ),
--> 466 callback=queue.put,
467 )
468
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/dask/local.py in apply_sync(func, args, kwds, callback)
514 def apply_sync(func, args=(), kwds={}, callback=None):
515 """ A naive synchronous version of apply_async """
--> 516 res = func(*args, **kwds)
517 if callback is not None:
518 callback(res)
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
225 failed = False
226 except BaseException as e:
--> 227 result = pack_exception(e, dumps)
228 failed = True
229 return key, result, failed
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
220 try:
221 task, data = loads(task_info)
--> 222 result = _execute_task(task, data)
223 id = get_id()
224 result = dumps((result, id))
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/dask/core.py in _execute_task(arg, cache, dsk)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/cudf/core/series.py in __init__(self, data, index, dtype, name, nan_as_null)
168
169 if not isinstance(data, column.ColumnBase):
--> 170 data = column.as_column(data, nan_as_null=nan_as_null, dtype=dtype)
171
172 if index is not None and not isinstance(index, Index):
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/cudf/core/column/column.py in as_column(arbitrary, nan_as_null, dtype, length)
1488 elif arb_dtype.kind in ("O", "U"):
1489 data = as_column(
-> 1490 pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype
1491 )
1492 # There is no cast operation available for pa.Array from int to
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/pyarrow/array.pxi in pyarrow.lib.Array.from_pandas()
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/pyarrow/array.pxi in pyarrow.lib.array()
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
/raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowInvalid: Could not convert 989.0 with type cupy.core.core.ndarray: did not recognize Python value type when inferring an Arrow data type
From a quick debug:
ipdb> up
> /tmp/nicholasb/dev/nersc/lsst/notebooks/pyarrow/array.pxi(661)pyarrow.lib.Array.from_pandas()
ipdb> up
> /raid/nicholasb/miniconda3/envs/rapids-20200529-cuda102-1005/lib/python3.7/site-packages/cudf/core/column/column.py(1490)as_column()
1488 elif arb_dtype.kind in ("O", "U"):
1489 data = as_column(
-> 1490 pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype
1491 )
1492 # There is no cast operation available for pa.Array from int to
ipdb> arbitrary
array([array(979.), array(992.), array(1031.)], dtype=object)
ipdb> arbitrary.dtype
dtype('O')
ipdb> type(arbitrary)
<class 'numpy.ndarray'>
ipdb> type(arbitrary[0])
<class 'cupy.core.core.ndarray'>
Calling
describe
on a Dask-cuDF dataframe currently appears to fail due to the nature of the data structure we're passing toas_column
, which forces pyarrow to guess what type the object is (causing an error). We're passing a numpy array of single element cupy arrays.From a quick debug:
cuDF commit: e21a0e1
The text was updated successfully, but these errors were encountered: