From 5a8982089c95958b6bb08be93f5f4eb6d28e1846 Mon Sep 17 00:00:00 2001 From: Ian Thomas Date: Fri, 23 Jun 2023 19:10:51 +0100 Subject: [PATCH] Ensure categorical column order is the same across dask partitions (#1239) --- datashader/core.py | 2 +- datashader/tests/test_dask.py | 33 +++++++++++++++++++++++++++++++++ datashader/utils.py | 2 +- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/datashader/core.py b/datashader/core.py index f0914413a..38cf45800 100644 --- a/datashader/core.py +++ b/datashader/core.py @@ -1289,7 +1289,7 @@ def _bypixel_sanitise(source, glyph, agg): source[glyph.geometry].array._sindex = sindex dshape = dshape_from_pandas(source) elif isinstance(source, dd.DataFrame): - dshape = dshape_from_dask(source) + dshape, source = dshape_from_dask(source) elif isinstance(source, Dataset): # Multi-dimensional Dataset dshape = dshape_from_xarray_dataset(source) diff --git a/datashader/tests/test_dask.py b/datashader/tests/test_dask.py index f3b021528..b53f3886c 100644 --- a/datashader/tests/test_dask.py +++ b/datashader/tests/test_dask.py @@ -2128,3 +2128,36 @@ def test_dataframe_dtypes(ddf, npartitions): ddf = ddf.repartition(npartitions) assert ddf.npartitions == npartitions ds.Canvas(2, 2).points(ddf, 'x', 'y', ds.count()) + + +@pytest.mark.parametrize('on_gpu', [False, True]) +def test_dask_categorical_counts(on_gpu): + # Issue 1202 + if on_gpu and not test_gpu: + pytest.skip('gpu tests not enabled') + + df = pd.DataFrame( + data=dict( + x = [0, 1, 2, 0, 1, 2, 1, 1, 1, 1, 1, 1], + y = [0]*12, + cat = ['a', 'b', 'c', 'a', 'b', 'c', 'b', 'b', 'b', 'b', 'b', 'c'], + ) + ) + ddf = dd.from_pandas(df, npartitions=2) + assert ddf.npartitions == 2 + ddf.cat = ddf.cat.astype('category') + + # Categorical counts at the dataframe level to confirm test is reasonable. + cat_totals = ddf.cat.value_counts().compute() + assert cat_totals['a'] == 2 + assert cat_totals['b'] == 7 + assert cat_totals['c'] == 3 + + canvas = ds.Canvas(3, 1, x_range=(0, 2), y_range=(-1, 1)) + agg = canvas.points(ddf, 'x', 'y', ds.by("cat", ds.count())) + assert all(agg.cat == ['a', 'b', 'c']) + + # Prior to fix, this gives [7, 3, 2] + sum_cat = agg.sum(dim=['x', 'y']) + assert all(sum_cat.cat == ['a', 'b', 'c']) + assert all(sum_cat.values == [2, 7, 3]) diff --git a/datashader/utils.py b/datashader/utils.py index dd0f4cbc1..09944137e 100644 --- a/datashader/utils.py +++ b/datashader/utils.py @@ -456,7 +456,7 @@ def dshape_from_dask(df): # for dask-cudf DataFrames with multiple partitions return datashape.var * datashape.Record([ (k, dshape_from_pandas_helper(df[k].get_partition(0))) for k in df.columns - ]) + ]), df def dshape_from_xarray_dataset(xr_ds):