From 64785659573e9b8fbbc86c094839224153fa23a0 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 3 Dec 2021 11:29:47 -0800 Subject: [PATCH 1/3] FIX-#3736: Don't use a 2-d array to set a categorical column. Signed-off-by: mvashishtha --- .../pandas/partitioning/partition_manager.py | 20 ++++++++++--- modin/pandas/indexing.py | 29 ++++++++++++++++++- modin/pandas/test/dataframe/test_indexing.py | 9 ++++++ modin/pandas/test/test_series.py | 9 ++++++ 4 files changed, 62 insertions(+), 5 deletions(-) diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index 8aa4fc69600..92089d6f8b5 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -1201,11 +1201,23 @@ def compute_part_size(indexer, remote_part, part_idx, axis): col_internal_idx, remote_part, col_idx, axis=1 ) + # We want to eventually make item_to_distribute an np.ndarray, + # but that doesn't work for setting a subset of a categorical + # column, as per https://github.com/modin-project/modin/issues/3736. + # In that case, `item` is not an ndarray but instead some + # categorical variable, which we we don't need to distribute + # at all. Note that np.ndarray is not hashable, so it can't + # be a categorical variable. + # TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete + # this special case once the pandas bug is fixed. if item_to_distribute is not None: - item = item_to_distribute[ - row_position_counter : row_position_counter + row_offset, - col_position_counter : col_position_counter + col_offset, - ] + if isinstance(item_to_distribute, np.ndarray): + item = item_to_distribute[ + row_position_counter : row_position_counter + row_offset, + col_position_counter : col_position_counter + col_offset, + ] + else: + item = item_to_distribute item = {"item": item} else: item = {} diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index d1db847213e..02d57b622ad 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -385,7 +385,34 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None): else: new_col_len = len(col_lookup) to_shape = new_row_len, new_col_len - item = self._broadcast_item(row_lookup, col_lookup, item, to_shape) + # If we are asssigning to a single categorical column, we won't be + # able to set the column to a 2-d array due to a bug in Pandas. + # Until that bug is fixed, don't convert `item` to a 2-d array + # in that case. + # TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete + # this special case once the pandas bug is fixed. + assigning_to_single_category_column = new_col_len == 1 and ( + # Case 1: df = pd.DataFrame({"status": ["a", "b", "c"]}, + # dtype="category") + # Then type(df.dtypes) is pandas.core.series.Series + ( + type(self.df.dtypes) == pandas.core.series.Series + and self.df.dtypes[col_lookup][0].name == "category" + ) + # Case 2: df = pd.Series( ["a", "b", "c"], dtype="category") + # Then df.dtypes == CategoricalDtype(categories=['a', 'b', 'c'], + # ordered=False) + # and type(df.dtypes) is pandas.core.dtypes.dtypes.CategoricalDtype. + # Case 3: df = pd.Series([0, 1, 2], index=['a', 'b', 'c']) + # Then df.dtypes == dtype('int64') and type(df.dtypes) is + # numpy.dtype + or ( + hasattr(type(self.df.dtypes), "name") + and self.df.dtypes.name == "category" + ) + ) + if not assigning_to_single_category_column: + item = self._broadcast_item(row_lookup, col_lookup, item, to_shape) self._write_items(row_lookup, col_lookup, item) def _broadcast_item(self, row_lookup, col_lookup, item, to_shape): diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py index e7998c9cd84..818d1980e23 100644 --- a/modin/pandas/test/dataframe/test_indexing.py +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -372,6 +372,15 @@ def test_loc(data): modin_df.loc["NO_EXIST"] +# This tests the bug from https://github.com/modin-project/modin/issues/3736 +def test_loc_setting_single_categorical_column(): + modin_df = pd.DataFrame({"status": ["a", "b", "c"]}, dtype="category") + pandas_df = pandas.DataFrame({"status": ["a", "b", "c"]}, dtype="category") + modin_df.loc[1:3, "status"] = "a" + pandas_df.loc[1:3, "status"] = "a" + df_equals(modin_df, pandas_df) + + def test_loc_multi_index(): modin_df = pd.read_csv( "modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0 diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 887e4c6a8d1..3c03ca243b3 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2105,6 +2105,15 @@ def test_loc(data): df_equals(modin_result, pandas_result) +# This tests the bug from https://github.com/modin-project/modin/issues/3736 +def test_loc_setting_categorical_series(): + modin_series = pd.Series(["a", "b", "c"], dtype="category") + pandas_series = pandas.Series(["a", "b", "c"], dtype="category") + modin_series.loc[1:3] = "a" + pandas_series.loc[1:3] = "a" + df_equals(modin_series, pandas_series) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_lt(data): modin_series, pandas_series = create_test_series(data) From ce99a39c8ef224611f9dc31315d0c03e6e5c1437 Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Mon, 6 Dec 2021 10:49:55 -0800 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Vasily Litvinov --- modin/pandas/indexing.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 02d57b622ad..a53a0a4b38d 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -396,7 +396,7 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None): # dtype="category") # Then type(df.dtypes) is pandas.core.series.Series ( - type(self.df.dtypes) == pandas.core.series.Series + isinstance(self.df.dtypes, pandas.core.series.Series) and self.df.dtypes[col_lookup][0].name == "category" ) # Case 2: df = pd.Series( ["a", "b", "c"], dtype="category") @@ -407,8 +407,7 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None): # Then df.dtypes == dtype('int64') and type(df.dtypes) is # numpy.dtype or ( - hasattr(type(self.df.dtypes), "name") - and self.df.dtypes.name == "category" + getattr(self.df.dtypes, "name", "") == "category" ) ) if not assigning_to_single_category_column: From 05a4a79c030ce6fb74a580b43645e009ab4cd861 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Mon, 6 Dec 2021 10:52:06 -0800 Subject: [PATCH 3/3] Fix a format error with black. Signed-off-by: mvashishtha --- .../experimental/xgboost/test/test_dmatrix.py | 70 +++++++------------ modin/pandas/indexing.py | 4 +- 2 files changed, 28 insertions(+), 46 deletions(-) diff --git a/modin/experimental/xgboost/test/test_dmatrix.py b/modin/experimental/xgboost/test/test_dmatrix.py index c9498131ef4..b28f01a2a81 100644 --- a/modin/experimental/xgboost/test/test_dmatrix.py +++ b/modin/experimental/xgboost/test/test_dmatrix.py @@ -13,7 +13,9 @@ import numpy as np import pytest +import pandas from sklearn.metrics import accuracy_score +from sklearn.datasets import load_breast_cancer import xgboost as xgb import modin.pandas as pd @@ -74,67 +76,49 @@ def test_dmatrix_feature_names_and_feature_types(data, feature_names, feature_ty check_dmatrix(data, feature_names=feature_names, feature_types=feature_types) -@pytest.mark.parametrize( - "feature_names", - [ - ["Feature1", "Feature2", "Feature3", "Feature4", "Feature5"], - [u"??1", u"??2", u"??3", u"??4", u"??5"], - ], -) -def test_feature_names(feature_names): - data = np.random.randn(100, 5) - label = np.array([0, 1] * 50) +def test_feature_names(): + dataset = load_breast_cancer() + X = dataset.data + y = dataset.target + feature_names = [f"feat{i}" for i in range(X.shape[1])] check_dmatrix( - data, - label, + X, + y, feature_names=feature_names, ) - dm = xgb.DMatrix(data, label=label, feature_names=feature_names) - md_dm = mxgb.DMatrix( - pd.DataFrame(data), label=pd.Series(label), feature_names=feature_names + dmatrix = xgb.DMatrix(X, label=y, feature_names=feature_names) + md_dmatrix = mxgb.DMatrix( + pd.DataFrame(X), label=pd.Series(y), feature_names=feature_names ) params = { - "objective": "multi:softprob", + "objective": "binary:logistic", "eval_metric": "mlogloss", - "eta": 0.3, - "num_class": 3, } - bst = xgb.train(params, dm, num_boost_round=10) - md_bst = mxgb.train(params, md_dm, num_boost_round=10) + booster = xgb.train(params, dmatrix, num_boost_round=10) + md_booster = mxgb.train(params, md_dmatrix, num_boost_round=10) - predictions = bst.predict(dm) - modin_predictions = md_bst.predict(md_dm) + predictions = booster.predict(dmatrix) + modin_predictions = md_booster.predict(md_dmatrix) - preds = np.asarray([np.argmax(line) for line in predictions]) - md_preds = np.asarray([np.argmax(line) for line in modin_predictions.to_numpy()]) + preds = pandas.DataFrame(predictions).apply(np.round, axis=0) + modin_preds = modin_predictions.apply(np.round, axis=0) - val = accuracy_score(label, preds) - md_val = accuracy_score(label, md_preds) + accuracy = accuracy_score(y, preds) + md_accuracy = accuracy_score(y, modin_preds) - np.testing.assert_allclose(val, md_val, atol=0.02, rtol=0.01) + np.testing.assert_allclose(accuracy, md_accuracy, atol=0.005, rtol=0.002) - dummy = np.random.randn(5, 5) - dm = xgb.DMatrix(dummy, feature_names=feature_names) - md_dm = mxgb.DMatrix(pd.DataFrame(dummy), feature_names=feature_names) - predictions = bst.predict(dm) - modin_predictions = md_bst.predict(md_dm) - - preds = np.asarray([np.argmax(line) for line in predictions]) - md_preds = np.asarray([np.argmax(line) for line in modin_predictions.to_numpy()]) - - assert preds.all() == md_preds.all() - - # different feature names must raises error - dm = xgb.DMatrix(dummy, feature_names=list("abcde")) - md_dm = mxgb.DMatrix(pd.DataFrame(dummy), feature_names=list("abcde")) + # Different feature_names (default) must raise error in this case + dm = xgb.DMatrix(X) + md_dm = mxgb.DMatrix(pd.DataFrame(X)) with pytest.raises(ValueError): - bst.predict(dm) + booster.predict(dm) with pytest.raises(ValueError): - md_bst.predict(md_dm) + repr(md_booster.predict(md_dm)) def test_feature_weights(): diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index a53a0a4b38d..218675f291d 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -406,9 +406,7 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None): # Case 3: df = pd.Series([0, 1, 2], index=['a', 'b', 'c']) # Then df.dtypes == dtype('int64') and type(df.dtypes) is # numpy.dtype - or ( - getattr(self.df.dtypes, "name", "") == "category" - ) + or (getattr(self.df.dtypes, "name", "") == "category") ) if not assigning_to_single_category_column: item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)