modin-project · devin-petersohn · Dec 7, 2021 · Dec 3, 2021 · Dec 6, 2021 · Dec 6, 2021
@@ -1201,11 +1201,23 @@ def compute_part_size(indexer, remote_part, part_idx, axis):
                     col_internal_idx, remote_part, col_idx, axis=1
                 )
 
+                # We want to eventually make item_to_distribute an np.ndarray,
+                # but that doesn't work for setting a subset of a categorical
+                # column, as per https://github.com/modin-project/modin/issues/3736.
+                # In that case, `item` is not an ndarray but instead some
+                # categorical variable, which we we don't need to distribute
+                # at all. Note that np.ndarray is not hashable, so it can't
+                # be a categorical variable.
+                # TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
+                # this special case once the pandas bug is fixed.
                 if item_to_distribute is not None:
-                    item = item_to_distribute[
-                        row_position_counter : row_position_counter + row_offset,
-                        col_position_counter : col_position_counter + col_offset,
-                    ]
+                    if isinstance(item_to_distribute, np.ndarray):
+                        item = item_to_distribute[
+                            row_position_counter : row_position_counter + row_offset,
+                            col_position_counter : col_position_counter + col_offset,
+                        ]
+                    else:
+                        item = item_to_distribute
                     item = {"item": item}
                 else:
                     item = {}

@@ -13,7 +13,9 @@
 
 import numpy as np
 import pytest
+import pandas
 from sklearn.metrics import accuracy_score
+from sklearn.datasets import load_breast_cancer
 import xgboost as xgb
 
 import modin.pandas as pd
@@ -74,67 +76,49 @@ def test_dmatrix_feature_names_and_feature_types(data, feature_names, feature_ty
     check_dmatrix(data, feature_names=feature_names, feature_types=feature_types)
 
 
-@pytest.mark.parametrize(
-    "feature_names",
-    [
-        ["Feature1", "Feature2", "Feature3", "Feature4", "Feature5"],
-        [u"??1", u"??2", u"??3", u"??4", u"??5"],
-    ],
-)
-def test_feature_names(feature_names):
-    data = np.random.randn(100, 5)
-    label = np.array([0, 1] * 50)
+def test_feature_names():
+    dataset = load_breast_cancer()
+    X = dataset.data
+    y = dataset.target
+    feature_names = [f"feat{i}" for i in range(X.shape[1])]
 
     check_dmatrix(
-        data,
-        label,
+        X,
+        y,
         feature_names=feature_names,
     )
 
-    dm = xgb.DMatrix(data, label=label, feature_names=feature_names)
-    md_dm = mxgb.DMatrix(
-        pd.DataFrame(data), label=pd.Series(label), feature_names=feature_names
+    dmatrix = xgb.DMatrix(X, label=y, feature_names=feature_names)
+    md_dmatrix = mxgb.DMatrix(
+        pd.DataFrame(X), label=pd.Series(y), feature_names=feature_names
     )
 
     params = {
-        "objective": "multi:softprob",
+        "objective": "binary:logistic",
         "eval_metric": "mlogloss",
-        "eta": 0.3,
-        "num_class": 3,
     }
 
-    bst = xgb.train(params, dm, num_boost_round=10)
-    md_bst = mxgb.train(params, md_dm, num_boost_round=10)
+    booster = xgb.train(params, dmatrix, num_boost_round=10)
+    md_booster = mxgb.train(params, md_dmatrix, num_boost_round=10)
 
-    predictions = bst.predict(dm)
-    modin_predictions = md_bst.predict(md_dm)
+    predictions = booster.predict(dmatrix)
+    modin_predictions = md_booster.predict(md_dmatrix)
 
-    preds = np.asarray([np.argmax(line) for line in predictions])
-    md_preds = np.asarray([np.argmax(line) for line in modin_predictions.to_numpy()])
+    preds = pandas.DataFrame(predictions).apply(np.round, axis=0)
+    modin_preds = modin_predictions.apply(np.round, axis=0)
 
-    val = accuracy_score(label, preds)
-    md_val = accuracy_score(label, md_preds)
+    accuracy = accuracy_score(y, preds)
+    md_accuracy = accuracy_score(y, modin_preds)
 
-    np.testing.assert_allclose(val, md_val, atol=0.02, rtol=0.01)
+    np.testing.assert_allclose(accuracy, md_accuracy, atol=0.005, rtol=0.002)
 
-    dummy = np.random.randn(5, 5)
-    dm = xgb.DMatrix(dummy, feature_names=feature_names)
-    md_dm = mxgb.DMatrix(pd.DataFrame(dummy), feature_names=feature_names)
-    predictions = bst.predict(dm)
-    modin_predictions = md_bst.predict(md_dm)
-
-    preds = np.asarray([np.argmax(line) for line in predictions])
-    md_preds = np.asarray([np.argmax(line) for line in modin_predictions.to_numpy()])
-
-    assert preds.all() == md_preds.all()
-
-    # different feature names must raises error
-    dm = xgb.DMatrix(dummy, feature_names=list("abcde"))
-    md_dm = mxgb.DMatrix(pd.DataFrame(dummy), feature_names=list("abcde"))
+    # Different feature_names (default) must raise error in this case
+    dm = xgb.DMatrix(X)
+    md_dm = mxgb.DMatrix(pd.DataFrame(X))
     with pytest.raises(ValueError):
-        bst.predict(dm)
+        booster.predict(dm)
     with pytest.raises(ValueError):
-        md_bst.predict(md_dm)
+        repr(md_booster.predict(md_dm))
 
 
 def test_feature_weights():

@@ -385,7 +385,31 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
             else:
                 new_col_len = len(col_lookup)
             to_shape = new_row_len, new_col_len
-            item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
+            # If we are asssigning to a single categorical column, we won't be
+            # able to set the column to a 2-d array due to a bug in Pandas.
+            # Until that bug is fixed, don't convert `item` to a 2-d array
+            # in that case.
+            # TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
+            # this special case once the pandas bug is fixed.
+            assigning_to_single_category_column = new_col_len == 1 and (
+                # Case 1: df = pd.DataFrame({"status": ["a", "b", "c"]},
+                #                           dtype="category")
+                # Then type(df.dtypes) is pandas.core.series.Series
+                (
+                    isinstance(self.df.dtypes, pandas.core.series.Series)
+                    and self.df.dtypes[col_lookup][0].name == "category"
+                )
+                # Case 2: df = pd.Series( ["a", "b", "c"],  dtype="category")
+                # Then df.dtypes ==  CategoricalDtype(categories=['a', 'b', 'c'],
+                #                                     ordered=False)
+                # and type(df.dtypes) is pandas.core.dtypes.dtypes.CategoricalDtype.
+                # Case 3:  df = pd.Series([0, 1, 2], index=['a', 'b', 'c'])
+                # Then df.dtypes == dtype('int64') and type(df.dtypes) is
+                # numpy.dtype
+                or (getattr(self.df.dtypes, "name", "") == "category")
+            )
+            if not assigning_to_single_category_column:
+                item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
             self._write_items(row_lookup, col_lookup, item)
 
     def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):

@@ -372,6 +372,15 @@ def test_loc(data):
         modin_df.loc["NO_EXIST"]
 
 
+# This tests the bug from https://github.com/modin-project/modin/issues/3736
+def test_loc_setting_single_categorical_column():
+    modin_df = pd.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
+    pandas_df = pandas.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
+    modin_df.loc[1:3, "status"] = "a"
+    pandas_df.loc[1:3, "status"] = "a"
+    df_equals(modin_df, pandas_df)
+
+
 def test_loc_multi_index():
     modin_df = pd.read_csv(
         "modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0

@@ -2105,6 +2105,15 @@ def test_loc(data):
     df_equals(modin_result, pandas_result)
 
 
+# This tests the bug from https://github.com/modin-project/modin/issues/3736
+def test_loc_setting_categorical_series():
+    modin_series = pd.Series(["a", "b", "c"], dtype="category")
+    pandas_series = pandas.Series(["a", "b", "c"], dtype="category")
+    modin_series.loc[1:3] = "a"
+    pandas_series.loc[1:3] = "a"
+    df_equals(modin_series, pandas_series)
+
+
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_lt(data):
     modin_series, pandas_series = create_test_series(data)