From 64785659573e9b8fbbc86c094839224153fa23a0 Mon Sep 17 00:00:00 2001
From: mvashishtha <mahesh@ponder.io>
Date: Fri, 3 Dec 2021 11:29:47 -0800
Subject: [PATCH 1/3] FIX-#3736: Don't use a 2-d array to set a categorical
 column.

Signed-off-by: mvashishtha <mahesh@ponder.io>
---
 .../pandas/partitioning/partition_manager.py  | 20 ++++++++++---
 modin/pandas/indexing.py                      | 29 ++++++++++++++++++-
 modin/pandas/test/dataframe/test_indexing.py  |  9 ++++++
 modin/pandas/test/test_series.py              |  9 ++++++
 4 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py
index 8aa4fc69600..92089d6f8b5 100644
--- a/modin/core/dataframe/pandas/partitioning/partition_manager.py
+++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py
@@ -1201,11 +1201,23 @@ def compute_part_size(indexer, remote_part, part_idx, axis):
                     col_internal_idx, remote_part, col_idx, axis=1
                 )
 
+                # We want to eventually make item_to_distribute an np.ndarray,
+                # but that doesn't work for setting a subset of a categorical
+                # column, as per https://github.com/modin-project/modin/issues/3736.
+                # In that case, `item` is not an ndarray but instead some
+                # categorical variable, which we we don't need to distribute
+                # at all. Note that np.ndarray is not hashable, so it can't
+                # be a categorical variable.
+                # TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
+                # this special case once the pandas bug is fixed.
                 if item_to_distribute is not None:
-                    item = item_to_distribute[
-                        row_position_counter : row_position_counter + row_offset,
-                        col_position_counter : col_position_counter + col_offset,
-                    ]
+                    if isinstance(item_to_distribute, np.ndarray):
+                        item = item_to_distribute[
+                            row_position_counter : row_position_counter + row_offset,
+                            col_position_counter : col_position_counter + col_offset,
+                        ]
+                    else:
+                        item = item_to_distribute
                     item = {"item": item}
                 else:
                     item = {}
diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py
index d1db847213e..02d57b622ad 100644
--- a/modin/pandas/indexing.py
+++ b/modin/pandas/indexing.py
@@ -385,7 +385,34 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
             else:
                 new_col_len = len(col_lookup)
             to_shape = new_row_len, new_col_len
-            item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
+            # If we are asssigning to a single categorical column, we won't be
+            # able to set the column to a 2-d array due to a bug in Pandas.
+            # Until that bug is fixed, don't convert `item` to a 2-d array
+            # in that case.
+            # TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
+            # this special case once the pandas bug is fixed.
+            assigning_to_single_category_column = new_col_len == 1 and (
+                # Case 1: df = pd.DataFrame({"status": ["a", "b", "c"]},
+                #                           dtype="category")
+                # Then type(df.dtypes) is pandas.core.series.Series
+                (
+                    type(self.df.dtypes) == pandas.core.series.Series
+                    and self.df.dtypes[col_lookup][0].name == "category"
+                )
+                # Case 2: df = pd.Series( ["a", "b", "c"],  dtype="category")
+                # Then df.dtypes ==  CategoricalDtype(categories=['a', 'b', 'c'],
+                #                                     ordered=False)
+                # and type(df.dtypes) is pandas.core.dtypes.dtypes.CategoricalDtype.
+                # Case 3:  df = pd.Series([0, 1, 2], index=['a', 'b', 'c'])
+                # Then df.dtypes == dtype('int64') and type(df.dtypes) is
+                # numpy.dtype
+                or (
+                    hasattr(type(self.df.dtypes), "name")
+                    and self.df.dtypes.name == "category"
+                )
+            )
+            if not assigning_to_single_category_column:
+                item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
             self._write_items(row_lookup, col_lookup, item)
 
     def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):
diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py
index e7998c9cd84..818d1980e23 100644
--- a/modin/pandas/test/dataframe/test_indexing.py
+++ b/modin/pandas/test/dataframe/test_indexing.py
@@ -372,6 +372,15 @@ def test_loc(data):
         modin_df.loc["NO_EXIST"]
 
 
+# This tests the bug from https://github.com/modin-project/modin/issues/3736
+def test_loc_setting_single_categorical_column():
+    modin_df = pd.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
+    pandas_df = pandas.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
+    modin_df.loc[1:3, "status"] = "a"
+    pandas_df.loc[1:3, "status"] = "a"
+    df_equals(modin_df, pandas_df)
+
+
 def test_loc_multi_index():
     modin_df = pd.read_csv(
         "modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0
diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
index 887e4c6a8d1..3c03ca243b3 100644
--- a/modin/pandas/test/test_series.py
+++ b/modin/pandas/test/test_series.py
@@ -2105,6 +2105,15 @@ def test_loc(data):
     df_equals(modin_result, pandas_result)
 
 
+# This tests the bug from https://github.com/modin-project/modin/issues/3736
+def test_loc_setting_categorical_series():
+    modin_series = pd.Series(["a", "b", "c"], dtype="category")
+    pandas_series = pandas.Series(["a", "b", "c"], dtype="category")
+    modin_series.loc[1:3] = "a"
+    pandas_series.loc[1:3] = "a"
+    df_equals(modin_series, pandas_series)
+
+
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_lt(data):
     modin_series, pandas_series = create_test_series(data)

From ce99a39c8ef224611f9dc31315d0c03e6e5c1437 Mon Sep 17 00:00:00 2001
From: Mahesh Vashishtha <mvashishtha@users.noreply.github.com>
Date: Mon, 6 Dec 2021 10:49:55 -0800
Subject: [PATCH 2/3] Apply suggestions from code review

Co-authored-by: Vasily Litvinov <vasilij.n.litvinov@intel.com>
---
 modin/pandas/indexing.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py
index 02d57b622ad..a53a0a4b38d 100644
--- a/modin/pandas/indexing.py
+++ b/modin/pandas/indexing.py
@@ -396,7 +396,7 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
                 #                           dtype="category")
                 # Then type(df.dtypes) is pandas.core.series.Series
                 (
-                    type(self.df.dtypes) == pandas.core.series.Series
+                    isinstance(self.df.dtypes, pandas.core.series.Series)
                     and self.df.dtypes[col_lookup][0].name == "category"
                 )
                 # Case 2: df = pd.Series( ["a", "b", "c"],  dtype="category")
@@ -407,8 +407,7 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
                 # Then df.dtypes == dtype('int64') and type(df.dtypes) is
                 # numpy.dtype
                 or (
-                    hasattr(type(self.df.dtypes), "name")
-                    and self.df.dtypes.name == "category"
+                    getattr(self.df.dtypes, "name", "") == "category"
                 )
             )
             if not assigning_to_single_category_column:

From 05a4a79c030ce6fb74a580b43645e009ab4cd861 Mon Sep 17 00:00:00 2001
From: mvashishtha <mahesh@ponder.io>
Date: Mon, 6 Dec 2021 10:52:06 -0800
Subject: [PATCH 3/3] Fix a format error with black.

Signed-off-by: mvashishtha <mahesh@ponder.io>
---
 .../experimental/xgboost/test/test_dmatrix.py | 70 +++++++------------
 modin/pandas/indexing.py                      |  4 +-
 2 files changed, 28 insertions(+), 46 deletions(-)

diff --git a/modin/experimental/xgboost/test/test_dmatrix.py b/modin/experimental/xgboost/test/test_dmatrix.py
index c9498131ef4..b28f01a2a81 100644
--- a/modin/experimental/xgboost/test/test_dmatrix.py
+++ b/modin/experimental/xgboost/test/test_dmatrix.py
@@ -13,7 +13,9 @@
 
 import numpy as np
 import pytest
+import pandas
 from sklearn.metrics import accuracy_score
+from sklearn.datasets import load_breast_cancer
 import xgboost as xgb
 
 import modin.pandas as pd
@@ -74,67 +76,49 @@ def test_dmatrix_feature_names_and_feature_types(data, feature_names, feature_ty
     check_dmatrix(data, feature_names=feature_names, feature_types=feature_types)
 
 
-@pytest.mark.parametrize(
-    "feature_names",
-    [
-        ["Feature1", "Feature2", "Feature3", "Feature4", "Feature5"],
-        [u"??1", u"??2", u"??3", u"??4", u"??5"],
-    ],
-)
-def test_feature_names(feature_names):
-    data = np.random.randn(100, 5)
-    label = np.array([0, 1] * 50)
+def test_feature_names():
+    dataset = load_breast_cancer()
+    X = dataset.data
+    y = dataset.target
+    feature_names = [f"feat{i}" for i in range(X.shape[1])]
 
     check_dmatrix(
-        data,
-        label,
+        X,
+        y,
         feature_names=feature_names,
     )
 
-    dm = xgb.DMatrix(data, label=label, feature_names=feature_names)
-    md_dm = mxgb.DMatrix(
-        pd.DataFrame(data), label=pd.Series(label), feature_names=feature_names
+    dmatrix = xgb.DMatrix(X, label=y, feature_names=feature_names)
+    md_dmatrix = mxgb.DMatrix(
+        pd.DataFrame(X), label=pd.Series(y), feature_names=feature_names
     )
 
     params = {
-        "objective": "multi:softprob",
+        "objective": "binary:logistic",
         "eval_metric": "mlogloss",
-        "eta": 0.3,
-        "num_class": 3,
     }
 
-    bst = xgb.train(params, dm, num_boost_round=10)
-    md_bst = mxgb.train(params, md_dm, num_boost_round=10)
+    booster = xgb.train(params, dmatrix, num_boost_round=10)
+    md_booster = mxgb.train(params, md_dmatrix, num_boost_round=10)
 
-    predictions = bst.predict(dm)
-    modin_predictions = md_bst.predict(md_dm)
+    predictions = booster.predict(dmatrix)
+    modin_predictions = md_booster.predict(md_dmatrix)
 
-    preds = np.asarray([np.argmax(line) for line in predictions])
-    md_preds = np.asarray([np.argmax(line) for line in modin_predictions.to_numpy()])
+    preds = pandas.DataFrame(predictions).apply(np.round, axis=0)
+    modin_preds = modin_predictions.apply(np.round, axis=0)
 
-    val = accuracy_score(label, preds)
-    md_val = accuracy_score(label, md_preds)
+    accuracy = accuracy_score(y, preds)
+    md_accuracy = accuracy_score(y, modin_preds)
 
-    np.testing.assert_allclose(val, md_val, atol=0.02, rtol=0.01)
+    np.testing.assert_allclose(accuracy, md_accuracy, atol=0.005, rtol=0.002)
 
-    dummy = np.random.randn(5, 5)
-    dm = xgb.DMatrix(dummy, feature_names=feature_names)
-    md_dm = mxgb.DMatrix(pd.DataFrame(dummy), feature_names=feature_names)
-    predictions = bst.predict(dm)
-    modin_predictions = md_bst.predict(md_dm)
-
-    preds = np.asarray([np.argmax(line) for line in predictions])
-    md_preds = np.asarray([np.argmax(line) for line in modin_predictions.to_numpy()])
-
-    assert preds.all() == md_preds.all()
-
-    # different feature names must raises error
-    dm = xgb.DMatrix(dummy, feature_names=list("abcde"))
-    md_dm = mxgb.DMatrix(pd.DataFrame(dummy), feature_names=list("abcde"))
+    # Different feature_names (default) must raise error in this case
+    dm = xgb.DMatrix(X)
+    md_dm = mxgb.DMatrix(pd.DataFrame(X))
     with pytest.raises(ValueError):
-        bst.predict(dm)
+        booster.predict(dm)
     with pytest.raises(ValueError):
-        md_bst.predict(md_dm)
+        repr(md_booster.predict(md_dm))
 
 
 def test_feature_weights():
diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py
index a53a0a4b38d..218675f291d 100644
--- a/modin/pandas/indexing.py
+++ b/modin/pandas/indexing.py
@@ -406,9 +406,7 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
                 # Case 3:  df = pd.Series([0, 1, 2], index=['a', 'b', 'c'])
                 # Then df.dtypes == dtype('int64') and type(df.dtypes) is
                 # numpy.dtype
-                or (
-                    getattr(self.df.dtypes, "name", "") == "category"
-                )
+                or (getattr(self.df.dtypes, "name", "") == "category")
             )
             if not assigning_to_single_category_column:
                 item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)