Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX-#3736: Don't use a 2-d array to set a categorical column. #3777

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions modin/core/dataframe/pandas/partitioning/partition_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,11 +1201,23 @@ def compute_part_size(indexer, remote_part, part_idx, axis):
col_internal_idx, remote_part, col_idx, axis=1
)

# We want to eventually make item_to_distribute an np.ndarray,
# but that doesn't work for setting a subset of a categorical
# column, as per https://github.com/modin-project/modin/issues/3736.
# In that case, `item` is not an ndarray but instead some
# categorical variable, which we we don't need to distribute
# at all. Note that np.ndarray is not hashable, so it can't
# be a categorical variable.
# TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
# this special case once the pandas bug is fixed.
if item_to_distribute is not None:
item = item_to_distribute[
row_position_counter : row_position_counter + row_offset,
col_position_counter : col_position_counter + col_offset,
]
if isinstance(item_to_distribute, np.ndarray):
item = item_to_distribute[
row_position_counter : row_position_counter + row_offset,
col_position_counter : col_position_counter + col_offset,
]
else:
item = item_to_distribute
item = {"item": item}
else:
item = {}
Expand Down
70 changes: 27 additions & 43 deletions modin/experimental/xgboost/test/test_dmatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@

import numpy as np
import pytest
import pandas
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
import xgboost as xgb

import modin.pandas as pd
Expand Down Expand Up @@ -74,67 +76,49 @@ def test_dmatrix_feature_names_and_feature_types(data, feature_names, feature_ty
check_dmatrix(data, feature_names=feature_names, feature_types=feature_types)


@pytest.mark.parametrize(
"feature_names",
[
["Feature1", "Feature2", "Feature3", "Feature4", "Feature5"],
[u"??1", u"??2", u"??3", u"??4", u"??5"],
],
)
def test_feature_names(feature_names):
data = np.random.randn(100, 5)
label = np.array([0, 1] * 50)
def test_feature_names():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mvashishtha, I wonder why this PR contains changes in this file, though it shouldn't, because the changes were preliminary merged into master as part of another PR.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I don't understand. Git blame correctly points to #3770 for these lines.

dataset = load_breast_cancer()
X = dataset.data
y = dataset.target
feature_names = [f"feat{i}" for i in range(X.shape[1])]

check_dmatrix(
data,
label,
X,
y,
feature_names=feature_names,
)

dm = xgb.DMatrix(data, label=label, feature_names=feature_names)
md_dm = mxgb.DMatrix(
pd.DataFrame(data), label=pd.Series(label), feature_names=feature_names
dmatrix = xgb.DMatrix(X, label=y, feature_names=feature_names)
md_dmatrix = mxgb.DMatrix(
pd.DataFrame(X), label=pd.Series(y), feature_names=feature_names
)

params = {
"objective": "multi:softprob",
"objective": "binary:logistic",
"eval_metric": "mlogloss",
"eta": 0.3,
"num_class": 3,
}

bst = xgb.train(params, dm, num_boost_round=10)
md_bst = mxgb.train(params, md_dm, num_boost_round=10)
booster = xgb.train(params, dmatrix, num_boost_round=10)
md_booster = mxgb.train(params, md_dmatrix, num_boost_round=10)

predictions = bst.predict(dm)
modin_predictions = md_bst.predict(md_dm)
predictions = booster.predict(dmatrix)
modin_predictions = md_booster.predict(md_dmatrix)

preds = np.asarray([np.argmax(line) for line in predictions])
md_preds = np.asarray([np.argmax(line) for line in modin_predictions.to_numpy()])
preds = pandas.DataFrame(predictions).apply(np.round, axis=0)
modin_preds = modin_predictions.apply(np.round, axis=0)

val = accuracy_score(label, preds)
md_val = accuracy_score(label, md_preds)
accuracy = accuracy_score(y, preds)
md_accuracy = accuracy_score(y, modin_preds)

np.testing.assert_allclose(val, md_val, atol=0.02, rtol=0.01)
np.testing.assert_allclose(accuracy, md_accuracy, atol=0.005, rtol=0.002)

dummy = np.random.randn(5, 5)
dm = xgb.DMatrix(dummy, feature_names=feature_names)
md_dm = mxgb.DMatrix(pd.DataFrame(dummy), feature_names=feature_names)
predictions = bst.predict(dm)
modin_predictions = md_bst.predict(md_dm)

preds = np.asarray([np.argmax(line) for line in predictions])
md_preds = np.asarray([np.argmax(line) for line in modin_predictions.to_numpy()])

assert preds.all() == md_preds.all()

# different feature names must raises error
dm = xgb.DMatrix(dummy, feature_names=list("abcde"))
md_dm = mxgb.DMatrix(pd.DataFrame(dummy), feature_names=list("abcde"))
# Different feature_names (default) must raise error in this case
dm = xgb.DMatrix(X)
md_dm = mxgb.DMatrix(pd.DataFrame(X))
with pytest.raises(ValueError):
bst.predict(dm)
booster.predict(dm)
with pytest.raises(ValueError):
md_bst.predict(md_dm)
repr(md_booster.predict(md_dm))


def test_feature_weights():
Expand Down
26 changes: 25 additions & 1 deletion modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,31 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
else:
new_col_len = len(col_lookup)
to_shape = new_row_len, new_col_len
item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
# If we are asssigning to a single categorical column, we won't be
# able to set the column to a 2-d array due to a bug in Pandas.
# Until that bug is fixed, don't convert `item` to a 2-d array
# in that case.
# TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
# this special case once the pandas bug is fixed.
assigning_to_single_category_column = new_col_len == 1 and (
# Case 1: df = pd.DataFrame({"status": ["a", "b", "c"]},
# dtype="category")
# Then type(df.dtypes) is pandas.core.series.Series
(
isinstance(self.df.dtypes, pandas.core.series.Series)
and self.df.dtypes[col_lookup][0].name == "category"
)
# Case 2: df = pd.Series( ["a", "b", "c"], dtype="category")
# Then df.dtypes == CategoricalDtype(categories=['a', 'b', 'c'],
# ordered=False)
# and type(df.dtypes) is pandas.core.dtypes.dtypes.CategoricalDtype.
# Case 3: df = pd.Series([0, 1, 2], index=['a', 'b', 'c'])
# Then df.dtypes == dtype('int64') and type(df.dtypes) is
# numpy.dtype
or (getattr(self.df.dtypes, "name", "") == "category")
)
if not assigning_to_single_category_column:
item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
self._write_items(row_lookup, col_lookup, item)

def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):
Expand Down
9 changes: 9 additions & 0 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,15 @@ def test_loc(data):
modin_df.loc["NO_EXIST"]


# This tests the bug from https://github.com/modin-project/modin/issues/3736
def test_loc_setting_single_categorical_column():
modin_df = pd.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
pandas_df = pandas.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
modin_df.loc[1:3, "status"] = "a"
pandas_df.loc[1:3, "status"] = "a"
df_equals(modin_df, pandas_df)


def test_loc_multi_index():
modin_df = pd.read_csv(
"modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0
Expand Down
9 changes: 9 additions & 0 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2105,6 +2105,15 @@ def test_loc(data):
df_equals(modin_result, pandas_result)


# This tests the bug from https://github.com/modin-project/modin/issues/3736
def test_loc_setting_categorical_series():
modin_series = pd.Series(["a", "b", "c"], dtype="category")
pandas_series = pandas.Series(["a", "b", "c"], dtype="category")
modin_series.loc[1:3] = "a"
pandas_series.loc[1:3] = "a"
df_equals(modin_series, pandas_series)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_lt(data):
modin_series, pandas_series = create_test_series(data)
Expand Down