-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Initial draft: from_dummies #41902
Initial draft: from_dummies #41902
Changes from 3 commits
f3e6afe
c7c5588
d06540f
1fa4e8a
c7f8ec8
3cc98ca
0e131c6
9f74dc7
442b340
38cf04d
8eccfab
fd027c5
106ff3c
2019228
be39c05
d406227
61a25e0
1d104f8
5bcfbb4
ca6200e
bf17cdb
92b5dae
c2cd747
dc50464
4d9cfd0
82d6743
153202d
d3dd9f7
e6ec175
ee6025d
4e741c8
1b4a8e9
90177be
d58c668
46457fa
131f42b
1af65ac
6dacf53
61edd30
04f360c
7ff2f3b
56ea182
39a0199
e05fe3f
23f6c07
7190879
012a1dd
52ed909
d8e4743
0cf35d8
b9303bc
3207534
8089fe5
55ad274
1b17815
00c7b05
07ba536
bbe41d0
329394b
b83ac6a
1f5e1dc
8a3421b
16cdaa0
174df1f
e45d3f8
e83faed
1e12e6a
24e9899
c8e7a7d
0ac8fff
6af6cad
54fdcbd
ced3ed0
6db7744
c84d973
842d335
8f91012
84d5bd8
fd0f985
6230d0f
84a60f7
c78ef2a
52a9dea
bc658ba
9fbca72
2581fc9
85a0ed8
5b74039
015ee94
66c0292
30b8ff1
b261656
555825b
9d6e571
9f1bb8e
dc52985
e7d6828
ae9f3d2
a59ed4e
66c7a64
76221f8
7fa66b3
536f9c5
530889e
6536c65
1272a23
fd3b115
bd5a118
f7d08d0
c32e514
0fda02f
62b09ae
1dcdd9a
3c00690
4425b4a
dc144f7
15503b0
61a348b
f06a45c
f3a0f83
23c133f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1053,6 +1053,168 @@ def get_empty_frame(data) -> DataFrame: | |
return DataFrame(dummy_mat, index=index, columns=dummy_cols) | ||
|
||
|
||
def from_dummies( | ||
data, | ||
to_series: bool = False, | ||
variables: None | str | list[str] | dict[str, str] = None, | ||
prefix_sep: str | list[str] | dict[str, str] = "_", | ||
dummy_na: bool = False, | ||
columns: None | list[str] = None, | ||
dropped_first: None | str | list[str] | dict[str, str] = None, | ||
) -> Series | DataFrame: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's just always return a DataFrame, much simpler There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea, and in line with this perspective as the |
||
""" | ||
soon | ||
""" | ||
from pandas.core.reshape.concat import concat | ||
|
||
if to_series: | ||
return _from_dummies_1d(data, dummy_na, dropped_first) | ||
|
||
data_to_decode: DataFrame | ||
if columns is None: | ||
# index data with a list of all columns that are dummies | ||
cat_columns = [] | ||
non_cat_columns = [] | ||
for col in data.columns: | ||
if any(ps in col for ps in prefix_sep): | ||
cat_columns.append(col) | ||
else: | ||
non_cat_columns.append(col) | ||
data_to_decode = data[cat_columns] | ||
non_cat_data = data[non_cat_columns] | ||
elif not is_list_like(columns): | ||
raise TypeError("Input must be a list-like for parameter 'columns'") | ||
else: | ||
data_to_decode = data[columns] | ||
non_cat_data = data[[col for col in data.columns if col not in columns]] | ||
|
||
# get separator for each prefix and lists to slice data for each prefix | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. umm this is very complicated. what are you actually trying to do here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I want to get all columns that correspond to a specific prefix such that I can extract the values for each block. I do this here to avoid deep nesting (and checking whether or not a column belongs to a prefix) later on, when the value for each entry is determined. |
||
if isinstance(prefix_sep, dict): | ||
variables_slice = {prefix: [] for prefix in prefix_sep} | ||
for col in data_to_decode.columns: | ||
for prefix in prefix_sep: | ||
if prefix in col: | ||
variables_slice[prefix].append(col) | ||
else: | ||
sep_for_prefix = {} | ||
variables_slice = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Awesome advice, thank you very much :) |
||
for col in data_to_decode.columns: | ||
ps = [ps for ps in prefix_sep if ps in col][0] | ||
prefix = col.split(ps)[0] | ||
if prefix not in sep_for_prefix: | ||
sep_for_prefix[prefix] = ps | ||
if prefix not in variables_slice: | ||
variables_slice[prefix] = [col] | ||
else: | ||
variables_slice[prefix].append(col) | ||
prefix_sep = sep_for_prefix | ||
|
||
# validate number of passed arguments | ||
def check_len(item, name) -> None: | ||
if not len(item) == len(variables_slice): | ||
len_msg = ( | ||
f"Length of '{name}' ({len(item)}) did not match the " | ||
"length of the columns being encoded " | ||
f"({len(variables_slice)})." | ||
) | ||
raise ValueError(len_msg) | ||
|
||
# obtain prefix to category mapping | ||
variables: dict[str, str] | ||
if isinstance(variables, dict): | ||
check_len(variables, "variables") | ||
variables = variables | ||
elif is_list_like(variables): | ||
check_len(variables, "variables") | ||
variables = dict(zip(variables_slice, variables)) | ||
elif isinstance(variables, str): | ||
variables = dict( | ||
zip( | ||
variables_slice, | ||
(f"{variables}{i}" for i in range(len(variables_slice))), | ||
) | ||
) | ||
else: | ||
variables = dict(zip(variables_slice, variables_slice)) | ||
|
||
if dropped_first: | ||
if isinstance(dropped_first, dict): | ||
check_len(dropped_first, "dropped_first") | ||
elif is_list_like(dropped_first): | ||
check_len(dropped_first, "dropped_first") | ||
dropped_first = dict(zip(variables_slice, dropped_first)) | ||
else: | ||
dropped_first = dict( | ||
zip(variables_slice, [dropped_first] * len(variables_slice)) | ||
) | ||
|
||
cat_data = {var: [] for _, var in variables.items()} | ||
for index, row in data.iterrows(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Iterating over rows in Python will be too slow - can you have a look at how the (now closed) PR did it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed the row iteration. At the moment this resulted in a problem with NaN values in the output DF which I am currently looking into.. I can mirror the method of the old PR if its method is more efficient (or if it provides an easy solution for the NaN issue). |
||
for prefix, prefix_slice in variables_slice.items(): | ||
slice_sum = row[prefix_slice].sum() | ||
if slice_sum > 1: | ||
raise ValueError( | ||
f"Dummy DataFrame contains multi-assignment(s) for prefix: " | ||
f"'{prefix}' in row {index}." | ||
) | ||
elif slice_sum == 0: | ||
if dropped_first: | ||
category = dropped_first[prefix] | ||
elif not dummy_na: | ||
category = np.nan | ||
else: | ||
raise ValueError( | ||
f"Dummy DataFrame contains no assignment for prefix: " | ||
f"'{prefix}' in row {index}." | ||
) | ||
else: | ||
cat_index = row[prefix_slice].argmax() | ||
category = prefix_slice[cat_index].split(prefix_sep[prefix])[1] | ||
if dummy_na and category == "NaN": | ||
category = np.nan | ||
pckSF marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cat_data[variables[prefix]].append(category) | ||
|
||
if columns: | ||
return DataFrame(cat_data) | ||
else: | ||
return concat([non_cat_data, DataFrame(cat_data)], axis=1) | ||
|
||
|
||
def _from_dummies_1d( | ||
data, | ||
dummy_na: bool = False, | ||
dropped_first: None | str = None, | ||
) -> Series: | ||
""" | ||
soon | ||
""" | ||
if dropped_first and not isinstance(dropped_first, str): | ||
raise ValueError("Only one dropped first value possible in 1D dummy DataFrame.") | ||
|
||
cat_data = [] | ||
for index, row in data.iterrows(): | ||
row_sum = row.sum() | ||
if row_sum > 1: | ||
raise ValueError( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Couldn't you check this much earlier with a row sum after the conversion to boolean, e.g. , if (data_to_decode.sum(1) > 1).any()? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, that only works if there are no prefixes/multiple variables as each prefix slice has to be checked individually and |
||
f"Dummy DataFrame contains multi-assignment in row {index}." | ||
) | ||
elif row_sum == 0: | ||
if dropped_first: | ||
category = dropped_first | ||
elif not dummy_na: | ||
category = np.nan | ||
else: | ||
raise ValueError( | ||
f"Dummy DataFrame contains no assignment in row {index}." | ||
) | ||
else: | ||
category = data.columns[row.argmax()] | ||
if dummy_na and category == "NaN": | ||
category = np.nan | ||
cat_data.append(category) | ||
return Series(cat_data) | ||
|
||
|
||
def _reorder_for_extension_array_stack( | ||
arr: ExtensionArray, n_rows: int, n_columns: int | ||
) -> ExtensionArray: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should consider moving get_dummies / from_dummies to a separate file (in /reshape), could be a precursor PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like that idea to improve clarity. What would be an elegant and obvious name for a collection of "reshape operations that change the data representation" - maybe
transform
? Or would we rather collect more categrogical/dummy specific operations instead? For me the first option seems more intuitive: I will think about a name -/reshape/transform.py
could cause confusion with the.transform
method.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
one_hot_encoding.py
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
or if its supposed to be a dummy operations file:
dummy_coding.py
?