Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEP: Enforce deprecation of mangle_dup cols and convert_float in read_excel #49089

Merged
merged 11 commits into from
Oct 21, 2022
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ Removal of prior version deprecations/changes
- Remove arguments ``names`` and ``dtype`` from :meth:`Index.copy` and ``levels`` and ``codes`` from :meth:`MultiIndex.copy` (:issue:`35853`, :issue:`36685`)
- Removed argument ``try_cast`` from :meth:`DataFrame.mask`, :meth:`DataFrame.where`, :meth:`Series.mask` and :meth:`Series.where` (:issue:`38836`)
- Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)
- Remove keywords ``convert_float`` and ``mangle_dupe_cols`` from :func:`read_excel` (:issue:`41176`)
- Disallow passing non-keyword arguments to :func:`read_excel` except ``io`` and ``sheet_name`` (:issue:`34418`)
- Removed the ``numeric_only`` keyword from :meth:`Categorical.min` and :meth:`Categorical.max` in favor of ``skipna`` (:issue:`48821`)
- Removed :func:`is_extension_type` in favor of :func:`is_extension_array_dtype` (:issue:`29457`)
- Remove :meth:`DataFrameGroupBy.pad` and :meth:`DataFrameGroupBy.backfill` (:issue:`45076`)
Expand Down
52 changes: 5 additions & 47 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@
from pandas.errors import EmptyDataError
from pandas.util._decorators import (
Appender,
deprecate_kwarg,
deprecate_nonkeyword_arguments,
doc,
)
from pandas.util._exceptions import find_stack_level
Expand Down Expand Up @@ -269,23 +267,6 @@
comment string and the end of the current line is ignored.
skipfooter : int, default 0
Rows at the end to skip (0-indexed).
convert_float : bool, default True
Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
data will be read in as floats: Excel stores all numbers as floats
internally.

.. deprecated:: 1.3.0
convert_float will be removed in a future version

mangle_dupe_cols : bool, default True
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
'X'...'X'. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.

.. deprecated:: 1.5.0
Not implemented, and a new argument to specify the pattern for the
names of duplicated columns will be added instead

{storage_options}

.. versionadded:: 1.2.0
Expand Down Expand Up @@ -365,6 +346,7 @@ def read_excel(
io,
# sheet name is str or int -> DataFrame
sheet_name: str | int = ...,
*,
WillAyd marked this conversation as resolved.
Show resolved Hide resolved
header: int | Sequence[int] | None = ...,
names: list[str] | None = ...,
index_col: int | Sequence[int] | None = ...,
Expand Down Expand Up @@ -392,8 +374,6 @@ def read_excel(
decimal: str = ...,
comment: str | None = ...,
skipfooter: int = ...,
convert_float: bool | None = ...,
mangle_dupe_cols: bool = ...,
storage_options: StorageOptions = ...,
) -> DataFrame:
...
Expand All @@ -404,6 +384,7 @@ def read_excel(
io,
# sheet name is list or None -> dict[IntStrT, DataFrame]
sheet_name: list[IntStrT] | None,
*,
header: int | Sequence[int] | None = ...,
names: list[str] | None = ...,
index_col: int | Sequence[int] | None = ...,
Expand Down Expand Up @@ -431,20 +412,17 @@ def read_excel(
decimal: str = ...,
comment: str | None = ...,
skipfooter: int = ...,
convert_float: bool | None = ...,
mangle_dupe_cols: bool = ...,
storage_options: StorageOptions = ...,
) -> dict[IntStrT, DataFrame]:
...


@doc(storage_options=_shared_docs["storage_options"])
@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
@deprecate_nonkeyword_arguments(allowed_args=["io", "sheet_name"], version="2.0")
@Appender(_read_excel_doc)
def read_excel(
io,
sheet_name: str | int | list[IntStrT] | None = 0,
*,
header: int | Sequence[int] | None = 0,
names: list[str] | None = None,
index_col: int | Sequence[int] | None = None,
Expand Down Expand Up @@ -472,8 +450,6 @@ def read_excel(
decimal: str = ".",
comment: str | None = None,
skipfooter: int = 0,
convert_float: bool | None = None,
mangle_dupe_cols: bool = True,
storage_options: StorageOptions = None,
) -> DataFrame | dict[IntStrT, DataFrame]:

Expand Down Expand Up @@ -511,8 +487,6 @@ def read_excel(
decimal=decimal,
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
)
finally:
# make sure to close opened file handles
Expand Down Expand Up @@ -588,7 +562,7 @@ def get_sheet_by_index(self, index: int):
pass

@abc.abstractmethod
def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None):
def get_sheet_data(self, sheet, rows: int | None = None):
pass

def raise_if_bad_sheet_by_index(self, index: int) -> None:
Expand Down Expand Up @@ -716,20 +690,9 @@ def parse(
decimal: str = ".",
comment: str | None = None,
skipfooter: int = 0,
convert_float: bool | None = None,
mangle_dupe_cols: bool = True,
**kwds,
):

if convert_float is None:
convert_float = True
else:
warnings.warn(
"convert_float is deprecated and will be removed in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)

validate_header_arg(header)
validate_integer("nrows", nrows)

Expand Down Expand Up @@ -763,7 +726,7 @@ def parse(
sheet = self.get_sheet_by_index(asheetname)

file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
data = self.get_sheet_data(sheet, convert_float, file_rows_needed)
data = self.get_sheet_data(sheet, file_rows_needed)
if hasattr(sheet, "close"):
# pyxlsb opens two TemporaryFiles
sheet.close()
Expand Down Expand Up @@ -885,7 +848,6 @@ def parse(
comment=comment,
skipfooter=skipfooter,
usecols=usecols,
mangle_dupe_cols=mangle_dupe_cols,
**kwds,
)

Expand Down Expand Up @@ -1718,8 +1680,6 @@ def parse(
thousands: str | None = None,
comment: str | None = None,
skipfooter: int = 0,
convert_float: bool | None = None,
mangle_dupe_cols: bool = True,
**kwds,
) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
"""
Expand Down Expand Up @@ -1751,8 +1711,6 @@ def parse(
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
**kwds,
)

Expand Down
13 changes: 6 additions & 7 deletions pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str):
raise ValueError(f"sheet {name} not found")

def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
self, sheet, file_rows_needed: int | None = None
) -> list[list[Scalar | NaTType]]:
"""
Parse an ODF Table into a list of lists
Expand Down Expand Up @@ -122,7 +122,7 @@ def get_sheet_data(

for sheet_cell in sheet_cells:
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell, convert_float)
value = self._get_cell_value(sheet_cell)
else:
value = self.empty_value

Expand Down Expand Up @@ -183,7 +183,7 @@ def _is_empty_row(self, row) -> bool:

return True

def _get_cell_value(self, cell, convert_float: bool) -> Scalar | NaTType:
def _get_cell_value(self, cell) -> Scalar | NaTType:
from odf.namespaces import OFFICENS

if str(cell) == "#N/A":
Expand All @@ -199,10 +199,9 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar | NaTType:
elif cell_type == "float":
# GH5394
cell_value = float(cell.attributes.get((OFFICENS, "value")))
if convert_float:
val = int(cell_value)
if val == cell_value:
return val
val = int(cell_value)
if val == cell_value:
return val
return cell_value
elif cell_type == "percentage":
cell_value = cell.attributes.get((OFFICENS, "value"))
Expand Down
17 changes: 7 additions & 10 deletions pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ def get_sheet_by_index(self, index: int):
self.raise_if_bad_sheet_by_index(index)
return self.book.worksheets[index]

def _convert_cell(self, cell, convert_float: bool) -> Scalar:
def _convert_cell(self, cell) -> Scalar:

from openpyxl.cell.cell import (
TYPE_ERROR,
Expand All @@ -593,18 +593,15 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
elif cell.data_type == TYPE_ERROR:
return np.nan
elif cell.data_type == TYPE_NUMERIC:
# GH5394, GH46988
if convert_float:
val = int(cell.value)
if val == cell.value:
return val
else:
return float(cell.value)
val = int(cell.value)
if val == cell.value:
return val
return float(cell.value)

return cell.value

def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
self, sheet, file_rows_needed: int | None = None
) -> list[list[Scalar]]:

if self.book.read_only:
Expand All @@ -613,7 +610,7 @@ def get_sheet_data(
data: list[list[Scalar]] = []
last_row_with_data = -1
for row_number, row in enumerate(sheet.rows):
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
converted_row = [self._convert_cell(cell) for cell in row]
while converted_row and converted_row[-1] == "":
# trim trailing empty elements
converted_row.pop()
Expand Down
7 changes: 3 additions & 4 deletions pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,12 @@ def get_sheet_by_index(self, index: int):
# There's a fix for this in the source, but the pypi package doesn't have it
return self.book.get_sheet(index + 1)

def _convert_cell(self, cell, convert_float: bool) -> Scalar:
def _convert_cell(self, cell) -> Scalar:
# TODO: there is no way to distinguish between floats and datetimes in pyxlsb
# This means that there is no way to read datetime types from an xlsb file yet
if cell.v is None:
return "" # Prevents non-named columns from not showing up as Unnamed: i
if isinstance(cell.v, float) and convert_float:
if isinstance(cell.v, float):
val = int(cell.v)
if val == cell.v:
return val
Expand All @@ -82,7 +82,6 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
def get_sheet_data(
self,
sheet,
convert_float: bool,
file_rows_needed: int | None = None,
) -> list[list[Scalar]]:
data: list[list[Scalar]] = []
Expand All @@ -91,7 +90,7 @@ def get_sheet_data(
# not returned. The cells are namedtuples of row, col, value (r, c, v).
for row in sheet.rows(sparse=True):
row_number = row[0].r
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
converted_row = [self._convert_cell(cell) for cell in row]
while converted_row and converted_row[-1] == "":
# trim trailing empty elements
converted_row.pop()
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_sheet_by_index(self, index):
return self.book.sheet_by_index(index)

def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
self, sheet, file_rows_needed: int | None = None
) -> list[list[Scalar]]:
from xlrd import (
XL_CELL_BOOLEAN,
Expand Down Expand Up @@ -104,7 +104,7 @@ def _parse_cell(cell_contents, cell_typ):
cell_contents = np.nan
elif cell_typ == XL_CELL_BOOLEAN:
cell_contents = bool(cell_contents)
elif convert_float and cell_typ == XL_CELL_NUMBER:
elif cell_typ == XL_CELL_NUMBER:
# GH5394 - Excel 'numbers' are always floats
# it's a minimal perf hit and less surprising
val = int(cell_contents)
Expand Down
42 changes: 4 additions & 38 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,6 @@ def test_reader_special_dtypes(self, request, read_ext):
"FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005],
"BoolCol": [True, False, True, True, False],
"StrCol": [1, 2, 3, 4, 5],
# GH5394 - this is why convert_float isn't vectorized
"Str2Col": ["a", 3, "c", "d", "e"],
"DateCol": [
datetime(2013, 10, 30),
Expand All @@ -424,19 +423,11 @@ def test_reader_special_dtypes(self, request, read_ext):

# if not coercing number, then int comes in as float
float_expected = expected.copy()
float_expected["IntCol"] = float_expected["IntCol"].astype(float)
float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
with tm.assert_produces_warning(
FutureWarning,
match="convert_float is deprecated",
raise_on_extra_warnings=False,
):
# raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning
# on database job Linux_py37_IO (ci/deps/actions-37-db.yaml)
# See GH#41176
actual = pd.read_excel(
basename + read_ext, sheet_name="Sheet1", convert_float=False
)
# raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tangential to this PR but assuming we ultimately will be removing xlrd as well

#35029

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep but did not want to mix too much stuff

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like the raise_on_extra_warnings comment isn't needed anymore

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed

# on database job Linux_py37_IO (ci/deps/actions-37-db.yaml)
# See GH#41176
actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1")
tm.assert_frame_equal(actual, float_expected)

# check setting Index (assuming xls and xlsx are the same here)
Expand All @@ -447,31 +438,12 @@ def test_reader_special_dtypes(self, request, read_ext):
exp = expected.set_index(name)
tm.assert_frame_equal(actual, exp)

# convert_float and converters should be different but both accepted
expected["StrCol"] = expected["StrCol"].apply(str)
actual = pd.read_excel(
basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str}
)
tm.assert_frame_equal(actual, expected)

no_convert_float = float_expected.copy()
no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
with tm.assert_produces_warning(
FutureWarning,
match="convert_float is deprecated",
raise_on_extra_warnings=False,
):
# raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning
# on database job Linux_py37_IO (ci/deps/actions-37-db.yaml)
# See GH#41176
actual = pd.read_excel(
basename + read_ext,
sheet_name="Sheet1",
convert_float=False,
converters={"StrCol": str},
)
tm.assert_frame_equal(actual, no_convert_float)

# GH8212 - support for converters and missing values
def test_reader_converters(self, read_ext):

Expand Down Expand Up @@ -1275,12 +1247,6 @@ def test_read_excel_squeeze(self, read_ext):
expected = Series([1, 2, 3], name="a")
tm.assert_series_equal(actual, expected)

def test_deprecated_kwargs(self, read_ext):
with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you turn this test into checking that non-keyword arguments here raise?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice idea - wondering if there's a comprehensive way we should be testing for that (maybe part of test_api) as we do this with more functions

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice idea - wondering if there's a comprehensive way we should be testing for that (maybe part of test_api) as we do this with more functions

Yeah it would be cool to have a test that nails down signature definitions and even alignment between DataFrame/Series methods

pd.read_excel("test1" + read_ext, "Sheet1", 0)

pd.read_excel("test1" + read_ext)

def test_no_header_with_list_index_col(self, read_ext):
# GH 31783
file_name = "testmultiindex" + read_ext
Expand Down
Loading