diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e281e250d608e..26efc383f4314 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -180,6 +180,8 @@ Removal of prior version deprecations/changes - Removed argument ``try_cast`` from :meth:`DataFrame.mask`, :meth:`DataFrame.where`, :meth:`Series.mask` and :meth:`Series.where` (:issue:`38836`) - Removed argument ``is_copy`` from :meth:`DataFrame.take` and :meth:`Series.take` (:issue:`30615`) - Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`) +- Remove keywords ``convert_float`` and ``mangle_dupe_cols`` from :func:`read_excel` (:issue:`41176`) +- Disallow passing non-keyword arguments to :func:`read_excel` except ``io`` and ``sheet_name`` (:issue:`34418`) - Removed :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`) - Removed :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`) - Removed :attr:`Rolling.is_datetimelike` (:issue:`38963`) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f555e7c5f5d95..994887f487473 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -42,8 +42,6 @@ from pandas.errors import EmptyDataError from pandas.util._decorators import ( Appender, - deprecate_kwarg, - deprecate_nonkeyword_arguments, doc, ) from pandas.util._exceptions import find_stack_level @@ -269,23 +267,6 @@ comment string and the end of the current line is ignored. skipfooter : int, default 0 Rows at the end to skip (0-indexed). -convert_float : bool, default True - Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric - data will be read in as floats: Excel stores all numbers as floats - internally. - - .. deprecated:: 1.3.0 - convert_float will be removed in a future version - -mangle_dupe_cols : bool, default True - Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than - 'X'...'X'. Passing in False will cause data to be overwritten if there - are duplicate names in the columns. - - .. deprecated:: 1.5.0 - Not implemented, and a new argument to specify the pattern for the - names of duplicated columns will be added instead - {storage_options} .. versionadded:: 1.2.0 @@ -365,6 +346,7 @@ def read_excel( io, # sheet name is str or int -> DataFrame sheet_name: str | int = ..., + *, header: int | Sequence[int] | None = ..., names: list[str] | None = ..., index_col: int | Sequence[int] | None = ..., @@ -392,8 +374,6 @@ def read_excel( decimal: str = ..., comment: str | None = ..., skipfooter: int = ..., - convert_float: bool | None = ..., - mangle_dupe_cols: bool = ..., storage_options: StorageOptions = ..., ) -> DataFrame: ... @@ -404,6 +384,7 @@ def read_excel( io, # sheet name is list or None -> dict[IntStrT, DataFrame] sheet_name: list[IntStrT] | None, + *, header: int | Sequence[int] | None = ..., names: list[str] | None = ..., index_col: int | Sequence[int] | None = ..., @@ -431,20 +412,17 @@ def read_excel( decimal: str = ..., comment: str | None = ..., skipfooter: int = ..., - convert_float: bool | None = ..., - mangle_dupe_cols: bool = ..., storage_options: StorageOptions = ..., ) -> dict[IntStrT, DataFrame]: ... @doc(storage_options=_shared_docs["storage_options"]) -@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None) -@deprecate_nonkeyword_arguments(allowed_args=["io", "sheet_name"], version="2.0") @Appender(_read_excel_doc) def read_excel( io, sheet_name: str | int | list[IntStrT] | None = 0, + *, header: int | Sequence[int] | None = 0, names: list[str] | None = None, index_col: int | Sequence[int] | None = None, @@ -472,8 +450,6 @@ def read_excel( decimal: str = ".", comment: str | None = None, skipfooter: int = 0, - convert_float: bool | None = None, - mangle_dupe_cols: bool = True, storage_options: StorageOptions = None, ) -> DataFrame | dict[IntStrT, DataFrame]: @@ -511,8 +487,6 @@ def read_excel( decimal=decimal, comment=comment, skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, ) finally: # make sure to close opened file handles @@ -588,7 +562,7 @@ def get_sheet_by_index(self, index: int): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None): + def get_sheet_data(self, sheet, rows: int | None = None): pass def raise_if_bad_sheet_by_index(self, index: int) -> None: @@ -716,20 +690,9 @@ def parse( decimal: str = ".", comment: str | None = None, skipfooter: int = 0, - convert_float: bool | None = None, - mangle_dupe_cols: bool = True, **kwds, ): - if convert_float is None: - convert_float = True - else: - warnings.warn( - "convert_float is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - validate_header_arg(header) validate_integer("nrows", nrows) @@ -763,7 +726,7 @@ def parse( sheet = self.get_sheet_by_index(asheetname) file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows) - data = self.get_sheet_data(sheet, convert_float, file_rows_needed) + data = self.get_sheet_data(sheet, file_rows_needed) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() @@ -885,7 +848,6 @@ def parse( comment=comment, skipfooter=skipfooter, usecols=usecols, - mangle_dupe_cols=mangle_dupe_cols, **kwds, ) @@ -1718,8 +1680,6 @@ def parse( thousands: str | None = None, comment: str | None = None, skipfooter: int = 0, - convert_float: bool | None = None, - mangle_dupe_cols: bool = True, **kwds, ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]: """ @@ -1751,8 +1711,6 @@ def parse( thousands=thousands, comment=comment, skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, **kwds, ) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 075590f3535fe..8d2434e96ca61 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") def get_sheet_data( - self, sheet, convert_float: bool, file_rows_needed: int | None = None + self, sheet, file_rows_needed: int | None = None ) -> list[list[Scalar | NaTType]]: """ Parse an ODF Table into a list of lists @@ -122,7 +122,7 @@ def get_sheet_data( for sheet_cell in sheet_cells: if sheet_cell.qname == table_cell_name: - value = self._get_cell_value(sheet_cell, convert_float) + value = self._get_cell_value(sheet_cell) else: value = self.empty_value @@ -183,7 +183,7 @@ def _is_empty_row(self, row) -> bool: return True - def _get_cell_value(self, cell, convert_float: bool) -> Scalar | NaTType: + def _get_cell_value(self, cell) -> Scalar | NaTType: from odf.namespaces import OFFICENS if str(cell) == "#N/A": @@ -199,10 +199,9 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar | NaTType: elif cell_type == "float": # GH5394 cell_value = float(cell.attributes.get((OFFICENS, "value"))) - if convert_float: - val = int(cell_value) - if val == cell_value: - return val + val = int(cell_value) + if val == cell_value: + return val return cell_value elif cell_type == "percentage": cell_value = cell.attributes.get((OFFICENS, "value")) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 6fde319b3a81e..5572116ca29fe 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -581,7 +581,7 @@ def get_sheet_by_index(self, index: int): self.raise_if_bad_sheet_by_index(index) return self.book.worksheets[index] - def _convert_cell(self, cell, convert_float: bool) -> Scalar: + def _convert_cell(self, cell) -> Scalar: from openpyxl.cell.cell import ( TYPE_ERROR, @@ -593,18 +593,15 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: elif cell.data_type == TYPE_ERROR: return np.nan elif cell.data_type == TYPE_NUMERIC: - # GH5394, GH46988 - if convert_float: - val = int(cell.value) - if val == cell.value: - return val - else: - return float(cell.value) + val = int(cell.value) + if val == cell.value: + return val + return float(cell.value) return cell.value def get_sheet_data( - self, sheet, convert_float: bool, file_rows_needed: int | None = None + self, sheet, file_rows_needed: int | None = None ) -> list[list[Scalar]]: if self.book.read_only: @@ -613,7 +610,7 @@ def get_sheet_data( data: list[list[Scalar]] = [] last_row_with_data = -1 for row_number, row in enumerate(sheet.rows): - converted_row = [self._convert_cell(cell, convert_float) for cell in row] + converted_row = [self._convert_cell(cell) for cell in row] while converted_row and converted_row[-1] == "": # trim trailing empty elements converted_row.pop() diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 5d40ccdf2f8f3..634baee63137e 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -65,12 +65,12 @@ def get_sheet_by_index(self, index: int): # There's a fix for this in the source, but the pypi package doesn't have it return self.book.get_sheet(index + 1) - def _convert_cell(self, cell, convert_float: bool) -> Scalar: + def _convert_cell(self, cell) -> Scalar: # TODO: there is no way to distinguish between floats and datetimes in pyxlsb # This means that there is no way to read datetime types from an xlsb file yet if cell.v is None: return "" # Prevents non-named columns from not showing up as Unnamed: i - if isinstance(cell.v, float) and convert_float: + if isinstance(cell.v, float): val = int(cell.v) if val == cell.v: return val @@ -82,7 +82,6 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: def get_sheet_data( self, sheet, - convert_float: bool, file_rows_needed: int | None = None, ) -> list[list[Scalar]]: data: list[list[Scalar]] = [] @@ -91,7 +90,7 @@ def get_sheet_data( # not returned. The cells are namedtuples of row, col, value (r, c, v). for row in sheet.rows(sparse=True): row_number = row[0].r - converted_row = [self._convert_cell(cell, convert_float) for cell in row] + converted_row = [self._convert_cell(cell) for cell in row] while converted_row and converted_row[-1] == "": # trim trailing empty elements converted_row.pop() diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 0bf3ac6134cf6..171705dee6e59 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -62,7 +62,7 @@ def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) def get_sheet_data( - self, sheet, convert_float: bool, file_rows_needed: int | None = None + self, sheet, file_rows_needed: int | None = None ) -> list[list[Scalar]]: from xlrd import ( XL_CELL_BOOLEAN, @@ -104,7 +104,7 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = np.nan elif cell_typ == XL_CELL_BOOLEAN: cell_contents = bool(cell_contents) - elif convert_float and cell_typ == XL_CELL_NUMBER: + elif cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less surprising val = int(cell_contents) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 362e8146fd291..16fbf54bbe394 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -405,7 +405,6 @@ def test_reader_special_dtypes(self, request, read_ext): "FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005], "BoolCol": [True, False, True, True, False], "StrCol": [1, 2, 3, 4, 5], - # GH5394 - this is why convert_float isn't vectorized "Str2Col": ["a", 3, "c", "d", "e"], "DateCol": [ datetime(2013, 10, 30), @@ -424,19 +423,8 @@ def test_reader_special_dtypes(self, request, read_ext): # if not coercing number, then int comes in as float float_expected = expected.copy() - float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - with tm.assert_produces_warning( - FutureWarning, - match="convert_float is deprecated", - raise_on_extra_warnings=False, - ): - # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning - # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml) - # See GH#41176 - actual = pd.read_excel( - basename + read_ext, sheet_name="Sheet1", convert_float=False - ) + actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) @@ -447,31 +435,12 @@ def test_reader_special_dtypes(self, request, read_ext): exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) - # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) actual = pd.read_excel( basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str} ) tm.assert_frame_equal(actual, expected) - no_convert_float = float_expected.copy() - no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - with tm.assert_produces_warning( - FutureWarning, - match="convert_float is deprecated", - raise_on_extra_warnings=False, - ): - # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning - # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml) - # See GH#41176 - actual = pd.read_excel( - basename + read_ext, - sheet_name="Sheet1", - convert_float=False, - converters={"StrCol": str}, - ) - tm.assert_frame_equal(actual, no_convert_float) - # GH8212 - support for converters and missing values def test_reader_converters(self, read_ext): @@ -1276,11 +1245,9 @@ def test_read_excel_squeeze(self, read_ext): tm.assert_series_equal(actual, expected) def test_deprecated_kwargs(self, read_ext): - with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): + with pytest.raises(TypeError, match="but 3 positional arguments"): pd.read_excel("test1" + read_ext, "Sheet1", 0) - pd.read_excel("test1" + read_ext) - def test_no_header_with_list_index_col(self, read_ext): # GH 31783 file_name = "testmultiindex" + read_ext diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index d4b74ddbd66e0..897d6969ea6ae 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -471,18 +471,6 @@ def test_int_types(self, np_type, path): recons2 = pd.read_excel(path, sheet_name="test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) - # Test with convert_float=False comes back as float. - float_frame = df.astype(float) - float_frame.columns = float_frame.columns.astype(float) - float_frame.index = float_frame.index.astype(float) - with tm.assert_produces_warning( - FutureWarning, match="convert_float is deprecated" - ): - recons = pd.read_excel( - path, sheet_name="test1", convert_float=False, index_col=0 - ) - tm.assert_frame_equal(recons, float_frame) - @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) def test_float_types(self, np_type, path): # Test np.float values read come back as float. @@ -972,15 +960,6 @@ def test_duplicated_columns(self, path): result = pd.read_excel(path, sheet_name="test1", index_col=0) tm.assert_frame_equal(result, expected) - # Explicitly, we pass in the parameter. - with tm.assert_produces_warning( - FutureWarning, match="the 'mangle_dupe_cols' keyword is deprecated" - ): - result = pd.read_excel( - path, sheet_name="test1", index_col=0, mangle_dupe_cols=True - ) - tm.assert_frame_equal(result, expected) - # see gh-11007, gh-10970 df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) df.to_excel(path, "test1") @@ -998,15 +977,6 @@ def test_duplicated_columns(self, path): expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) tm.assert_frame_equal(result, expected) - msg = "Setting mangle_dupe_cols=False is not supported yet" - with tm.assert_produces_warning( - FutureWarning, match="the 'mangle_dupe_cols' keyword is deprecated" - ): - with pytest.raises(ValueError, match=msg): - pd.read_excel( - path, sheet_name="test1", header=None, mangle_dupe_cols=False - ) - def test_swapped_columns(self, path): # Test for issue #5427. write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) @@ -1212,21 +1182,15 @@ def test_merged_cell_custom_objects(self, path): (pd.Period("2018"), pd.Period("2018Q2")), ] ) - expected = DataFrame(np.ones((2, 2)), columns=mi) + expected = DataFrame(np.ones((2, 2), dtype="int64"), columns=mi) expected.to_excel(path) - with tm.assert_produces_warning( - FutureWarning, match="convert_float is deprecated" - ): - result = pd.read_excel( - path, header=[0, 1], index_col=0, convert_float=False - ) + result = pd.read_excel(path, header=[0, 1], index_col=0) # need to convert PeriodIndexes to standard Indexes for assert equal expected.columns = expected.columns.set_levels( [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], level=[0, 1], ) - expected.index = expected.index.astype(np.float64) - tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [None, object]) def test_raise_when_saving_timezones(self, dtype, tz_aware_fixture, path):