From 00ab4cf9638567547e8affb12c45ba42f22140ed Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 10 Jan 2025 13:50:17 +0400 Subject: [PATCH] docs(python): Add links to `read_excel` "engine_options" and "read_options" docstring --- py-polars/polars/_typing.py | 2 +- py-polars/polars/io/spreadsheet/functions.py | 26 ++++----- py-polars/tests/unit/io/test_spreadsheet.py | 58 ++++++++------------ 3 files changed, 36 insertions(+), 50 deletions(-) diff --git a/py-polars/polars/_typing.py b/py-polars/polars/_typing.py index 88a0882e5679..6103ea60e245 100644 --- a/py-polars/polars/_typing.py +++ b/py-polars/polars/_typing.py @@ -235,7 +235,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: BufferInfo: TypeAlias = tuple[int, int, int] # type alias for supported spreadsheet engines -ExcelSpreadsheetEngine: TypeAlias = Literal["xlsx2csv", "openpyxl", "calamine"] +ExcelSpreadsheetEngine: TypeAlias = Literal["calamine", "openpyxl", "xlsx2csv"] class SeriesBuffers(TypedDict): diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 8af5d9069643..36ef5edd2555 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -255,14 +255,12 @@ def read_excel( """ Read Excel spreadsheet data into a DataFrame. + .. versionadded:: 1.20 + Support loading data from named table objects with `table_name` parameter. .. versionadded:: 1.18 Support loading data from a list (or glob pattern) of multiple workbooks. .. versionchanged:: 1.0 Default engine is now "calamine" (was "xlsx2csv"). - .. versionadded:: 0.20.6 - Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls). - .. versionadded:: 0.19.3 - Added "openpyxl" engine, and added `schema_overrides` parameter. Parameters ---------- @@ -283,34 +281,32 @@ def read_excel( the workbook, so additionally specifying a sheet id or name is optional; if one of those parameters *is* specified, an error will be raised if the named table is not found in that particular sheet. - engine : {'calamine', 'xlsx2csv', 'openpyxl'} + engine : {'calamine', 'openpyxl', 'xlsx2csv'} Library used to parse the spreadsheet file; defaults to "calamine". * "calamine": this engine can be used for reading all major types of Excel Workbook (`.xlsx`, `.xlsb`, `.xls`) and is *dramatically* faster than the other options, using the `fastexcel` module to bind the Calamine parser. + * "openpyxl": this engine is significantly slower than both `calamine` and + `xlsx2csv` but can provide a useful fallback if you are otherwise unable + to read data from your workbook. * "xlsx2csv": converts the data to an in-memory CSV before using the native - polars `read_csv` method to parse the result. You can pass `engine_options` - and `read_options` to refine the conversion. - * "openpyxl": this engine is significantly slower than `xlsx2csv` but supports - additional automatic type inference; potentially useful if you are otherwise - unable to parse your sheet with the `xlsx2csv` engine in conjunction with the - `schema_overrides` parameter. + polars `read_csv` method to parse the result. engine_options Additional options passed to the underlying engine's primary parsing constructor (given below), if supported: * "calamine": n/a (can only provide `read_options`) - * "xlsx2csv": `Xlsx2csv` - * "openpyxl": `load_workbook` + * "openpyxl": `load_workbook `_ + * "xlsx2csv": `Xlsx2csv `_ read_options Options passed to the underlying engine method that reads the sheet data. Where supported, this allows for additional control over parsing. The specific read methods associated with each engine are: - * "calamine": `ExcelReader.load_sheet_by_name` - * "xlsx2csv": `pl.read_csv` + * "calamine": `load_sheet_by_name `_ * "openpyxl": n/a (can only provide `engine_options`) + * "xlsx2csv": see :meth:`read_csv` has_header Indicate if the first row of the table data is a header or not. If False, column names will be autogenerated in the following format: `column_x`, with diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index 4d7217c3f7cc..417eb894634e 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -283,7 +283,7 @@ def test_read_excel_all_sheets( @pytest.mark.parametrize( "engine", - ["xlsx2csv", "calamine", "openpyxl"], + ["calamine", "openpyxl", "xlsx2csv"], ) def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None: df = pl.DataFrame( @@ -471,7 +471,7 @@ def test_read_mixed_dtype_columns( ) -@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"]) +@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"]) def test_write_excel_bytes(engine: ExcelSpreadsheetEngine) -> None: df = pl.DataFrame({"colx": [1.5, -2, 0], "coly": ["a", None, "c"]}) @@ -634,7 +634,7 @@ def test_unsupported_binary_workbook(path_xlsb: Path) -> None: pl.read_excel(path_xlsb, engine="openpyxl") -@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"]) +@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"]) def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) -> None: with pytest.raises( ValueError, @@ -793,7 +793,7 @@ def test_excel_round_trip(write_params: dict[str, Any]) -> None: assert_frame_equal(df, xldf) -@pytest.mark.parametrize("engine", ["xlsx2csv", "calamine"]) +@pytest.mark.parametrize("engine", ["calamine", "xlsx2csv"]) def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> None: df = pl.DataFrame( { @@ -828,7 +828,7 @@ def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> No assert xldf.row(-1) == (None, 0.0, 0.0, 0, 0, None, 0.0, 0) -@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"]) +@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"]) def test_excel_write_compound_types(engine: ExcelSpreadsheetEngine) -> None: df = pl.DataFrame( {"x": [[1, 2], [3, 4], [5, 6]], "y": ["a", "b", "c"], "z": [9, 8, 7]} @@ -925,7 +925,7 @@ def test_excel_write_to_file_object( assert_frame_equal(df, pl.read_excel(src, engine=engine)) -@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"]) +@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"]) def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None: df = pl.DataFrame( {"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]} @@ -938,7 +938,7 @@ def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None: assert_frame_equal(df, expected) -@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"]) +@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"]) def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None: from xlsxwriter import Workbook @@ -1217,7 +1217,7 @@ def test_excel_mixed_calamine_float_data(io_files_path: Path) -> None: ) -@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"]) +@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"]) @pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch, _read_spreadsheet_xlsx2csv needs to be changed not to call `_reorder_columns` on the df def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None: df = pl.DataFrame( @@ -1255,36 +1255,26 @@ def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None assert_frame_equal(df.select(reversed_cols), read_df) -def test_drop_empty_rows(path_empty_rows_excel: Path) -> None: - df1 = pl.read_excel(source=path_empty_rows_excel, engine="xlsx2csv") +@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"]) +def test_drop_empty_rows( + path_empty_rows_excel: Path, engine: ExcelSpreadsheetEngine +) -> None: + df1 = pl.read_excel( + source=path_empty_rows_excel, + engine=engine, + ) # check default assert df1.shape == (8, 4) + df2 = pl.read_excel( - source=path_empty_rows_excel, engine="xlsx2csv", drop_empty_rows=True + source=path_empty_rows_excel, + engine=engine, + drop_empty_rows=True, ) assert df2.shape == (8, 4) + df3 = pl.read_excel( - source=path_empty_rows_excel, engine="xlsx2csv", drop_empty_rows=False + source=path_empty_rows_excel, + engine=engine, + drop_empty_rows=False, ) assert df3.shape == (10, 4) - - df4 = pl.read_excel(source=path_empty_rows_excel, engine="openpyxl") - assert df4.shape == (8, 4) - df5 = pl.read_excel( - source=path_empty_rows_excel, engine="openpyxl", drop_empty_rows=True - ) - assert df5.shape == (8, 4) - df6 = pl.read_excel( - source=path_empty_rows_excel, engine="openpyxl", drop_empty_rows=False - ) - assert df6.shape == (10, 4) - - df7 = pl.read_excel(source=path_empty_rows_excel, engine="calamine") - assert df7.shape == (8, 4) - df8 = pl.read_excel( - source=path_empty_rows_excel, engine="calamine", drop_empty_rows=True - ) - assert df8.shape == (8, 4) - df9 = pl.read_excel( - source=path_empty_rows_excel, engine="calamine", drop_empty_rows=False - ) - assert df9.shape == (10, 4)