Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handling kwargs to be passed to pandas.ExcelFile #877

Merged
2 changes: 1 addition & 1 deletion .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ jobs:
#------------------------------------------------
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction --with dev,docs,optional_io_formats,optional_plotting,tutorials,wbdata --no-root
run: poetry install --no-interaction --with calamine,dev,docs,optional_io_formats,optional_plotting,tutorials,wbdata --no-root

#------------------------
# install root project
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pytest-legacy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
#------------------------------------------------
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction --with dev,optional_io_formats,optional_plotting,tutorials --no-root
run: poetry install --no-interaction --with calamine,dev,optional_io_formats,optional_plotting,tutorials --no-root

#------------------------
# install root project
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:
# install your project
#------------------------
- name: Install library
run: poetry install --no-interaction --with dev,optional_io_formats,optional_plotting,tutorials,wbdata
run: poetry install --no-interaction --with calamine,dev,optional_io_formats,optional_plotting,tutorials,wbdata

# run tests without Matplotlib & CodeCode tests on earlier Python versions
- name: Test with pytest
Expand Down
2 changes: 1 addition & 1 deletion RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Next release

- [#xxx](https://github.com/IAMconsortium/pyam/pull/xxx) Description of the PR
- [#877](https://github.com/IAMconsortium/pyam/pull/xxx) Support `engine` and other `pd.ExcelFile` keywords.

# Release v2.2.4

Expand Down
112 changes: 111 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
ILLEGAL_COLS,
META_IDX,
format_data,
get_excel_file_with_kwargs,
is_list_like,
make_index,
merge_exclude,
Expand Down Expand Up @@ -197,7 +198,7 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs): # noqa: C

# if initializing from xlsx, try to load `meta` table from file
if meta_sheet and isinstance(data, Path) and data.suffix in [".xlsx", ".xls"]:
excel_file = pd.ExcelFile(data)
excel_file, kwargs = get_excel_file_with_kwargs(data, **kwargs)
if meta_sheet in excel_file.sheet_names:
self.load_meta(excel_file, sheet_name=meta_sheet, ignore_conflict=True)

Expand Down
28 changes: 27 additions & 1 deletion pyam/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import importlib.metadata
import itertools
import logging
import re
import string
import warnings
from pathlib import Path

import dateutil
import numpy as np
import packaging.version
import pandas as pd
from pandas.api.types import is_list_like

Expand Down Expand Up @@ -94,13 +97,36 @@ def write_sheet(writer, name, df, index=False):
writer.sheets[name].set_column(i, i, width) # assumes xlsxwriter as engine


def get_excel_file_with_kwargs(path, **kwargs):
"""Return a `pandas.ExcelFile` and a dict of unused kwargs.

When reading an Excel file, this function finds keyword arguments that
should be passed to `pandas.ExcelFile`, and returns a `pandas.ExcelFile`
instance along with the remaining keyword arguments (which presumably
will be used for other purposes by the calling function).
"""
EXCEL_FILE_KWS = ('engine', 'storage_options', 'engine_kwargs')
danielhuppmann marked this conversation as resolved.
Show resolved Hide resolved
kwargs = kwargs.copy()
excel_file_kwargs = {
k: kwargs.pop(k) for k in EXCEL_FILE_KWS if k in kwargs
}
if packaging.version.parse(importlib.metadata.version("pandas")) \
< packaging.version.parse("2.2.0"):
korsbakken marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn(
"pandas < 2.2.0 has inconsistent support for `engine_kwargs`. "
"Using it is likely to result in an exception."
)
return pd.ExcelFile(path, **excel_file_kwargs), kwargs


def read_pandas(path, sheet_name=["data*", "Data*"], *args, **kwargs):
"""Read a file and return a pandas.DataFrame"""

if isinstance(path, Path) and path.suffix == ".csv":
return pd.read_csv(path, *args, **kwargs)

with pd.ExcelFile(path) as xl:
xlfile, kwargs = get_excel_file_with_kwargs(path, **kwargs)
with xlfile as xl:
# reading multiple sheets
sheet_names = pd.Series(xl.sheet_names)
if len(sheet_names) > 1:
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ optional = true
[tool.poetry.group.unfccc.dependencies]
unfccc_di_api = ">=3.0.1"

[tool.poetry.group.calamine]
optional = true

[tool.poetry.group.calamine.dependencies]
python-calamine = ">=0.2.3"

[build-system]
build-backend = "poetry_dynamic_versioning.backend"
requires = ["poetry-core>=1.2.0", "poetry-dynamic-versioning"]
Expand Down
Binary file added tests/data/test_df.xlsx
Binary file not shown.
49 changes: 49 additions & 0 deletions tests/test_io.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import importlib.metadata
from pathlib import Path

import numpy as np
import packaging
import pandas as pd
import pytest

Expand All @@ -17,6 +19,12 @@
except ModuleNotFoundError: # pragma: no cover
has_xlrd = False

try:
import python_calamine # noqa: F401
has_calamine = True
except ModuleNotFoundError: # pragma: no cover
has_calamine = False


FILTER_ARGS = dict(scenario="scen_a")

Expand Down Expand Up @@ -118,6 +126,47 @@ def test_read_xls(test_df_year):
assert_iamframe_equal(test_df_year, import_df)


@pytest.mark.skipif(
packaging.version.parse(importlib.metadata.version("pandas")) \
< packaging.version.parse("2.2.0"),
reason="pandas < 2.2.0 has inconsistent support for `engine_kwargs`",
)
def test_read_xlsx_kwargs(test_df_year):
# Test that kwargs to `IamDataFrame.__init__` are passed to `pd.read_excel`
# or `pd.ExcelFile` when reading an Excel file. The `engine_kwargs`
# here does not really do anything, but is included to make sure that using
# it doesn't crash anything, which would be a sign that it's not being
# passed correctly to `pd.ExcelFile`.
import_df = IamDataFrame(
TEST_DATA_DIR / "test_df.xlsx",
sheet_name="custom data sheet name",
nrows=2,
engine="openpyxl",
engine_kwargs={"data_only": False},
)
assert_iamframe_equal(
test_df_year.filter(scenario="scen_a"),
import_df,
)


@pytest.mark.skipif(not has_calamine, reason="Package 'python_calamine' not installed.")
@pytest.mark.skipif(
packaging.version.parse(importlib.metadata.version("pandas")) \
< packaging.version.parse("2.2.0"),
reason="`engine='calamine' requires pandas >= 2.2.0",
)
def test_read_xlsx_calamine(test_df_year):
# Test that an xlsx file is read correctly when using the calamine engine,
# and that excel kwargs such as `sheet_name` are still handled correctly
import_df = IamDataFrame(
TEST_DATA_DIR / "test_df.xlsx",
engine="calamine",
sheet_name="custom data sheet name",
danielhuppmann marked this conversation as resolved.
Show resolved Hide resolved
)
assert_iamframe_equal(import_df, test_df_year)


def test_init_df_with_na_unit(test_pd_df, tmpdir):
# missing values in the unit column are replaced by an empty string
test_pd_df.loc[1, "unit"] = np.nan
Expand Down