IAMconsortium · danielhuppmann · Aug 28, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -68,7 +68,7 @@ jobs:
     #------------------------------------------------
     - name: Install dependencies
       if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-      run: poetry install --no-interaction --with dev,docs,optional_io_formats,optional_plotting,tutorials,wbdata --no-root
+      run: poetry install --no-interaction --with calamine,dev,docs,optional_io_formats,optional_plotting,tutorials,wbdata --no-root
 
     #------------------------
     #  install root project

diff --git a/.github/workflows/pytest-legacy.yml b/.github/workflows/pytest-legacy.yml
@@ -61,7 +61,7 @@ jobs:
       #------------------------------------------------
       - name: Install dependencies
         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction --with dev,optional_io_formats,optional_plotting,tutorials --no-root
+        run: poetry install --no-interaction --with calamine,dev,optional_io_formats,optional_plotting,tutorials --no-root
 
       #------------------------
       #  install root project

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -64,7 +64,7 @@ jobs:
       #  install your project
       #------------------------
       - name: Install library
-        run: poetry install --no-interaction --with dev,optional_io_formats,optional_plotting,tutorials,wbdata
+        run: poetry install --no-interaction --with calamine,dev,optional_io_formats,optional_plotting,tutorials,wbdata
 
       # run tests without Matplotlib & CodeCode tests on earlier Python versions
       - name: Test with pytest

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,6 +1,6 @@
 # Next release
 
-- [#xxx](https://github.com/IAMconsortium/pyam/pull/xxx) Description of the PR
+- [#877](https://github.com/IAMconsortium/pyam/pull/xxx) Support `engine` and other `pd.ExcelFile` keywords.
 
 # Release v2.2.4
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyam/core.py b/pyam/core.py
@@ -59,6 +59,7 @@
     ILLEGAL_COLS,
     META_IDX,
     format_data,
+    get_excel_file_with_kwargs,
     is_list_like,
     make_index,
     merge_exclude,
@@ -197,7 +198,7 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):  # noqa: C
 
         # if initializing from xlsx, try to load `meta` table from file
         if meta_sheet and isinstance(data, Path) and data.suffix in [".xlsx", ".xls"]:
-            excel_file = pd.ExcelFile(data)
+            excel_file, kwargs = get_excel_file_with_kwargs(data, **kwargs)
             if meta_sheet in excel_file.sheet_names:
                 self.load_meta(excel_file, sheet_name=meta_sheet, ignore_conflict=True)
 

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -1,11 +1,14 @@
+import importlib.metadata
 import itertools
 import logging
 import re
 import string
+import warnings
 from pathlib import Path
 
 import dateutil
 import numpy as np
+import packaging.version
 import pandas as pd
 from pandas.api.types import is_list_like
 
@@ -94,13 +97,36 @@ def write_sheet(writer, name, df, index=False):
         writer.sheets[name].set_column(i, i, width)  # assumes xlsxwriter as engine
 
 
+def get_excel_file_with_kwargs(path, **kwargs):
+    """Return a `pandas.ExcelFile` and a dict of unused kwargs.
+
+    When reading an Excel file, this function finds keyword arguments that
+    should be passed to `pandas.ExcelFile`, and returns a `pandas.ExcelFile`
+    instance along with the remaining keyword arguments (which presumably
+    will be used for other purposes by the calling function).
+    """
+    EXCEL_FILE_KWS = ('engine', 'storage_options', 'engine_kwargs')
+    kwargs = kwargs.copy()
+    excel_file_kwargs = {
+        k: kwargs.pop(k) for k in EXCEL_FILE_KWS if k in kwargs
+    }
+    if packaging.version.parse(importlib.metadata.version("pandas")) \
+            < packaging.version.parse("2.2.0"):
+        warnings.warn(
+            "pandas < 2.2.0 has inconsistent support for `engine_kwargs`. "
+            "Using it is likely to result in an exception."
+        )
+    return pd.ExcelFile(path, **excel_file_kwargs), kwargs
+
+
 def read_pandas(path, sheet_name=["data*", "Data*"], *args, **kwargs):
     """Read a file and return a pandas.DataFrame"""
 
     if isinstance(path, Path) and path.suffix == ".csv":
         return pd.read_csv(path, *args, **kwargs)
 
-    with pd.ExcelFile(path) as xl:
+    xlfile, kwargs = get_excel_file_with_kwargs(path, **kwargs)
+    with xlfile as xl:
         # reading multiple sheets
         sheet_names = pd.Series(xl.sheet_names)
         if len(sheet_names) > 1:

diff --git a/pyproject.toml b/pyproject.toml
@@ -116,6 +116,12 @@ optional = true
 [tool.poetry.group.unfccc.dependencies]
 unfccc_di_api = ">=3.0.1"
 
+[tool.poetry.group.calamine]
+optional = true
+
+[tool.poetry.group.calamine.dependencies]
+python-calamine = ">=0.2.3"
+
 [build-system]
 build-backend = "poetry_dynamic_versioning.backend"
 requires = ["poetry-core>=1.2.0", "poetry-dynamic-versioning"]

diff --git a/tests/data/test_df.xlsx b/tests/data/test_df.xlsx
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -1,6 +1,8 @@
+import importlib.metadata
 from pathlib import Path
 
 import numpy as np
+import packaging
 import pandas as pd
 import pytest
 
@@ -17,6 +19,12 @@
 except ModuleNotFoundError:  # pragma: no cover
     has_xlrd = False
 
+try:
+    import python_calamine  # noqa: F401
+    has_calamine = True
+except ModuleNotFoundError:  # pragma: no cover
+    has_calamine = False
+
 
 FILTER_ARGS = dict(scenario="scen_a")
 
@@ -118,6 +126,47 @@ def test_read_xls(test_df_year):
     assert_iamframe_equal(test_df_year, import_df)
 
 
+@pytest.mark.skipif(
+        packaging.version.parse(importlib.metadata.version("pandas")) \
+          < packaging.version.parse("2.2.0"),
+        reason="pandas < 2.2.0 has inconsistent support for `engine_kwargs`",
+)
+def test_read_xlsx_kwargs(test_df_year):
+    # Test that kwargs to `IamDataFrame.__init__` are passed to `pd.read_excel`
+    # or `pd.ExcelFile` when reading an Excel file. The `engine_kwargs`
+    # here does not really do anything, but is included to make sure that using
+    # it doesn't crash anything, which would be a sign that it's not being
+    # passed correctly to `pd.ExcelFile`.
+    import_df = IamDataFrame(
+        TEST_DATA_DIR / "test_df.xlsx",
+        sheet_name="custom data sheet name",
+        nrows=2,
+        engine="openpyxl",
+        engine_kwargs={"data_only": False},
+    )
+    assert_iamframe_equal(
+        test_df_year.filter(scenario="scen_a"),
+        import_df,
+    )
+
+
+@pytest.mark.skipif(not has_calamine, reason="Package 'python_calamine' not installed.")
+@pytest.mark.skipif(
+    packaging.version.parse(importlib.metadata.version("pandas")) \
+      < packaging.version.parse("2.2.0"),
+    reason="`engine='calamine' requires pandas >= 2.2.0",
+)
+def test_read_xlsx_calamine(test_df_year):
+    # Test that an xlsx file is read correctly when using the calamine engine,
+    # and that excel kwargs such as `sheet_name` are still handled correctly
+    import_df = IamDataFrame(
+        TEST_DATA_DIR / "test_df.xlsx",
+        engine="calamine",
+        sheet_name="custom data sheet name",
+    )
+    assert_iamframe_equal(import_df, test_df_year)
+
+
 def test_init_df_with_na_unit(test_pd_df, tmpdir):
     # missing values in the unit column are replaced by an empty string
     test_pd_df.loc[1, "unit"] = np.nan