Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of pyam.concat() #510

Merged
merged 11 commits into from
Mar 22, 2021
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ The following persons contributed to the development of the |pyam| framework:
- Maik Budzinski `@mabudz <https://github.com/mabudz>`_
- Jarmo Kikstra `@jkikstra <https://github.com/jkikstra>`_
- Michael Pimmer `@fonfon <https://github.com/fonfon>`_
- Patrick Jürgens `@pjuergens <https://github.com/pjuergens>`_
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ starting with `data` will be parsed for timeseries data.

## Individual updates

- [#510](https://github.com/IAMconsortium/pyam/pull/510) Improve performance of `pyam.concat()`
- [#508](https://github.com/IAMconsortium/pyam/pull/508) Bugfix for non-empty but invisible header and no rows in 'meta' sheet
- [#502](https://github.com/IAMconsortium/pyam/pull/502) Switch to Black code style
- [#499](https://github.com/IAMconsortium/pyam/pull/499) Implement `order` feature in line plot
Expand Down
63 changes: 48 additions & 15 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ def append(
Raises
------
ValueError
If time domain or other timeseries data index dimension don't match
If time domain or other timeseries data index dimension don't match.
"""
if not isinstance(other, IamDataFrame):
other = IamDataFrame(other, **kwargs)
Expand Down Expand Up @@ -2306,28 +2306,61 @@ def compare(
return ret[[right_label, left_label]]


def concat(dfs):
def concat(dfs, ignore_meta_conflict=False, **kwargs):
"""Concatenate a series of IamDataFrame-like objects

Parameters
----------
dfs : list of IamDataFrames
a list of :class:`IamDataFrame` instances
A list of :class:`IamDataFrame` instances
ignore_meta_conflict : bool, default False
If False, raise an error if any meta columns present in `dfs` are not identical.
If True, values in earlier elements of `dfs` take precendence.
kwargs
Passed to :class:`IamDataFrame(other, **kwargs) <IamDataFrame>`
for any item of `dfs` which isn't already an IamDataFrame.

Returns
-------
IamDataFrame

Raises
------
TypeError
If `dfs` is not a list.
ValueError
If time domain or other timeseries data index dimension don't match.
"""
if isstr(dfs) or not hasattr(dfs, "__iter__"):
msg = "Argument must be a non-string iterable (e.g., list or tuple)"
raise TypeError(msg)
if not islistable(dfs) or isinstance(dfs, pd.DataFrame):
raise TypeError(
f"First argument must be an iterable, "
f"you passed an object of type '{dfs.__class__.__name__}'!"
)

_df = None
for df in dfs:
df = df if isinstance(df, IamDataFrame) else IamDataFrame(df)
if _df is None:
_df = df.copy()
else:
_df.append(df, inplace=True, verify_integrity=False)
# cast first element in list to IamDataFrame (if necessary)
df = dfs[0] if isinstance(dfs[0], IamDataFrame) else IamDataFrame(dfs[0], **kwargs)
ret_data, ret_meta = [df._data], df.meta
index, time_col = df._data.index.names, df.time_col

for df in dfs[1:]:
# skip merging meta if element is a pd.DataFrame
_meta_merge = False if isinstance(df, pd.DataFrame) else True
df = IamDataFrame(df, **kwargs) if not isinstance(df, IamDataFrame) else df

if df.time_col != time_col:
raise ValueError("Items have incompatible time format ('year' vs. 'time')!")

if df._data.index.names != index:
raise ValueError(
"Items have incompatible timeseries data index dimensions!"
)

ret_data.append(df._data)
if _meta_merge:
ret_meta = merge_meta(ret_meta, df.meta, ignore_meta_conflict)

verify_index_integrity(_df._data)
return _df
# return as new IamDataFrame, this will verify integrity as part of `__init__()`
return IamDataFrame(pd.concat(ret_data, verify_integrity=False), meta=ret_meta)


def read_datapackage(path, data="data", meta="meta"):
Expand Down
5 changes: 3 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,9 @@ def test_df_year():
# minimal IamDataFrame for specifically testing 'time'-column features
@pytest.fixture(scope="function")
def test_df_time():
tdf = TEST_DF.rename({2005: TEST_DTS[0], 2010: TEST_DTS[1]}, axis="columns")
df = IamDataFrame(data=tdf)
df = IamDataFrame(
data=TEST_DF.rename({2005: TEST_DTS[0], 2010: TEST_DTS[1]}, axis="columns")
)
for i in META_COLS:
df.set_meta(META_DF[i])
yield df
Expand Down
33 changes: 29 additions & 4 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,12 +1027,37 @@ def test_pd_join_by_meta_nonmatching_index(test_df):
pd.testing.assert_frame_equal(obs.sort_index(level=1), exp)


def test_concat_fails_iter():
pytest.raises(TypeError, concat, 1)
def test_concat_fails_iterable(test_pd_df):
"""Check that calling concat with a non-iterable raises"""
msg = "First argument must be an iterable, you passed an object of type '{}'!"

for dfs, type_ in [(1, "int"), ("foo", "str"), (test_pd_df, "DataFrame")]:
with pytest.raises(TypeError, match=msg.format(type_)):
concat(dfs)

def test_concat_fails_notdf():
pytest.raises(TypeError, concat, "foo")

def test_concat_single_item(test_df):
"""Check that calling concat on a single-item list returns identical object"""
obs = concat([test_df])
assert_iamframe_equal(obs, test_df)


def test_concat_incompatible_time(test_df_year, test_df_time):
"""Check that calling concat with incompatible time formats raises"""
match = re.escape("Items have incompatible time format ('year' vs. 'time')!")
with pytest.raises(ValueError, match=match):
concat([test_df_year, test_df_time])


def test_concat_incompatible_cols(test_pd_df):
"""Check that calling concat on a single-item list returns identical object"""
df1 = IamDataFrame(test_pd_df)
test_pd_df["extra_col"] = "foo"
df2 = IamDataFrame(test_pd_df)

match = "Items have incompatible timeseries data index dimensions!"
with pytest.raises(ValueError, match=match):
concat([df1, df2])


def test_concat(test_df):
Expand Down