Skip to content

Commit

Permalink
Improve performance of pyam.concat() (#510)
Browse files Browse the repository at this point in the history
Co-authored-by: pjuergens <74722312+pjuergens@users.noreply.github.com>
  • Loading branch information
danielhuppmann and pjuergens authored Mar 22, 2021
1 parent ebceba1 commit 32b06f7
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 21 deletions.
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ The following persons contributed to the development of the |pyam| framework:
- Maik Budzinski `@mabudz <https://github.com/mabudz>`_
- Jarmo Kikstra `@jkikstra <https://github.com/jkikstra>`_
- Michael Pimmer `@fonfon <https://github.com/fonfon>`_
- Patrick Jürgens `@pjuergens <https://github.com/pjuergens>`_
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ starting with `data` will be parsed for timeseries data.

## Individual updates

- [#510](https://github.com/IAMconsortium/pyam/pull/510) Improve performance of `pyam.concat()`
- [#508](https://github.com/IAMconsortium/pyam/pull/508) Bugfix for non-empty but invisible header and no rows in 'meta' sheet
- [#502](https://github.com/IAMconsortium/pyam/pull/502) Switch to Black code style
- [#499](https://github.com/IAMconsortium/pyam/pull/499) Implement `order` feature in line plot
Expand Down
63 changes: 48 additions & 15 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ def append(
Raises
------
ValueError
If time domain or other timeseries data index dimension don't match
If time domain or other timeseries data index dimension don't match.
"""
if not isinstance(other, IamDataFrame):
other = IamDataFrame(other, **kwargs)
Expand Down Expand Up @@ -2306,28 +2306,61 @@ def compare(
return ret[[right_label, left_label]]


def concat(dfs):
def concat(dfs, ignore_meta_conflict=False, **kwargs):
"""Concatenate a series of IamDataFrame-like objects
Parameters
----------
dfs : list of IamDataFrames
a list of :class:`IamDataFrame` instances
A list of :class:`IamDataFrame` instances
ignore_meta_conflict : bool, default False
If False, raise an error if any meta columns present in `dfs` are not identical.
If True, values in earlier elements of `dfs` take precendence.
kwargs
Passed to :class:`IamDataFrame(other, **kwargs) <IamDataFrame>`
for any item of `dfs` which isn't already an IamDataFrame.
Returns
-------
IamDataFrame
Raises
------
TypeError
If `dfs` is not a list.
ValueError
If time domain or other timeseries data index dimension don't match.
"""
if isstr(dfs) or not hasattr(dfs, "__iter__"):
msg = "Argument must be a non-string iterable (e.g., list or tuple)"
raise TypeError(msg)
if not islistable(dfs) or isinstance(dfs, pd.DataFrame):
raise TypeError(
f"First argument must be an iterable, "
f"you passed an object of type '{dfs.__class__.__name__}'!"
)

_df = None
for df in dfs:
df = df if isinstance(df, IamDataFrame) else IamDataFrame(df)
if _df is None:
_df = df.copy()
else:
_df.append(df, inplace=True, verify_integrity=False)
# cast first element in list to IamDataFrame (if necessary)
df = dfs[0] if isinstance(dfs[0], IamDataFrame) else IamDataFrame(dfs[0], **kwargs)
ret_data, ret_meta = [df._data], df.meta
index, time_col = df._data.index.names, df.time_col

for df in dfs[1:]:
# skip merging meta if element is a pd.DataFrame
_meta_merge = False if isinstance(df, pd.DataFrame) else True
df = IamDataFrame(df, **kwargs) if not isinstance(df, IamDataFrame) else df

if df.time_col != time_col:
raise ValueError("Items have incompatible time format ('year' vs. 'time')!")

if df._data.index.names != index:
raise ValueError(
"Items have incompatible timeseries data index dimensions!"
)

ret_data.append(df._data)
if _meta_merge:
ret_meta = merge_meta(ret_meta, df.meta, ignore_meta_conflict)

verify_index_integrity(_df._data)
return _df
# return as new IamDataFrame, this will verify integrity as part of `__init__()`
return IamDataFrame(pd.concat(ret_data, verify_integrity=False), meta=ret_meta)


def read_datapackage(path, data="data", meta="meta"):
Expand Down
5 changes: 3 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,9 @@ def test_df_year():
# minimal IamDataFrame for specifically testing 'time'-column features
@pytest.fixture(scope="function")
def test_df_time():
tdf = TEST_DF.rename({2005: TEST_DTS[0], 2010: TEST_DTS[1]}, axis="columns")
df = IamDataFrame(data=tdf)
df = IamDataFrame(
data=TEST_DF.rename({2005: TEST_DTS[0], 2010: TEST_DTS[1]}, axis="columns")
)
for i in META_COLS:
df.set_meta(META_DF[i])
yield df
Expand Down
33 changes: 29 additions & 4 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,12 +1027,37 @@ def test_pd_join_by_meta_nonmatching_index(test_df):
pd.testing.assert_frame_equal(obs.sort_index(level=1), exp)


def test_concat_fails_iter():
pytest.raises(TypeError, concat, 1)
def test_concat_fails_iterable(test_pd_df):
"""Check that calling concat with a non-iterable raises"""
msg = "First argument must be an iterable, you passed an object of type '{}'!"

for dfs, type_ in [(1, "int"), ("foo", "str"), (test_pd_df, "DataFrame")]:
with pytest.raises(TypeError, match=msg.format(type_)):
concat(dfs)

def test_concat_fails_notdf():
pytest.raises(TypeError, concat, "foo")

def test_concat_single_item(test_df):
"""Check that calling concat on a single-item list returns identical object"""
obs = concat([test_df])
assert_iamframe_equal(obs, test_df)


def test_concat_incompatible_time(test_df_year, test_df_time):
"""Check that calling concat with incompatible time formats raises"""
match = re.escape("Items have incompatible time format ('year' vs. 'time')!")
with pytest.raises(ValueError, match=match):
concat([test_df_year, test_df_time])


def test_concat_incompatible_cols(test_pd_df):
"""Check that calling concat on a single-item list returns identical object"""
df1 = IamDataFrame(test_pd_df)
test_pd_df["extra_col"] = "foo"
df2 = IamDataFrame(test_pd_df)

match = "Items have incompatible timeseries data index dimensions!"
with pytest.raises(ValueError, match=match):
concat([df1, df2])


def test_concat(test_df):
Expand Down

0 comments on commit 32b06f7

Please sign in to comment.