Improve performance of pyam.concat() (#510)

Co-authored-by: pjuergens <74722312+pjuergens@users.noreply.github.com>
IAMconsortium · Mar 22, 2021 · 32b06f7 · 32b06f7
1 parent ebceba1
commit 32b06f7
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 21 deletions.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -17,3 +17,4 @@ The following persons contributed to the development of the |pyam| framework:
 - Maik Budzinski `@mabudz <https://github.com/mabudz>`_
 - Jarmo Kikstra `@jkikstra <https://github.com/jkikstra>`_
 - Michael Pimmer `@fonfon <https://github.com/fonfon>`_
+- Patrick Jürgens `@pjuergens <https://github.com/pjuergens>`_
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -8,6 +8,7 @@ starting with `data` will be parsed for timeseries data.
 
 ## Individual updates
 
+- [#510](https://github.com/IAMconsortium/pyam/pull/510) Improve performance of `pyam.concat()`
 - [#508](https://github.com/IAMconsortium/pyam/pull/508) Bugfix for non-empty but invisible header and no rows in 'meta' sheet
 - [#502](https://github.com/IAMconsortium/pyam/pull/502) Switch to Black code style
 - [#499](https://github.com/IAMconsortium/pyam/pull/499) Implement `order` feature in line plot 

diff --git a/pyam/core.py b/pyam/core.py
@@ -500,7 +500,7 @@ def append(
         Raises
         ------
         ValueError
-            If time domain or other timeseries data index dimension don't match
+            If time domain or other timeseries data index dimension don't match.
         """
         if not isinstance(other, IamDataFrame):
             other = IamDataFrame(other, **kwargs)
@@ -2306,28 +2306,61 @@ def compare(
     return ret[[right_label, left_label]]
 
 
-def concat(dfs):
+def concat(dfs, ignore_meta_conflict=False, **kwargs):
     """Concatenate a series of IamDataFrame-like objects
 
     Parameters
     ----------
     dfs : list of IamDataFrames
-        a list of :class:`IamDataFrame` instances
+        A list of :class:`IamDataFrame` instances
+    ignore_meta_conflict : bool, default False
+        If False, raise an error if any meta columns present in `dfs` are not identical.
+        If True, values in earlier elements of `dfs` take precendence.
+    kwargs
+        Passed to :class:`IamDataFrame(other, **kwargs) <IamDataFrame>`
+        for any item of `dfs` which isn't already an IamDataFrame.
+
+    Returns
+    -------
+    IamDataFrame
+
+    Raises
+    ------
+    TypeError
+        If `dfs` is not a list.
+    ValueError
+        If time domain or other timeseries data index dimension don't match.
     """
-    if isstr(dfs) or not hasattr(dfs, "__iter__"):
-        msg = "Argument must be a non-string iterable (e.g., list or tuple)"
-        raise TypeError(msg)
+    if not islistable(dfs) or isinstance(dfs, pd.DataFrame):
+        raise TypeError(
+            f"First argument must be an iterable, "
+            f"you passed an object of type '{dfs.__class__.__name__}'!"
+        )
 
-    _df = None
-    for df in dfs:
-        df = df if isinstance(df, IamDataFrame) else IamDataFrame(df)
-        if _df is None:
-            _df = df.copy()
-        else:
-            _df.append(df, inplace=True, verify_integrity=False)
+    # cast first element in list to IamDataFrame (if necessary)
+    df = dfs[0] if isinstance(dfs[0], IamDataFrame) else IamDataFrame(dfs[0], **kwargs)
+    ret_data, ret_meta = [df._data], df.meta
+    index, time_col = df._data.index.names, df.time_col
+
+    for df in dfs[1:]:
+        # skip merging meta if element is a pd.DataFrame
+        _meta_merge = False if isinstance(df, pd.DataFrame) else True
+        df = IamDataFrame(df, **kwargs) if not isinstance(df, IamDataFrame) else df
+
+        if df.time_col != time_col:
+            raise ValueError("Items have incompatible time format ('year' vs. 'time')!")
+
+        if df._data.index.names != index:
+            raise ValueError(
+                "Items have incompatible timeseries data index dimensions!"
+            )
+
+        ret_data.append(df._data)
+        if _meta_merge:
+            ret_meta = merge_meta(ret_meta, df.meta, ignore_meta_conflict)
 
-    verify_index_integrity(_df._data)
-    return _df
+    # return as new IamDataFrame, this will verify integrity as part of `__init__()`
+    return IamDataFrame(pd.concat(ret_data, verify_integrity=False), meta=ret_meta)
 
 
 def read_datapackage(path, data="data", meta="meta"):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -148,8 +148,9 @@ def test_df_year():
 # minimal IamDataFrame for specifically testing 'time'-column features
 @pytest.fixture(scope="function")
 def test_df_time():
-    tdf = TEST_DF.rename({2005: TEST_DTS[0], 2010: TEST_DTS[1]}, axis="columns")
-    df = IamDataFrame(data=tdf)
+    df = IamDataFrame(
+        data=TEST_DF.rename({2005: TEST_DTS[0], 2010: TEST_DTS[1]}, axis="columns")
+    )
     for i in META_COLS:
         df.set_meta(META_DF[i])
     yield df

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1027,12 +1027,37 @@ def test_pd_join_by_meta_nonmatching_index(test_df):
     pd.testing.assert_frame_equal(obs.sort_index(level=1), exp)
 
 
-def test_concat_fails_iter():
-    pytest.raises(TypeError, concat, 1)
+def test_concat_fails_iterable(test_pd_df):
+    """Check that calling concat with a non-iterable raises"""
+    msg = "First argument must be an iterable, you passed an object of type '{}'!"
 
+    for dfs, type_ in [(1, "int"), ("foo", "str"), (test_pd_df, "DataFrame")]:
+        with pytest.raises(TypeError, match=msg.format(type_)):
+            concat(dfs)
 
-def test_concat_fails_notdf():
-    pytest.raises(TypeError, concat, "foo")
+
+def test_concat_single_item(test_df):
+    """Check that calling concat on a single-item list returns identical object"""
+    obs = concat([test_df])
+    assert_iamframe_equal(obs, test_df)
+
+
+def test_concat_incompatible_time(test_df_year, test_df_time):
+    """Check that calling concat with incompatible time formats raises"""
+    match = re.escape("Items have incompatible time format ('year' vs. 'time')!")
+    with pytest.raises(ValueError, match=match):
+        concat([test_df_year, test_df_time])
+
+
+def test_concat_incompatible_cols(test_pd_df):
+    """Check that calling concat on a single-item list returns identical object"""
+    df1 = IamDataFrame(test_pd_df)
+    test_pd_df["extra_col"] = "foo"
+    df2 = IamDataFrame(test_pd_df)
+
+    match = "Items have incompatible timeseries data index dimensions!"
+    with pytest.raises(ValueError, match=match):
+        concat([df1, df2])
 
 
 def test_concat(test_df):