API: warning to raise KeyError in the future if not all elements of a…

… list are selected via .loc closes #15747
pandas-dev · Aug 23, 2017 · 8509dc6 · 8509dc6
1 parent 66ec5f3
commit 8509dc6
Show file tree

Hide file tree

Showing 14 changed files with 295 additions and 56 deletions.
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -667,6 +667,72 @@ For getting *multiple* indexers, using ``.get_indexer``
   dfd.iloc[[0, 2], dfd.columns.get_indexer(['A', 'B'])]
 
 
+.. _indexing.deprecate_loc_reindex_listlike:
+
+Using loc with missing keys in a list is Deprecated
+---------------------------------------------------
+
+.. warning::
+
+   Starting in 0.21.0, using ``.loc`` with a list-like containing missing keys. This is deprecated, in favor of ``.reindex``.
+
+In prior versions, using ``.loc[list-of-keys]`` would work as long as *at least 1* of the keys was found (otherwise it
+would raise a ``KeyError``). This behavior is deprecated and will show a warning message pointing to this section. The
+recommeded alternative is to use ``.reindex()``.
+
+For example.
+
+.. ipython:: python
+
+   s = Series([1, 2, 3])
+   s
+
+Selection with all keys found is unchanged.
+
+.. ipython:: python
+
+   s.loc[[1, 2]]
+
+Previous Behavior
+
+.. code-block:: ipython
+
+
+   In [4]: s.loc[[1, 2, 3]]
+   Out[4]:
+   1    2.0
+   2    3.0
+   3    NaN
+   dtype: float64
+
+
+Current Behavior
+
+   In [4]: s.loc[[1, 2, 3]]
+   /Users/jreback/miniconda3/envs/pandas/bin/ipython:1: FutureWarning: passing list-likes to .loc with any non-matching elements will raise
+   KeyError in the future, you can use .reindex() as an alternative
+     #!/Users/jreback/miniconda3/envs/pandas/bin/python
+   Out[4]:
+   1    2.0
+   2    3.0
+   3    NaN
+   dtype: float64
+
+The idiomatic way to achieve selecting potentially not-found elmenents if via ``.reindex()``
+
+.. ipython:: python
+
+  s.reindex([1, 2, 3])
+
+
+Alternatively, if you want to select only *valid* keys, the following is idiomatic; furthermore this is more efficient, and is guaranteed to preserve the dtype of the selection.
+
+.. ipython:: python
+
+   keys = [1, 2, 3]
+   s.loc[s.index.intersection(keys)]
+
+
 .. _indexing.basics.partial_setting:
 
 Selecting Random Samples

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -158,6 +158,63 @@ We have updated our minimum supported versions of dependencies (:issue:`15206`,
    | Bottleneck   | 1.0.0           |          |
    +--------------+-----------------+----------+
 
+.. _whatsnew_0210.api_breaking.loc:
+
+.loc with a list-like containing messing keys is Deprecated
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Selecting at least 1 valid key with a list-like indexer would succeed and return ``NaN`` for non-found elements.
+This is exactly the function of ``.reindex()``. This will now show a ``FutureWarning`` message; in the future this will raise ``KeyError`` (:issue:`15747`)
+See the :ref:`deprecation docs <indexing.deprecate_loc_reindex_listlike>`.
+
+
+.. ipython:: python
+
+   s = Series([1, 2, 3])
+   s
+
+Selection with all keys found is unchanged.
+
+.. ipython:: python
+
+   s.loc[[1, 2]]
+
+Previous Behavior
+
+.. code-block:: ipython
+
+
+   In [4]: s.loc[[1, 2, 3]]
+   Out[4]:
+   1    2.0
+   2    3.0
+   3    NaN
+   dtype: float64
+
+
+Current Behavior
+
+   In [4]: s.loc[[1, 2, 3]]
+   /Users/jreback/miniconda3/envs/pandas/bin/ipython:1: FutureWarning:
+   Passing list-likes to .loc with any non-matching elements will raise
+   KeyError in the future, you can use .reindex() as an alternative.
+
+   See the documentation here:
+   http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
+
+   Out[4]:
+   1    2.0
+   2    3.0
+   3    NaN
+   dtype: float64
+
+The idiomatic way to achieve selecting potentially not-found elmenents if via ``.reindex()``
+
+.. ipython:: python
+
+  s.reindex([1, 2, 3])
+
+
 .. _whatsnew_0210.api_breaking.pandas_eval:
 
 Improved error handling during item assignment in pd.eval

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1417,12 +1417,35 @@ def _has_valid_type(self, key, axis):
             if isinstance(key, tuple) and isinstance(ax, MultiIndex):
                 return True
 
-            # TODO: don't check the entire key unless necessary
-            if (not is_iterator(key) and len(key) and
-                    np.all(ax.get_indexer_for(key) < 0)):
+            if not is_iterator(key) and len(key):
 
-                raise KeyError("None of [%s] are in the [%s]" %
-                               (key, self.obj._get_axis_name(axis)))
+                # True indicates missing values
+                missing = ax.get_indexer_for(key) < 0
+
+                if np.any(missing):
+                    if len(key) == 1 or np.all(missing):
+                        raise KeyError("None of [%s] are in the [%s]" %
+                                       (key, self.obj._get_axis_name(axis)))
+
+                    else:
+
+                        # we skip the warning on Categorical/Interval
+                        # as this check is actually done (check for
+                        # non-missing values), but a bit later in the
+                        # code, so we want to avoid warning & then
+                        # just raising
+                        _missing_key_warning = textwrap.dedent("""
+                        Passing list-likes to .loc with any non-matching elements will raise
+                        KeyError in the future, you can use .reindex() as an alternative.
+
+                        See the documentation here:
+                        http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike""")  # noqa
+
+                        if not (ax.is_categorical() or ax.is_interval()):
+                            warnings.warn(_missing_key_warning,
+                                          FutureWarning, stacklevel=5)
+
+                return True
 
             return True
 

diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py
@@ -353,7 +353,15 @@ def __init__(self, df, na_rep='', float_format=None, cols=None,
             self.styler = None
         self.df = df
         if cols is not None:
-            self.df = df.loc[:, cols]
+
+            # all missing, raise
+            if not len(Index(cols) & df.columns):
+                raise KeyError
+
+            # 1 missing is ok
+            # TODO(jreback)k this should raise
+            # on *any* missing columns
+            self.df = df.reindex(columns=cols)
         self.columns = self.df.columns
         self.float_format = float_format
         self.index = index

diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
@@ -111,7 +111,8 @@ def test_loc_listlike(self):
         assert_frame_equal(result, expected, check_index_type=True)
 
         # not all labels in the categories
-        pytest.raises(KeyError, lambda: self.df2.loc[['a', 'd']])
+        with pytest.raises(KeyError):
+            self.df2.loc[['a', 'd']]
 
     def test_loc_listlike_dtypes(self):
         # GH 11586

diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py
@@ -223,7 +223,7 @@ def test_series_partial_set_datetime(self):
                 Timestamp('2011-01-03')]
         exp = Series([np.nan, 0.2, np.nan],
                      index=pd.DatetimeIndex(keys, name='idx'), name='s')
-        tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True)
+        tm.assert_series_equal(ser.reindex(keys), exp, check_index_type=True)
 
     def test_series_partial_set_period(self):
         # GH 11497
@@ -248,5 +248,5 @@ def test_series_partial_set_period(self):
                 pd.Period('2011-01-03', freq='D')]
         exp = Series([np.nan, 0.2, np.nan],
                      index=pd.PeriodIndex(keys, name='idx'), name='s')
-        result = ser.loc[keys]
+        result = ser.reindex(keys)
         tm.assert_series_equal(result, exp)
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
@@ -617,7 +617,8 @@ def test_iloc_non_unique_indexing(self):
         expected = DataFrame(new_list)
         expected = pd.concat([expected, DataFrame(index=idx[idx > sidx.max()])
                               ])
-        result = df2.loc[idx]
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df2.loc[idx]
         tm.assert_frame_equal(result, expected, check_index_type=False)
 
     def test_iloc_empty_list_indexer_is_ok(self):

diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -176,7 +176,8 @@ def test_dups_fancy_indexing(self):
              'test1': [7., 6, np.nan],
              'other': ['d', 'c', np.nan]}, index=rows)
 
-        result = df.loc[rows]
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df.loc[rows]
         tm.assert_frame_equal(result, expected)
 
         # see GH5553, make sure we use the right indexer
@@ -186,7 +187,8 @@ def test_dups_fancy_indexing(self):
                               'other': [np.nan, np.nan, np.nan,
                                         'd', 'c', np.nan]},
                              index=rows)
-        result = df.loc[rows]
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df.loc[rows]
         tm.assert_frame_equal(result, expected)
 
         # inconsistent returns for unique/duplicate indices when values are
@@ -203,20 +205,23 @@ def test_dups_fancy_indexing(self):
 
         # GH 4619; duplicate indexer with missing label
         df = DataFrame({"A": [0, 1, 2]})
-        result = df.loc[[0, 8, 0]]
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df.loc[[0, 8, 0]]
         expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
         tm.assert_frame_equal(result, expected, check_index_type=False)
 
         df = DataFrame({"A": list('abc')})
-        result = df.loc[[0, 8, 0]]
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df.loc[[0, 8, 0]]
         expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
         tm.assert_frame_equal(result, expected, check_index_type=False)
 
         # non unique with non unique selector
         df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
         expected = DataFrame(
             {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E'])
-        result = df.loc[['A', 'A', 'E']]
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df.loc[['A', 'A', 'E']]
         tm.assert_frame_equal(result, expected)
 
         # GH 5835
@@ -227,7 +232,8 @@ def test_dups_fancy_indexing(self):
         expected = pd.concat(
             [df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'],
                                               index=df.index)], axis=1)
-        result = df.loc[:, ['A', 'B', 'C']]
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df.loc[:, ['A', 'B', 'C']]
         tm.assert_frame_equal(result, expected)
 
         # GH 6504, multi-axis indexing

diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -152,12 +152,15 @@ def test_loc_getitem_label_list(self):
                           [Timestamp('20130102'), Timestamp('20130103')],
                           typs=['ts'], axes=0)
 
+    def test_loc_getitem_label_list_with_missing(self):
         self.check_result('list lbl', 'loc', [0, 1, 2], 'indexer', [0, 1, 2],
                           typs=['empty'], fails=KeyError)
-        self.check_result('list lbl', 'loc', [0, 2, 3], 'ix', [0, 2, 3],
-                          typs=['ints', 'uints'], axes=0, fails=KeyError)
-        self.check_result('list lbl', 'loc', [3, 6, 7], 'ix', [3, 6, 7],
-                          typs=['ints', 'uints'], axes=1, fails=KeyError)
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            self.check_result('list lbl', 'loc', [0, 2, 3], 'ix', [0, 2, 3],
+                              typs=['ints', 'uints'], axes=0, fails=KeyError)
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            self.check_result('list lbl', 'loc', [3, 6, 7], 'ix', [3, 6, 7],
+                              typs=['ints', 'uints'], axes=1, fails=KeyError)
         self.check_result('list lbl', 'loc', [4, 8, 10], 'ix', [4, 8, 10],
                           typs=['ints', 'uints'], axes=2, fails=KeyError)
 
@@ -249,7 +252,7 @@ def test_loc_to_fail(self):
         pytest.raises(KeyError, lambda: s.loc[['4']])
 
         s.loc[-1] = 3
-        result = s.loc[[-1, -2]]
+        result = s.reindex([-1, -2])
         expected = Series([3, np.nan], index=[-1, -2])
         tm.assert_series_equal(result, expected)
 
@@ -277,6 +280,23 @@ def f():
 
         pytest.raises(KeyError, f)
 
+    def test_loc_getitem_list_with_fail(self):
+        # 15747
+        # should KeyError if *any* missing labels
+
+        s = Series([1, 2, 3])
+
+        s.loc[[2]]
+
+        with pytest.raises(KeyError):
+            s.loc[[3]]
+
+        # a non-match and a match
+        with tm.assert_produces_warning(FutureWarning):
+            expected = s.loc[[2, 3]]
+        result = s.reindex([2, 3])
+        tm.assert_series_equal(result, expected)
+
     def test_loc_getitem_label_slice(self):
 
         # label slices (with ints)