From 7a3f81a34507a38e4a69fbf8d80f2ca95fc610dc Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Sun, 3 Dec 2017 15:26:50 +0000 Subject: [PATCH] ENH: Better error message if usecols doesn't match columns (#17310) --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/io/parsers.py | 42 ++++++++++++++++++++++++++++--- pandas/tests/io/parser/usecols.py | 21 ++++++++++------ 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 09b504cac5ed4..af580403aa4b2 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -76,6 +76,7 @@ Other Enhancements - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) +- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`) .. _whatsnew_0220.api_breaking: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fe50b551ea948..83b1d8ec1a070 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1141,6 +1141,38 @@ def _evaluate_usecols(usecols, names): return usecols +def _validate_usecols_names(usecols, names): + """ + Validates that all usecols are present in a given + list of names. If not, raise a ValueError that + shows what usecols are missing. + + Parameters + ---------- + usecols : iterable of usecols + The columns to validate are present in names. + names : iterable of names + The column names to check against. + + Returns + ------- + usecols : iterable of usecols + The `usecols` parameter if the validation succeeds. + + Raises + ------ + ValueError : Columns were missing. Error message will list them. + """ + missing = [c for c in usecols if c not in names] + if len(missing) > 0: + raise ValueError( + "Usecols do not match columns, " + "columns expected but not found: {missing}".format(missing=missing) + ) + + return usecols + + def _validate_skipfooter_arg(skipfooter): """ Validate the 'skipfooter' parameter. @@ -1753,14 +1785,14 @@ def __init__(self, src, **kwds): # GH 14671 if (self.usecols_dtype == 'string' and not set(usecols).issubset(self.orig_names)): - raise ValueError("Usecols do not match names.") + _validate_usecols_names(usecols, self.orig_names) if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] if len(self.names) < len(usecols): - raise ValueError("Usecols do not match names.") + _validate_usecols_names(usecols, self.names) self._set_noconvert_columns() @@ -2532,9 +2564,13 @@ def _handle_usecols(self, columns, usecols_key): raise ValueError("If using multiple headers, usecols must " "be integers.") col_indices = [] + for col in self.usecols: if isinstance(col, string_types): - col_indices.append(usecols_key.index(col)) + try: + col_indices.append(usecols_key.index(col)) + except ValueError: + _validate_usecols_names(self.usecols, usecols_key) else: col_indices.append(col) else: diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index f582e5037ca07..0fa53e6288bda 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -480,10 +480,10 @@ def test_raise_on_usecols_names_mismatch(self): # GH 14671 data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' - if self.engine == 'c': - msg = 'Usecols do not match names' - else: - msg = 'is not in list' + msg = ( + "Usecols do not match columns, " + "columns expected but not found: {missing}" + ) usecols = ['a', 'b', 'c', 'd'] df = self.read_csv(StringIO(data), usecols=usecols) @@ -492,11 +492,16 @@ def test_raise_on_usecols_names_mismatch(self): tm.assert_frame_equal(df, expected) usecols = ['a', 'b', 'c', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), usecols=usecols) usecols = ['a', 'b', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): + self.read_csv(StringIO(data), usecols=usecols) + + usecols = ['a', 'b', 'f', 'g'] + with tm.assert_raises_regex( + ValueError, msg.format(missing="\[('f', 'g'|'g', 'f')\]")): self.read_csv(StringIO(data), usecols=usecols) names = ['A', 'B', 'C', 'D'] @@ -520,9 +525,9 @@ def test_raise_on_usecols_names_mismatch(self): # tm.assert_frame_equal(df, expected) usecols = ['A', 'B', 'C', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) usecols = ['A', 'B', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), names=names, usecols=usecols)