Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Don't warn if default conflicts with dialect #23775

Merged
merged 1 commit into from
Nov 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1384,6 +1384,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
- Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`)
- Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`)
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
- Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`)
- Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
Expand Down
47 changes: 37 additions & 10 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,24 @@ def parser_f(filepath_or_buffer,
if sep is False:
sep = default_sep

# gh-23761
#
# When a dialect is passed, it overrides any of the overlapping
# parameters passed in directly. We don't want to warn if the
# default parameters were passed in (since it probably means
# that the user didn't pass them in explicitly in the first place).
#
# "delimiter" is the annoying corner case because we alias it to
# "sep" before doing comparison to the dialect values later on.
# Thus, we need a flag to indicate that we need to "override"
# the comparison to dialect values by checking if default values
# for BOTH "delimiter" and "sep" were provided.
if dialect is not None:
sep_override = delimiter is None and sep == default_sep
kwds = dict(sep_override=sep_override)
else:
kwds = dict()

# Alias sep -> delimiter.
if delimiter is None:
delimiter = sep
Expand All @@ -647,7 +665,7 @@ def parser_f(filepath_or_buffer,
engine = 'c'
engine_specified = False

kwds = dict(delimiter=delimiter,
kwds.update(delimiter=delimiter,
engine=engine,
dialect=dialect,
compression=compression,
Expand Down Expand Up @@ -769,18 +787,27 @@ def __init__(self, f, engine=None, **kwds):
except AttributeError:
raise ValueError("Invalid dialect '{dialect}' provided"
.format(dialect=kwds['dialect']))
provided = kwds.get(param, _parser_defaults[param])
parser_default = _parser_defaults[param]
provided = kwds.get(param, parser_default)

# Messages for conflicting values between the dialect instance
# and the actual parameters provided.
# Messages for conflicting values between the dialect
# instance and the actual parameters provided.
conflict_msgs = []

if dialect_val != provided:
conflict_msgs.append((
"Conflicting values for '{param}': '{val}' was "
"provided, but the dialect specifies '{diaval}'. "
"Using the dialect-specified value.".format(
param=param, val=provided, diaval=dialect_val)))
# Don't warn if the default parameter was passed in,
# even if it conflicts with the dialect (gh-23761).
if provided != parser_default and provided != dialect_val:
msg = ("Conflicting values for '{param}': '{val}' was "
"provided, but the dialect specifies '{diaval}'. "
"Using the dialect-specified value.".format(
param=param, val=provided, diaval=dialect_val))

# Annoying corner case for not warning about
# conflicts between dialect and delimiter parameter.
gfyoung marked this conversation as resolved.
Show resolved Hide resolved
# Refer to the outer "_read_" function for more info.
if not (param == "delimiter" and
kwds.pop("sep_override", False)):
conflict_msgs.append(msg)

if conflict_msgs:
warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,
Expand Down
84 changes: 64 additions & 20 deletions pandas/tests/io/parser/test_dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
import pandas.util.testing as tm


@pytest.fixture
def custom_dialect():
dialect_name = "weird"
dialect_kwargs = dict(doublequote=False, escapechar="~", delimiter=":",
skipinitialspace=False, quotechar="~", quoting=3)
return dialect_name, dialect_kwargs


def test_dialect(all_parsers):
parser = all_parsers
data = """\
Expand All @@ -26,10 +34,7 @@ def test_dialect(all_parsers):

dia = csv.excel()
dia.quoting = csv.QUOTE_NONE

# Conflicting dialect quoting.
with tm.assert_produces_warning(ParserWarning):
df = parser.read_csv(StringIO(data), dialect=dia)
df = parser.read_csv(StringIO(data), dialect=dia)

data = """\
label1,label2,label3
Expand All @@ -53,14 +58,10 @@ def test_dialect_str(all_parsers):
"fruit": ["apple", "pear"],
"vegetable": ["broccoli", "tomato"]
})
csv.register_dialect(dialect_name, delimiter=":")

# Conflicting dialect delimiter.
with tm.assert_produces_warning(ParserWarning):
with tm.with_csv_dialect(dialect_name, delimiter=":"):
df = parser.read_csv(StringIO(data), dialect=dialect_name)

tm.assert_frame_equal(df, exp)
csv.unregister_dialect(dialect_name)
tm.assert_frame_equal(df, exp)


def test_invalid_dialect(all_parsers):
Expand All @@ -75,17 +76,60 @@ class InvalidDialect(object):
parser.read_csv(StringIO(data), dialect=InvalidDialect)


@pytest.mark.parametrize("delimiter", [",", "."])
def test_dialect_conflict(all_parsers, delimiter):
data = "a,b\n1,2"
dialect = "excel"
@pytest.mark.parametrize("arg", [None, "doublequote", "escapechar",
"skipinitialspace", "quotechar", "quoting"])
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect,
arg, value):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers

expected = DataFrame({"a": [1], "b": [2]})
data = "a:b\n1:2"

warning_klass = None
kwds = dict()

# arg=None tests when we pass in the dialect without any other arguments.
if arg is not None:
if "value" == "dialect": # No conflict --> no warning.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

presumably this was mean to be if value == "dialect"? fortunately pylint has caught this 😄 https://github.com/pandas-dev/pandas/pull/49502/files

kwds[arg] = dialect_kwargs[arg]
elif "value" == "default": # Default --> no warning.
from pandas.io.parsers import _parser_defaults
kwds[arg] = _parser_defaults[arg]
else: # Non-default + conflict with dialect --> warning.
warning_klass = ParserWarning
kwds[arg] = "blah"

with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
with tm.assert_produces_warning(warning_klass):
result = parser.read_csv(StringIO(data),
dialect=dialect_name, **kwds)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("kwargs,warning_klass", [
(dict(sep=","), None), # sep is default --> sep_override=True
(dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False
(dict(delimiter=":"), None), # No conflict
(dict(delimiter=None), None), # Default arguments --> sep_override=True
(dict(delimiter=","), ParserWarning), # Conflict
(dict(delimiter="."), ParserWarning), # Conflict
], ids=["sep-override-true", "sep-override-false",
"delimiter-no-conflict", "delimiter-default-arg",
"delimiter-conflict", "delimiter-conflict2"])
def test_dialect_conflict_delimiter(all_parsers, custom_dialect,
kwargs, warning_klass):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers

expected = DataFrame({"a": [1], "b": [2]})
warning_klass = None if delimiter == "," else ParserWarning
data = "a:b\n1:2"

with tm.assert_produces_warning(warning_klass):
result = parser.read_csv(StringIO(data),
delimiter=delimiter,
dialect=dialect)
tm.assert_frame_equal(result, expected)
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
with tm.assert_produces_warning(warning_klass):
result = parser.read_csv(StringIO(data),
dialect=dialect_name, **kwargs)
tm.assert_frame_equal(result, expected)
31 changes: 31 additions & 0 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2835,6 +2835,37 @@ def __exit__(self, exc_type, exc_value, traceback):
np.random.set_state(self.start_state)


@contextmanager
def with_csv_dialect(name, **kwargs):
"""
Context manager to temporarily register a CSV dialect for parsing CSV.

Parameters
----------
name : str
The name of the dialect.
kwargs : mapping
The parameters for the dialect.

Raises
------
ValueError : the name of the dialect conflicts with a builtin one.

See Also
--------
csv : Python's CSV library.
"""
import csv
_BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"}

if name in _BUILTIN_DIALECTS:
raise ValueError("Cannot override builtin dialect.")

csv.register_dialect(name, **kwargs)
yield
csv.unregister_dialect(name)


@contextmanager
def use_numexpr(use, min_elements=None):
from pandas.core.computation import expressions as expr
Expand Down