diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 4e1e9ce0174087..c1e02bd8eafc4e 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,27 @@ +Checklist for the pandas documentation sprint (ignore this if you are doing +an unrelated PR): + +- [ ] PR title is "DOC: update the docstring" +- [ ] The validation script passes: `scripts/validate_docstrings.py ` +- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff` +- [ ] The html version looks good: `python doc/make.py --single ` +- [ ] It has been proofread on language by another sprint participant + +Please include the output of the validation script below between the "```" ticks: + +``` +# paste output of "scripts/validate_docstrings.py " here +# between the "```" (remove this comment, but keep the "```") + +``` + +If the validation script still gives errors, but you think there is a good reason +to deviate in this case (and there are certainly such cases), please state this +explicitly. + + +Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint): + - [ ] closes #xxxx - [ ] tests added / passed - [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff` diff --git a/.gitignore b/.gitignore index 0d4e8c6fb75a60..00dac6e336c37e 100644 --- a/.gitignore +++ b/.gitignore @@ -88,8 +88,9 @@ scikits *.c *.cpp -# Performance Testing # -####################### +# Unit / Performance Testing # +############################## +.pytest_cache/ asv_bench/env/ asv_bench/html/ asv_bench/results/ diff --git a/README.md b/README.md index 4b9c9505e320ae..86cf95508a5d9e 100644 --- a/README.md +++ b/README.md @@ -216,13 +216,16 @@ Further, general questions and discussions can also take place on the [pydata ma ## Discussion and Development Most development discussion is taking place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. -## Contributing to pandas +## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) + All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome. A detailed overview on how to contribute can be found in the **[contributing guide.](https://pandas.pydata.org/pandas-docs/stable/contributing.html)** If you are simply looking to start working with the pandas codebase, navigate to the [GitHub “issues” tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [Difficulty Novice](https://github.com/pandas-dev/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) where you could start out. +You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas). + Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it! Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c347442784d415..77773220719573 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -11,6 +11,16 @@ from .pandas_vb_common import setup # noqa +method_blacklist = { + 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', + 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', + 'var', 'mad', 'describe', 'std'}, + 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew', + 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe', + 'std'} +} + + class ApplyDictReturn(object): goal_time = 0.2 @@ -83,45 +93,6 @@ def time_series_groups(self, data, key): self.ser.groupby(self.ser).groups -class FirstLast(object): - - goal_time = 0.2 - - param_names = ['dtype'] - params = ['float32', 'float64', 'datetime', 'object'] - - def setup(self, dtype): - N = 10**5 - # with datetimes (GH7555) - if dtype == 'datetime': - self.df = DataFrame({'values': date_range('1/1/2011', - periods=N, - freq='s'), - 'key': range(N)}) - elif dtype == 'object': - self.df = DataFrame({'values': ['foo'] * N, - 'key': range(N)}) - else: - labels = np.arange(N / 10).repeat(10) - data = Series(np.random.randn(len(labels)), dtype=dtype) - data[::3] = np.nan - data[1::3] = np.nan - labels = labels.take(np.random.permutation(len(labels))) - self.df = DataFrame({'values': data, 'key': labels}) - - def time_groupby_first(self, dtype): - self.df.groupby('key').first() - - def time_groupby_last(self, dtype): - self.df.groupby('key').last() - - def time_groupby_nth_all(self, dtype): - self.df.groupby('key').nth(0, dropna='all') - - def time_groupby_nth_none(self, dtype): - self.df.groupby('key').nth(0) - - class GroupManyLabels(object): goal_time = 0.2 @@ -142,38 +113,40 @@ class Nth(object): goal_time = 0.2 - def setup_cache(self): - df = DataFrame(np.random.randint(1, 100, (10000, 2))) - df.iloc[1, 1] = np.nan - return df - - def time_frame_nth_any(self, df): - df.groupby(0).nth(0, dropna='any') - - def time_frame_nth(self, df): - df.groupby(0).nth(0) + param_names = ['dtype'] + params = ['float32', 'float64', 'datetime', 'object'] - def time_series_nth_any(self, df): - df[1].groupby(df[0]).nth(0, dropna='any') + def setup(self, dtype): + N = 10**5 + # with datetimes (GH7555) + if dtype == 'datetime': + values = date_range('1/1/2011', periods=N, freq='s') + elif dtype == 'object': + values = ['foo'] * N + else: + values = np.arange(N).astype(dtype) - def time_series_nth(self, df): - df[1].groupby(df[0]).nth(0) + key = np.arange(N) + self.df = DataFrame({'key': key, 'values': values}) + self.df.iloc[1, 1] = np.nan # insert missing data + def time_frame_nth_any(self, dtype): + self.df.groupby('key').nth(0, dropna='any') -class NthObject(object): + def time_groupby_nth_all(self, dtype): + self.df.groupby('key').nth(0, dropna='all') - goal_time = 0.2 + def time_frame_nth(self, dtype): + self.df.groupby('key').nth(0) - def setup_cache(self): - df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g']) - df['obj'] = ['a'] * 5000 + ['b'] * 5000 - return df + def time_series_nth_any(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0, dropna='any') - def time_nth(self, df): - df.groupby('g').nth(5) + def time_groupby_nth_all(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0, dropna='all') - def time_nth_last(self, df): - df.groupby('g').last() + def time_series_nth(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0) class DateAttributes(object): @@ -235,7 +208,7 @@ def time_multi_count(self, df): df.groupby(['key1', 'key2']).count() -class CountInt(object): +class CountMultiInt(object): goal_time = 0.2 @@ -247,10 +220,10 @@ def setup_cache(self): 'ints2': np.random.randint(0, 1000, size=n)}) return df - def time_int_count(self, df): + def time_multi_int_count(self, df): df.groupby(['key1', 'key2']).count() - def time_int_nunique(self, df): + def time_multi_int_nunique(self, df): df.groupby(['key1', 'key2']).nunique() @@ -258,7 +231,7 @@ class AggFunctions(object): goal_time = 0.2 - def setup_cache(self): + def setup_cache(): N = 10**5 fac1 = np.array(['A', 'B', 'C'], dtype='O') fac2 = np.array(['one', 'two'], dtype='O') @@ -353,9 +326,6 @@ def setup(self): def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() - def time_dt_size(self): - self.df.groupby(['dates']).size() - def time_dt_timegrouper_size(self): with warnings.catch_warnings(record=True): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() @@ -368,30 +338,51 @@ class GroupByMethods(object): goal_time = 0.2 - param_names = ['dtype', 'method'] - params = [['int', 'float'], + param_names = ['dtype', 'method', 'application'] + params = [['int', 'float', 'object', 'datetime'], ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', 'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew', - 'std', 'sum', 'tail', 'unique', 'value_counts', 'var']] + 'std', 'sum', 'tail', 'unique', 'value_counts', 'var'], + ['direct', 'transformation']] - def setup(self, dtype, method): + def setup(self, dtype, method, application): + if method in method_blacklist.get(dtype, {}): + raise NotImplementedError # skip benchmark ngroups = 1000 size = ngroups * 2 rng = np.arange(ngroups) values = rng.take(np.random.randint(0, ngroups, size=size)) if dtype == 'int': key = np.random.randint(0, size, size=size) - else: + elif dtype == 'float': key = np.concatenate([np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0]) + elif dtype == 'object': + key = ['foo'] * size + elif dtype == 'datetime': + key = date_range('1/1/2011', periods=size, freq='s') df = DataFrame({'values': values, 'key': key}) - self.df_groupby_method = getattr(df.groupby('key')['values'], method) - def time_method(self, dtype, method): - self.df_groupby_method() + if application == 'transform': + if method == 'describe': + raise NotImplementedError + + self.as_group_method = lambda: df.groupby( + 'key')['values'].transform(method) + self.as_field_method = lambda: df.groupby( + 'values')['key'].transform(method) + else: + self.as_group_method = getattr(df.groupby('key')['values'], method) + self.as_field_method = getattr(df.groupby('values')['key'], method) + + def time_dtype_as_group(self, dtype, method, application): + self.as_group_method() + + def time_dtype_as_field(self, dtype, method, application): + self.as_field_method() class Float32(object): diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml index c72abd0c19516f..1337fc54e9aac8 100644 --- a/ci/environment-dev.yaml +++ b/ci/environment-dev.yaml @@ -5,6 +5,7 @@ channels: dependencies: - Cython - NumPy + - flake8 - moto - pytest>=3.1 - python-dateutil>=2.5.0 diff --git a/ci/requirements-3.6_DOC.run b/ci/requirements-3.6_DOC.run index 084f38ce17eb27..fa9cab32c0ac21 100644 --- a/ci/requirements-3.6_DOC.run +++ b/ci/requirements-3.6_DOC.run @@ -5,7 +5,7 @@ sphinx nbconvert nbformat notebook -matplotlib +matplotlib=2.1* seaborn scipy lxml diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 82f8de277c57bd..fcbe0da5de305d 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -2,9 +2,10 @@ # Do not modify directly Cython NumPy +flake8 moto pytest>=3.1 python-dateutil>=2.5.0 pytz setuptools>=3.3 -sphinx +sphinx \ No newline at end of file diff --git a/doc/make.py b/doc/make.py index e3cb29aa3e0867..4967f30453fd18 100755 --- a/doc/make.py +++ b/doc/make.py @@ -11,12 +11,14 @@ $ python make.py html $ python make.py latex """ +import importlib import sys import os import shutil -import subprocess +# import subprocess import argparse from contextlib import contextmanager +import webbrowser import jinja2 @@ -26,28 +28,6 @@ BUILD_DIRS = ['doctrees', 'html', 'latex', 'plots', '_static', '_templates'] -def _generate_index(include_api, single_doc=None): - """Create index.rst file with the specified sections. - - Parameters - ---------- - include_api : bool - Whether API documentation will be built. - single_doc : str or None - If provided, this single documentation page will be generated. - """ - if single_doc is not None: - single_doc = os.path.splitext(os.path.basename(single_doc))[0] - include_api = False - - with open(os.path.join(SOURCE_PATH, 'index.rst.template')) as f: - t = jinja2.Template(f.read()) - - with open(os.path.join(SOURCE_PATH, 'index.rst'), 'w') as f: - f.write(t.render(include_api=include_api, - single_doc=single_doc)) - - @contextmanager def _maybe_exclude_notebooks(): """Skip building the notebooks if pandoc is not installed. @@ -58,6 +38,7 @@ def _maybe_exclude_notebooks(): 1. nbconvert isn't installed, or 2. nbconvert is installed, but pandoc isn't """ + # TODO move to exclude_pattern base = os.path.dirname(__file__) notebooks = [os.path.join(base, 'source', nb) for nb in ['style.ipynb']] @@ -96,8 +77,112 @@ class DocBuilder: All public methods of this class can be called as parameters of the script. """ - def __init__(self, num_jobs=1): + def __init__(self, num_jobs=1, include_api=True, single_doc=None, + verbosity=0): self.num_jobs = num_jobs + self.include_api = include_api + self.verbosity = verbosity + self.single_doc = None + self.single_doc_type = None + if single_doc is not None: + self._process_single_doc(single_doc) + self.exclude_patterns = self._exclude_patterns + + self._generate_index() + if self.single_doc_type == 'docstring': + self._run_os('sphinx-autogen', '-o', + 'source/generated_single', 'source/index.rst') + + @property + def _exclude_patterns(self): + """Docs source files that will be excluded from building.""" + # TODO move maybe_exclude_notebooks here + if self.single_doc is not None: + rst_files = [f for f in os.listdir(SOURCE_PATH) + if ((f.endswith('.rst') or f.endswith('.ipynb')) + and (f != 'index.rst') + and (f != '{0}.rst'.format(self.single_doc)))] + if self.single_doc_type != 'api': + rst_files += ['generated/*.rst'] + elif not self.include_api: + rst_files = ['api.rst', 'generated/*.rst'] + else: + rst_files = ['generated_single/*.rst'] + + exclude_patterns = ','.join( + '{!r}'.format(i) for i in ['**.ipynb_checkpoints'] + rst_files) + + return exclude_patterns + + def _process_single_doc(self, single_doc): + """Extract self.single_doc (base name) and self.single_doc_type from + passed single_doc kwarg. + + """ + self.include_api = False + + if single_doc == 'api.rst' or single_doc == 'api': + self.single_doc_type = 'api' + self.single_doc = 'api' + elif os.path.exists(os.path.join(SOURCE_PATH, single_doc)): + self.single_doc_type = 'rst' + self.single_doc = os.path.splitext(os.path.basename(single_doc))[0] + elif os.path.exists( + os.path.join(SOURCE_PATH, '{}.rst'.format(single_doc))): + self.single_doc_type = 'rst' + self.single_doc = single_doc + elif single_doc is not None: + try: + obj = pandas # noqa: F821 + for name in single_doc.split('.'): + obj = getattr(obj, name) + except AttributeError: + raise ValueError('Single document not understood, it should ' + 'be a file in doc/source/*.rst (e.g. ' + '"contributing.rst" or a pandas function or ' + 'method (e.g. "pandas.DataFrame.head")') + else: + self.single_doc_type = 'docstring' + if single_doc.startswith('pandas.'): + self.single_doc = single_doc[len('pandas.'):] + else: + self.single_doc = single_doc + + def _copy_generated_docstring(self): + """Copy existing generated (from api.rst) docstring page because + this is more correct in certain cases (where a custom autodoc + template is used). + + """ + fname = os.path.join(SOURCE_PATH, 'generated', + 'pandas.{}.rst'.format(self.single_doc)) + temp_dir = os.path.join(SOURCE_PATH, 'generated_single') + + try: + os.makedirs(temp_dir) + except OSError: + pass + + if os.path.exists(fname): + try: + # copying to make sure sphinx always thinks it is new + # and needs to be re-generated (to pick source code changes) + shutil.copy(fname, temp_dir) + except: # noqa + pass + + def _generate_index(self): + """Create index.rst file with the specified sections.""" + if self.single_doc_type == 'docstring': + self._copy_generated_docstring() + + with open(os.path.join(SOURCE_PATH, 'index.rst.template')) as f: + t = jinja2.Template(f.read()) + + with open(os.path.join(SOURCE_PATH, 'index.rst'), 'w') as f: + f.write(t.render(include_api=self.include_api, + single_doc=self.single_doc, + single_doc_type=self.single_doc_type)) @staticmethod def _create_build_structure(): @@ -121,7 +206,10 @@ def _run_os(*args): -------- >>> DocBuilder()._run_os('python', '--version') """ - subprocess.check_call(args, stderr=subprocess.STDOUT) + # TODO check_call should be more safe, but it fails with + # exclude patterns, needs investigation + # subprocess.check_call(args, stderr=subprocess.STDOUT) + os.system(' '.join(args)) def _sphinx_build(self, kind): """Call sphinx to build documentation. @@ -142,11 +230,23 @@ def _sphinx_build(self, kind): self._run_os('sphinx-build', '-j{}'.format(self.num_jobs), '-b{}'.format(kind), - '-d{}'.format(os.path.join(BUILD_PATH, - 'doctrees')), + '-{}'.format( + 'v' * self.verbosity) if self.verbosity else '', + '-d{}'.format(os.path.join(BUILD_PATH, 'doctrees')), + '-Dexclude_patterns={}'.format(self.exclude_patterns), SOURCE_PATH, os.path.join(BUILD_PATH, kind)) + def _open_browser(self): + base_url = os.path.join('file://', DOC_PATH, 'build', 'html') + if self.single_doc_type == 'docstring': + url = os.path.join( + base_url, + 'generated_single', 'pandas.{}.html'.format(self.single_doc)) + else: + url = os.path.join(base_url, '{}.html'.format(self.single_doc)) + webbrowser.open(url, new=2) + def html(self): """Build HTML documentation.""" self._create_build_structure() @@ -156,6 +256,11 @@ def html(self): if os.path.exists(zip_fname): os.remove(zip_fname) + if self.single_doc is not None: + self._open_browser() + shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'), + ignore_errors=True) + def latex(self, force=False): """Build PDF documentation.""" self._create_build_structure() @@ -222,21 +327,32 @@ def main(): metavar='FILENAME', type=str, default=None, - help=('filename of section to compile, ' - 'e.g. "indexing"')) + help=('filename of section or method name to ' + 'compile, e.g. "indexing", "DataFrame.join"')) argparser.add_argument('--python-path', type=str, - default=os.path.join(DOC_PATH, '..'), + default=os.path.dirname(DOC_PATH), help='path') + argparser.add_argument('-v', action='count', dest='verbosity', default=0, + help=('increase verbosity (can be repeated), ' + 'passed to the sphinx build command')) args = argparser.parse_args() if args.command not in cmds: raise ValueError('Unknown command {}. Available options: {}'.format( args.command, ', '.join(cmds))) + # Below we update both os.environ and sys.path. The former is used by + # external libraries (namely Sphinx) to compile this module and resolve + # the import of `python_path` correctly. The latter is used to resolve + # the import within the module, injecting it into the global namespace os.environ['PYTHONPATH'] = args.python_path - _generate_index(not args.no_api, args.single) - getattr(DocBuilder(args.num_jobs), args.command)() + sys.path.append(args.python_path) + globals()['pandas'] = importlib.import_module('pandas') + + builder = DocBuilder(args.num_jobs, not args.no_api, args.single, + args.verbosity) + getattr(builder, args.command)() if __name__ == '__main__': diff --git a/doc/source/api.rst b/doc/source/api.rst index 0e47499a03f3a5..dba7f6526f22a3 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -551,7 +551,6 @@ These can be accessed like ``Series.dt.``. Series.dt.weekofyear Series.dt.dayofweek Series.dt.weekday - Series.dt.weekday_name Series.dt.dayofyear Series.dt.quarter Series.dt.is_month_start @@ -581,6 +580,8 @@ These can be accessed like ``Series.dt.``. Series.dt.round Series.dt.floor Series.dt.ceil + Series.dt.month_name + Series.dt.day_name **Timedelta Properties** @@ -1723,7 +1724,6 @@ Time/Date Components DatetimeIndex.week DatetimeIndex.dayofweek DatetimeIndex.weekday - DatetimeIndex.weekday_name DatetimeIndex.quarter DatetimeIndex.tz DatetimeIndex.freq @@ -1759,6 +1759,8 @@ Time-specific operations DatetimeIndex.round DatetimeIndex.floor DatetimeIndex.ceil + DatetimeIndex.month_name + DatetimeIndex.day_name Conversion ~~~~~~~~~~ @@ -1940,7 +1942,6 @@ Properties Timestamp.tzinfo Timestamp.value Timestamp.week - Timestamp.weekday_name Timestamp.weekofyear Timestamp.year @@ -1954,6 +1955,7 @@ Methods Timestamp.combine Timestamp.ctime Timestamp.date + Timestamp.day_name Timestamp.dst Timestamp.floor Timestamp.freq @@ -1963,6 +1965,7 @@ Methods Timestamp.isocalendar Timestamp.isoformat Timestamp.isoweekday + Timestamp.month_name Timestamp.normalize Timestamp.now Timestamp.replace @@ -2179,8 +2182,12 @@ Computations / Descriptive Stats .. autosummary:: :toctree: generated/ + GroupBy.all + GroupBy.any + GroupBy.bfill GroupBy.count GroupBy.cumcount + GroupBy.ffill GroupBy.first GroupBy.head GroupBy.last @@ -2192,6 +2199,7 @@ Computations / Descriptive Stats GroupBy.nth GroupBy.ohlc GroupBy.prod + GroupBy.rank GroupBy.size GroupBy.sem GroupBy.std diff --git a/doc/source/basics.rst b/doc/source/basics.rst index e1b36a6acad708..8d09f1fc04c1fb 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -746,7 +746,7 @@ What if the function you wish to apply takes its data as, say, the second argume In this case, provide ``pipe`` with a tuple of ``(callable, data_keyword)``. ``.pipe`` will route the ``DataFrame`` to the argument specified in the tuple. -For example, we can fit a regression using statsmodels. Their API expects a formula first and a ``DataFrame`` as the second argument, ``data``. We pass in the function, keyword pair ``(sm.poisson, 'data')`` to ``pipe``: +For example, we can fit a regression using statsmodels. Their API expects a formula first and a ``DataFrame`` as the second argument, ``data``. We pass in the function, keyword pair ``(sm.ols, 'data')`` to ``pipe``: .. ipython:: python @@ -756,7 +756,7 @@ For example, we can fit a regression using statsmodels. Their API expects a form (bb.query('h > 0') .assign(ln_h = lambda df: np.log(df.h)) - .pipe((sm.poisson, 'data'), 'hr ~ ln_h + year + g + C(lg)') + .pipe((sm.ols, 'data'), 'hr ~ ln_h + year + g + C(lg)') .fit() .summary() ) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index efcc04d6883348..e4ce7ebd01dacb 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -46,9 +46,14 @@ The categorical data type is useful in the following cases: See also the :ref:`API docs on categoricals`. +.. _categorical.objectcreation: + Object Creation --------------- +Series Creation +~~~~~~~~~~~~~~~ + Categorical ``Series`` or columns in a ``DataFrame`` can be created in several ways: By specifying ``dtype="category"`` when constructing a ``Series``: @@ -77,7 +82,7 @@ discrete bins. See the :ref:`example on tiling ` in the docs df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) -By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to a `DataFrame`. +By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it to a ``DataFrame``. .. ipython:: python @@ -89,6 +94,55 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df +Categorical data has a specific ``category`` :ref:`dtype `: + +.. ipython:: python + + df.dtypes + +DataFrame Creation +~~~~~~~~~~~~~~~~~~ + +Similar to the previous section where a single column was converted to categorical, all columns in a +``DataFrame`` can be batch converted to categorical either during or after construction. + +This can be done during construction by specifying ``dtype="category"`` in the ``DataFrame`` constructor: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}, dtype="category") + df.dtypes + +Note that the categories present in each column differ; the conversion is done column by column, so +only labels present in a given column are categories: + +.. ipython:: python + + df['A'] + df['B'] + + +.. versionadded:: 0.23.0 + +Analogously, all columns in an existing ``DataFrame`` can be batch converted using :meth:`DataFrame.astype`: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + df_cat = df.astype('category') + df_cat.dtypes + +This conversion is likewise done column by column: + +.. ipython:: python + + df_cat['A'] + df_cat['B'] + + +Controlling Behavior +~~~~~~~~~~~~~~~~~~~~ + In the examples above where we passed ``dtype='category'``, we used the default behavior: @@ -108,21 +162,36 @@ of :class:`~pandas.api.types.CategoricalDtype`. s_cat = s.astype(cat_type) s_cat -Categorical data has a specific ``category`` :ref:`dtype `: +Similarly, a ``CategoricalDtype`` can be used with a ``DataFrame`` to ensure that categories +are consistent among all columns. .. ipython:: python - df.dtypes + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + cat_type = CategoricalDtype(categories=list('abcd'), + ordered=True) + df_cat = df.astype(cat_type) + df_cat['A'] + df_cat['B'] .. note:: - In contrast to R's `factor` function, categorical data is not converting input values to - strings and categories will end up the same data type as the original values. + To perform table-wise conversion, where all labels in the entire ``DataFrame`` are used as + categories for each column, the ``categories`` parameter can be determined programmatically by + ``categories = pd.unique(df.values.ravel())``. -.. note:: +If you already have ``codes`` and ``categories``, you can use the +:func:`~pandas.Categorical.from_codes` constructor to save the factorize step +during normal constructor mode: - In contrast to R's `factor` function, there is currently no way to assign/change labels at - creation time. Use `categories` to change the categories after creation time. +.. ipython:: python + + splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) + + +Regaining Original Data +~~~~~~~~~~~~~~~~~~~~~~~ To get back to the original ``Series`` or NumPy array, use ``Series.astype(original_dtype)`` or ``np.asarray(categorical)``: @@ -136,14 +205,15 @@ To get back to the original ``Series`` or NumPy array, use s2.astype(str) np.asarray(s2) -If you already have `codes` and `categories`, you can use the -:func:`~pandas.Categorical.from_codes` constructor to save the factorize step -during normal constructor mode: +.. note:: -.. ipython:: python + In contrast to R's `factor` function, categorical data is not converting input values to + strings; categories will end up the same data type as the original values. - splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) - s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) +.. note:: + + In contrast to R's `factor` function, there is currently no way to assign/change labels at + creation time. Use `categories` to change the categories after creation time. .. _categorical.categoricaldtype: diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index 214667119f7e03..0354ad473544bb 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -25,7 +25,7 @@ As is customary, we import pandas and NumPy as follows: This is often used in interactive work (e.g. `Jupyter notebook `_ or terminal) - the equivalent in SAS would be: - .. code-block:: none + .. code-block:: sas proc print data=df(obs=5); run; @@ -65,7 +65,7 @@ in the ``DATA`` step. Every ``DataFrame`` and ``Series`` has an ``Index`` - which are labels on the *rows* of the data. SAS does not have an exactly analogous concept. A data set's -row are essentially unlabeled, other than an implicit integer index that can be +rows are essentially unlabeled, other than an implicit integer index that can be accessed during the ``DATA`` step (``_N_``). In pandas, if no index is specified, an integer index is also used by default @@ -87,7 +87,7 @@ A SAS data set can be built from specified values by placing the data after a ``datalines`` statement and specifying the column names. -.. code-block:: none +.. code-block:: sas data df; input x y; @@ -121,7 +121,7 @@ will be used in many of the following examples. SAS provides ``PROC IMPORT`` to read csv data into a data set. -.. code-block:: none +.. code-block:: sas proc import datafile='tips.csv' dbms=csv out=tips replace; getnames=yes; @@ -156,7 +156,7 @@ Exporting Data The inverse of ``PROC IMPORT`` in SAS is ``PROC EXPORT`` -.. code-block:: none +.. code-block:: sas proc export data=tips outfile='tips2.csv' dbms=csv; run; @@ -178,7 +178,7 @@ Operations on Columns In the ``DATA`` step, arbitrary math expressions can be used on new or existing columns. -.. code-block:: none +.. code-block:: sas data tips; set tips; @@ -207,7 +207,7 @@ Filtering Filtering in SAS is done with an ``if`` or ``where`` statement, on one or more columns. -.. code-block:: none +.. code-block:: sas data tips; set tips; @@ -233,7 +233,7 @@ If/Then Logic In SAS, if/then logic can be used to create new columns. -.. code-block:: none +.. code-block:: sas data tips; set tips; @@ -262,7 +262,7 @@ Date Functionality SAS provides a variety of functions to do operations on date/datetime columns. -.. code-block:: none +.. code-block:: sas data tips; set tips; @@ -307,7 +307,7 @@ Selection of Columns SAS provides keywords in the ``DATA`` step to select, drop, and rename columns. -.. code-block:: none +.. code-block:: sas data tips; set tips; @@ -343,7 +343,7 @@ Sorting by Values Sorting in SAS is accomplished via ``PROC SORT`` -.. code-block:: none +.. code-block:: sas proc sort data=tips; by sex total_bill; @@ -369,7 +369,7 @@ SAS determines the length of a character string with the and `LENGTHC `__ functions. ``LENGTHN`` excludes trailing blanks and ``LENGTHC`` includes trailing blanks. -.. code-block:: none +.. code-block:: sas data _null_; set tips; @@ -395,7 +395,7 @@ SAS determines the position of a character in a string with the ``FINDW`` takes the string defined by the first argument and searches for the first position of the substring you supply as the second argument. -.. code-block:: none +.. code-block:: sas data _null_; set tips; @@ -419,7 +419,7 @@ Substring SAS extracts a substring from a string based on its position with the `SUBSTR `__ function. -.. code-block:: none +.. code-block:: sas data _null_; set tips; @@ -442,7 +442,7 @@ The SAS `SCAN `__ functions change the case of the argument. -.. code-block:: none +.. code-block:: sas data firstlast; input String $60.; @@ -516,7 +516,7 @@ types of joins are accomplished using the ``in=`` dummy variables to track whether a match was found in one or both input frames. -.. code-block:: none +.. code-block:: sas proc sort data=df1; by key; @@ -572,7 +572,7 @@ operations, and is ignored by default for aggregations. One difference is that missing data cannot be compared to its sentinel value. For example, in SAS you could do this to filter missing values. -.. code-block:: none +.. code-block:: sas data outer_join_nulls; set outer_join; @@ -615,7 +615,7 @@ SAS's PROC SUMMARY can be used to group by one or more key variables and compute aggregations on numeric columns. -.. code-block:: none +.. code-block:: sas proc summary data=tips nway; class sex smoker; @@ -640,7 +640,7 @@ In SAS, if the group aggregations need to be used with the original frame, it must be merged back together. For example, to subtract the mean for each observation by smoker group. -.. code-block:: none +.. code-block:: sas proc summary data=tips missing nway; class smoker; @@ -679,7 +679,7 @@ replicate most other by group processing from SAS. For example, this ``DATA`` step reads the data by sex/smoker group and filters to the first entry for each. -.. code-block:: none +.. code-block:: sas proc sort data=tips; by sex smoker; @@ -719,7 +719,7 @@ Data Interop pandas provides a :func:`read_sas` method that can read SAS data saved in the XPORT or SAS7BDAT binary format. -.. code-block:: none +.. code-block:: sas libname xportout xport 'transport-file.xpt'; data xportout.tips; diff --git a/doc/source/conf.py b/doc/source/conf.py index b5fbf096f26263..46249af8a5a562 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -18,7 +18,6 @@ import importlib import warnings -from pandas.compat import u, PY3 try: raw_input # Python 2 @@ -64,6 +63,7 @@ 'ipython_sphinxext.ipython_console_highlighting', # lowercase didn't work 'IPython.sphinxext.ipython_console_highlighting', + 'matplotlib.sphinxext.plot_directive', 'sphinx.ext.intersphinx', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', @@ -86,37 +86,13 @@ if any(re.match("\s*api\s*", l) for l in index_rst_lines): autosummary_generate = True -files_to_delete = [] -for f in os.listdir(os.path.dirname(__file__)): - if (not f.endswith(('.ipynb', '.rst')) or - f.startswith('.') or os.path.basename(f) == 'index.rst'): - continue - - _file_basename = os.path.splitext(f)[0] - _regex_to_match = "\s*{}\s*$".format(_file_basename) - if not any(re.match(_regex_to_match, line) for line in index_rst_lines): - files_to_delete.append(f) - -if files_to_delete: - print("I'm about to DELETE the following:\n{}\n".format( - list(sorted(files_to_delete)))) - sys.stdout.write("WARNING: I'd like to delete those " - "to speed up processing (yes/no)? ") - if PY3: - answer = input() - else: - answer = raw_input() - - if answer.lower().strip() in ('y', 'yes'): - for f in files_to_delete: - f = os.path.join(os.path.join(os.path.dirname(__file__), f)) - f = os.path.abspath(f) - try: - print("Deleting {}".format(f)) - os.unlink(f) - except: - print("Error deleting {}".format(f)) - pass +# matplotlib plot directive +plot_include_source = True +plot_formats = [("png", 90)] +plot_html_show_formats = False +plot_html_show_source_link = False +plot_pre_code = """import numpy as np +import pandas as pd""" # Add any paths that contain templates here, relative to this directory. templates_path = ['../_templates'] @@ -131,8 +107,8 @@ master_doc = 'index' # General information about the project. -project = u('pandas') -copyright = u('2008-2014, the pandas development team') +project = u'pandas' +copyright = u'2008-2014, the pandas development team' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -343,8 +319,8 @@ # file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'pandas.tex', - u('pandas: powerful Python data analysis toolkit'), - u('Wes McKinney\n\& PyData Development Team'), 'manual'), + u'pandas: powerful Python data analysis toolkit', + u'Wes McKinney\n\& PyData Development Team', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -585,6 +561,45 @@ def remove_flags_docstring(app, what, name, obj, options, lines): del lines[:] +def process_class_docstrings(app, what, name, obj, options, lines): + """ + For those classes for which we use :: + + :template: autosummary/class_without_autosummary.rst + + the documented attributes/methods have to be listed in the class + docstring. However, if one of those lists is empty, we use 'None', + which then generates warnings in sphinx / ugly html output. + This "autodoc-process-docstring" event connector removes that part + from the processed docstring. + + """ + if what == "class": + joined = '\n'.join(lines) + + templates = [ + """.. rubric:: Attributes + +.. autosummary:: + :toctree: + + None +""", + """.. rubric:: Methods + +.. autosummary:: + :toctree: + + None +""" + ] + + for template in templates: + if template in joined: + joined = joined.replace(template, '') + lines[:] = joined.split('\n') + + suppress_warnings = [ # We "overwrite" autosummary with our PandasAutosummary, but # still want the regular autosummary setup to run. So we just @@ -595,6 +610,7 @@ def remove_flags_docstring(app, what, name, obj, options, lines): def setup(app): app.connect("autodoc-process-docstring", remove_flags_docstring) + app.connect("autodoc-process-docstring", process_class_docstrings) app.add_autodocumenter(AccessorDocumenter) app.add_autodocumenter(AccessorAttributeDocumenter) app.add_autodocumenter(AccessorMethodDocumenter) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 258ab874cafcf4..a0c6f1332f3391 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -171,7 +171,7 @@ We'll now kick off a three-step process: # Create and activate the build environment conda env create -f ci/environment-dev.yaml conda activate pandas-dev - + # or with older versions of Anaconda: source activate pandas-dev @@ -262,8 +262,9 @@ after updating. Contributing to the documentation ================================= -If you're not the developer type, contributing to the documentation is still of -huge value. You don't even have to be an expert on *pandas* to do so! In fact, +Contributing to the documentation benefits everyone who uses *pandas*. +We encourage you to help us improve the documentation, and +you don't have to be an expert on *pandas* to do so! In fact, there are sections of the docs that are worse off after being written by experts. If something in the docs doesn't make sense to you, updating the relevant section after you figure it out is a great way to ensure it will help @@ -351,8 +352,10 @@ Some other important things to know about the docs: pandoc doc/source/contributing.rst -t markdown_github > CONTRIBUTING.md -The utility script ``scripts/api_rst_coverage.py`` can be used to compare -the list of methods documented in ``doc/source/api.rst`` (which is used to generate +The utility script ``scripts/validate_docstrings.py`` can be used to get a csv +summary of the API documentation. And also validate common errors in the docstring +of a specific class, function or method. The summary also compares the list of +methods documented in ``doc/source/api.rst`` (which is used to generate the `API Reference `_ page) and the actual public methods. This will identify methods documented in ``doc/source/api.rst`` that are not actually @@ -388,14 +391,11 @@ If you want to do a full clean build, do:: python make.py html You can tell ``make.py`` to compile only a single section of the docs, greatly -reducing the turn-around time for checking your changes. You will be prompted to -delete ``.rst`` files that aren't required. This is okay because the prior -versions of these files can be checked out from git. However, you must make sure -not to commit the file deletions to your Git repository! +reducing the turn-around time for checking your changes. :: - #omit autosummary and API section + # omit autosummary and API section python make.py clean python make.py --no-api @@ -404,10 +404,20 @@ not to commit the file deletions to your Git repository! python make.py clean python make.py --single indexing -For comparison, a full documentation build may take 10 minutes, a ``-no-api`` build -may take 3 minutes and a single section may take 15 seconds. Subsequent builds, which -only process portions you have changed, will be faster. Open the following file in a web -browser to see the full documentation you just built:: + # compile the reference docs for a single function + python make.py clean + python make.py --single DataFrame.join + +For comparison, a full documentation build may take 15 minutes, but a single +section may take 15 seconds. Subsequent builds, which only process portions +you have changed, will be faster. + +You can also specify to use multiple cores to speed up the documentation build:: + + python make.py html --num-jobs 4 + +Open the following file in a web browser to see the full documentation you +just built:: pandas/docs/build/html/index.html diff --git a/doc/source/developer.rst b/doc/source/developer.rst index 0ef097da090f25..b8bb2b2fcbe2f4 100644 --- a/doc/source/developer.rst +++ b/doc/source/developer.rst @@ -140,46 +140,3 @@ As an example of fully-formed metadata: 'metadata': None} ], 'pandas_version': '0.20.0'} - -.. _developer.register-accessors: - -Registering Custom Accessors ----------------------------- - -Libraries can use the decorators -:func:`pandas.api.extensions.register_dataframe_accessor`, -:func:`pandas.api.extensions.register_series_accessor`, and -:func:`pandas.api.extensions.register_index_accessor`, to add additional "namespaces" to -pandas objects. All of these follow a similar convention: you decorate a class, providing the name of attribute to add. The -class's `__init__` method gets the object being decorated. For example: - -.. code-block:: python - - @pd.api.extensions.register_dataframe_accessor("geo") - class GeoAccessor(object): - def __init__(self, pandas_obj): - self._obj = pandas_obj - - @property - def center(self): - # return the geographic center point of this DataFarme - lon = self._obj.latitude - lat = self._obj.longitude - return (float(lon.mean()), float(lat.mean())) - - def plot(self): - # plot this array's data on a map, e.g., using Cartopy - pass - -Now users can access your methods using the `geo` namespace: - - >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10), - ... 'latitude': np.linspace(0, 20)}) - >>> ds.geo.center - (5.0, 10.0) - >>> ds.geo.plot() - # plots data on a map - -This can be a convenient way to extend pandas objects without subclassing them. -If you write a custom accessor, make a pull request adding it to our -:ref:`ecosystem` page. diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 1ba00b8fb6f233..ca6cefac9e8425 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -81,9 +81,28 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``. **From dict** -If ``data`` is a dict, if **index** is passed the values in data corresponding -to the labels in the index will be pulled out. Otherwise, an index will be -constructed from the sorted keys of the dict, if possible. +Series can be instantiated from dicts: + +.. ipython:: python + + d = {'b' : 1, 'a' : 0, 'c' : 2} + pd.Series(d) + +.. note:: + + When the data is a dict, and an index is not passed, the ``Series`` index + will be ordered by the dict's insertion order, if you're using Python + version >= 3.6 and Pandas version >= 0.23. + + If you're using Python < 3.6 or Pandas < 0.23, and an index is not passed, + the ``Series`` index will be the lexically ordered list of dict keys. + +In the example above, if you were on a Python version lower than 3.6 or a +Pandas version lower than 0.23, the ``Series`` would be ordered by the lexical +order of the dict keys (i.e. ``['a', 'b', 'c']`` rather than ``['b', 'a', 'c']``). + +If an index is passed, the values in data corresponding to the labels in the +index will be pulled out. .. ipython:: python @@ -243,12 +262,22 @@ not matching up to the passed index. If axis labels are not passed, they will be constructed from the input data based on common sense rules. +.. note:: + + When the data is a dict, and ``columns`` is not specified, the ``DataFrame`` + columns will be ordered by the dict's insertion order, if you are using + Python version >= 3.6 and Pandas >= 0.23. + + If you are using Python < 3.6 or Pandas < 0.23, and ``columns`` is not + specified, the ``DataFrame`` columns will be the lexically ordered list of dict + keys. + From dict of Series or dicts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The resulting **index** will be the **union** of the indexes of the various Series. If there are any nested dicts, these will first be converted to -Series. If no columns are passed, the columns will be the sorted list of dict +Series. If no columns are passed, the columns will be the ordered list of dict keys. .. ipython:: python diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index c770bf28516431..30cdb06b284877 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -262,3 +262,38 @@ Data validation Engarde is a lightweight library used to explicitly state your assumptions abour your datasets and check that they're *actually* true. + +.. _ecosystem.extensions: + +Extension Data Types +-------------------- + +Pandas provides an interface for defining +:ref:`extension types ` to extend NumPy's type +system. The following libraries implement that interface to provide types not +found in NumPy or pandas, which work well with pandas' data containers. + +`cyberpandas`_ +~~~~~~~~~~~~~~ + +Cyberpandas provides an extension type for storing arrays of IP Addresses. These +arrays can be stored inside pandas' Series and DataFrame. + +.. _ecosystem.accessors: + +Accessors +--------- + +A directory of projects providing +:ref:`extension accessors `. This is for users to +discover new accessors and for library authors to coordinate on the namespace. + +============== ========== ========================= +Library Accessor Classes +============== ========== ========================= +`cyberpandas`_ ``ip`` ``Series`` +`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` +============== ========== ========================= + +.. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest +.. _pdvega: https://jakevdp.github.io/pdvega/ diff --git a/doc/source/extending.rst b/doc/source/extending.rst new file mode 100644 index 00000000000000..25c4ba4a4a2a37 --- /dev/null +++ b/doc/source/extending.rst @@ -0,0 +1,269 @@ +.. _extending: + +**************** +Extending Pandas +**************** + +While pandas provides a rich set of methods, containers, and data types, your +needs may not be fully satisfied. Pandas offers a few options for extending +pandas. + +.. _extending.register-accessors: + +Registering Custom Accessors +---------------------------- + +Libraries can use the decorators +:func:`pandas.api.extensions.register_dataframe_accessor`, +:func:`pandas.api.extensions.register_series_accessor`, and +:func:`pandas.api.extensions.register_index_accessor`, to add additional +"namespaces" to pandas objects. All of these follow a similar convention: you +decorate a class, providing the name of attribute to add. The class's +``__init__`` method gets the object being decorated. For example: + +.. code-block:: python + + @pd.api.extensions.register_dataframe_accessor("geo") + class GeoAccessor(object): + def __init__(self, pandas_obj): + self._obj = pandas_obj + + @property + def center(self): + # return the geographic center point of this DataFrame + lat = self._obj.latitude + lon = self._obj.longitude + return (float(lon.mean()), float(lat.mean())) + + def plot(self): + # plot this array's data on a map, e.g., using Cartopy + pass + +Now users can access your methods using the ``geo`` namespace: + + >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10), + ... 'latitude': np.linspace(0, 20)}) + >>> ds.geo.center + (5.0, 10.0) + >>> ds.geo.plot() + # plots data on a map + +This can be a convenient way to extend pandas objects without subclassing them. +If you write a custom accessor, make a pull request adding it to our +:ref:`ecosystem` page. + +.. _extending.extension-types: + +Extension Types +--------------- + +Pandas defines an interface for implementing data types and arrays that *extend* +NumPy's type system. Pandas itself uses the extension system for some types +that aren't built into NumPy (categorical, period, interval, datetime with +timezone). + +Libraries can define a custom array and data type. When pandas encounters these +objects, they will be handled properly (i.e. not converted to an ndarray of +objects). Many methods like :func:`pandas.isna` will dispatch to the extension +type's implementation. + +If you're building a library that implements the interface, please publicize it +on :ref:`ecosystem.extensions`. + +The interface consists of two classes. + +``ExtensionDtype`` +^^^^^^^^^^^^^^^^^^ + +An ``ExtensionDtype`` is similar to a ``numpy.dtype`` object. It describes the +data type. Implementors are responsible for a few unique items like the name. + +One particularly important item is the ``type`` property. This should be the +class that is the scalar type for your data. For example, if you were writing an +extension array for IP Address data, this might be ``ipaddress.IPv4Address``. + +See the `extension dtype source`_ for interface definition. + +``ExtensionArray`` +^^^^^^^^^^^^^^^^^^ + +This class provides all the array-like functionality. ExtensionArrays are +limited to 1 dimension. An ExtensionArray is linked to an ExtensionDtype via the +``dtype`` attribute. + +Pandas makes no restrictions on how an extension array is created via its +``__new__`` or ``__init__``, and puts no restrictions on how you store your +data. We do require that your array be convertible to a NumPy array, even if +this is relatively expensive (as it is for ``Categorical``). + +They may be backed by none, one, or many NumPy arrays. For example, +``pandas.Categorical`` is an extension array backed by two arrays, +one for codes and one for categories. An array of IPv6 addresses may +be backed by a NumPy structured array with two fields, one for the +lower 64 bits and one for the upper 64 bits. Or they may be backed +by some other storage type, like Python lists. + +See the `extension array source`_ for the interface definition. The docstrings +and comments contain guidance for properly implementing the interface. + +.. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py +.. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py + +.. _extending.subclassing-pandas: + +Subclassing pandas Data Structures +---------------------------------- + +.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures. + + 1. Extensible method chains with :ref:`pipe ` + + 2. Use *composition*. See `here `_. + + 3. Extending by :ref:`registering an accessor ` + + 4. Extending by :ref:`extension type ` + +This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are two points that need attention: + +1. Override constructor properties. +2. Define original properties + +.. note:: + + You can find a nice example in `geopandas `_ project. + +Override Constructor Properties +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Each data structure has several *constructor properties* for returning a new +data structure as the result of an operation. By overriding these properties, +you can retain subclasses through ``pandas`` data manipulations. + +There are 3 constructor properties to be defined: + +- ``_constructor``: Used when a manipulation result has the same dimesions as the original. +- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. +- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. + +Following table shows how ``pandas`` data structures define constructor properties by default. + +=========================== ======================= ============= +Property Attributes ``Series`` ``DataFrame`` +=========================== ======================= ============= +``_constructor`` ``Series`` ``DataFrame`` +``_constructor_sliced`` ``NotImplementedError`` ``Series`` +``_constructor_expanddim`` ``DataFrame`` ``Panel`` +=========================== ======================= ============= + +Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. + +.. code-block:: python + + class SubclassedSeries(Series): + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + class SubclassedDataFrame(DataFrame): + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + +.. code-block:: python + + >>> s = SubclassedSeries([1, 2, 3]) + >>> type(s) + + + >>> to_framed = s.to_frame() + >>> type(to_framed) + + + >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> type(df) + + + >>> sliced1 = df[['A', 'B']] + >>> sliced1 + A B + 0 1 4 + 1 2 5 + 2 3 6 + >>> type(sliced1) + + + >>> sliced2 = df['A'] + >>> sliced2 + 0 1 + 1 2 + 2 3 + Name: A, dtype: int64 + >>> type(sliced2) + + +Define Original Properties +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To let original data structures have additional properties, you should let ``pandas`` know what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways: + +1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results. +2. Define ``_metadata`` for normal properties which will be passed to manipulation results. + +Below is an example to define two original properties, "internal_cache" as a temporary property and "added_property" as a normal property + +.. code-block:: python + + class SubclassedDataFrame2(DataFrame): + + # temporary properties + _internal_names = pd.DataFrame._internal_names + ['internal_cache'] + _internal_names_set = set(_internal_names) + + # normal properties + _metadata = ['added_property'] + + @property + def _constructor(self): + return SubclassedDataFrame2 + +.. code-block:: python + + >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.internal_cache = 'cached' + >>> df.added_property = 'property' + + >>> df.internal_cache + cached + >>> df.added_property + property + + # properties defined in _internal_names is reset after manipulation + >>> df[['A', 'B']].internal_cache + AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' + + # properties defined in _metadata are retained + >>> df[['A', 'B']].added_property + property diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index eff1227e98994b..1ef88a524732f6 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -106,8 +106,13 @@ Some other notes See the package overview for more detail about what's in the library. +{% if single_doc_type == 'docstring' -%} +.. autosummary:: + :toctree: generated_single/ +{% else -%} .. toctree:: :maxdepth: 4 +{% endif %} {% if single_doc -%} {{ single_doc }} @@ -152,5 +157,6 @@ See the package overview for more detail about what's in the library. {% if not single_doc -%} developer internals + extending release {% endif -%} diff --git a/doc/source/install.rst b/doc/source/install.rst index 4ff63d59024b26..07f57dbd657091 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -12,7 +12,7 @@ cross platform distribution for data analysis and scientific computing. This is the recommended installation method for most users. Instructions for installing from source, -`PyPI `__, various Linux distributions, or a +`PyPI `__, `ActivePython `__, various Linux distributions, or a `development version `__ are also provided. Python version support @@ -25,8 +25,8 @@ Installing pandas .. _install.anaconda: -Installing pandas with Anaconda -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Installing with Anaconda +~~~~~~~~~~~~~~~~~~~~~~~~ Installing pandas and the rest of the `NumPy `__ and `SciPy `__ stack can be a little @@ -58,8 +58,8 @@ that folder). .. _install.miniconda: -Installing pandas with Miniconda -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Installing with Miniconda +~~~~~~~~~~~~~~~~~~~~~~~~~ The previous section outlined how to get pandas installed as part of the `Anaconda `__ distribution. @@ -134,6 +134,13 @@ pandas can be installed via pip from pip install pandas +Installing with ActivePython +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Installation instructions for +`ActivePython `__ can be found +`here `__. Versions +2.7 and 3.5 include pandas. Installing using your Linux distribution's package manager. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -164,7 +171,7 @@ Installing from source See the :ref:`contributing documentation ` for complete instructions on building from the git source tree. Further, see :ref:`creating a development environment ` if you wish to create a *pandas* development environment. Running the test suite -~~~~~~~~~~~~~~~~~~~~~~ +---------------------- pandas is equipped with an exhaustive set of unit tests, covering about 97% of the codebase as of this writing. To run it on your machine to verify that @@ -299,5 +306,5 @@ Optional Dependencies Without the optional dependencies, many useful features will not work. Hence, it is highly recommended that you install these. A packaged - distribution like `Anaconda `__, or `Enthought Canopy + distribution like `Anaconda `__, `ActivePython `__ (version 2.7 or 3.5), or `Enthought Canopy `__ may be worth considering. diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 957f82fd9eba71..b120e3a98db7fc 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -15,7 +15,8 @@ Internals ********* -This section will provide a look into some of pandas internals. +This section will provide a look into some of pandas internals. It's primarily +intended for developers of pandas itself. Indexing -------- @@ -107,156 +108,9 @@ containers (``Index`` classes and ``Series``) we have the following convention: So, for example, ``Series[category]._values`` is a ``Categorical``, while ``Series[category]._ndarray_values`` is the underlying codes. - .. _ref-subclassing-pandas: Subclassing pandas Data Structures ---------------------------------- -.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures. - - 1. Extensible method chains with :ref:`pipe ` - - 2. Use *composition*. See `here `_. - - 3. Extending by :ref:`registering an accessor ` - -This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are 2 points which need attention: - -1. Override constructor properties. -2. Define original properties - -.. note:: You can find a nice example in `geopandas `_ project. - -Override Constructor Properties -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations. - -There are 3 constructors to be defined: - -- ``_constructor``: Used when a manipulation result has the same dimesions as the original. -- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. -- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. - -Following table shows how ``pandas`` data structures define constructor properties by default. - -=========================== ======================= =================== ======================= -Property Attributes ``Series`` ``DataFrame`` ``Panel`` -=========================== ======================= =================== ======================= -``_constructor`` ``Series`` ``DataFrame`` ``Panel`` -``_constructor_sliced`` ``NotImplementedError`` ``Series`` ``DataFrame`` -``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` -=========================== ======================= =================== ======================= - -Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. - -.. code-block:: python - - class SubclassedSeries(Series): - - @property - def _constructor(self): - return SubclassedSeries - - @property - def _constructor_expanddim(self): - return SubclassedDataFrame - - class SubclassedDataFrame(DataFrame): - - @property - def _constructor(self): - return SubclassedDataFrame - - @property - def _constructor_sliced(self): - return SubclassedSeries - -.. code-block:: python - - >>> s = SubclassedSeries([1, 2, 3]) - >>> type(s) - - - >>> to_framed = s.to_frame() - >>> type(to_framed) - - - >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) - >>> df - A B C - 0 1 4 7 - 1 2 5 8 - 2 3 6 9 - - >>> type(df) - - - >>> sliced1 = df[['A', 'B']] - >>> sliced1 - A B - 0 1 4 - 1 2 5 - 2 3 6 - >>> type(sliced1) - - - >>> sliced2 = df['A'] - >>> sliced2 - 0 1 - 1 2 - 2 3 - Name: A, dtype: int64 - >>> type(sliced2) - - -Define Original Properties -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -To let original data structures have additional properties, you should let ``pandas`` know what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways: - -1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results. -2. Define ``_metadata`` for normal properties which will be passed to manipulation results. - -Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property - -.. code-block:: python - - class SubclassedDataFrame2(DataFrame): - - # temporary properties - _internal_names = pd.DataFrame._internal_names + ['internal_cache'] - _internal_names_set = set(_internal_names) - - # normal properties - _metadata = ['added_property'] - - @property - def _constructor(self): - return SubclassedDataFrame2 - -.. code-block:: python - - >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) - >>> df - A B C - 0 1 4 7 - 1 2 5 8 - 2 3 6 9 - - >>> df.internal_cache = 'cached' - >>> df.added_property = 'property' - - >>> df.internal_cache - cached - >>> df.added_property - property - - # properties defined in _internal_names is reset after manipulation - >>> df[['A', 'B']].internal_cache - AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' - - # properties defined in _metadata are retained - >>> df[['A', 'B']].added_property - property +This section has been moved to :ref:`extending.subclassing-pandas`. diff --git a/doc/source/io.rst b/doc/source/io.rst index 0b9a610b50d7d5..93f5c5bea53b4b 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4711,6 +4711,12 @@ writes ``data`` to the database in batches of 1000 rows at a time: data.to_sql('data_chunked', engine, chunksize=1000) +.. note:: + + The function :func:`~pandas.DataFrame.to_sql` will perform a multivalue + insert if the engine dialect ``supports_multivalues_insert``. This will + greatly speed up the insert in some cases. + SQL data types ++++++++++++++ diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 4d9746eed0f0bb..cfd3f9e88e4eaf 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -152,7 +152,7 @@ functionality below. Set logic on the other axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When gluing together multiple ``DataFrame``s, you have a choice of how to handle +When gluing together multiple DataFrames, you have a choice of how to handle the other axes (other than the one being concatenated). This can be done in the following three ways: @@ -636,7 +636,7 @@ key combination: Here is a more complicated example with multiple join keys. Only the keys appearing in ``left`` and ``right`` are present (the intersection), since -``how='inner'```by default. +``how='inner'`` by default. .. ipython:: python @@ -721,7 +721,7 @@ either the left or right tables, the values in the joined table will be labels=['left', 'right'], vertical=False); plt.close('all'); -Here is another example with duplicate join keys in ``DataFrame``s: +Here is another example with duplicate join keys in DataFrames: .. ipython:: python diff --git a/doc/source/options.rst b/doc/source/options.rst index cce16a5396377a..a82be4d84bf3fe 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -402,6 +402,10 @@ display.html.table_schema False Whether to publish a Table display.html.border 1 A ``border=value`` attribute is inserted in the ```` tag for the DataFrame HTML repr. +display.html.use_mathjax True When True, Jupyter notebook will process + table contents using MathJax, rendering + mathematical expressions enclosed by the + dollar symbol. io.excel.xls.writer xlwt The default Excel writer engine for 'xls' files. io.excel.xlsm.writer openpyxl The default Excel writer engine for diff --git a/doc/source/text.rst b/doc/source/text.rst index 1e620acb1f88a3..da8e40892716eb 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -118,8 +118,8 @@ i.e., from the end of the string to the beginning of the string: s2.str.rsplit('_', expand=True, n=1) -Methods like ``replace`` and ``findall`` take `regular expressions -`__, too: +``replace`` by default replaces `regular expressions +`__: .. ipython:: python @@ -146,12 +146,25 @@ following code will cause trouble because of the regular expression meaning of # We need to escape the special character (for >1 len patterns) dollars.str.replace(r'-\$', '-') +.. versionadded:: 0.23.0 + +If you do want literal replacement of a string (equivalent to +:meth:`str.replace`), you can set the optional ``regex`` parameter to +``False``, rather than escaping each character. In this case both ``pat`` +and ``repl`` must be strings: + +.. ipython:: python + + # These lines are equivalent + dollars.str.replace(r'-\$', '-') + dollars.str.replace('-$', '-', regex=False) + +.. versionadded:: 0.20.0 + The ``replace`` method can also take a callable as replacement. It is called on every ``pat`` using :func:`re.sub`. The callable should expect one positional argument (a regex object) and return a string. -.. versionadded:: 0.20.0 - .. ipython:: python # Reverse every lowercase alphabetic word @@ -164,12 +177,12 @@ positional argument (a regex object) and return a string. repl = lambda m: m.group('two').swapcase() pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) +.. versionadded:: 0.20.0 + The ``replace`` method also accepts a compiled regular expression object from :func:`re.compile` as a pattern. All flags should be included in the compiled regular expression object. -.. versionadded:: 0.20.0 - .. ipython:: python import re @@ -186,6 +199,7 @@ regular expression object will raise a ``ValueError``. --------------------------------------------------------------------------- ValueError: case and flags cannot be set when pat is a compiled regex + Indexing with ``.str`` ---------------------- @@ -432,7 +446,7 @@ Method Summary :meth:`~Series.str.join`;Join strings in each element of the Series with passed separator :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex - :meth:`~Series.str.replace`;Replace occurrences of pattern/regex with some other string or the return value of a callable given the occurrence + :meth:`~Series.str.replace`;Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings" :meth:`~Series.str.center`;Equivalent to ``str.center`` diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 0398e2892cef5d..85e455de7d246e 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -180,13 +180,13 @@ Video Tutorials - `Pandas From The Ground Up `_ (2015) (2:24) - `GitHub repo `_ + `GitHub repo `__ - `Introduction Into Pandas `_ (2016) (1:28) - `GitHub repo `_ + `GitHub repo `__ - `Pandas: .head() to .tail() `_ (2016) (1:26) - `GitHub repo `_ + `GitHub repo `__ Various Tutorials diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index ee93f06fbc9588..09a52ee527cb5f 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -870,7 +870,7 @@ Andrews Curves Andrews curves allow one to plot multivariate data as a large number of curves that are created using the attributes of samples as coefficients -for Fourier series, see the `Wikipedia entry`_ +for Fourier series, see the `Wikipedia entry `__ for more information. By coloring these curves differently for each class it is possible to visualize data clustering. Curves belonging to samples of the same class will usually be closer together and form larger structures. @@ -894,7 +894,7 @@ Parallel Coordinates ~~~~~~~~~~~~~~~~~~~~ Parallel coordinates is a plotting technique for plotting multivariate data, -see the `Wikipedia entry`_ +see the `Wikipedia entry `__ for an introduction. Parallel coordinates allows one to see clusters in data and to estimate other statistics visually. Using parallel coordinates points are represented as connected line segments. @@ -962,7 +962,7 @@ all time-lag separations. If time series is non-random then one or more of the autocorrelations will be significantly non-zero. The horizontal lines displayed in the plot correspond to 95% and 99% confidence bands. The dashed line is 99% confidence band. See the -`Wikipedia entry`_ for more about +`Wikipedia entry `__ for more about autocorrelation plots. .. ipython:: python @@ -1032,7 +1032,7 @@ unit interval). The point in the plane, where our sample settles to (where the forces acting on our sample are at an equilibrium) is where a dot representing our sample will be drawn. Depending on which class that sample belongs it will be colored differently. -See the R package `Radviz`_ +See the R package `Radviz `__ for more information. **Note**: The "Iris" dataset is available `here `__. diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index b1e8aa10457f8c..9e1dc391d7ace3 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -313,7 +313,7 @@ Other Enhancements - Add/delete ``str/dt/cat`` accessors dynamically from ``__dir__``. (:issue:`9910`) - Add ``normalize`` as a ``dt`` accessor method. (:issue:`10047`) -- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` +- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` - ``pd.lib.infer_dtype`` now returns ``'bytes'`` in Python 3 where appropriate. (:issue:`10032`) diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index bfe44290e49d2c..91ec0c3038985d 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -63,10 +63,10 @@ of ``(function, keyword)`` indicating where the DataFrame should flow. For examp bb = pd.read_csv('data/baseball.csv', index_col='id') - # sm.poisson takes (formula, data) + # sm.ols takes (formula, data) (bb.query('h > 0') .assign(ln_h = lambda df: np.log(df.h)) - .pipe((sm.poisson, 'data'), 'hr ~ ln_h + year + g + C(lg)') + .pipe((sm.ols, 'data'), 'hr ~ ln_h + year + g + C(lg)') .fit() .summary() ) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 5330f7e7e998b8..f686a042c1a74f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -3,7 +3,7 @@ v0.23.0 ------- -This is a major release from 0.21.1 and includes a number of API changes, +This is a major release from 0.22.0 and includes a number of API changes, deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -117,7 +117,7 @@ resetting indexes. See the :ref:`Sorting by Indexes and Values # Sort by 'second' (index) and 'A' (column) df_multi.sort_values(by=['second', 'A']) -.. _whatsnew_0230.enhancements.timedelta_mod +.. _whatsnew_0230.enhancements.timedelta_mod: Timedelta mod method ^^^^^^^^^^^^^^^^^^^^ @@ -249,7 +249,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python using ``.assign()`` to update an existing column. Previously, callables referring to other variables being updated would get the "old" values - Previous Behaviour: + Previous Behavior: .. code-block:: ipython @@ -262,12 +262,43 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python 1 3 -2 2 4 -3 - New Behaviour: + New Behavior: .. ipython:: python df.assign(A=df.A+1, C= lambda df: df.A* -1) + +.. _whatsnew_0230.enhancements.astype_category: + +``DataFrame.astype`` performs column-wise conversion to ``Categorical`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`DataFrame.astype` can now perform column-wise conversion to ``Categorical`` by supplying the string ``'category'`` or +a :class:`~pandas.api.types.CategoricalDtype`. Previously, attempting this would raise a ``NotImplementedError``. See the +:ref:`categorical.objectcreation` section of the documentation for more details and examples. (:issue:`12860`, :issue:`18099`) + +Supplying the string ``'category'`` performs column-wise conversion, with only labels appearing in a given column set as categories: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + df = df.astype('category') + df['A'].dtype + df['B'].dtype + + +Supplying a ``CategoricalDtype`` will make the categories in each column consistent with the supplied dtype: + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + cdt = CategoricalDtype(categories=list('abcd'), ordered=True) + df = df.astype(cdt) + df['A'].dtype + df['B'].dtype + .. _whatsnew_0230.enhancements.other: Other Enhancements @@ -306,7 +337,13 @@ Other Enhancements - Added :func:`SeriesGroupBy.is_monotonic_increasing` and :func:`SeriesGroupBy.is_monotonic_decreasing` (:issue:`17015`) - For subclassed ``DataFrames``, :func:`DataFrame.apply` will now preserve the ``Series`` subclass (if defined) when passing the data to the applied function (:issue:`19822`) - :func:`DataFrame.from_dict` now accepts a ``columns`` argument that can be used to specify the column names when ``orient='index'`` is used (:issue:`18529`) - +- Added option ``display.html.use_mathjax`` so `MathJax `_ can be disabled when rendering tables in ``Jupyter`` notebooks (:issue:`19856`, :issue:`19824`) +- :func:`DataFrame.replace` now supports the ``method`` parameter, which can be used to specify the replacement method when ``to_replace`` is a scalar, list or tuple and ``value`` is ``None`` (:issue:`19632`) +- :meth:`Timestamp.month_name`, :meth:`DatetimeIndex.month_name`, and :meth:`Series.dt.month_name` are now available (:issue:`12805`) +- :meth:`Timestamp.day_name` and :meth:`DatetimeIndex.day_name` are now available to return day names with a specified locale (:issue:`12806`) +- :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row. + ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) +- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) .. _whatsnew_0230.api_breaking: @@ -329,6 +366,57 @@ If installed, we now require: | openpyxl | 2.4.0 | | +-----------------+-----------------+----------+ +.. _whatsnew_0230.api_breaking.dict_insertion_order: + +Instantation from dicts preserves dict insertion order for python 3.6+ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Until Python 3.6, dicts in Python had no formally defined ordering. For Python +version 3.6 and later, dicts are ordered by insertion order, see +`PEP 468 `_. +Pandas will use the dict's insertion order, when creating a ``Series`` or +``DataFrame`` from a dict and you're using Python version 3.6 or +higher. (:issue:`19884`) + +Previous Behavior (and current behavior if on Python < 3.6): + +.. code-block:: ipython + + In [1]: pd.Series({'Income': 2000, + ... 'Expenses': -1500, + ... 'Taxes': -200, + ... 'Net result': 300}) + Expenses -1500 + Income 2000 + Net result 300 + Taxes -200 + dtype: int64 + +Note the Series above is ordered alphabetically by the index values. + +New Behavior (for Python >= 3.6): + +.. ipython:: python + + pd.Series({'Income': 2000, + 'Expenses': -1500, + 'Taxes': -200, + 'Net result': 300}) + +Notice that the Series is now ordered by insertion order. This new behavior is +used for all relevant pandas types (``Series``, ``DataFrame``, ``SparseSeries`` +and ``SparseDataFrame``). + +If you wish to retain the old behavior while using Python >= 3.6, you can use +``.sort_index()``: + +.. ipython:: python + + pd.Series({'Income': 2000, + 'Expenses': -1500, + 'Taxes': -200, + 'Net result': 300}).sort_index() + .. _whatsnew_0230.api_breaking.deprecate_panel: Deprecate Panel @@ -587,6 +675,7 @@ Datetimelike API Changes - Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (:issue:`18817`) - :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) - For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with ``freq=None``, addition or subtraction of integer-dtyped array or ``Index`` will raise ``NullFrequencyError`` instead of ``TypeError`` (:issue:`19895`) +- :class:`Timestamp` constructor now accepts a `nanosecond` keyword or positional argument (:issue:`18898`) .. _whatsnew_0230.api.other: @@ -620,6 +709,8 @@ Other API Changes - Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`) - :class:`DateOffset` objects render more simply, e.g. ```` instead of ```` (:issue:`19403`) - ``Categorical.fillna`` now validates its ``value`` and ``method`` keyword arguments. It now raises when both or none are specified, matching the behavior of :meth:`Series.fillna` (:issue:`19682`) +- ``pd.to_datetime('today')`` now returns a datetime, consistent with ``pd.Timestamp('today')``; previously ``pd.to_datetime('today')`` returned a ``.normalized()`` datetime (:issue:`19935`) +- :func:`Series.str.replace` now takes an optional `regex` keyword which, when set to ``False``, uses literal string replacement rather than regex replacement (:issue:`16808`) .. _whatsnew_0230.deprecations: @@ -642,6 +733,9 @@ Deprecations - The ``broadcast`` parameter of ``.apply()`` is deprecated in favor of ``result_type='broadcast'`` (:issue:`18577`) - The ``reduce`` parameter of ``.apply()`` is deprecated in favor of ``result_type='reduce'`` (:issue:`18577`) - The ``order`` parameter of :func:`factorize` is deprecated and will be removed in a future release (:issue:`19727`) +- :attr:`Timestamp.weekday_name`, :attr:`DatetimeIndex.weekday_name`, and :attr:`Series.dt.weekday_name` are deprecated in favor of :meth:`Timestamp.day_name`, :meth:`DatetimeIndex.day_name`, and :meth:`Series.dt.day_name` (:issue:`12806`) + +- ``pandas.tseries.plotting.tsplot`` is deprecated. Use :func:`Series.plot` instead (:issue:`18627`) .. _whatsnew_0230.prior_deprecations: @@ -670,7 +764,7 @@ Removal of prior version deprecations/changes - The ``Panel4D`` and ``PanelND`` classes have been removed (:issue:`13776`) - The ``Panel`` class has dropped the ``to_long`` and ``toLong`` methods (:issue:`19077`) - The options ``display.line_with`` and ``display.height`` are removed in favor of ``display.width`` and ``display.max_rows`` respectively (:issue:`4391`, :issue:`19107`) -- The ``labels`` attribute of the ``Categorical`` class has been removed in favor of :attribute:`Categorical.codes` (:issue:`7768`) +- The ``labels`` attribute of the ``Categorical`` class has been removed in favor of :attr:`Categorical.codes` (:issue:`7768`) - The ``flavor`` parameter have been removed from func:`to_sql` method (:issue:`13611`) - The modules ``pandas.tools.hashing`` and ``pandas.util.hashing`` have been removed (:issue:`16223`) - The top-level functions ``pd.rolling_*``, ``pd.expanding_*`` and ``pd.ewm*`` have been removed (Deprecated since v0.18). @@ -697,9 +791,11 @@ Performance Improvements - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) -- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) -- Improved performance of ``GroupBy.ffill`` and ``GroupBy.bfill`` (:issue:`11296`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`) .. _whatsnew_0230.docs: @@ -799,6 +895,7 @@ Timezones - Bug in the :class:`DataFrame` constructor, where tz-aware Datetimeindex and a given column name will result in an empty ``DataFrame`` (:issue:`19157`) - Bug in :func:`Timestamp.tz_localize` where localizing a timestamp near the minimum or maximum valid values could overflow and return a timestamp with an incorrect nanosecond value (:issue:`12677`) - Bug when iterating over :class:`DatetimeIndex` that was localized with fixed timezone offset that rounded nanosecond precision to microseconds (:issue:`19603`) +- Bug in :func:`DataFrame.diff` that raised an ``IndexError`` with tz-aware values (:issue:`18578`) Offsets ^^^^^^^ @@ -812,6 +909,7 @@ Offsets Numeric ^^^^^^^ +- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`) - Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`) - Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`) - Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) @@ -837,6 +935,7 @@ Indexing - Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) - Bug in :class:`IntervalIndex` where set operations that returned an empty ``IntervalIndex`` had the wrong dtype (:issue:`19101`) - Bug in :meth:`DataFrame.drop_duplicates` where no ``KeyError`` is raised when passing in columns that don't exist on the ``DataFrame`` (issue:`19726`) +- Bug in ``Index`` subclasses constructors that ignore unexpected keyword arguments (:issue:`19348`) MultiIndex @@ -867,8 +966,9 @@ I/O - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`) - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`) - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`) -- :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for xls file type (:issue:`19242`, :issue:`9155`) +- :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`, :issue:`9155`, :issue:`19900`) - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) +- Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`) Plotting ^^^^^^^^ @@ -890,6 +990,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) - Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) - Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`) +- Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` when ``skipna`` was passed (:issue:`19806`) Sparse ^^^^^^ @@ -920,6 +1021,8 @@ Reshaping - Bug in :func:`qcut` where datetime and timedelta data with ``NaT`` present raised a ``ValueError`` (:issue:`19768`) - Bug in :func:`DataFrame.iterrows`, which would infers strings not compliant to `ISO8601 `_ to datetimes (:issue:`19671`) - Bug in :class:`Series` constructor with ``Categorical`` where a ```ValueError`` is not raised when an index of different length is given (:issue:`19342`) +- Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) +- Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) Other ^^^^^ diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 2f40bd4349a2e5..9348d7525c3072 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -213,7 +213,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, sum_ranks = dups = 0 {{endif}} if pct: - return ranks / count + if tiebreak == TIEBREAK_DENSE: + return ranks / total_tie_count + else: + return ranks / count else: return ranks @@ -385,7 +388,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if pct: - ranks[i, :] /= count + if tiebreak == TIEBREAK_DENSE: + ranks[i, :] /= total_tie_count + else: + ranks[i, :] /= count if axis == 0: return ranks.T else: diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e3d208a915225e..43afd1e0f5969e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -139,7 +139,8 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, def group_cumprod_float64(float64_t[:, :] out, float64_t[:, :] values, int64_t[:] labels, - bint is_datetimelike): + bint is_datetimelike, + bint skipna=True): """ Only transforms on axis=0 """ @@ -163,6 +164,11 @@ def group_cumprod_float64(float64_t[:, :] out, if val == val: accum[lab, j] *= val out[i, j] = accum[lab, j] + else: + out[i, j] = NaN + if not skipna: + accum[lab, j] = NaN + break @cython.boundscheck(False) @@ -170,7 +176,8 @@ def group_cumprod_float64(float64_t[:, :] out, def group_cumsum(numeric[:, :] out, numeric[:, :] values, int64_t[:] labels, - is_datetimelike): + is_datetimelike, + bint skipna=True): """ Only transforms on axis=0 """ @@ -196,6 +203,11 @@ def group_cumsum(numeric[:, :] out, if val == val: accum[lab, j] += val out[i, j] = accum[lab, j] + else: + out[i, j] = NaN + if not skipna: + accum[lab, j] = NaN + break else: accum[lab, j] += val out[i, j] = accum[lab, j] @@ -310,5 +322,62 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, filled_vals = 0 +@cython.boundscheck(False) +@cython.wraparound(False) +def group_any_all(ndarray[uint8_t] out, + ndarray[int64_t] labels, + ndarray[uint8_t] values, + ndarray[uint8_t] mask, + object val_test, + bint skipna): + """Aggregated boolean values to show truthfulness of group elements + + Parameters + ---------- + out : array of values which this method will write its results to + labels : array containing unique label for each group, with its + ordering matching up to the corresponding record in `values` + values : array containing the truth value of each element + mask : array indicating whether a value is na or not + val_test : str {'any', 'all'} + String object dictating whether to use any or all truth testing + skipna : boolean + Flag to ignore nan values during truth testing + + Notes + ----- + This method modifies the `out` parameter rather than returning an object. + The returned values will either be 0 or 1 (False or True, respectively). + """ + cdef: + Py_ssize_t i, N=len(labels) + int64_t lab + uint8_t flag_val + + if val_test == 'all': + # Because the 'all' value of an empty iterable in Python is True we can + # start with an array full of ones and set to zero when a False value + # is encountered + flag_val = 0 + elif val_test == 'any': + # Because the 'any' value of an empty iterable in Python is False we + # can start with an array full of zeros and set to one only if any + # value encountered is True + flag_val = 1 + else: + raise ValueError("'bool_func' must be either 'any' or 'all'!") + + out.fill(1 - flag_val) + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0 or (skipna and mask[i]): + continue + + if values[i] == flag_val: + out[lab] = flag_val + + # generated from template include "groupby_helper.pxi" diff --git a/pandas/_libs/lib.pxd b/pandas/_libs/lib.pxd deleted file mode 100644 index b06c071c358c11..00000000000000 --- a/pandas/_libs/lib.pxd +++ /dev/null @@ -1,3 +0,0 @@ -# prototypes for sharing - -cpdef bint is_period(val) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1d59f807a7fdb..30521760327b46 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -21,11 +21,9 @@ from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, PyBytes_Check, PyUnicode_Check, PyTuple_New, + Py_EQ, PyObject_RichCompareBool) -cimport cpython - - from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyTime_Check, PyDelta_Check, PyDateTime_IMPORT) @@ -105,6 +103,14 @@ def item_from_zerodim(object val): """ If the value is a zerodim array, return the item it contains. + Parameters + ---------- + val : object + + Returns + ------- + result : object + Examples -------- >>> item_from_zerodim(1) @@ -117,7 +123,9 @@ def item_from_zerodim(object val): array([1]) """ - return util.unbox_if_zerodim(val) + if cnp.PyArray_IsZeroDim(val): + return cnp.PyArray_ToScalar(cnp.PyArray_DATA(val), val) + return val @cython.wraparound(False) @@ -405,72 +413,6 @@ def maybe_booleans_to_slice(ndarray[uint8_t] mask): return slice(start, end) -@cython.wraparound(False) -@cython.boundscheck(False) -def scalar_compare(ndarray[object] values, object val, object op): - cdef: - Py_ssize_t i, n = len(values) - ndarray[uint8_t, cast=True] result - bint isnull_val - int flag - object x - - if op is operator.lt: - flag = cpython.Py_LT - elif op is operator.le: - flag = cpython.Py_LE - elif op is operator.gt: - flag = cpython.Py_GT - elif op is operator.ge: - flag = cpython.Py_GE - elif op is operator.eq: - flag = cpython.Py_EQ - elif op is operator.ne: - flag = cpython.Py_NE - else: - raise ValueError('Unrecognized operator') - - result = np.empty(n, dtype=bool).view(np.uint8) - isnull_val = checknull(val) - - if flag == cpython.Py_NE: - for i in range(n): - x = values[i] - if checknull(x): - result[i] = True - elif isnull_val: - result[i] = True - else: - try: - result[i] = PyObject_RichCompareBool(x, val, flag) - except (TypeError): - result[i] = True - elif flag == cpython.Py_EQ: - for i in range(n): - x = values[i] - if checknull(x): - result[i] = False - elif isnull_val: - result[i] = False - else: - try: - result[i] = PyObject_RichCompareBool(x, val, flag) - except (TypeError): - result[i] = False - - else: - for i in range(n): - x = values[i] - if checknull(x): - result[i] = False - elif isnull_val: - result[i] = False - else: - result[i] = PyObject_RichCompareBool(x, val, flag) - - return result.view(bool) - - @cython.wraparound(False) @cython.boundscheck(False) cpdef bint array_equivalent_object(object[:] left, object[:] right): @@ -486,115 +428,12 @@ cpdef bint array_equivalent_object(object[:] left, object[:] right): # we are either not equal or both nan # I think None == None will be true here - if not (PyObject_RichCompareBool(x, y, cpython.Py_EQ) or + if not (PyObject_RichCompareBool(x, y, Py_EQ) or _checknull(x) and _checknull(y)): return False return True -@cython.wraparound(False) -@cython.boundscheck(False) -def vec_compare(ndarray[object] left, ndarray[object] right, object op): - cdef: - Py_ssize_t i, n = len(left) - ndarray[uint8_t, cast=True] result - int flag - - if n != len(right): - raise ValueError('Arrays were different lengths: %d vs %d' - % (n, len(right))) - - if op is operator.lt: - flag = cpython.Py_LT - elif op is operator.le: - flag = cpython.Py_LE - elif op is operator.gt: - flag = cpython.Py_GT - elif op is operator.ge: - flag = cpython.Py_GE - elif op is operator.eq: - flag = cpython.Py_EQ - elif op is operator.ne: - flag = cpython.Py_NE - else: - raise ValueError('Unrecognized operator') - - result = np.empty(n, dtype=bool).view(np.uint8) - - if flag == cpython.Py_NE: - for i in range(n): - x = left[i] - y = right[i] - - if checknull(x) or checknull(y): - result[i] = True - else: - result[i] = PyObject_RichCompareBool(x, y, flag) - else: - for i in range(n): - x = left[i] - y = right[i] - - if checknull(x) or checknull(y): - result[i] = False - else: - result[i] = PyObject_RichCompareBool(x, y, flag) - - return result.view(bool) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def scalar_binop(ndarray[object] values, object val, object op): - cdef: - Py_ssize_t i, n = len(values) - ndarray[object] result - object x - - result = np.empty(n, dtype=object) - if _checknull(val): - result.fill(val) - return result - - for i in range(n): - x = values[i] - if _checknull(x): - result[i] = x - else: - result[i] = op(x, val) - - return maybe_convert_bool(result) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def vec_binop(ndarray[object] left, ndarray[object] right, object op): - cdef: - Py_ssize_t i, n = len(left) - ndarray[object] result - - if n != len(right): - raise ValueError('Arrays were different lengths: %d vs %d' - % (n, len(right))) - - result = np.empty(n, dtype=object) - - for i in range(n): - x = left[i] - y = right[i] - try: - result[i] = op(x, y) - except TypeError: - if _checknull(x): - result[i] = x - elif _checknull(y): - result[i] = y - else: - raise - - return maybe_convert_bool(result) - - def astype_intsafe(ndarray[object] arr, new_dtype): cdef: Py_ssize_t i, n = len(arr) diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx new file mode 100644 index 00000000000000..148018ece20e2f --- /dev/null +++ b/pandas/_libs/ops.pyx @@ -0,0 +1,296 @@ +# -*- coding: utf-8 -*- +# cython: profile=False +import operator + +from cpython cimport (PyFloat_Check, PyBool_Check, + PyObject_RichCompareBool, + Py_EQ, Py_NE, Py_LT, Py_LE, Py_GT, Py_GE) + +cimport cython +from cython cimport Py_ssize_t + +import numpy as np +from numpy cimport ndarray, uint8_t + + +from util cimport UINT8_MAX, _checknull + +from missing cimport checknull + + +@cython.wraparound(False) +@cython.boundscheck(False) +def scalar_compare(ndarray[object] values, object val, object op): + """ + Compare each element of `values` array with the scalar `val`, with + the comparison operation described by `op`. + + Parameters + ---------- + values : ndarray[object] + val : object + op : {operator.eq, operator.ne, + operator.le, operator.lt, + operator.ge, operator.gt} + + Returns + ------- + result : ndarray[bool] + """ + cdef: + Py_ssize_t i, n = len(values) + ndarray[uint8_t, cast=True] result + bint isnull_val + int flag + object x + + if op is operator.lt: + flag = Py_LT + elif op is operator.le: + flag = Py_LE + elif op is operator.gt: + flag = Py_GT + elif op is operator.ge: + flag = Py_GE + elif op is operator.eq: + flag = Py_EQ + elif op is operator.ne: + flag = Py_NE + else: + raise ValueError('Unrecognized operator') + + result = np.empty(n, dtype=bool).view(np.uint8) + isnull_val = checknull(val) + + if flag == Py_NE: + for i in range(n): + x = values[i] + if checknull(x): + result[i] = True + elif isnull_val: + result[i] = True + else: + try: + result[i] = PyObject_RichCompareBool(x, val, flag) + except (TypeError): + result[i] = True + elif flag == Py_EQ: + for i in range(n): + x = values[i] + if checknull(x): + result[i] = False + elif isnull_val: + result[i] = False + else: + try: + result[i] = PyObject_RichCompareBool(x, val, flag) + except (TypeError): + result[i] = False + + else: + for i in range(n): + x = values[i] + if checknull(x): + result[i] = False + elif isnull_val: + result[i] = False + else: + result[i] = PyObject_RichCompareBool(x, val, flag) + + return result.view(bool) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def vec_compare(ndarray[object] left, ndarray[object] right, object op): + """ + Compare the elements of `left` with the elements of `right` pointwise, + with the comparison operation described by `op`. + + Parameters + ---------- + left : ndarray[object] + right : ndarray[object] + op : {operator.eq, operator.ne, + operator.le, operator.lt, + operator.ge, operator.gt} + + Returns + ------- + result : ndarray[bool] + """ + cdef: + Py_ssize_t i, n = len(left) + ndarray[uint8_t, cast=True] result + int flag + + if n != len(right): + raise ValueError('Arrays were different lengths: %d vs %d' + % (n, len(right))) + + if op is operator.lt: + flag = Py_LT + elif op is operator.le: + flag = Py_LE + elif op is operator.gt: + flag = Py_GT + elif op is operator.ge: + flag = Py_GE + elif op is operator.eq: + flag = Py_EQ + elif op is operator.ne: + flag = Py_NE + else: + raise ValueError('Unrecognized operator') + + result = np.empty(n, dtype=bool).view(np.uint8) + + if flag == Py_NE: + for i in range(n): + x = left[i] + y = right[i] + + if checknull(x) or checknull(y): + result[i] = True + else: + result[i] = PyObject_RichCompareBool(x, y, flag) + else: + for i in range(n): + x = left[i] + y = right[i] + + if checknull(x) or checknull(y): + result[i] = False + else: + result[i] = PyObject_RichCompareBool(x, y, flag) + + return result.view(bool) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def scalar_binop(ndarray[object] values, object val, object op): + """ + Apply the given binary operator `op` between each element of the array + `values` and the scalar `val`. + + Parameters + ---------- + values : ndarray[object] + val : object + op : binary operator + + Returns + ------- + result : ndarray[object] + """ + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] result + object x + + result = np.empty(n, dtype=object) + if _checknull(val): + result.fill(val) + return result + + for i in range(n): + x = values[i] + if _checknull(x): + result[i] = x + else: + result[i] = op(x, val) + + return maybe_convert_bool(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def vec_binop(ndarray[object] left, ndarray[object] right, object op): + """ + Apply the given binary operator `op` pointwise to the elements of + arrays `left` and `right`. + + Parameters + ---------- + left : ndarray[object] + right : ndarray[object] + op : binary operator + + Returns + ------- + result : ndarray[object] + """ + cdef: + Py_ssize_t i, n = len(left) + ndarray[object] result + + if n != len(right): + raise ValueError('Arrays were different lengths: %d vs %d' + % (n, len(right))) + + result = np.empty(n, dtype=object) + + for i in range(n): + x = left[i] + y = right[i] + try: + result[i] = op(x, y) + except TypeError: + if _checknull(x): + result[i] = x + elif _checknull(y): + result[i] = y + else: + raise + + return maybe_convert_bool(result) + + +def maybe_convert_bool(ndarray[object] arr, + true_values=None, false_values=None): + cdef: + Py_ssize_t i, n + ndarray[uint8_t] result + object val + set true_vals, false_vals + int na_count = 0 + + n = len(arr) + result = np.empty(n, dtype=np.uint8) + + # the defaults + true_vals = set(('True', 'TRUE', 'true')) + false_vals = set(('False', 'FALSE', 'false')) + + if true_values is not None: + true_vals = true_vals | set(true_values) + + if false_values is not None: + false_vals = false_vals | set(false_values) + + for i from 0 <= i < n: + val = arr[i] + + if PyBool_Check(val): + if val is True: + result[i] = 1 + else: + result[i] = 0 + elif val in true_vals: + result[i] = 1 + elif val in false_vals: + result[i] = 0 + elif PyFloat_Check(val): + result[i] = UINT8_MAX + na_count += 1 + else: + return arr + + if na_count > 0: + mask = result == UINT8_MAX + arr = result.view(np.bool_).astype(object) + np.putmask(arr, mask, np.nan) + return arr + else: + return result.view(np.bool_) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 89d2de6de213ab..52ca3d1226f796 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1045,7 +1045,7 @@ cdef class TextReader: usecols = set() if callable(self.usecols): if self.usecols(name): - usecols = set([i]) + usecols = {i} else: usecols = self.usecols if self.has_usecols and not (i in usecols or diff --git a/pandas/_libs/properties.pyx b/pandas/_libs/properties.pyx index 4beb24f07c21cc..67f58851a9a702 100644 --- a/pandas/_libs/properties.pyx +++ b/pandas/_libs/properties.pyx @@ -6,31 +6,28 @@ from cpython cimport ( PyDict_Contains, PyDict_GetItem, PyDict_SetItem) -cdef class cache_readonly(object): +cdef class CachedProperty(object): cdef readonly: - object func, name, allow_setting + object func, name, __doc__ - def __init__(self, func=None, allow_setting=False): - if func is not None: - self.func = func - self.name = func.__name__ - self.allow_setting = allow_setting - - def __call__(self, func, doc=None): + def __init__(self, func): self.func = func self.name = func.__name__ - return self + self.__doc__ = getattr(func, '__doc__', None) def __get__(self, obj, typ): - # Get the cache or set a default one if needed + if obj is None: + # accessed on the class, not the instance + return self + # Get the cache or set a default one if needed cache = getattr(obj, '_cache', None) if cache is None: try: cache = obj._cache = {} except (AttributeError): - return + return self if PyDict_Contains(cache, self.name): # not necessary to Py_INCREF @@ -40,20 +37,9 @@ cdef class cache_readonly(object): PyDict_SetItem(cache, self.name, val) return val - def __set__(self, obj, value): - - if not self.allow_setting: - raise Exception("cannot set values for [%s]" % self.name) - # Get the cache or set a default one if needed - cache = getattr(obj, '_cache', None) - if cache is None: - try: - cache = obj._cache = {} - except (AttributeError): - return +cache_readonly = CachedProperty - PyDict_SetItem(cache, self.name, value) cdef class AxisProperty(object): cdef: diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 75bff34e4a3917..1fa07dbed6822a 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -752,7 +752,7 @@ cdef class IntegerFloatValidator(Validator): return issubclass(self.dtype.type, np.integer) -cpdef bint is_integer_float_array(ndarray values): +cdef bint is_integer_float_array(ndarray values): cdef: IntegerFloatValidator validator = IntegerFloatValidator( len(values), @@ -803,7 +803,7 @@ cdef class UnicodeValidator(Validator): return issubclass(self.dtype.type, np.unicode_) -cpdef bint is_unicode_array(ndarray values, bint skipna=False): +cdef bint is_unicode_array(ndarray values, bint skipna=False): cdef: UnicodeValidator validator = UnicodeValidator( len(values), @@ -822,7 +822,7 @@ cdef class BytesValidator(Validator): return issubclass(self.dtype.type, np.bytes_) -cpdef bint is_bytes_array(ndarray values, bint skipna=False): +cdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: BytesValidator validator = BytesValidator( len(values), @@ -1090,7 +1090,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, cdef: int status, maybe_int Py_ssize_t i, n = values.size - Seen seen = Seen(coerce_numeric); + Seen seen = Seen(coerce_numeric) ndarray[float64_t] floats = np.empty(n, dtype='f8') ndarray[complex128_t] complexes = np.empty(n, dtype='c16') ndarray[int64_t] ints = np.empty(n, dtype='i8') @@ -1224,7 +1224,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, ndarray[uint8_t] bools ndarray[int64_t] idatetimes ndarray[int64_t] itimedeltas - Seen seen = Seen(); + Seen seen = Seen() object val, onan float64_t fval, fnan @@ -1405,55 +1405,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects -def maybe_convert_bool(ndarray[object] arr, - true_values=None, false_values=None): - cdef: - Py_ssize_t i, n - ndarray[uint8_t] result - object val - set true_vals, false_vals - int na_count = 0 - - n = len(arr) - result = np.empty(n, dtype=np.uint8) - - # the defaults - true_vals = set(('True', 'TRUE', 'true')) - false_vals = set(('False', 'FALSE', 'false')) - - if true_values is not None: - true_vals = true_vals | set(true_values) - - if false_values is not None: - false_vals = false_vals | set(false_values) - - for i from 0 <= i < n: - val = arr[i] - - if cpython.PyBool_Check(val): - if val is True: - result[i] = 1 - else: - result[i] = 0 - elif val in true_vals: - result[i] = 1 - elif val in false_vals: - result[i] = 0 - elif PyFloat_Check(val): - result[i] = UINT8_MAX - na_count += 1 - else: - return arr - - if na_count > 0: - mask = result == UINT8_MAX - arr = result.view(np.bool_).astype(object) - np.putmask(arr, mask, np.nan) - return arr - else: - return result.view(np.bool_) - - def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, bint convert=1): """ diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index 19a7282f38049e..cb6f0a220fafe1 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -58,18 +58,6 @@ npy_int64 unix_date_from_ymd(int year, int month, int day) { return unix_date; } -/* Sets the date part of the date_info struct - Assumes GREGORIAN_CALENDAR */ -static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, - npy_int64 unix_date) { - pandas_datetimestruct dts; - - pandas_datetime_to_datetimestruct(unix_date, PANDAS_FR_D, &dts); - dinfo->year = dts.year; - dinfo->month = dts.month; - dinfo->day = dts.day; - return 0; -} /////////////////////////////////////////////// @@ -139,9 +127,9 @@ static npy_int64 DtoB_weekday(npy_int64 unix_date) { return floordiv(unix_date + 4, 7) * 5 + mod_compat(unix_date + 4, 7) - 4; } -static npy_int64 DtoB(struct date_info *dinfo, +static npy_int64 DtoB(pandas_datetimestruct *dts, int roll_back, npy_int64 unix_date) { - int day_of_week = dayofweek(dinfo->year, dinfo->month, dinfo->day); + int day_of_week = dayofweek(dts->year, dts->month, dts->day); if (roll_back == 1) { if (day_of_week > 4) { @@ -161,32 +149,32 @@ static npy_int64 DtoB(struct date_info *dinfo, //************ FROM DAILY *************** static npy_int64 asfreq_DTtoA(npy_int64 ordinal, asfreq_info *af_info) { - struct date_info dinfo; + pandas_datetimestruct dts; ordinal = downsample_daytime(ordinal, af_info); - dInfoCalc_SetFromAbsDate(&dinfo, ordinal); - if (dinfo.month > af_info->to_a_year_end) { - return (npy_int64)(dinfo.year + 1 - 1970); + pandas_datetime_to_datetimestruct(ordinal, PANDAS_FR_D, &dts); + if (dts.month > af_info->to_end) { + return (npy_int64)(dts.year + 1 - 1970); } else { - return (npy_int64)(dinfo.year - 1970); + return (npy_int64)(dts.year - 1970); } } static int DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year) { - struct date_info dinfo; + pandas_datetimestruct dts; int quarter; - dInfoCalc_SetFromAbsDate(&dinfo, ordinal); - if (af_info->to_q_year_end != 12) { - dinfo.month -= af_info->to_q_year_end; - if (dinfo.month <= 0) { - dinfo.month += 12; + pandas_datetime_to_datetimestruct(ordinal, PANDAS_FR_D, &dts); + if (af_info->to_end != 12) { + dts.month -= af_info->to_end; + if (dts.month <= 0) { + dts.month += 12; } else { - dinfo.year += 1; + dts.year += 1; } } - *year = dinfo.year; - quarter = monthToQuarter(dinfo.month); + *year = dts.year; + quarter = monthToQuarter(dts.month); return quarter; } @@ -200,29 +188,28 @@ static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, asfreq_info *af_info) { } static npy_int64 asfreq_DTtoM(npy_int64 ordinal, asfreq_info *af_info) { - struct date_info dinfo; + pandas_datetimestruct dts; ordinal = downsample_daytime(ordinal, af_info); - dInfoCalc_SetFromAbsDate(&dinfo, ordinal); - return (npy_int64)((dinfo.year - 1970) * 12 + dinfo.month - 1); + pandas_datetime_to_datetimestruct(ordinal, PANDAS_FR_D, &dts); + return (npy_int64)((dts.year - 1970) * 12 + dts.month - 1); } static npy_int64 asfreq_DTtoW(npy_int64 ordinal, asfreq_info *af_info) { ordinal = downsample_daytime(ordinal, af_info); - return floordiv(ordinal + 3 - af_info->to_week_end, 7) + 1; + return floordiv(ordinal + 3 - af_info->to_end, 7) + 1; } static npy_int64 asfreq_DTtoB(npy_int64 ordinal, asfreq_info *af_info) { - struct date_info dinfo; int roll_back; - - ordinal = downsample_daytime(ordinal, af_info); - dInfoCalc_SetFromAbsDate(&dinfo, ordinal); + pandas_datetimestruct dts; + npy_int64 unix_date = downsample_daytime(ordinal, af_info); + pandas_datetime_to_datetimestruct(unix_date, PANDAS_FR_D, &dts); // This usage defines roll_back the opposite way from the others roll_back = 1 - af_info->is_end; - return DtoB(&dinfo, roll_back, ordinal); + return DtoB(&dts, roll_back, unix_date); } //************ FROM BUSINESS *************** @@ -252,7 +239,7 @@ static npy_int64 asfreq_BtoW(npy_int64 ordinal, asfreq_info *af_info) { //************ FROM WEEKLY *************** static npy_int64 asfreq_WtoDT(npy_int64 ordinal, asfreq_info *af_info) { - ordinal = ordinal * 7 + af_info->from_week_end - 4 + + ordinal = ordinal * 7 + af_info->from_end - 4 + (7 - 1) * (af_info->is_end - 1); return upsample_daytime(ordinal, af_info); } @@ -274,12 +261,13 @@ static npy_int64 asfreq_WtoW(npy_int64 ordinal, asfreq_info *af_info) { } static npy_int64 asfreq_WtoB(npy_int64 ordinal, asfreq_info *af_info) { - struct date_info dinfo; + int roll_back; + pandas_datetimestruct dts; npy_int64 unix_date = asfreq_WtoDT(ordinal, af_info); - int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, unix_date); + pandas_datetime_to_datetimestruct(unix_date, PANDAS_FR_D, &dts); + roll_back = af_info->is_end; + return DtoB(&dts, roll_back, unix_date); } //************ FROM MONTHLY *************** @@ -313,12 +301,13 @@ static npy_int64 asfreq_MtoW(npy_int64 ordinal, asfreq_info *af_info) { } static npy_int64 asfreq_MtoB(npy_int64 ordinal, asfreq_info *af_info) { - struct date_info dinfo; + int roll_back; + pandas_datetimestruct dts; npy_int64 unix_date = asfreq_MtoDT(ordinal, af_info); - int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, unix_date); + pandas_datetime_to_datetimestruct(unix_date, PANDAS_FR_D, &dts); + roll_back = af_info->is_end; + return DtoB(&dts, roll_back, unix_date); } //************ FROM QUARTERLY *************** @@ -328,8 +317,8 @@ static void QtoD_ym(npy_int64 ordinal, int *year, int *month, *year = floordiv(ordinal, 4) + 1970; *month = mod_compat(ordinal, 4) * 3 + 1; - if (af_info->from_q_year_end != 12) { - *month += af_info->from_q_year_end; + if (af_info->from_end != 12) { + *month += af_info->from_end; if (*month > 12) { *month -= 12; } else { @@ -367,23 +356,24 @@ static npy_int64 asfreq_QtoW(npy_int64 ordinal, asfreq_info *af_info) { } static npy_int64 asfreq_QtoB(npy_int64 ordinal, asfreq_info *af_info) { - struct date_info dinfo; + int roll_back; + pandas_datetimestruct dts; npy_int64 unix_date = asfreq_QtoDT(ordinal, af_info); - int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, unix_date); + pandas_datetime_to_datetimestruct(unix_date, PANDAS_FR_D, &dts); + roll_back = af_info->is_end; + return DtoB(&dts, roll_back, unix_date); } //************ FROM ANNUAL *************** -static void AtoD_ym(npy_int64 ordinal, int *year, int *month, +static void AtoD_ym(npy_int64 ordinal, npy_int64 *year, int *month, asfreq_info *af_info) { *year = ordinal + 1970; *month = 1; - if (af_info->from_a_year_end != 12) { - *month += af_info->from_a_year_end; + if (af_info->from_end != 12) { + *month += af_info->from_end; if (*month > 12) { // This case is never reached, but is kept for symmetry // with QtoD_ym @@ -395,8 +385,8 @@ static void AtoD_ym(npy_int64 ordinal, int *year, int *month, } static npy_int64 asfreq_AtoDT(npy_int64 ordinal, asfreq_info *af_info) { - npy_int64 unix_date; - int year, month; + npy_int64 unix_date, year; + int month; ordinal += af_info->is_end; AtoD_ym(ordinal, &year, &month, af_info); @@ -423,12 +413,13 @@ static npy_int64 asfreq_AtoW(npy_int64 ordinal, asfreq_info *af_info) { } static npy_int64 asfreq_AtoB(npy_int64 ordinal, asfreq_info *af_info) { - struct date_info dinfo; + int roll_back; + pandas_datetimestruct dts; npy_int64 unix_date = asfreq_AtoDT(ordinal, af_info); - int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, unix_date); + pandas_datetime_to_datetimestruct(unix_date, PANDAS_FR_D, &dts); + roll_back = af_info->is_end; + return DtoB(&dts, roll_back, unix_date); } static npy_int64 nofunc(npy_int64 ordinal, asfreq_info *af_info) { diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index c6313924adddd0..8f538b261db9e2 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -81,27 +81,23 @@ typedef struct asfreq_info { // char relation == 'S' (for START) --> is_end = 0 // char relation == 'E' (for END) --> is_end = 1 - int from_week_end; // day the week ends on in the "from" frequency - int to_week_end; // day the week ends on in the "to" frequency - - int from_a_year_end; // month the year ends on in the "from" frequency - int to_a_year_end; // month the year ends on in the "to" frequency - - int from_q_year_end; // month the year ends on in the "from" frequency - int to_q_year_end; // month the year ends on in the "to" frequency + int from_end; + int to_end; + // weekly: + // from_end --> day the week ends on in the "from" frequency + // to_end --> day the week ends on in the "to" frequency + // + // annual: + // from_end --> month the year ends on in the "from" frequency + // to_end --> month the year ends on in the "to" frequency + // + // quarterly: + // from_end --> month the year ends on in the "from" frequency + // to_end --> month the year ends on in the "to" frequency npy_int64 intraday_conversion_factor; } asfreq_info; -typedef struct date_info { - double second; - int minute; - int hour; - int day; - int month; - int year; -} date_info; - typedef npy_int64 (*freq_conv_func)(npy_int64, asfreq_info *af_info); /* diff --git a/pandas/_libs/src/util.pxd b/pandas/_libs/src/util.pxd index cf23df1279f34b..5030b742849f83 100644 --- a/pandas/_libs/src/util.pxd +++ b/pandas/_libs/src/util.pxd @@ -164,22 +164,3 @@ cdef inline bint _checknan(object val): cdef inline bint is_period_object(object val): return getattr(val, '_typ', '_typ') == 'period' - - -cdef inline object unbox_if_zerodim(object arr): - """ - If arr is zerodim array, return a proper array scalar (e.g. np.int64). - Otherwise, return arr as is. - - Parameters - ---------- - arr : object - - Returns - ------- - result : object - """ - if cnp.PyArray_IsZeroDim(arr): - return cnp.PyArray_ToScalar(cnp.PyArray_DATA(arr), arr) - else: - return arr diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index fec7f21d6e6eba..17453d8af1297c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -755,8 +755,7 @@ cdef inline bint _parse_today_now(str val, int64_t* iresult): iresult[0] = Timestamp.utcnow().value return True elif val == 'today': - # Note: this is *not* the same as Timestamp('today') - iresult[0] = Timestamp.now().normalize().value + iresult[0] = Timestamp.today().value return True return False diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 9bd315b43ea9ea..0901d474d044c9 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -12,6 +12,8 @@ cimport numpy as cnp from numpy cimport int64_t, int32_t cnp.import_array() +from locale import LC_TIME +from strptime import LocaleTime # ---------------------------------------------------------------------- # Constants @@ -35,11 +37,18 @@ cdef int32_t* _month_offset = [ # Canonical location for other modules to find name constants MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] +# The first blank line is consistent with calendar.month_name in the calendar +# standard library +MONTHS_FULL = ['', 'January', 'February', 'March', 'April', 'May', 'June', + 'July', 'August', 'September', 'October', 'November', + 'December'] MONTH_NUMBERS = {name: num for num, name in enumerate(MONTHS)} MONTH_ALIASES = {(num + 1): name for num, name in enumerate(MONTHS)} MONTH_TO_CAL_NUM = {name: num + 1 for num, name in enumerate(MONTHS)} DAYS = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] +DAYS_FULL = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', + 'Saturday', 'Sunday'] int_to_weekday = {num: name for num, name in enumerate(DAYS)} weekday_to_int = {int_to_weekday[key]: key for key in int_to_weekday} @@ -199,3 +208,23 @@ cpdef int32_t get_day_of_year(int year, int month, int day) nogil: day_of_year = mo_off + day return day_of_year + + +cpdef get_locale_names(object name_type, object locale=None): + """Returns an array of localized day or month names + + Parameters + ---------- + name_type : string, attribute of LocaleTime() in which to return localized + names + locale : string + + Returns + ------- + list of locale names + + """ + from pandas.util.testing import set_locale + + with set_locale(locale, LC_TIME): + return getattr(LocaleTime(), name_type) diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 868c2641b34db4..8f887dc3af203d 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -16,7 +16,8 @@ cdef class _TSObject: cdef convert_to_tsobject(object ts, object tz, object unit, - bint dayfirst, bint yearfirst) + bint dayfirst, bint yearfirst, + int32_t nanos=*) cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, int32_t nanos=*) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index beaca1a8483c77..f4841e6abb7e88 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -252,7 +252,7 @@ cpdef int64_t pydt_to_i8(object pydt) except? -1: cdef convert_to_tsobject(object ts, object tz, object unit, - bint dayfirst, bint yearfirst): + bint dayfirst, bint yearfirst, int32_t nanos=0): """ Extract datetime and int64 from any of: - np.int64 (with unit providing a possible modifier) @@ -297,7 +297,7 @@ cdef convert_to_tsobject(object ts, object tz, object unit, obj.value = ts dt64_to_dtstruct(ts, &obj.dts) elif PyDateTime_Check(ts): - return convert_datetime_to_tsobject(ts, tz) + return convert_datetime_to_tsobject(ts, tz, nanos) elif PyDate_Check(ts): # Keep the converter same as PyDateTime's ts = datetime.combine(ts, datetime_time()) @@ -543,7 +543,6 @@ cdef inline void localize_tso(_TSObject obj, tzinfo tz): ndarray[int64_t] trans, deltas int64_t delta, local_val Py_ssize_t posn - datetime dt assert obj.tzinfo is None @@ -679,7 +678,6 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): Py_ssize_t pos int64_t v, offset, utc_date pandas_datetimestruct dts - datetime dt # See GH#17734 We should always be converting either from UTC or to UTC assert (is_utc(tz1) or tz1 == 'UTC') or (is_utc(tz2) or tz2 == 'UTC') @@ -739,7 +737,6 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): ndarray[Py_ssize_t] posn int64_t v, offset, delta pandas_datetimestruct dts - datetime dt if len(vals) == 0: return np.array([], dtype=np.int64) @@ -844,7 +841,6 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, pandas_datetimestruct dts bint infer_dst = False, is_dst = False, fill = False bint is_coerce = errors == 'coerce', is_raise = errors == 'raise' - datetime dt # Vectorized version of DstTzInfo.localize @@ -1086,7 +1082,6 @@ cdef ndarray[int64_t] _normalize_local(ndarray[int64_t] stamps, object tz): ndarray[int64_t] result = np.empty(n, dtype=np.int64) ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts - datetime dt if is_utc(tz): with nogil: diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 7a4b9775bd56eb..ccf67e765e079d 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -13,7 +13,7 @@ cimport numpy as cnp from numpy cimport ndarray, int64_t, int32_t, int8_t cnp.import_array() - +from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL from ccalendar cimport (get_days_in_month, is_leapyear, dayofweek, get_week_of_year, get_day_of_year) from np_datetime cimport (pandas_datetimestruct, pandas_timedeltastruct, @@ -85,26 +85,27 @@ def build_field_sarray(ndarray[int64_t] dtindex): @cython.wraparound(False) @cython.boundscheck(False) -def get_date_name_field(ndarray[int64_t] dtindex, object field): +def get_date_name_field(ndarray[int64_t] dtindex, object field, + object locale=None): """ Given a int64-based datetime index, return array of strings of date name based on requested field (e.g. weekday_name) """ cdef: Py_ssize_t i, count = 0 - ndarray[object] out + ndarray[object] out, names pandas_datetimestruct dts int dow - _dayname = np.array( - ['Monday', 'Tuesday', 'Wednesday', 'Thursday', - 'Friday', 'Saturday', 'Sunday'], - dtype=np.object_) - count = len(dtindex) out = np.empty(count, dtype=object) - if field == 'weekday_name': + if field == 'day_name' or field == 'weekday_name': + if locale is None: + names = np.array(DAYS_FULL, dtype=np.object_) + else: + names = np.array(get_locale_names('f_weekday', locale), + dtype=np.object_) for i in range(count): if dtindex[i] == NPY_NAT: out[i] = np.nan @@ -112,7 +113,21 @@ def get_date_name_field(ndarray[int64_t] dtindex, object field): dt64_to_dtstruct(dtindex[i], &dts) dow = dayofweek(dts.year, dts.month, dts.day) - out[i] = _dayname[dow] + out[i] = names[dow].capitalize() + return out + elif field == 'month_name': + if locale is None: + names = np.array(MONTHS_FULL, dtype=np.object_) + else: + names = np.array(get_locale_names('f_month', locale), + dtype=np.object_) + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = np.nan + continue + + dt64_to_dtstruct(dtindex[i], &dts) + out[i] = names[dts.month].capitalize() return out raise ValueError("Field %s not supported" % field) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 9f4ef4e5150584..be76b55fa169bc 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- # cython: profile=False -import warnings from cpython cimport ( PyFloat_Check, PyComplex_Check, @@ -39,24 +38,19 @@ _nat_scalar_rules[Py_GE] = False # ---------------------------------------------------------------------- -def _make_nan_func(func_name, cls): +def _make_nan_func(func_name, doc): def f(*args, **kwargs): return np.nan f.__name__ = func_name - f.__doc__ = getattr(cls, func_name).__doc__ + f.__doc__ = doc return f -def _make_nat_func(func_name, cls): +def _make_nat_func(func_name, doc): def f(*args, **kwargs): return NaT - f.__name__ = func_name - if isinstance(cls, str): - # passed the literal docstring directly - f.__doc__ = cls - else: - f.__doc__ = getattr(cls, func_name).__doc__ + f.__doc__ = doc return f @@ -318,11 +312,40 @@ class NaTType(_NaT): # These are the ones that can get their docstrings from datetime. # nan methods - weekday = _make_nan_func('weekday', datetime) - isoweekday = _make_nan_func('isoweekday', datetime) + weekday = _make_nan_func('weekday', datetime.weekday.__doc__) + isoweekday = _make_nan_func('isoweekday', datetime.isoweekday.__doc__) + month_name = _make_nan_func('month_name', # noqa:E128 + """ + Return the month name of the Timestamp with specified locale. + + Parameters + ---------- + locale : string, default None (English locale) + locale determining the language in which to return the month name + + Returns + ------- + month_name : string + + .. versionadded:: 0.23.0 + """) + day_name = _make_nan_func('day_name', # noqa:E128 + """ + Return the day name of the Timestamp with specified locale. + Parameters + ---------- + locale : string, default None (English locale) + locale determining the language in which to return the day name + + Returns + ------- + day_name : string + + .. versionadded:: 0.23.0 + """) # _nat_methods - date = _make_nat_func('date', datetime) + date = _make_nat_func('date', datetime.date.__doc__) utctimetuple = _make_error_func('utctimetuple', datetime) timetz = _make_error_func('timetz', datetime) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 09aeff852a0f21..2ecd55ea881702 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -57,7 +57,7 @@ _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') -cdef set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) +cdef set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'} NAT_SENTINEL = object() # This allows us to reference NaT without having to import it @@ -651,7 +651,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, break # Only consider it a valid guess if we have a year, month and day - if len(set(['year', 'month', 'day']) & found_attrs) != 3: + if len({'year', 'month', 'day'} & found_attrs) != 3: return None output_format = [] diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 9cf7e39791f2b5..89f38724cde1a9 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -79,14 +79,8 @@ cdef extern from "period_helper.h": int64_t intraday_conversion_factor int is_end - int from_week_end - int to_week_end - - int from_a_year_end - int to_a_year_end - - int from_q_year_end - int to_q_year_end + int to_end + int from_end ctypedef int64_t (*freq_conv_func)(int64_t, asfreq_info*) nogil @@ -147,23 +141,13 @@ cdef inline int get_freq_group_index(int freq) nogil: # specifically _dont_ use cdvision or else ordinals near -1 are assigned to # incorrect dates GH#19643 @cython.cdivision(False) -cdef int64_t get_period_ordinal(int year, int month, int day, - int hour, int minute, int second, - int microseconds, int picoseconds, - int freq) nogil: +cdef int64_t get_period_ordinal(pandas_datetimestruct *dts, int freq) nogil: """ Generate an ordinal in period space Parameters ---------- - year : int - month : int - day : int - hour : int - minute : int - second : int - microseconds : int - picoseconds : int + dts: pandas_datetimestruct* freq : int Returns @@ -182,52 +166,54 @@ cdef int64_t get_period_ordinal(int year, int month, int day, fmonth = freq - FR_ANN if fmonth == 0: fmonth = 12 - if month <= fmonth: - return year - 1970 + + mdiff = dts.month - fmonth + if mdiff <= 0: + return dts.year - 1970 else: - return year - 1970 + 1 + return dts.year - 1970 + 1 elif freq_group == FR_QTR: fmonth = freq - FR_QTR if fmonth == 0: fmonth = 12 - mdiff = month - fmonth + mdiff = dts.month - fmonth # TODO: Aren't the next two conditions equivalent to # unconditional incrementing? if mdiff < 0: mdiff += 12 - if month >= fmonth: + if dts.month >= fmonth: mdiff += 12 - return (year - 1970) * 4 + (mdiff - 1) // 3 + return (dts.year - 1970) * 4 + (mdiff - 1) // 3 elif freq == FR_MTH: - return (year - 1970) * 12 + month - 1 + return (dts.year - 1970) * 12 + dts.month - 1 - unix_date = unix_date_from_ymd(year, month, day) + unix_date = pandas_datetimestruct_to_datetime(PANDAS_FR_D, dts) if freq >= FR_SEC: - seconds = unix_date * 86400 + hour * 3600 + minute * 60 + second + seconds = unix_date * 86400 + dts.hour * 3600 + dts.min * 60 + dts.sec if freq == FR_MS: - return seconds * 1000 + microseconds // 1000 + return seconds * 1000 + dts.us // 1000 elif freq == FR_US: - return seconds * 1000000 + microseconds + return seconds * 1000000 + dts.us elif freq == FR_NS: return (seconds * 1000000000 + - microseconds * 1000 + picoseconds // 1000) + dts.us * 1000 + dts.ps // 1000) else: return seconds elif freq == FR_MIN: - return unix_date * 1440 + hour * 60 + minute + return unix_date * 1440 + dts.hour * 60 + dts.min elif freq == FR_HR: - return unix_date * 24 + hour + return unix_date * 24 + dts.hour elif freq == FR_DAY: return unix_date @@ -374,34 +360,6 @@ cdef double get_abs_time(int freq, int64_t unix_date, int64_t ordinal) nogil: return result -cdef int64_t unix_date_from_ymd(int year, int month, int day) nogil: - """ - Find the unix_date (days elapsed since datetime(1970, 1, 1) - for the given year/month/day. - - Parameters - ---------- - year : int - month : int - day : int - - Returns - ------- - unix_date : int - days elapsed since datetime(1970, 1, 1) - """ - cdef: - pandas_datetimestruct dts - int64_t unix_date - - memset(&dts, 0, sizeof(pandas_datetimestruct)) - dts.year = year - dts.month = month - dts.day = day - unix_date = pandas_datetimestruct_to_datetime(PANDAS_FR_D, &dts) - return unix_date - - cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): """ Find the year and quarter of a Period with the given ordinal and frequency @@ -434,6 +392,7 @@ cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): else: qtr_freq = FR_QTR + assert (qtr_freq % 1000) <= 12 get_asfreq_info(FR_DAY, qtr_freq, True, &af_info) quarter[0] = DtoQ_yq(unix_date, &af_info, year) @@ -447,8 +406,8 @@ cdef int DtoQ_yq(int64_t unix_date, asfreq_info *af_info, int *year): date_info_from_days_and_time(&dts, unix_date, 0) - if af_info.to_q_year_end != 12: - dts.month -= af_info.to_q_year_end + if af_info.to_end != 12: + dts.month -= af_info.to_end if dts.month <= 0: dts.month += 12 else: @@ -490,9 +449,7 @@ def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq, tz=None): out[i] = NPY_NAT continue dt64_to_dtstruct(dtarr[i], &dts) - out[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, - dts.us, dts.ps, freq) + out[i] = get_period_ordinal(&dts, freq) else: out = localize_dt64arr_to_period(dtarr, freq, tz) return out @@ -570,18 +527,18 @@ cdef void get_asfreq_info(int from_freq, int to_freq, get_freq_group_index(max_value(to_group, FR_DAY))) if from_group == FR_WK: - af_info.from_week_end = calc_week_end(from_freq, from_group) + af_info.from_end = calc_week_end(from_freq, from_group) elif from_group == FR_ANN: - af_info.from_a_year_end = calc_a_year_end(from_freq, from_group) + af_info.from_end = calc_a_year_end(from_freq, from_group) elif from_group == FR_QTR: - af_info.from_q_year_end = calc_a_year_end(from_freq, from_group) + af_info.from_end = calc_a_year_end(from_freq, from_group) if to_group == FR_WK: - af_info.to_week_end = calc_week_end(to_freq, to_group) + af_info.to_end = calc_week_end(to_freq, to_group) elif to_group == FR_ANN: - af_info.to_a_year_end = calc_a_year_end(to_freq, to_group) + af_info.to_end = calc_a_year_end(to_freq, to_group) elif to_group == FR_QTR: - af_info.to_q_year_end = calc_a_year_end(to_freq, to_group) + af_info.to_end = calc_a_year_end(to_freq, to_group) @cython.cdivision @@ -635,15 +592,43 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): return result -def period_ordinal(int y, int m, int d, int h, int min, - int s, int us, int ps, int freq): - return get_period_ordinal(y, m, d, h, min, s, us, ps, freq) +cpdef int64_t period_ordinal(int y, int m, int d, int h, int min, + int s, int us, int ps, int freq): + """ + Find the ordinal representation of the given datetime components at the + frequency `freq`. + + Parameters + ---------- + y : int + m : int + d : int + h : int + min : int + s : int + us : int + ps : int + + Returns + ------- + ordinal : int64_t + """ + cdef: + pandas_datetimestruct dts + dts.year = y + dts.month = m + dts.day = d + dts.hour = h + dts.min = min + dts.sec = s + dts.us = us + dts.ps = ps + return get_period_ordinal(&dts, freq) cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil: cdef: pandas_datetimestruct dts - float subsecond_fraction if ordinal == NPY_NAT: return NPY_NAT @@ -770,19 +755,15 @@ cdef int pyear(int64_t ordinal, int freq): @cython.cdivision cdef int pqyear(int64_t ordinal, int freq): cdef: - int year, quarter, qtr_freq - qtr_freq = get_yq(ordinal, freq, &quarter, &year) - if (qtr_freq % 1000) > 12: - year -= 1 + int year, quarter + get_yq(ordinal, freq, &quarter, &year) return year cdef int pquarter(int64_t ordinal, int freq): cdef: - int year, quarter, qtr_freq - qtr_freq = get_yq(ordinal, freq, &quarter, &year) - if (qtr_freq % 1000) > 12: - year -= 1 + int year, quarter + get_yq(ordinal, freq, &quarter, &year) return quarter @@ -968,9 +949,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, result[i] = NPY_NAT continue dt64_to_dtstruct(stamps[i], &dts) - result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, - dts.us, dts.ps, freq) + result[i] = get_period_ordinal(&dts, freq) elif is_tzlocal(tz): for i in range(n): @@ -979,9 +958,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, continue local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) dt64_to_dtstruct(local_val, &dts) - result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, - dts.us, dts.ps, freq) + result[i] = get_period_ordinal(&dts, freq) else: # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) @@ -998,18 +975,14 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, result[i] = NPY_NAT continue dt64_to_dtstruct(stamps[i] + deltas[0], &dts) - result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, - dts.us, dts.ps, freq) + result[i] = get_period_ordinal(&dts, freq) else: for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, - dts.us, dts.ps, freq) + result[i] = get_period_ordinal(&dts, freq) return result @@ -1595,9 +1568,9 @@ class Period(_Period): if ordinal is None: base, mult = get_freq_code(freq) - ordinal = get_period_ordinal(dt.year, dt.month, dt.day, - dt.hour, dt.minute, dt.second, - dt.microsecond, 0, base) + ordinal = period_ordinal(dt.year, dt.month, dt.day, + dt.hour, dt.minute, dt.second, + dt.microsecond, 0, base) return cls._from_ordinal(ordinal, freq) @@ -1608,8 +1581,8 @@ cdef int64_t _ordinal_from_fields(year, month, quarter, day, if quarter is not None: year, month = _quarter_to_myear(year, quarter, freq) - return get_period_ordinal(year, month, day, hour, - minute, second, 0, 0, base) + return period_ordinal(year, month, day, hour, + minute, second, 0, 0, base) def _quarter_to_myear(year, quarter, freq): diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index ed77916a1d8877..9818d53e386bd6 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -24,7 +24,7 @@ cimport ccalendar from conversion import tz_localize_to_utc, date_normalize from conversion cimport (tz_convert_single, _TSObject, convert_to_tsobject, convert_datetime_to_tsobject) -from fields import get_date_field, get_start_end_field +from fields import get_start_end_field, get_date_name_field from nattype import NaT from nattype cimport NPY_NAT from np_datetime import OutOfBoundsDatetime @@ -107,6 +107,7 @@ cdef class _Timestamp(datetime): cdef readonly: int64_t value, nanosecond object freq # frequency reference + list _date_attributes def __hash__(_Timestamp self): if self.nanosecond: @@ -351,6 +352,16 @@ cdef class _Timestamp(datetime): field, freqstr, month_kw) return out[0] + cpdef _get_date_name_field(self, object field, object locale): + cdef: + int64_t val + ndarray out + + val = self._maybe_convert_value_to_local() + out = get_date_name_field(np.array([val], dtype=np.int64), + field, locale=locale) + return out[0] + @property def _repr_base(self): return '{date} {time}'.format(date=self._date_repr, @@ -425,6 +436,8 @@ class Timestamp(_Timestamp): .. versionadded:: 0.19.0 hour, minute, second, microsecond : int, optional, default 0 .. versionadded:: 0.19.0 + nanosecond : int, optional, default 0 + .. versionadded:: 0.23.0 tzinfo : datetime.tzinfo, optional, default None .. versionadded:: 0.19.0 @@ -556,7 +569,7 @@ class Timestamp(_Timestamp): object freq=None, tz=None, unit=None, year=None, month=None, day=None, hour=None, minute=None, second=None, microsecond=None, - tzinfo=None): + nanosecond=None, tzinfo=None): # The parameter list folds together legacy parameter names (the first # four) and positional and keyword parameter names from pydatetime. # @@ -580,6 +593,9 @@ class Timestamp(_Timestamp): cdef _TSObject ts + _date_attributes = [year, month, day, hour, minute, second, + microsecond, nanosecond] + if tzinfo is not None: if not PyTZInfo_Check(tzinfo): # tzinfo must be a datetime.tzinfo object, GH#17690 @@ -588,7 +604,14 @@ class Timestamp(_Timestamp): elif tz is not None: raise ValueError('Can provide at most one of tz, tzinfo') - if ts_input is _no_input: + if is_string_object(ts_input): + # User passed a date string to parse. + # Check that the user didn't also pass a date attribute kwarg. + if any(arg is not None for arg in _date_attributes): + raise ValueError('Cannot pass a date attribute keyword ' + 'argument when passing a date string') + + elif ts_input is _no_input: # User passed keyword arguments. if tz is None: # Handle the case where the user passes `tz` and not `tzinfo` @@ -596,20 +619,20 @@ class Timestamp(_Timestamp): return Timestamp(datetime(year, month, day, hour or 0, minute or 0, second or 0, microsecond or 0, tzinfo), - tz=tz) + nanosecond=nanosecond, tz=tz) elif is_integer_object(freq): # User passed positional arguments: # Timestamp(year, month, day[, hour[, minute[, second[, - # microsecond[, tzinfo]]]]]) + # microsecond[, nanosecond[, tzinfo]]]]]]) return Timestamp(datetime(ts_input, freq, tz, unit or 0, year or 0, month or 0, day or 0, - hour), tz=hour) + minute), nanosecond=hour, tz=minute) if tzinfo is not None: # User passed tzinfo instead of tz; avoid silently ignoring tz, tzinfo = tzinfo, None - ts = convert_to_tsobject(ts_input, tz, unit, 0, 0) + ts = convert_to_tsobject(ts_input, tz, unit, 0, 0, nanosecond or 0) if ts.value == NPY_NAT: return NaT @@ -701,12 +724,50 @@ class Timestamp(_Timestamp): def dayofweek(self): return self.weekday() + def day_name(self, locale=None): + """ + Return the day name of the Timestamp with specified locale. + + Parameters + ---------- + locale : string, default None (English locale) + locale determining the language in which to return the day name + + Returns + ------- + day_name : string + + .. versionadded:: 0.23.0 + """ + return self._get_date_name_field('day_name', locale) + + def month_name(self, locale=None): + """ + Return the month name of the Timestamp with specified locale. + + Parameters + ---------- + locale : string, default None (English locale) + locale determining the language in which to return the month name + + Returns + ------- + month_name : string + + .. versionadded:: 0.23.0 + """ + return self._get_date_name_field('month_name', locale) + @property def weekday_name(self): - cdef dict wdays = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', - 3: 'Thursday', 4: 'Friday', 5: 'Saturday', - 6: 'Sunday'} - return wdays[self.weekday()] + """ + .. deprecated:: 0.23.0 + Use ``Timestamp.day_name()`` instead + """ + warnings.warn("`weekday_name` is deprecated and will be removed in a " + "future version. Use `day_name` instead", + FutureWarning) + return self.day_name() @property def dayofyear(self): diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index f651fbbf563165..c1a9a9fc1ed131 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -38,9 +38,9 @@ def load_reduce(self): # try to re-encode the arguments if getattr(self, 'encoding', None) is not None: - args = tuple([arg.encode(self.encoding) - if isinstance(arg, string_types) - else arg for arg in args]) + args = tuple(arg.encode(self.encoding) + if isinstance(arg, string_types) + else arg for arg in args) try: stack[-1] = func(*args) return @@ -113,6 +113,12 @@ def load_reduce(self): # 19269, arrays moving ('pandas.core.categorical', 'Categorical'): ('pandas.core.arrays', 'Categorical'), + + # 19939, add timedeltaindex, float64index compat from 15998 move + ('pandas.tseries.tdi', 'TimedeltaIndex'): + ('pandas.core.indexes.timedeltas', 'TimedeltaIndex'), + ('pandas.indexes.numeric', 'Float64Index'): + ('pandas.core.indexes.numeric', 'Float64Index'), } diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 96bf628c8d7ff1..06c4068f86bfe3 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -191,9 +191,9 @@ def __init__(self, pandas_obj): @property def center(self): - # return the geographic center point of this DataFarme - lon = self._obj.latitude - lat = self._obj.longitude + # return the geographic center point of this DataFrame + lat = self._obj.latitude + lon = self._obj.longitude return (float(lon.mean()), float(lat.mean())) def plot(self): diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9056f78ee02edb..8fb74e2e871743 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -191,7 +191,7 @@ def apply_broadcast(self, target): for i, col in enumerate(target.columns): res = self.f(target[col]) - ares = np. asarray(res).ndim + ares = np.asarray(res).ndim # must be a scalar or 1d if ares > 1: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cec881394a021d..37074b563efbd0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -18,6 +18,7 @@ class ExtensionArray(object): The interface includes the following abstract methods that must be implemented by subclasses: + * _constructor_from_sequence * __getitem__ * __len__ * dtype @@ -56,6 +57,25 @@ class ExtensionArray(object): # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. # Don't override this. _typ = 'extension' + + # ------------------------------------------------------------------------ + # Constructors + # ------------------------------------------------------------------------ + @classmethod + def _constructor_from_sequence(cls, scalars): + """Construct a new ExtensionArray from a sequence of scalars. + + Parameters + ---------- + scalars : Sequence + Each element will be an instance of the scalar type for this + array, ``cls.dtype.type``. + Returns + ------- + ExtensionArray + """ + raise AbstractMethodError(cls) + # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c6eeabf0148d0f..e23dc3b3e5b89f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -364,6 +364,10 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) + @classmethod + def _constructor_from_sequence(cls, scalars): + return cls(scalars) + @property def categories(self): """The categories of this categorical. diff --git a/pandas/core/base.py b/pandas/core/base.py index 280b8849792e37..fd039480fc6f17 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1184,24 +1184,6 @@ def searchsorted(self, value, side='left', sorter=None): # needs coercion on the key (DatetimeIndex does already) return self.values.searchsorted(value, side=side, sorter=sorter) - _shared_docs['drop_duplicates'] = ( - """Return %(klass)s with duplicate values removed - - Parameters - ---------- - - keep : {'first', 'last', False}, default 'first' - - ``first`` : Drop duplicates except for the first occurrence. - - ``last`` : Drop duplicates except for the last occurrence. - - False : Drop all duplicates. - %(inplace)s - - Returns - ------- - deduplicated : %(klass)s - """) - - @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if isinstance(self, ABCIndexClass): diff --git a/pandas/core/common.py b/pandas/core/common.py index c4fbcf28cbcae9..c4890dbd39ef1b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -11,7 +11,7 @@ from pandas._libs import lib, tslib from pandas import compat -from pandas.compat import long, zip, iteritems +from pandas.compat import long, zip, iteritems, PY36, OrderedDict from pandas.core.config import get_option from pandas.core.dtypes.generic import ABCSeries, ABCIndex from pandas.core.dtypes.common import _NS_DTYPE @@ -186,6 +186,16 @@ def _try_sort(iterable): return listed +def _dict_keys_to_ordered_list(mapping): + # when pandas drops support for Python < 3.6, this function + # can be replaced by a simple list(mapping.keys()) + if PY36 or isinstance(mapping, OrderedDict): + keys = list(mapping.keys()) + else: + keys = _try_sort(mapping) + return keys + + def iterpairs(seq): """ Parameters diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index da42cdbf102333..0edbf892172a99 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -207,6 +207,12 @@ def use_numexpr_cb(key): (currently both are identical) """ +pc_html_use_mathjax_doc = """\ +: boolean + When True, Jupyter notebook will process table contents using MathJax, + rendering mathematical expressions enclosed by the dollar symbol. + (default: True) +""" pc_width_doc = """ : int @@ -358,6 +364,8 @@ def table_schema_cb(key): validator=is_bool, cb=table_schema_cb) cf.register_option('html.border', 1, pc_html_border_doc, validator=is_int) + cf.register_option('html.use_mathjax', True, pc_html_use_mathjax_doc, + validator=is_bool) with cf.config_prefix('html'): cf.register_option('border', 1, pc_html_border_doc, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ae8fb48a61fceb..d430d442fae0f3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -116,8 +116,8 @@ - if `axis` is 1 or `'columns'` then `by` may contain column levels and/or index labels - .. versionchanged:: 0.23.0 - Allow specifying index or column level names.""", + .. versionchanged:: 0.23.0 + Allow specifying index or column level names.""", versionadded_to_excel='', optional_labels="""labels : array-like, optional New labels / index to conform the axis specified by 'axis' to.""", @@ -252,6 +252,11 @@ class DataFrame(NDFrame): ---------- data : numpy ndarray (structured or homogeneous), dict, or DataFrame Dict can contain Series, arrays, constants, or list-like objects + + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided @@ -460,9 +465,7 @@ def _init_dict(self, data, index, columns, dtype=None): arrays.append(v) else: - keys = list(data.keys()) - if not isinstance(data, OrderedDict): - keys = com._try_sort(keys) + keys = com._dict_keys_to_ordered_list(data) columns = data_names = Index(keys) arrays = [data[k] for k in keys] @@ -880,27 +883,66 @@ def dot(self, other): @classmethod def from_dict(cls, data, orient='columns', dtype=None, columns=None): """ - Construct DataFrame from dict of array-like or dicts + Construct DataFrame from dict of array-like or dicts. + + Creates DataFrame object from dictionary by columns or by index + allowing dtype specification. Parameters ---------- data : dict - {field : array-like} or {field : dict} + Of the form {field : array-like} or {field : dict}. orient : {'columns', 'index'}, default 'columns' The "orientation" of the data. If the keys of the passed dict should be the columns of the resulting DataFrame, pass 'columns' (default). Otherwise if the keys should be rows, pass 'index'. dtype : dtype, default None - Data type to force, otherwise infer - columns: list, default None - Column labels to use when orient='index'. Raises a ValueError - if used with orient='columns' + Data type to force, otherwise infer. + columns : list, default None + Column labels to use when ``orient='index'``. Raises a ValueError + if used with ``orient='columns'``. .. versionadded:: 0.23.0 Returns ------- - DataFrame + pandas.DataFrame + + See Also + -------- + DataFrame.from_records : DataFrame from ndarray (structured + dtype), list of tuples, dict, or DataFrame + DataFrame : DataFrame object creation using constructor + + Examples + -------- + By default the keys of the dict become the DataFrame columns: + + >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Specify ``orient='index'`` to create the DataFrame using dictionary + keys as rows: + + >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data, orient='index') + 0 1 2 3 + row_1 3 2 1 0 + row_2 a b c d + + When using the 'index' orientation, the column names can be + specified manually: + + >>> pd.DataFrame.from_dict(data, orient='index', + ... columns=['A', 'B', 'C', 'D']) + A B C D + row_1 3 2 1 0 + row_2 a b c d """ index = None orient = orient.lower() @@ -1206,20 +1248,68 @@ def from_records(cls, data, index=None, exclude=None, columns=None, def to_records(self, index=True, convert_datetime64=True): """ - Convert DataFrame to record array. Index will be put in the - 'index' field of the record array if requested + Convert DataFrame to a NumPy record array. + + Index will be put in the 'index' field of the record array if + requested. Parameters ---------- index : boolean, default True - Include index in resulting record array, stored in 'index' field + Include index in resulting record array, stored in 'index' field. convert_datetime64 : boolean, default True Whether to convert the index to datetime.datetime if it is a - DatetimeIndex + DatetimeIndex. Returns ------- - y : recarray + y : numpy.recarray + + See Also + -------- + DataFrame.from_records: convert structured or record ndarray + to DataFrame. + numpy.recarray: ndarray that allows field access using + attributes, analogous to typed columns in a + spreadsheet. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, + ... index=['a', 'b']) + >>> df + A B + a 1 0.50 + b 2 0.75 + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('index', 'O'), ('A', '>> df.to_records(index=False) + rec.array([(1, 0.5 ), (2, 0.75)], + dtype=[('A', '>> df.index = pd.date_range('2018-01-01 09:00', periods=2, freq='min') + >>> df + A B + 2018-01-01 09:00:00 1 0.50 + 2018-01-01 09:01:00 2 0.75 + >>> df.to_records() + rec.array([(datetime.datetime(2018, 1, 1, 9, 0), 1, 0.5 ), + (datetime.datetime(2018, 1, 1, 9, 1), 2, 0.75)], + dtype=[('index', 'O'), ('A', '>> df.to_records(convert_datetime64=False) + rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ), + ('2018-01-01T09:01:00.000000000', 2, 0.75)], + dtype=[('index', '>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], + ... 'b': [1, 1, 2, 3, 5, 8], + ... 'c': [1, 4, 9, 16, 25, 36]}) + >>> df + a b c + 0 1 1 1 + 1 2 1 4 + 2 3 2 9 + 3 4 3 16 + 4 5 5 25 + 5 6 8 36 + + >>> df.diff() + a b c + 0 NaN NaN NaN + 1 1.0 0.0 3.0 + 2 1.0 1.0 5.0 + 3 1.0 1.0 7.0 + 4 1.0 2.0 9.0 + 5 1.0 3.0 11.0 + + Difference with previous column + + >>> df.diff(axis=1) + a b c + 0 NaN 0.0 0.0 + 1 NaN -1.0 3.0 + 2 NaN -1.0 7.0 + 3 NaN -1.0 13.0 + 4 NaN 0.0 20.0 + 5 NaN 2.0 28.0 + + Difference with 3rd previous row + + >>> df.diff(periods=3) + a b c + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 NaN NaN NaN + 3 3.0 2.0 15.0 + 4 3.0 4.0 21.0 + 5 3.0 6.0 27.0 + + Difference with following row + + >>> df.diff(periods=-1) + a b c + 0 -1.0 0.0 -3.0 + 1 -1.0 -1.0 -5.0 + 2 -1.0 -1.0 -7.0 + 3 -1.0 -2.0 -9.0 + 4 -1.0 -3.0 -11.0 + 5 NaN NaN NaN """ bm_axis = self._get_block_manager_axis(axis) new_data = self._data.diff(n=periods, axis=bm_axis) @@ -5498,7 +5658,22 @@ def corr(self, method='pearson', min_periods=1): def cov(self, min_periods=None): """ - Compute pairwise covariance of columns, excluding NA/null values + Compute pairwise covariance of columns, excluding NA/null values. + + Compute the pairwise covariance among the series of a DataFrame. + The returned data frame is the `covariance matrix + `__ of the columns + of the DataFrame. + + Both NA and null values are automatically excluded from the + calculation. (See the note below about bias from missing values.) + A threshold can be set for the minimum number of + observations for each value created. Comparisons with observations + below this threshold will be returned as ``NaN``. + + This method is generally used for the analysis of time series data to + understand the relationship between different measures + across time. Parameters ---------- @@ -5508,12 +5683,71 @@ def cov(self, min_periods=None): Returns ------- - y : DataFrame + DataFrame + The covariance matrix of the series of the DataFrame. + + See Also + -------- + pandas.Series.cov : compute covariance with another Series + pandas.core.window.EWM.cov: expoential weighted sample covariance + pandas.core.window.Expanding.cov : expanding sample covariance + pandas.core.window.Rolling.cov : rolling sample covariance Notes ----- - `y` contains the covariance matrix of the DataFrame's time series. - The covariance is normalized by N-1 (unbiased estimator). + Returns the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-1. + + For DataFrames that have Series that are missing data (assuming that + data is `missing at random + `__) + the returned covariance matrix will be an unbiased estimate + of the variance and covariance between the member Series. + + However, for many applications this estimate may not be acceptable + because the estimate covariance matrix is not guaranteed to be positive + semi-definite. This could lead to estimate correlations having + absolute values which are greater than one, and/or a non-invertible + covariance matrix. See `Estimation of covariance matrices + `__ for more details. + + Examples + -------- + >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], + ... columns=['dogs', 'cats']) + >>> df.cov() + dogs cats + dogs 0.666667 -1.000000 + cats -1.000000 1.666667 + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(1000, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df.cov() + a b c d e + a 0.998438 -0.020161 0.059277 -0.008943 0.014144 + b -0.020161 1.059352 -0.008543 -0.024738 0.009826 + c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 + d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 + e 0.014144 0.009826 -0.000271 -0.013692 0.977795 + + **Minimum number of periods** + + This method also supports an optional ``min_periods`` keyword + that specifies the required minimum number of non-NA observations for + each column pair in order to have a valid result: + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(20, 3), + ... columns=['a', 'b', 'c']) + >>> df.loc[df.index[:5], 'a'] = np.nan + >>> df.loc[df.index[5:10], 'b'] = np.nan + >>> df.cov(min_periods=12) + a b c + a 0.316741 NaN -0.150812 + b NaN 1.248003 0.191417 + c -0.150812 0.191417 0.895202 """ numeric_df = self._get_numeric_data() cols = numeric_df.columns diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e1ed6ae9c8a6c5..fc8aaa23d2f790 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -18,6 +18,7 @@ is_number, is_integer, is_bool, is_bool_dtype, + is_categorical_dtype, is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype, @@ -211,9 +212,8 @@ def _dir_additions(self): """ add the string-like attributes from the info_axis. If info_axis is a MultiIndex, it's first level values are used. """ - additions = set( - [c for c in self._info_axis.unique(level=0)[:100] - if isinstance(c, string_types) and isidentifier(c)]) + additions = {c for c in self._info_axis.unique(level=0)[:100] + if isinstance(c, string_types) and isidentifier(c)} return super(NDFrame, self)._dir_additions().union(additions) @property @@ -3605,7 +3605,11 @@ def f(x): def head(self, n=5): """ - Return the first n rows. + Return the first `n` rows. + + This function returns the first `n` rows for the object based + on position. It is useful for quickly testing if your object + has the right type of data in it. Parameters ---------- @@ -3615,11 +3619,11 @@ def head(self, n=5): Returns ------- obj_head : type of caller - The first n rows of the caller object. + The first `n` rows of the caller object. See Also -------- - pandas.DataFrame.tail + pandas.DataFrame.tail: Returns the last `n` rows. Examples -------- @@ -3647,7 +3651,7 @@ def head(self, n=5): 3 lion 4 monkey - Viewing the first n lines (three in this case) + Viewing the first `n` lines (three in this case) >>> df.head(3) animal @@ -3660,7 +3664,11 @@ def head(self, n=5): def tail(self, n=5): """ - Return the last n rows. + Return the last `n` rows. + + This function returns last `n` rows from the object based on + position. It is useful for quickly verifying data, for example, + after sorting or appending rows. Parameters ---------- @@ -3669,12 +3677,12 @@ def tail(self, n=5): Returns ------- - obj_tail : type of caller - The last n rows of the caller object. + type of caller + The last `n` rows of the caller object. See Also -------- - pandas.DataFrame.head + pandas.DataFrame.head : The first `n` rows of the caller object. Examples -------- @@ -3702,7 +3710,7 @@ def tail(self, n=5): 7 whale 8 zebra - Viewing the last n lines (three in this case) + Viewing the last `n` lines (three in this case) >>> df.tail(3) animal @@ -4232,7 +4240,55 @@ def as_matrix(self, columns=None): @property def values(self): - """Numpy representation of NDFrame + """ + Return a Numpy representation of the DataFrame. + + Only the values in the DataFrame will be returned, the axes labels + will be removed. + + Returns + ------- + numpy.ndarray + The values of the DataFrame. + + Examples + -------- + A DataFrame where all columns are the same type (e.g., int64) results + in an array of the same type. + + >>> df = pd.DataFrame({'age': [ 3, 29], + ... 'height': [94, 170], + ... 'weight': [31, 115]}) + >>> df + age height weight + 0 3 94 31 + 1 29 170 115 + >>> df.dtypes + age int64 + height int64 + weight int64 + dtype: object + >>> df.values + array([[ 3, 94, 31], + [ 29, 170, 115]], dtype=int64) + + A DataFrame with mixed type columns(e.g., str/object, int64, float32) + results in an ndarray of the broadest type that accommodates these + mixed types (e.g., object). + + >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), + ... ('lion', 80.5, 1), + ... ('monkey', np.nan, None)], + ... columns=('name', 'max_speed', 'rank')) + >>> df2.dtypes + name object + max_speed float64 + rank object + dtype: object + >>> df2.values + array([['parrot', 24.0, 'second'], + ['lion', 80.5, 1], + ['monkey', nan, None]], dtype=object) Notes ----- @@ -4243,8 +4299,13 @@ def values(self): e.g. If the dtypes are float16 and float32, dtype will be upcast to float32. If dtypes are int32 and uint8, dtype will be upcast to - int32. By numpy.find_common_type convention, mixing int64 and uint64 - will result in a flot64 dtype. + int32. By :func:`numpy.find_common_type` convention, mixing int64 + and uint64 will result in a float64 dtype. + + See Also + -------- + pandas.DataFrame.index : Retrievie the index labels + pandas.DataFrame.columns : Retrieving the column names """ self._consolidate_inplace() return self._data.as_array(transpose=self._AXIS_REVERSED) @@ -4429,19 +4490,27 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): if col_name not in self: raise KeyError('Only a column name can be used for the ' 'key in a dtype mappings argument.') - from pandas import concat results = [] for col_name, col in self.iteritems(): if col_name in dtype: results.append(col.astype(dtype[col_name], copy=copy)) else: results.append(results.append(col.copy() if copy else col)) - return concat(results, axis=1, copy=False) - # else, only a single dtype is given - new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, - **kwargs) - return self._constructor(new_data).__finalize__(self) + elif is_categorical_dtype(dtype) and self.ndim > 1: + # GH 18099: columnwise conversion to categorical + results = (self[col].astype(dtype, copy=copy) for col in self) + + else: + # else, only a single dtype is given + new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, + **kwargs) + return self._constructor(new_data).__finalize__(self) + + # GH 19920: retain column metadata after concat + result = pd.concat(results, axis=1, copy=False) + result.columns = self.columns + return result def copy(self, deep=True): """ @@ -4706,7 +4775,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, if axis is None: axis = 0 axis = self._get_axis_number(axis) - method = missing.clean_fill_method(method) + from pandas import DataFrame if value is None: @@ -4727,7 +4796,6 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, # 3d elif self.ndim == 3: - # fill in 2d chunks result = {col: s.fillna(method=method, value=value) for col, s in self.iteritems()} @@ -4737,7 +4805,6 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, else: # 2d or less - method = missing.clean_fill_method(method) new_data = self._data.interpolate(method=method, axis=axis, limit=limit, inplace=inplace, coerce=True, @@ -4880,7 +4947,10 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): ``to_replace`` must be ``None``. method : string, optional, {'pad', 'ffill', 'bfill'} The method to use when for replacement, when ``to_replace`` is a - ``list``. + scalar, list or tuple and ``value`` is None. + + .. versionchanged:: 0.23.0 + Added to DataFrame See Also -------- @@ -5049,6 +5119,10 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, to_replace = [to_replace] if isinstance(to_replace, (tuple, list)): + if isinstance(self, pd.DataFrame): + return self.apply(_single_replace, + args=(to_replace, method, inplace, + limit)) return _single_replace(self, to_replace, method, inplace, limit) @@ -7510,11 +7584,10 @@ def _add_numeric_operations(cls): cls.any = _make_logical_function( cls, 'any', name, name2, axis_descr, 'Return whether any element is True over requested axis', - nanops.nanany) + nanops.nanany, '', '') cls.all = _make_logical_function( - cls, 'all', name, name2, axis_descr, - 'Return whether all elements are True over requested axis', - nanops.nanall) + cls, 'all', name, name2, axis_descr, _all_doc, + nanops.nanall, _all_examples, _all_see_also) @Substitution(outname='mad', desc="Return the mean absolute deviation of the values " @@ -7760,7 +7833,8 @@ def _doc_parms(cls): If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a %(name1)s ddof : int, default 1 - degrees of freedom + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. numeric_only : boolean, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. @@ -7770,7 +7844,6 @@ def _doc_parms(cls): %(outname)s : %(name1)s or %(name2)s (if level specified)\n""" _bool_doc = """ - %(desc)s Parameters @@ -7778,17 +7851,71 @@ def _doc_parms(cls): axis : %(axis_descr)s skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result - will be NA + will be NA. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a %(name1)s + particular level, collapsing into a %(name1)s. bool_only : boolean, default None Include only boolean columns. If None, will attempt to use everything, then use only boolean data. Not implemented for Series. +**kwargs : any, default None + Additional keywords have no affect but might be accepted for + compatibility with numpy. Returns ------- -%(outname)s : %(name1)s or %(name2)s (if level specified)\n""" +%(outname)s : %(name1)s or %(name2)s (if level specified) + +%(examples)s +%(see_also)s""" + +_all_doc = """\ +Return whether all elements are True over series or dataframe axis. + +Returns True if all elements within a series or along a dataframe +axis are non-zero, not-empty or not-False.""" + +_all_examples = """\ +Examples +-------- +Series + +>>> pd.Series([True, True]).all() +True +>>> pd.Series([True, False]).all() +False + +Dataframes + +Create a dataframe from a dictionary. + +>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]}) +>>> df + col1 col2 +0 True True +1 True False + +Default behaviour checks if column-wise values all return True. + +>>> df.all() +col1 True +col2 False +dtype: bool + +Adding axis=1 argument will check if row-wise values all return True. + +>>> df.all(axis=1) +0 True +1 False +dtype: bool +""" + +_all_see_also = """\ +See also +-------- +pandas.Series.all : Return True if all elements are True +pandas.DataFrame.any : Return True if one (or more) elements are True +""" _cnum_doc = """ @@ -7971,9 +8098,10 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): return set_function_name(cum_func, name, cls) -def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f): +def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f, + examples, see_also): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, examples=examples, see_also=see_also) @Appender(_bool_doc) def logical_func(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 00643614e8803f..285c5786b532b4 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1219,6 +1219,53 @@ class GroupBy(_GroupBy): """ _apply_whitelist = _common_apply_whitelist + def _bool_agg(self, val_test, skipna): + """Shared func to call any / all Cython GroupBy implementations""" + + def objs_to_bool(vals): + try: + vals = vals.astype(np.bool) + except ValueError: # for objects + vals = np.array([bool(x) for x in vals]) + + return vals.view(np.uint8) + + def result_to_bool(result): + return result.astype(np.bool, copy=False) + + return self._get_cythonized_result('group_any_all', self.grouper, + aggregate=True, + cython_dtype=np.uint8, + needs_values=True, + needs_mask=True, + pre_processing=objs_to_bool, + post_processing=result_to_bool, + val_test=val_test, skipna=skipna) + + @Substitution(name='groupby') + @Appender(_doc_template) + def any(self, skipna=True): + """Returns True if any value in the group is truthful, else False + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing + """ + return self._bool_agg('any', skipna) + + @Substitution(name='groupby') + @Appender(_doc_template) + def all(self, skipna=True): + """Returns True if all values in the group are truthful, else False + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing + """ + return self._bool_agg('all', skipna) + @Substitution(name='groupby') @Appender(_doc_template) def count(self): @@ -1485,6 +1532,8 @@ def _fill(self, direction, limit=None): return self._get_cythonized_result('group_fillna_indexer', self.grouper, needs_mask=True, + cython_dtype=np.int64, + result_is_index=True, direction=direction, limit=limit) @Substitution(name='groupby') @@ -1839,7 +1888,8 @@ def rank(self, method='average', ascending=True, na_option='keep', @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): """Cumulative product for each group""" - nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only']) + nv.validate_groupby_func('cumprod', args, kwargs, + ['numeric_only', 'skipna']) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) @@ -1849,7 +1899,8 @@ def cumprod(self, axis=0, *args, **kwargs): @Appender(_doc_template) def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" - nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only']) + nv.validate_groupby_func('cumsum', args, kwargs, + ['numeric_only', 'skipna']) if axis != 0: return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) @@ -1873,18 +1924,40 @@ def cummax(self, axis=0, **kwargs): return self._cython_transform('cummax', numeric_only=False) - def _get_cythonized_result(self, how, grouper, needs_mask=False, - needs_ngroups=False, **kwargs): + def _get_cythonized_result(self, how, grouper, aggregate=False, + cython_dtype=None, needs_values=False, + needs_mask=False, needs_ngroups=False, + result_is_index=False, + pre_processing=None, post_processing=None, + **kwargs): """Get result for Cythonized functions Parameters ---------- how : str, Cythonized function name to be called grouper : Grouper object containing pertinent group info + aggregate : bool, default False + Whether the result should be aggregated to match the number of + groups + cython_dtype : default None + Type of the array that will be modified by the Cython call. If + `None`, the type will be inferred from the values of each slice + needs_values : bool, default False + Whether the values should be a part of the Cython call + signature needs_mask : bool, default False - Whether boolean mask needs to be part of the Cython call signature + Whether boolean mask needs to be part of the Cython call + signature needs_ngroups : bool, default False - Whether number of groups part of the Cython call signature + Whether number of groups is part of the Cython call signature + result_is_index : bool, default False + Whether the result of the Cython operation is an index of + values to be retrieved, instead of the actual values themselves + pre_processing : function, default None + Function to be applied to `values` prior to passing to Cython + Raises if `needs_values` is False + post_processing : function, default None + Function to be applied to result of Cython function **kwargs : dict Extra arguments to be passed back to Cython funcs @@ -1892,14 +1965,40 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False, ------- `Series` or `DataFrame` with filled values """ + if result_is_index and aggregate: + raise ValueError("'result_is_index' and 'aggregate' cannot both " + "be True!") + if post_processing: + if not callable(pre_processing): + raise ValueError("'post_processing' must be a callable!") + if pre_processing: + if not callable(pre_processing): + raise ValueError("'pre_processing' must be a callable!") + if not needs_values: + raise ValueError("Cannot use 'pre_processing' without " + "specifying 'needs_values'!") labels, _, ngroups = grouper.group_info output = collections.OrderedDict() base_func = getattr(libgroupby, how) for name, obj in self._iterate_slices(): - indexer = np.zeros_like(labels, dtype=np.int64) - func = partial(base_func, indexer, labels) + if aggregate: + result_sz = ngroups + else: + result_sz = len(obj.values) + + if not cython_dtype: + cython_dtype = obj.values.dtype + + result = np.zeros(result_sz, dtype=cython_dtype) + func = partial(base_func, result, labels) + if needs_values: + vals = obj.values + if pre_processing: + vals = pre_processing(vals) + func = partial(func, vals) + if needs_mask: mask = isnull(obj.values).view(np.uint8) func = partial(func, mask) @@ -1908,9 +2007,19 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False, func = partial(func, ngroups) func(**kwargs) # Call func to modify indexer values in place - output[name] = algorithms.take_nd(obj.values, indexer) - return self._wrap_transformed_output(output) + if result_is_index: + result = algorithms.take_nd(obj.values, result) + + if post_processing: + result = post_processing(result) + + output[name] = result + + if aggregate: + return self._wrap_aggregated_output(output) + else: + return self._wrap_transformed_output(output) @Substitution(name='groupby') @Appender(_doc_template) @@ -1930,9 +2039,28 @@ def shift(self, periods=1, freq=None, axis=0): return self.apply(lambda x: x.shift(periods, freq, axis)) return self._get_cythonized_result('group_shift_indexer', - self.grouper, needs_ngroups=True, + self.grouper, cython_dtype=np.int64, + needs_ngroups=True, + result_is_index=True, periods=periods) + @Substitution(name='groupby') + @Appender(_doc_template) + def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, + axis=0): + """Calcuate pct_change of each value to previous entry in group""" + if freq is not None or axis != 0: + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq, + axis=axis)) + + filled = getattr(self, fill_method)(limit=limit).drop( + self.grouper.names, axis=1) + shifted = filled.shift(periods=periods, freq=freq) + + return (filled / shifted) - 1 + @Substitution(name='groupby') @Appender(_doc_template) def head(self, n=5): @@ -3773,6 +3901,13 @@ def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) + def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): + """Calculate percent change of each value to previous entry in group""" + filled = getattr(self, fill_method)(limit=limit) + shifted = filled.shift(periods=periods, freq=freq) + + return (filled / shifted) - 1 + class NDFrameGroupBy(GroupBy): @@ -4646,7 +4781,7 @@ def _apply_to_column_groupbys(self, func): keys=self._selected_obj.columns, axis=1) def _fill(self, direction, limit=None): - """Overriden method to join grouped columns in output""" + """Overridden method to join grouped columns in output""" res = super(DataFrameGroupBy, self)._fill(direction, limit=limit) output = collections.OrderedDict( (grp.name, grp.grouper) for grp in self.grouper.groupings) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0813c12d573d5d..e82b641db98fdd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -681,7 +681,47 @@ def _values(self): return self.values def get_values(self): - """ return the underlying data as an ndarray """ + """ + Return `Index` data as an `numpy.ndarray`. + + Returns + ------- + numpy.ndarray + A one-dimensional numpy array of the `Index` values. + + See Also + -------- + Index.values : The attribute that get_values wraps. + + Examples + -------- + Getting the `Index` values of a `DataFrame`: + + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) + >>> df + A B C + a 1 2 3 + b 4 5 6 + c 7 8 9 + >>> df.index.get_values() + array(['a', 'b', 'c'], dtype=object) + + Standalone `Index` values: + + >>> idx = pd.Index(['1', '2', '3']) + >>> idx.get_values() + array(['1', '2', '3'], dtype=object) + + `MultiIndex` arrays also have only one dimension: + + >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], + ... names=('number', 'letter')) + >>> midx.get_values() + array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) + >>> midx.get_values().ndim + 1 + """ return self.values @Appender(IndexOpsMixin.memory_usage.__doc__) @@ -696,12 +736,38 @@ def memory_usage(self, deep=False): @deprecate_kwarg(old_arg_name='n', new_arg_name='repeats') def repeat(self, repeats, *args, **kwargs): """ - Repeat elements of an Index. Refer to `numpy.ndarray.repeat` - for more information about the `repeats` argument. + Repeat elements of an Index. - See also + Returns a new index where each element of the current index + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int + The number of repetitions for each element. + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + + Returns + ------- + pandas.Index + Newly created Index with repeated elements. + + See Also + -------- + Series.repeat : Equivalent function for Series + numpy.repeat : Underlying implementation + + Examples -------- - numpy.ndarray.repeat + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') + >>> idx.repeat(2) + Int64Index([1, 1, 2, 2, 3, 3], dtype='int64') + >>> idx.repeat(3) + Int64Index([1, 1, 1, 2, 2, 2, 3, 3, 3], dtype='int64') """ nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(repeats)) @@ -1122,7 +1188,26 @@ def to_frame(self, index=True): Returns ------- - DataFrame : a DataFrame containing the original Index data. + DataFrame + DataFrame containing the original Index data. + + Examples + -------- + >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow """ from pandas import DataFrame @@ -1401,7 +1486,7 @@ def _is_strictly_monotonic_decreasing(self): def is_lexsorted_for_tuple(self, tup): return True - @cache_readonly(allow_setting=True) + @cache_readonly def is_unique(self): """ return if the index has unique values """ return self._engine.is_unique @@ -1665,6 +1750,59 @@ def _invalid_indexer(self, form, key): kind=type(key))) def get_duplicates(self): + """ + Extract duplicated index elements. + + Returns a sorted list of index elements which appear more than once in + the index. + + Returns + ------- + array-like + List of duplicated indexes. + + See Also + -------- + Index.duplicated : Return boolean array denoting duplicates. + Index.drop_duplicates : Return Index with duplicates removed. + + Examples + -------- + + Works on different Index of types. + + >>> pd.Index([1, 2, 2, 3, 3, 3, 4]).get_duplicates() + [2, 3] + >>> pd.Index([1., 2., 2., 3., 3., 3., 4.]).get_duplicates() + [2.0, 3.0] + >>> pd.Index(['a', 'b', 'b', 'c', 'c', 'c', 'd']).get_duplicates() + ['b', 'c'] + >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03', + ... '2018-01-03', '2018-01-04', '2018-01-04'], + ... format='%Y-%m-%d') + >>> pd.Index(dates).get_duplicates() + DatetimeIndex(['2018-01-03', '2018-01-04'], + dtype='datetime64[ns]', freq=None) + + Sorts duplicated elements even when indexes are unordered. + + >>> pd.Index([1, 2, 3, 2, 3, 4, 3]).get_duplicates() + [2, 3] + + Return empty array-like structure when all elements are unique. + + >>> pd.Index([1, 2, 3, 4]).get_duplicates() + [] + >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03'], + ... format='%Y-%m-%d') + >>> pd.Index(dates).get_duplicates() + DatetimeIndex([], dtype='datetime64[ns]', freq=None) + + Notes + ----- + In case of datetime-like indexes, the function is overridden where the + result is converted to DatetimeIndex. + """ from collections import defaultdict counter = defaultdict(lambda: 0) for k in self.values: @@ -2212,12 +2350,59 @@ def sortlevel(self, level=None, ascending=True, sort_remaining=None): def shift(self, periods=1, freq=None): """ - Shift Index containing datetime objects by input number of periods and - DateOffset + Shift index by desired number of time frequency increments. + + This method is for shifting the values of datetime-like indexes + by a specified time increment a given number of times. + + Parameters + ---------- + periods : int, default 1 + Number of periods (or increments) to shift by, + can be positive or negative. + freq : pandas.DateOffset, pandas.Timedelta or string, optional + Frequency increment to shift by. + If None, the index is shifted by its own `freq` attribute. + Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. Returns ------- - shifted : Index + pandas.Index + shifted index + + See Also + -------- + Series.shift : Shift values of Series. + + Examples + -------- + Put the first 5 month starts of 2011 into an index. + + >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') + >>> month_starts + DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', + '2011-05-01'], + dtype='datetime64[ns]', freq='MS') + + Shift the index by 10 days. + + >>> month_starts.shift(10, freq='D') + DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', + '2011-05-11'], + dtype='datetime64[ns]', freq=None) + + The default value of `freq` is the `freq` attribute of the index, + which is 'MS' (month start) in this example. + + >>> month_starts.shift(10) + DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', + '2012-03-01'], + dtype='datetime64[ns]', freq='MS') + + Notes + ----- + This method is only implemented for datetime-like index classes, + i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. """ raise NotImplementedError("Not supported for type %s" % type(self).__name__) @@ -3930,8 +4115,52 @@ def unique(self, level=None): result = super(Index, self).unique() return self._shallow_copy(result) - @Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs) def drop_duplicates(self, keep='first'): + """ + Return Index with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + Returns + ------- + deduplicated : Index + + See Also + -------- + Series.drop_duplicates : equivalent method on Series + DataFrame.drop_duplicates : equivalent method on DataFrame + Index.duplicated : related method on Index, indicating duplicate + Index values. + + Examples + -------- + Generate an pandas.Index with duplicate values. + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + + The `keep` parameter controls which duplicate values are removed. + The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> idx.drop_duplicates(keep='first') + Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') + + The value 'last' keeps the last occurrence for each set of duplicated + entries. + + >>> idx.drop_duplicates(keep='last') + Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') + + The value ``False`` discards all sets of duplicated entries. + + >>> idx.drop_duplicates(keep=False) + Index(['cow', 'beetle', 'hippo'], dtype='object') + """ return super(Index, self).drop_duplicates(keep=keep) @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 218851b1713f2d..71d39ad812d207 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -76,7 +76,7 @@ class CategoricalIndex(Index, accessor.PandasDelegate): _attributes = ['name'] def __new__(cls, data=None, categories=None, ordered=None, dtype=None, - copy=False, name=None, fastpath=False, **kwargs): + copy=False, name=None, fastpath=False): if fastpath: return cls._simple_new(data, name=name, dtype=dtype) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 4c6effc65a4d36..1c41488973978a 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -37,7 +37,7 @@ is_period_dtype, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( - ABCIndex, ABCSeries, ABCPeriodIndex, ABCIndexClass) + ABCIndex, ABCSeries, ABCDataFrame, ABCPeriodIndex, ABCIndexClass) from pandas.core.dtypes.missing import isna from pandas.core import common as com, algorithms, ops from pandas.core.algorithms import checked_add_with_arr @@ -48,6 +48,7 @@ from pandas.util._decorators import Appender, cache_readonly import pandas.core.dtypes.concat as _concat import pandas.tseries.frequencies as frequencies +from pandas.tseries.offsets import Tick, DateOffset import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -81,21 +82,84 @@ class TimelikeOps(object): _round_doc = ( """ - %s the index to the specified freq + {op} the data to the specified `freq`. Parameters ---------- - freq : freq string/object + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. Returns ------- - index of same type + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. Raises ------ - ValueError if the freq cannot be converted + ValueError if the `freq` cannot be converted. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='T') + """) + + _round_example = ( + """>>> rng.round('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.round("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] """) + _floor_example = ( + """>>> rng.floor('H') + DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.floor("H") + 0 2018-01-01 11:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + ) + + _ceil_example = ( + """>>> rng.ceil('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 13:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.ceil("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 13:00:00 + dtype: datetime64[ns] + """ + ) + def _round(self, freq, rounder): # round the local times values = _ensure_datetimelike_to_i8(self) @@ -110,15 +174,15 @@ def _round(self, freq, rounder): return self._ensure_localized( self._shallow_copy(result, **attribs)) - @Appender(_round_doc % "round") + @Appender((_round_doc + _round_example).format(op="round")) def round(self, freq, *args, **kwargs): return self._round(freq, np.round) - @Appender(_round_doc % "floor") + @Appender((_round_doc + _floor_example).format(op="floor")) def floor(self, freq): return self._round(freq, np.floor) - @Appender(_round_doc % "ceil") + @Appender((_round_doc + _ceil_example).format(op="ceil")) def ceil(self, freq): return self._round(freq, np.ceil) @@ -666,6 +730,9 @@ def _sub_nat(self): def _sub_period(self, other): return NotImplemented + def _add_offset(self, offset): + raise com.AbstractMethodError(self) + def _addsub_offset_array(self, other, op): """ Add or subtract array-like of DateOffset objects @@ -705,14 +772,17 @@ def __add__(self, other): from pandas import DateOffset other = lib.item_from_zerodim(other) - if isinstance(other, ABCSeries): + if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._add_nat() - elif isinstance(other, (DateOffset, timedelta, np.timedelta64)): + elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) + elif isinstance(other, DateOffset): + # specifically _not_ a Tick + result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datelike(other) elif is_integer(other): @@ -733,6 +803,12 @@ def __add__(self, other): elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") + elif is_float_dtype(other): + # Explicitly catch invalid dtypes + raise TypeError("cannot add {dtype}-dtype to {cls}" + .format(dtype=other.dtype, + cls=type(self).__name__)) + else: # pragma: no cover return NotImplemented @@ -753,17 +829,20 @@ def __radd__(self, other): cls.__radd__ = __radd__ def __sub__(self, other): - from pandas import Index, DateOffset + from pandas import Index other = lib.item_from_zerodim(other) - if isinstance(other, ABCSeries): + if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() - elif isinstance(other, (DateOffset, timedelta, np.timedelta64)): + elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) + elif isinstance(other, DateOffset): + # specifically _not_ a Tick + result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datelike(other) elif is_integer(other): @@ -790,6 +869,12 @@ def __sub__(self, other): elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") + + elif is_float_dtype(other): + # Explicitly catch invalid dtypes + raise TypeError("cannot subtract {dtype}-dtype from {cls}" + .format(dtype=other.dtype, + cls=type(self).__name__)) else: # pragma: no cover return NotImplemented diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index eb8133a1bbf974..b9c4b59536d0c1 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -138,11 +138,9 @@ def wrapper(self, other): result = func(np.asarray(other)) result = com._values_from_object(result) - if isinstance(other, Index): - o_mask = other.values.view('i8') == libts.iNaT - else: - o_mask = other.view('i8') == libts.iNaT - + # Make sure to pass an array to result[...]; indexing with + # Series breaks with older version of numpy + o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result @@ -215,6 +213,10 @@ class DatetimeIndex(DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, Attempt to infer fall dst-transition hours based on order name : object Name to be stored in the index + dayfirst : bool, default False + If True, parse dates in `data` with the day first order + yearfirst : bool, default False + If True parse dates in `data` with the year first order Attributes ---------- @@ -233,7 +235,6 @@ class DatetimeIndex(DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, week dayofweek weekday - weekday_name quarter tz freq @@ -262,6 +263,8 @@ class DatetimeIndex(DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, to_pydatetime to_series to_frame + month_name + day_name Notes ----- @@ -273,6 +276,7 @@ class DatetimeIndex(DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, Index : The base pandas Index type TimedeltaIndex : Index of timedelta64 data PeriodIndex : Index of Period data + pandas.to_datetime : Convert argument to datetime """ _typ = 'datetimeindex' @@ -320,7 +324,7 @@ def _add_comparison_methods(cls): _datetimelike_methods = ['to_period', 'tz_localize', 'tz_convert', 'normalize', 'strftime', 'round', 'floor', - 'ceil'] + 'ceil', 'month_name', 'day_name'] _is_numeric_dtype = False _infer_as_myclass = True @@ -328,10 +332,10 @@ def _add_comparison_methods(cls): @deprecate_kwarg(old_arg_name='infer_dst', new_arg_name='ambiguous', mapping={True: 'infer', False: 'raise'}) def __new__(cls, data=None, - freq=None, start=None, end=None, periods=None, - copy=False, name=None, tz=None, - verify_integrity=True, normalize=False, - closed=None, ambiguous='raise', dtype=None, **kwargs): + freq=None, start=None, end=None, periods=None, tz=None, + normalize=False, closed=None, ambiguous='raise', + dayfirst=False, yearfirst=False, dtype=None, + copy=False, name=None, verify_integrity=True): # This allows to later ensure that the 'copy' parameter is honored: if isinstance(data, Index): @@ -342,9 +346,6 @@ def __new__(cls, data=None, if name is None and hasattr(data, 'name'): name = data.name - dayfirst = kwargs.pop('dayfirst', None) - yearfirst = kwargs.pop('yearfirst', None) - freq_infer = False if not isinstance(freq, DateOffset): @@ -933,8 +934,6 @@ def _add_delta(self, delta): if not isinstance(delta, TimedeltaIndex): delta = TimedeltaIndex(delta) new_values = self._add_delta_tdi(delta) - elif isinstance(delta, DateOffset): - new_values = self._add_offset(delta).asi8 else: new_values = self.astype('O') + delta @@ -945,6 +944,7 @@ def _add_delta(self, delta): return result def _add_offset(self, offset): + assert not isinstance(offset, Tick) try: if self.tz is not None: values = self.tz_localize(None) @@ -953,12 +953,13 @@ def _add_offset(self, offset): result = offset.apply_index(values) if self.tz is not None: result = result.tz_localize(self.tz) - return result except NotImplementedError: warnings.warn("Non-vectorized DateOffset being applied to Series " "or DatetimeIndex", PerformanceWarning) - return self.astype('O') + offset + result = self.astype('O') + offset + + return DatetimeIndex(result, freq='infer') def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): from pandas.io.formats.format import _get_format_datetime64_from_values @@ -1715,7 +1716,7 @@ def freq(self, value): weekday_name = _field_accessor( 'weekday_name', 'weekday_name', - "The name of day in a week (ex: Friday)\n\n.. versionadded:: 0.18.1") + "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', "The ordinal day of the year") @@ -2099,6 +2100,58 @@ def to_julian_date(self): self.nanosecond / 3600.0 / 1e+9 ) / 24.0) + def month_name(self, locale=None): + """ + Return the month names of the DateTimeIndex with specified locale. + + Parameters + ---------- + locale : string, default None (English locale) + locale determining the language in which to return the month name + + Returns + ------- + month_names : Index + Index of month names + + .. versionadded:: 0.23.0 + """ + values = self.asi8 + if self.tz is not None: + if self.tz is not utc: + values = self._local_timestamps() + + result = fields.get_date_name_field(values, 'month_name', + locale=locale) + result = self._maybe_mask_results(result) + return Index(result, name=self.name) + + def day_name(self, locale=None): + """ + Return the day names of the DateTimeIndex with specified locale. + + Parameters + ---------- + locale : string, default None (English locale) + locale determining the language in which to return the day name + + Returns + ------- + month_names : Index + Index of day names + + .. versionadded:: 0.23.0 + """ + values = self.asi8 + if self.tz is not None: + if self.tz is not utc: + values = self._local_timestamps() + + result = fields.get_date_name_field(values, 'day_name', + locale=locale) + result = self._maybe_mask_results(result) + return Index(result, name=self.name) + DatetimeIndex._add_comparison_methods() DatetimeIndex._add_numeric_methods_disabled() @@ -2151,29 +2204,30 @@ def _generate_regular_range(start, end, periods, offset): def date_range(start=None, end=None, periods=None, freq='D', tz=None, normalize=False, name=None, closed=None, **kwargs): """ - Return a fixed frequency DatetimeIndex, with day (calendar) as the default - frequency + Return a fixed frequency DatetimeIndex. + + The default frequency is day (calendar). Parameters ---------- start : string or datetime-like, default None - Left bound for generating dates + Left bound for generating dates. end : string or datetime-like, default None - Right bound for generating dates + Right bound for generating dates. periods : integer, default None - Number of periods to generate + Number of periods to generate. freq : string or DateOffset, default 'D' (calendar daily) - Frequency strings can have multiples, e.g. '5H' + Frequency strings can have multiples, e.g. '5H'. tz : string, default None Time zone name for returning localized DatetimeIndex, for example - Asia/Hong_Kong + Asia/Hong_Kong. normalize : bool, default False - Normalize start/end dates to midnight before generating date range + Normalize start/end dates to midnight before generating date range. name : string, default None - Name of the resulting DatetimeIndex + Name of the resulting DatetimeIndex. closed : string, default None Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None) + the 'left', 'right', or both sides (None). Notes ----- @@ -2186,6 +2240,22 @@ def date_range(start=None, end=None, periods=None, freq='D', tz=None, Returns ------- rng : DatetimeIndex + + See Also + -------- + pandas.period_range : Return a fixed frequency PeriodIndex. + pandas.interval_range : Return a fixed frequency IntervalIndex. + + Examples + -------- + >>> pd.date_range('2018-10-03', periods=2) # doctest: +NORMALIZE_WHITESPACE + DatetimeIndex(['2018-10-03', '2018-10-04'], dtype='datetime64[ns]', + freq='D') + + >>> pd.date_range(start='2018-01-01', end='20180103') + ... # doctest: +NORMALIZE_WHITESPACE + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], + dtype='datetime64[ns]', freq='D') """ return DatetimeIndex(start=start, end=end, periods=periods, freq=freq, tz=tz, normalize=normalize, name=name, diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index d431ea1e51e317..ccf2e5e3c44864 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -213,8 +213,8 @@ class IntervalIndex(IntervalMixin, Index): _mask = None - def __new__(cls, data, closed=None, name=None, copy=False, dtype=None, - fastpath=False, verify_integrity=True): + def __new__(cls, data, closed=None, dtype=None, copy=False, + name=None, fastpath=False, verify_integrity=True): if fastpath: return cls._simple_new(data.left, data.right, closed, name, diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 73f4aee1c48808..be64f6f4bfd0f6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -208,8 +208,8 @@ class MultiIndex(Index): rename = Index.set_names def __new__(cls, levels=None, labels=None, sortorder=None, names=None, - copy=False, verify_integrity=True, _set_identity=True, - name=None, **kwargs): + dtype=None, copy=False, name=None, + verify_integrity=True, _set_identity=True): # compat with Index if name is not None: @@ -1775,22 +1775,45 @@ def droplevel(self, level=0): def swaplevel(self, i=-2, j=-1): """ - Swap level i with level j. Do not change the ordering of anything + Swap level i with level j. + + Calling this method does not change the ordering of the values. Parameters ---------- - i, j : int, string (can be mixed) - Level of index to be swapped. Can pass level name as string. + i : int, str, default -2 + First level of index to be swapped. Can pass level name as string. + Type of parameters can be mixed. + j : int, str, default -1 + Second level of index to be swapped. Can pass level name as string. + Type of parameters can be mixed. Returns ------- - swapped : MultiIndex + MultiIndex + A new MultiIndex .. versionchanged:: 0.18.1 The indexes ``i`` and ``j`` are now optional, and default to the two innermost levels of the index. + See Also + -------- + Series.swaplevel : Swap levels i and j in a MultiIndex + Dataframe.swaplevel : Swap levels i and j in a MultiIndex on a + particular axis + + Examples + -------- + >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + ... labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi + MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi.swaplevel(0, 1) + MultiIndex(levels=[['bb', 'aa'], ['a', 'b']], + labels=[[0, 1, 0, 1], [0, 0, 1, 1]]) """ new_levels = list(self.levels) new_labels = list(self.labels) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index a4558116bfa637..1fe0c8fa289e69 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -131,7 +131,7 @@ def is_all_dates(self): Attributes ---------- - inferred_type + None Methods ------- diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b936a4e26af605..705dc36d925226 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -21,10 +21,11 @@ import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc +from pandas.tseries.offsets import Tick, DateOffset + from pandas.core.indexes.datetimes import DatetimeIndex, Int64Index, Index from pandas.core.indexes.datetimelike import DatelikeOps, DatetimeIndexOpsMixin from pandas.core.tools.datetimes import parse_time_string -import pandas.tseries.offsets as offsets from pandas._libs.lib import infer_dtype from pandas._libs import tslib, index as libindex @@ -233,8 +234,15 @@ def _add_comparison_methods(cls): cls.__ge__ = _period_index_cmp('__ge__', cls) def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, - periods=None, copy=False, name=None, tz=None, dtype=None, - **kwargs): + periods=None, tz=None, dtype=None, copy=False, name=None, + **fields): + + valid_field_set = {'year', 'month', 'day', 'quarter', + 'hour', 'minute', 'second'} + + if not set(fields).issubset(valid_field_set): + raise TypeError('__new__() got an unexpected keyword argument {}'. + format(list(set(fields) - valid_field_set)[0])) if periods is not None: if is_float(periods): @@ -266,7 +274,7 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, data = np.asarray(ordinal, dtype=np.int64) else: data, freq = cls._generate_range(start, end, periods, - freq, kwargs) + freq, fields) return cls._from_ordinals(data, name=name, freq=freq) if isinstance(data, PeriodIndex): @@ -680,9 +688,9 @@ def to_timestamp(self, freq=None, how='start'): def _maybe_convert_timedelta(self, other): if isinstance( - other, (timedelta, np.timedelta64, offsets.Tick, np.ndarray)): + other, (timedelta, np.timedelta64, Tick, np.ndarray)): offset = frequencies.to_offset(self.freq.rule_code) - if isinstance(offset, offsets.Tick): + if isinstance(offset, Tick): if isinstance(other, np.ndarray): nanos = np.vectorize(delta_to_nanoseconds)(other) else: @@ -691,7 +699,7 @@ def _maybe_convert_timedelta(self, other): check = np.all(nanos % offset_nanos == 0) if check: return nanos // offset_nanos - elif isinstance(other, offsets.DateOffset): + elif isinstance(other, DateOffset): freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: @@ -707,6 +715,30 @@ def _maybe_convert_timedelta(self, other): msg = "Input has different freq from PeriodIndex(freq={0})" raise IncompatibleFrequency(msg.format(self.freqstr)) + def _add_offset(self, other): + assert not isinstance(other, Tick) + base = frequencies.get_base_alias(other.rule_code) + if base != self.freq.rule_code: + msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) + raise IncompatibleFrequency(msg) + return self.shift(other.n) + + def _add_delta_td(self, other): + assert isinstance(other, (timedelta, np.timedelta64, Tick)) + nanos = delta_to_nanoseconds(other) + own_offset = frequencies.to_offset(self.freq.rule_code) + + if isinstance(own_offset, Tick): + offset_nanos = delta_to_nanoseconds(own_offset) + if np.all(nanos % offset_nanos == 0): + return self.shift(nanos // offset_nanos) + + # raise when input doesn't have freq + raise IncompatibleFrequency("Input has different freq from " + "{cls}(freq={freqstr})" + .format(cls=type(self).__name__, + freqstr=self.freqstr)) + def _add_delta(self, other): ordinal_delta = self._maybe_convert_timedelta(other) return self.shift(ordinal_delta) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 9d770cffb00599..4e192548a1f2d7 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -53,6 +53,10 @@ class RangeIndex(Int64Index): Index : The base pandas Index type Int64Index : Index of int64 data + Attributes + ---------- + None + Methods ------- from_range @@ -61,8 +65,8 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine - def __new__(cls, start=None, stop=None, step=None, name=None, dtype=None, - fastpath=False, copy=False, **kwargs): + def __new__(cls, start=None, stop=None, step=None, + dtype=None, copy=False, name=None, fastpath=False): if fastpath: return cls._simple_new(start, stop, step, name=name) @@ -546,7 +550,7 @@ def __getitem__(self, key): stop = self._start + self._step * stop step = self._step * step - return RangeIndex(start, stop, step, self.name, fastpath=True) + return RangeIndex(start, stop, step, name=self.name, fastpath=True) # fall back to Int64Index return super_getitem(key) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index c42c0656c585a6..969afccdbc7558 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -197,10 +197,9 @@ def _add_comparison_methods(cls): freq = None - def __new__(cls, data=None, unit=None, - freq=None, start=None, end=None, periods=None, - copy=False, name=None, - closed=None, verify_integrity=True, **kwargs): + def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, + periods=None, closed=None, dtype=None, copy=False, + name=None, verify_integrity=True): if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: @@ -353,6 +352,12 @@ def _maybe_update_attributes(self, attrs): attrs['freq'] = 'infer' return attrs + def _add_offset(self, other): + assert not isinstance(other, Tick) + raise TypeError("cannot add the type {typ} to a {cls}" + .format(typ=type(other).__name__, + cls=type(self).__name__)) + def _add_delta(self, delta): """ Add a timedelta-like, Tick, or TimedeltaIndex-like object diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 2aa490cd02afb1..560e7638b55107 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -905,9 +905,8 @@ def _multi_take(self, tup): """ try: o = self.obj - d = dict( - [(a, self._convert_for_reindex(t, axis=o._get_axis_number(a))) - for t, a in zip(tup, o._AXIS_ORDERS)]) + d = {a: self._convert_for_reindex(t, axis=o._get_axis_number(a)) + for t, a in zip(tup, o._AXIS_ORDERS)} return o.reindex(**d) except(KeyError, IndexingError): raise self._exception @@ -1921,11 +1920,49 @@ def _convert_key(self, key, is_setter=False): class _iAtIndexer(_ScalarAccessIndexer): - """Fast integer location scalar accessor. + """ + Access a single value for a row/column pair by integer position. - Similarly to ``iloc``, ``iat`` provides **integer** based lookups. - You can also set using these indexers. + Similar to ``iloc``, in that both provide integer-based lookups. Use + ``iat`` if you only need to get or set a single value in a DataFrame + or Series. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair + DataFrame.loc : Access a group of rows and columns by label(s) + DataFrame.iloc : Access a group of rows and columns by integer position(s) + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 0 2 3 + 1 0 4 1 + 2 10 20 30 + + Get value at specified row/column pair + >>> df.iat[1, 2] + 1 + + Set value at specified row/column pair + + >>> df.iat[1, 2] = 10 + >>> df.iat[1, 2] + 10 + + Get value within a series + + >>> df.loc[0].iat[1] + 2 + + Raises + ------ + IndexError + When integer position is out of bounds """ _takeable = True diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 00ef8f9cef5985..240c9b1f3377cd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2905,6 +2905,35 @@ def shift(self, periods, axis=0, mgr=None): return [self.make_block_same_class(new_values, placement=self.mgr_locs)] + def diff(self, n, axis=0, mgr=None): + """1st discrete difference + + Parameters + ---------- + n : int, number of periods to diff + axis : int, axis to diff upon. default 0 + mgr : default None + + Return + ------ + A list with a new TimeDeltaBlock. + + Note + ---- + The arguments here are mimicking shift so they are called correctly + by apply. + """ + if axis == 0: + # Cannot currently calculate diff across multiple blocks since this + # function is invoked via apply + raise NotImplementedError + new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 + + # Reshape the new_values like how algos.diff does for timedelta data + new_values = new_values.reshape(1, len(new_values)) + new_values = new_values.astype('timedelta64[ns]') + return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] + def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 7bdbac66b4f311..6c6a54993b6697 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -10,8 +10,7 @@ import numpy as np import pandas as pd -from pandas._libs import (lib, index as libindex, - algos as libalgos) +from pandas._libs import algos as libalgos, ops as libops from pandas import compat from pandas.util._decorators import Appender @@ -344,50 +343,93 @@ def _get_op_name(op, special): # ----------------------------------------------------------------------------- # Docstring Generation and Templates +_add_example_FRAME = """ +>>> a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], + columns=['one']) +>>> a + one +a 1.0 +b 1.0 +c 1.0 +d NaN +>>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], + two=[np.nan, 2, np.nan, 2]), + index=['a', 'b', 'd', 'e']) +>>> b + one two +a 1.0 NaN +b NaN 2.0 +d 1.0 NaN +e NaN 2.0 +>>> a.add(b, fill_value=0) + one two +a 2.0 NaN +b 1.0 2.0 +c 1.0 NaN +d 1.0 NaN +e NaN 2.0 +""" + _op_descriptions = { + # Arithmetic Operators 'add': {'op': '+', 'desc': 'Addition', - 'reverse': 'radd'}, + 'reverse': 'radd', + 'df_examples': _add_example_FRAME}, 'sub': {'op': '-', 'desc': 'Subtraction', - 'reverse': 'rsub'}, + 'reverse': 'rsub', + 'df_examples': None}, 'mul': {'op': '*', 'desc': 'Multiplication', - 'reverse': 'rmul'}, + 'reverse': 'rmul', + 'df_examples': None}, 'mod': {'op': '%', 'desc': 'Modulo', - 'reverse': 'rmod'}, + 'reverse': 'rmod', + 'df_examples': None}, 'pow': {'op': '**', 'desc': 'Exponential power', - 'reverse': 'rpow'}, + 'reverse': 'rpow', + 'df_examples': None}, 'truediv': {'op': '/', 'desc': 'Floating division', - 'reverse': 'rtruediv'}, + 'reverse': 'rtruediv', + 'df_examples': None}, 'floordiv': {'op': '//', 'desc': 'Integer division', - 'reverse': 'rfloordiv'}, + 'reverse': 'rfloordiv', + 'df_examples': None}, 'divmod': {'op': 'divmod', 'desc': 'Integer division and modulo', - 'reverse': None}, + 'reverse': None, + 'df_examples': None}, + # Comparison Operators 'eq': {'op': '==', - 'desc': 'Equal to', - 'reverse': None}, + 'desc': 'Equal to', + 'reverse': None, + 'df_examples': None}, 'ne': {'op': '!=', - 'desc': 'Not equal to', - 'reverse': None}, + 'desc': 'Not equal to', + 'reverse': None, + 'df_examples': None}, 'lt': {'op': '<', - 'desc': 'Less than', - 'reverse': None}, + 'desc': 'Less than', + 'reverse': None, + 'df_examples': None}, 'le': {'op': '<=', - 'desc': 'Less than or equal to', - 'reverse': None}, + 'desc': 'Less than or equal to', + 'reverse': None, + 'df_examples': None}, 'gt': {'op': '>', - 'desc': 'Greater than', - 'reverse': None}, + 'desc': 'Greater than', + 'reverse': None, + 'df_examples': None}, 'ge': {'op': '>=', - 'desc': 'Greater than or equal to', - 'reverse': None}} + 'desc': 'Greater than or equal to', + 'reverse': None, + 'df_examples': None}} _op_names = list(_op_descriptions.keys()) for key in _op_names: @@ -533,30 +575,7 @@ def _get_op_name(op, special): Examples -------- ->>> a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], - columns=['one']) ->>> a - one -a 1.0 -b 1.0 -c 1.0 -d NaN ->>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], - two=[np.nan, 2, np.nan, 2]), - index=['a', 'b', 'd', 'e']) ->>> b - one two -a 1.0 NaN -b NaN 2.0 -d 1.0 NaN -e NaN 2.0 ->>> a.add(b, fill_value=0) - one two -a 2.0 NaN -b 1.0 2.0 -c 1.0 NaN -d 1.0 NaN -e NaN 2.0 +{df_examples} See also -------- @@ -623,14 +642,19 @@ def _make_flex_doc(op_name, typ): if typ == 'series': base_doc = _flex_doc_SERIES + doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, + equiv=equiv, reverse=op_desc['reverse']) elif typ == 'dataframe': base_doc = _flex_doc_FRAME + doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, + equiv=equiv, reverse=op_desc['reverse'], + df_examples=op_desc['df_examples']) elif typ == 'panel': base_doc = _flex_doc_PANEL + doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, + equiv=equiv, reverse=op_desc['reverse']) else: raise AssertionError('Invalid typ argument.') - doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, - equiv=equiv, reverse=op_desc['reverse']) return doc @@ -972,9 +996,9 @@ def _arith_method_SERIES(cls, op, special): code duplication. """ str_rep = _get_opstr(op, cls) - name = _get_op_name(op, special) - eval_kwargs = _gen_eval_kwargs(name) - fill_zeros = _gen_fill_zeros(name) + op_name = _get_op_name(op, special) + eval_kwargs = _gen_eval_kwargs(op_name) + fill_zeros = _gen_fill_zeros(op_name) construct_result = (_construct_divmod_result if op is divmod else _construct_result) @@ -997,7 +1021,7 @@ def na_op(x, y): result, changed = maybe_upcast_putmask(result, ~mask, np.nan) - result = missing.fill_zeros(result, x, y, name, fill_zeros) + result = missing.fill_zeros(result, x, y, op_name, fill_zeros) return result def safe_na_op(lvalues, rvalues): @@ -1010,7 +1034,7 @@ def safe_na_op(lvalues, rvalues): lambda x: op(x, rvalues)) raise - def wrapper(left, right, name=name, na_op=na_op): + def wrapper(left, right): if isinstance(right, ABCDataFrame): return NotImplemented @@ -1090,9 +1114,9 @@ def _comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, (ABCSeries, ABCIndex)): y = y.values - result = lib.vec_compare(x, y, op) + result = libops.vec_compare(x, y, op) else: - result = lib.scalar_compare(x, y, op) + result = libops.scalar_compare(x, y, op) return result @@ -1101,8 +1125,8 @@ def _comp_method_SERIES(cls, op, special): Wrapper function for Series arithmetic operations, to avoid code duplication. """ - name = _get_op_name(op, special) - masker = _gen_eval_kwargs(name).get('masker', False) + op_name = _get_op_name(op, special) + masker = _gen_eval_kwargs(op_name).get('masker', False) def na_op(x, y): @@ -1127,24 +1151,20 @@ def na_op(x, y): # integer comparisons # we have a datetime/timedelta and may need to convert + assert not needs_i8_conversion(x) mask = None - if (needs_i8_conversion(x) or - (not is_scalar(y) and needs_i8_conversion(y))): - - if is_scalar(y): - mask = isna(x) - y = libindex.convert_scalar(x, com._values_from_object(y)) - else: - mask = isna(x) | isna(y) - y = y.view('i8') + if not is_scalar(y) and needs_i8_conversion(y): + mask = isna(x) | isna(y) + y = y.view('i8') x = x.view('i8') - try: + method = getattr(x, op_name, None) + if method is not None: with np.errstate(all='ignore'): - result = getattr(x, name)(y) + result = method(y) if result is NotImplemented: raise TypeError("invalid type comparison") - except AttributeError: + else: result = op(x, y) if mask is not None and mask.any(): @@ -1174,6 +1194,14 @@ def wrapper(self, other, axis=None): return self._constructor(res_values, index=self.index, name=res_name) + if is_datetime64_dtype(self) or is_datetime64tz_dtype(self): + # Dispatch to DatetimeIndex to ensure identical + # Series/Index behavior + res_values = dispatch_to_index_op(op, self, other, + pd.DatetimeIndex) + return self._constructor(res_values, index=self.index, + name=res_name) + elif is_timedelta64_dtype(self): res_values = dispatch_to_index_op(op, self, other, pd.TimedeltaIndex) @@ -1191,8 +1219,7 @@ def wrapper(self, other, axis=None): elif isinstance(other, (np.ndarray, pd.Index)): # do not check length of zerodim array # as it will broadcast - if (not is_scalar(lib.item_from_zerodim(other)) and - len(self) != len(other)): + if other.ndim != 0 and len(self) != len(other): raise ValueError('Lengths must match to compare') res_values = na_op(self.values, np.asarray(other)) @@ -1215,7 +1242,7 @@ def wrapper(self, other, axis=None): else: res_values = np.zeros(len(self), dtype=bool) return self._constructor(res_values, index=self.index, - name=self.name, dtype='bool') + name=res_name, dtype='bool') else: values = self.get_values() @@ -1230,8 +1257,8 @@ def wrapper(self, other, axis=None): # always return a full value series here res_values = com._values_from_object(res) - return pd.Series(res_values, index=self.index, - name=res_name, dtype='bool') + return self._constructor(res_values, index=self.index, + name=res_name, dtype='bool') return wrapper @@ -1255,13 +1282,13 @@ def na_op(x, y): else: x = _ensure_object(x) y = _ensure_object(y) - result = lib.vec_binop(x, y, op) + result = libops.vec_binop(x, y, op) else: # let null fall thru if not isna(y): y = bool(y) try: - result = lib.scalar_binop(x, y, op) + result = libops.scalar_binop(x, y, op) except: raise TypeError("cannot compare a dtyped [{dtype}] array " "with a scalar of type [{typ}]" @@ -1428,10 +1455,10 @@ def to_series(right): def _arith_method_FRAME(cls, op, special): str_rep = _get_opstr(op, cls) - name = _get_op_name(op, special) - eval_kwargs = _gen_eval_kwargs(name) - fill_zeros = _gen_fill_zeros(name) - default_axis = _get_frame_op_default_axis(name) + op_name = _get_op_name(op, special) + eval_kwargs = _gen_eval_kwargs(op_name) + fill_zeros = _gen_fill_zeros(op_name) + default_axis = _get_frame_op_default_axis(op_name) def na_op(x, y): import pandas.core.computation.expressions as expressions @@ -1441,7 +1468,7 @@ def na_op(x, y): except TypeError: xrav = x.ravel() if isinstance(y, (np.ndarray, ABCSeries)): - dtype = np.find_common_type([x.dtype, y.dtype], []) + dtype = find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) yrav = y.ravel() mask = notna(xrav) & notna(yrav) @@ -1469,20 +1496,20 @@ def na_op(x, y): else: raise TypeError("cannot perform operation {op} between " "objects of type {x} and {y}" - .format(op=name, x=type(x), y=type(y))) + .format(op=op_name, x=type(x), y=type(y))) result, changed = maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) - result = missing.fill_zeros(result, x, y, name, fill_zeros) + result = missing.fill_zeros(result, x, y, op_name, fill_zeros) return result - if name in _op_descriptions: + if op_name in _op_descriptions: # i.e. include "add" but not "__add__" - doc = _make_flex_doc(name, 'dataframe') + doc = _make_flex_doc(op_name, 'dataframe') else: - doc = _arith_doc_FRAME % name + doc = _arith_doc_FRAME % op_name @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): @@ -1501,15 +1528,15 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): return self._combine_const(other, na_op, try_cast=True) - f.__name__ = name + f.__name__ = op_name return f def _flex_comp_method_FRAME(cls, op, special): str_rep = _get_opstr(op, cls) - name = _get_op_name(op, special) - default_axis = _get_frame_op_default_axis(name) + op_name = _get_op_name(op, special) + default_axis = _get_frame_op_default_axis(op_name) def na_op(x, y): try: @@ -1520,7 +1547,7 @@ def na_op(x, y): return result @Appender('Wrapper for flexible comparison methods {name}' - .format(name=name)) + .format(name=op_name)) def f(self, other, axis=default_axis, level=None): other = _align_method_FRAME(self, other, axis) @@ -1539,16 +1566,16 @@ def f(self, other, axis=default_axis, level=None): else: return self._combine_const(other, na_op, try_cast=False) - f.__name__ = name + f.__name__ = op_name return f def _comp_method_FRAME(cls, func, special): str_rep = _get_opstr(func, cls) - name = _get_op_name(func, special) + op_name = _get_op_name(func, special) - @Appender('Wrapper for comparison method {name}'.format(name=name)) + @Appender('Wrapper for comparison method {name}'.format(name=op_name)) def f(self, other): if isinstance(other, ABCDataFrame): # Another DataFrame @@ -1570,7 +1597,7 @@ def f(self, other): try_cast=False) return res.fillna(True).astype(bool) - f.__name__ = name + f.__name__ = op_name return f @@ -1580,7 +1607,7 @@ def f(self, other): def _arith_method_PANEL(cls, op, special): # work only for scalars - name = _get_op_name(op, special) + op_name = _get_op_name(op, special) def f(self, other): if not is_scalar(other): @@ -1590,13 +1617,13 @@ def f(self, other): return self._combine(other, op) - f.__name__ = name + f.__name__ = op_name return f def _comp_method_PANEL(cls, op, special): str_rep = _get_opstr(op, cls) - name = _get_op_name(op, special) + op_name = _get_op_name(op, special) def na_op(x, y): import pandas.core.computation.expressions as expressions @@ -1607,7 +1634,7 @@ def na_op(x, y): result = mask_cmp_op(x, y, op, np.ndarray) return result - @Appender('Wrapper for comparison method {name}'.format(name=name)) + @Appender('Wrapper for comparison method {name}'.format(name=op_name)) def f(self, other, axis=None): # Validate the axis parameter if axis is not None: @@ -1622,16 +1649,16 @@ def f(self, other, axis=None): else: return self._combine_const(other, na_op, try_cast=False) - f.__name__ = name + f.__name__ = op_name return f def _flex_method_PANEL(cls, op, special): str_rep = _get_opstr(op, cls) - name = _get_op_name(op, special) - eval_kwargs = _gen_eval_kwargs(name) - fill_zeros = _gen_fill_zeros(name) + op_name = _get_op_name(op, special) + eval_kwargs = _gen_eval_kwargs(op_name) + fill_zeros = _gen_fill_zeros(op_name) def na_op(x, y): import pandas.core.computation.expressions as expressions @@ -1646,20 +1673,20 @@ def na_op(x, y): # handles discrepancy between numpy and numexpr on division/mod # by 0 though, given that these are generally (always?) # non-scalars, I'm not sure whether it's worth it at the moment - result = missing.fill_zeros(result, x, y, name, fill_zeros) + result = missing.fill_zeros(result, x, y, op_name, fill_zeros) return result - if name in _op_descriptions: - doc = _make_flex_doc(name, 'panel') + if op_name in _op_descriptions: + doc = _make_flex_doc(op_name, 'panel') else: # doc strings substitors - doc = _agg_doc_PANEL.format(op_name=name) + doc = _agg_doc_PANEL.format(op_name=op_name) @Appender(doc) def f(self, other, axis=0): return self._combine(other, na_op, axis=axis) - f.__name__ = name + f.__name__ = op_name return f @@ -1701,7 +1728,7 @@ def _arith_method_SPARSE_SERIES(cls, op, special): Wrapper function for Series arithmetic operations, to avoid code duplication. """ - name = _get_op_name(op, special) + op_name = _get_op_name(op, special) def wrapper(self, other): if isinstance(other, ABCDataFrame): @@ -1709,7 +1736,7 @@ def wrapper(self, other): elif isinstance(other, ABCSeries): if not isinstance(other, ABCSparseSeries): other = other.to_sparse(fill_value=self.fill_value) - return _sparse_series_op(self, other, op, name) + return _sparse_series_op(self, other, op, op_name) elif is_scalar(other): with np.errstate(all='ignore'): new_values = op(self.values, other) @@ -1720,7 +1747,7 @@ def wrapper(self, other): raise TypeError('operation with {other} not supported' .format(other=type(other))) - wrapper.__name__ = name + wrapper.__name__ = op_name return wrapper @@ -1740,7 +1767,7 @@ def _arith_method_SPARSE_ARRAY(cls, op, special): Wrapper function for Series arithmetic operations, to avoid code duplication. """ - name = _get_op_name(op, special) + op_name = _get_op_name(op, special) def wrapper(self, other): from pandas.core.sparse.array import ( @@ -1753,16 +1780,16 @@ def wrapper(self, other): dtype = getattr(other, 'dtype', None) other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) - return _sparse_array_op(self, other, op, name) + return _sparse_array_op(self, other, op, op_name) elif is_scalar(other): with np.errstate(all='ignore'): fill = op(_get_fill(self), np.asarray(other)) result = op(self.sp_values, other) - return _wrap_result(name, result, self.sp_index, fill) + return _wrap_result(op_name, result, self.sp_index, fill) else: # pragma: no cover raise TypeError('operation with {other} not supported' .format(other=type(other))) - wrapper.__name__ = name + wrapper.__name__ = op_name return wrapper diff --git a/pandas/core/panel.py b/pandas/core/panel.py index fc7fad861df442..052d555df76f11 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -204,10 +204,8 @@ def _init_dict(self, data, axes, dtype=None): for k, v in compat.iteritems(data) if k in haxis) else: - ks = list(data.keys()) - if not isinstance(data, OrderedDict): - ks = com._try_sort(ks) - haxis = Index(ks) + keys = com._dict_keys_to_ordered_list(data) + haxis = Index(keys) for k, v in compat.iteritems(data): if isinstance(v, dict): diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 359c030157bd35..30132ddc05c406 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -1,6 +1,7 @@ """ Quantilization functions and related stuff """ +from functools import partial from pandas.core.dtypes.missing import isna from pandas.core.dtypes.common import ( @@ -9,6 +10,7 @@ is_categorical_dtype, is_datetime64_dtype, is_timedelta64_dtype, + is_datetime64tz_dtype, _ensure_int64) import pandas.core.algorithms as algos @@ -239,7 +241,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, ids = _ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: - ids[x == bins[0]] = 1 + # Numpy 1.9 support: ensure this mask is a Numpy array + ids[np.asarray(x == bins[0])] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() @@ -284,12 +287,14 @@ def _coerce_to_type(x): """ dtype = None - if is_timedelta64_dtype(x): - x = to_timedelta(x) - dtype = np.timedelta64 + if is_datetime64tz_dtype(x): + dtype = x.dtype elif is_datetime64_dtype(x): x = to_datetime(x) dtype = np.datetime64 + elif is_timedelta64_dtype(x): + x = to_timedelta(x) + dtype = np.timedelta64 if dtype is not None: # GH 19768: force NaT to NaN during integer conversion @@ -305,7 +310,7 @@ def _convert_bin_to_numeric_type(bins, dtype): Parameters ---------- - bins : list-liek of bins + bins : list-like of bins dtype : dtype of data Raises @@ -318,7 +323,7 @@ def _convert_bin_to_numeric_type(bins, dtype): bins = to_timedelta(bins).view(np.int64) else: raise ValueError("bins must be of timedelta64 dtype") - elif is_datetime64_dtype(dtype): + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): if bins_dtype in ['datetime', 'datetime64']: bins = to_datetime(bins).view(np.int64) else: @@ -333,7 +338,10 @@ def _format_labels(bins, precision, right=True, closed = 'right' if right else 'left' - if is_datetime64_dtype(dtype): + if is_datetime64tz_dtype(dtype): + formatter = partial(Timestamp, tz=dtype.tz) + adjust = lambda x: x - Timedelta('1ns') + elif is_datetime64_dtype(dtype): formatter = Timestamp adjust = lambda x: x - Timedelta('1ns') elif is_timedelta64_dtype(dtype): @@ -372,7 +380,13 @@ def _preprocess_for_cut(x): series_index = x.index name = x.name - x = np.asarray(x) + # Check that the passed array is a Pandas or Numpy object + # We don't want to strip away a Pandas data-type here (e.g. datetimetz) + ndim = getattr(x, 'ndim', None) + if ndim is None: + x = np.asarray(x) + if x.ndim != 1: + raise ValueError("Input array must be 1 dimensional") return x_is_series, series_index, name, x diff --git a/pandas/core/series.py b/pandas/core/series.py index 6822f1f6b58b50..19a9a0cf3da0f7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -54,7 +54,7 @@ from pandas import compat from pandas.io.formats.terminal import get_terminal_size from pandas.compat import ( - zip, u, OrderedDict, StringIO, range, get_range_parameters) + zip, u, OrderedDict, StringIO, range, get_range_parameters, PY36) from pandas.compat.numpy import function as nv import pandas.core.ops as ops @@ -130,6 +130,11 @@ class Series(base.IndexOpsMixin, generic.NDFrame): ---------- data : array-like, dict, or scalar value Contains data stored in Series + + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to @@ -297,7 +302,7 @@ def _init_dict(self, data, index=None, dtype=None): # Now we just make sure the order is respected, if any if index is not None: s = s.reindex(index, copy=False) - elif not isinstance(data, OrderedDict): + elif not PY36 and not isinstance(data, OrderedDict): try: s = s.sort_index() except TypeError: @@ -542,6 +547,71 @@ def __len__(self): return len(self._data) def view(self, dtype=None): + """ + Create a new view of the Series. + + This function will return a new Series with a view of the same + underlying values in memory, optionally reinterpreted with a new data + type. The new data type must preserve the same size in bytes as to not + cause index misalignment. + + Parameters + ---------- + dtype : data type + Data type object or one of their string representations. + + Returns + ------- + Series + A new Series object as a view of the same data in memory. + + See Also + -------- + numpy.ndarray.view : Equivalent numpy function to create a new view of + the same data in memory. + + Notes + ----- + Series are instantiated with ``dtype=float64`` by default. While + ``numpy.ndarray.view()`` will return a view with the same data type as + the original array, ``Series.view()`` (without specified dtype) + will try using ``float64`` and may fail if the original data type size + in bytes is not the same. + + Examples + -------- + >>> s = pd.Series([-2, -1, 0, 1, 2], dtype='int8') + >>> s + 0 -2 + 1 -1 + 2 0 + 3 1 + 4 2 + dtype: int8 + + The 8 bit signed integer representation of `-1` is `0b11111111`, but + the same bytes represent 255 if read as an 8 bit unsigned integer: + + >>> us = s.view('uint8') + >>> us + 0 254 + 1 255 + 2 0 + 3 1 + 4 2 + dtype: uint8 + + The views share the same underlying values: + + >>> us[0] = 128 + >>> s + 0 -128 + 1 -1 + 2 0 + 3 1 + 4 2 + dtype: int8 + """ return self._constructor(self._values.view(dtype), index=self.index).__finalize__(self) @@ -1021,31 +1091,31 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): >>> s = pd.Series([1, 2, 3, 4], index=pd.Index(['a', 'b', 'c', 'd'], ... name = 'idx')) >>> s.reset_index() - idx 0 - 0 0 1 - 1 1 2 - 2 2 3 - 3 3 4 + idx 0 + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', ... 'foo', 'qux', 'qux']), ... np.array(['one', 'two', 'one', 'two', 'one', 'two', ... 'one', 'two'])] >>> s2 = pd.Series( - ... np.random.randn(8), + ... range(8), ... index=pd.MultiIndex.from_arrays(arrays, ... names=['a', 'b'])) >>> s2.reset_index(level='a') - a 0 + a 0 b - one bar -0.286320 - two bar -0.587934 - one baz 0.710491 - two baz -1.429006 - one foo 0.790700 - two foo 0.824863 - one qux -0.718963 - two qux -0.055028 + one bar 0 + two bar 1 + one baz 2 + two baz 3 + one foo 4 + two foo 5 + one qux 6 + two qux 7 """ inplace = validate_bool_kwarg(inplace, 'inplace') if drop: @@ -1311,8 +1381,77 @@ def unique(self): return result - @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): + """ + Return Series with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + inplace : boolean, default ``False`` + If ``True``, performs operation inplace and returns None. + + Returns + ------- + deduplicated : Series + + See Also + -------- + Index.drop_duplicates : equivalent method on Index + DataFrame.drop_duplicates : equivalent method on DataFrame + Series.duplicated : related method on Series, indicating duplicate + Series values. + + Examples + -------- + Generate an Series with duplicated entries. + + >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'], + ... name='animal') + >>> s + 0 lama + 1 cow + 2 lama + 3 beetle + 4 lama + 5 hippo + Name: animal, dtype: object + + With the 'keep' parameter, the selection behaviour of duplicated values + can be changed. The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> s.drop_duplicates() + 0 lama + 1 cow + 3 beetle + 5 hippo + Name: animal, dtype: object + + The value 'last' for parameter 'keep' keeps the last occurrence for + each set of duplicated entries. + + >>> s.drop_duplicates(keep='last') + 1 cow + 3 beetle + 4 lama + 5 hippo + Name: animal, dtype: object + + The value ``False`` for parameter 'keep' discards all sets of + duplicated entries. Setting the value of 'inplace' to ``True`` performs + the operation inplace and returns ``None``. + + >>> s.drop_duplicates(keep=False, inplace=True) + >>> s + 1 cow + 3 beetle + 5 hippo + Name: animal, dtype: object + """ return super(Series, self).drop_duplicates(keep=keep, inplace=inplace) @Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs) @@ -1533,16 +1672,63 @@ def cov(self, other, min_periods=None): def diff(self, periods=1): """ - 1st discrete difference of object + First discrete difference of element. + + Calculates the difference of a Series element compared with another + element in the Series (default is element in previous row). Parameters ---------- periods : int, default 1 - Periods to shift for forming difference + Periods to shift for calculating difference, accepts negative + values. Returns ------- diffed : Series + + See Also + -------- + Series.pct_change: Percent change over given number of periods. + Series.shift: Shift index by desired number of periods with an + optional time freq. + DataFrame.diff: First discrete difference of object + + Examples + -------- + Difference with previous row + + >>> s = pd.Series([1, 1, 2, 3, 5, 8]) + >>> s.diff() + 0 NaN + 1 0.0 + 2 1.0 + 3 1.0 + 4 2.0 + 5 3.0 + dtype: float64 + + Difference with 3rd previous row + + >>> s.diff(periods=3) + 0 NaN + 1 NaN + 2 NaN + 3 2.0 + 4 4.0 + 5 6.0 + dtype: float64 + + Difference with following row + + >>> s.diff(periods=-1) + 0 0.0 + 1 -1.0 + 2 -1.0 + 3 -2.0 + 4 -3.0 + 5 NaN + dtype: float64 """ result = algorithms.diff(com._values_from_object(self), periods) return self._constructor(result, index=self.index).__finalize__(self) @@ -2691,28 +2877,54 @@ def reindex_axis(self, labels, axis=0, **kwargs): return self.reindex(index=labels, **kwargs) def memory_usage(self, index=True, deep=False): - """Memory usage of the Series + """ + Return the memory usage of the Series. + + The memory usage can optionally include the contribution of + the index and of elements of `object` dtype. Parameters ---------- - index : bool - Specifies whether to include memory usage of Series index - deep : bool - Introspect the data deeply, interrogate - `object` dtypes for system-level memory consumption + index : bool, default True + Specifies whether to include the memory usage of the Series index. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned value. Returns ------- - scalar bytes of memory consumed - - Notes - ----- - Memory usage does not include memory consumed by elements that - are not components of the array if deep=False + int + Bytes of memory consumed. See Also -------- - numpy.ndarray.nbytes + numpy.ndarray.nbytes : Total bytes consumed by the elements of the + array. + DataFrame.memory_usage : Bytes consumed by a DataFrame. + + Examples + -------- + + >>> s = pd.Series(range(3)) + >>> s.memory_usage() + 104 + + Not including the index gives the size of the rest of the data, which + is necessarily smaller: + + >>> s.memory_usage(index=False) + 24 + + The memory footprint of `object` values is ignored by default: + + >>> s = pd.Series(["a", "b"]) + >>> s.values + array(['a', 'b'], dtype=object) + >>> s.memory_usage() + 96 + >>> s.memory_usage(deep=True) + 212 """ v = super(Series, self).memory_usage(deep=deep) if index: @@ -2740,20 +2952,21 @@ def _take(self, indices, axis=0, convert=True, is_copy=False): def isin(self, values): """ - Return a boolean :class:`~pandas.Series` showing whether each element - in the :class:`~pandas.Series` is exactly contained in the passed - sequence of ``values``. + Check whether `values` are contained in Series. + + Return a boolean Series showing whether each element in the Series + matches an element in the passed sequence of `values` exactly. Parameters ---------- values : set or list-like The sequence of values to test. Passing in a single string will raise a ``TypeError``. Instead, turn a single string into a - ``list`` of one element. + list of one element. .. versionadded:: 0.18.1 - Support for values as a set + Support for values as a set. Returns ------- @@ -2762,31 +2975,37 @@ def isin(self, values): Raises ------ TypeError - * If ``values`` is a string + * If `values` is a string See Also -------- - pandas.DataFrame.isin + pandas.DataFrame.isin : equivalent method on DataFrame Examples -------- - >>> s = pd.Series(list('abc')) - >>> s.isin(['a', 'c', 'e']) + >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', + ... 'hippo'], name='animal') + >>> s.isin(['cow', 'lama']) 0 True - 1 False + 1 True 2 True - dtype: bool + 3 False + 4 True + 5 False + Name: animal, dtype: bool - Passing a single string as ``s.isin('a')`` will raise an error. Use + Passing a single string as ``s.isin('lama')`` will raise an error. Use a list of one element instead: - >>> s.isin(['a']) + >>> s.isin(['lama']) 0 True 1 False - 2 False - dtype: bool - + 2 True + 3 False + 4 True + 5 False + Name: animal, dtype: bool """ result = algorithms.isin(com._values_from_object(self), values) return self._constructor(result, index=self.index).__finalize__(self) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index d89b1d681c4783..2cefbea7220982 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -39,6 +39,10 @@ class SparseDataFrame(DataFrame): Parameters ---------- data : same types as can be passed to DataFrame or scipy.sparse.spmatrix + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + index : array-like, optional column : array-like, optional default_kind : {'block', 'integer'}, default 'block' @@ -138,7 +142,8 @@ def _init_dict(self, data, index, columns, dtype=None): columns = _ensure_index(columns) data = {k: v for k, v in compat.iteritems(data) if k in columns} else: - columns = Index(com._try_sort(list(data.keys()))) + keys = com._dict_keys_to_ordered_list(data) + columns = Index(keys) if index is None: index = extract_index(list(data.values())) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index f8b98a1a400811..714cd09a27294e 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -42,6 +42,10 @@ class SparseSeries(Series): Parameters ---------- data : {array-like, Series, SparseSeries, dict} + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + kind : {'block', 'integer'} fill_value : float Code for missing value. Defaults depends on dtype. diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b1c1ede66236cd..fac607f4621a89 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -20,6 +20,7 @@ from pandas.util._decorators import Appender import re import pandas._libs.lib as lib +import pandas._libs.ops as libops import warnings import textwrap import codecs @@ -305,7 +306,7 @@ def str_endswith(arr, pat, na=np.nan): return _na_map(f, arr, na, dtype=bool) -def str_replace(arr, pat, repl, n=-1, case=None, flags=0): +def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): r""" Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to :meth:`str.replace` or @@ -336,25 +337,50 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0): flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE - Cannot be set if `pat` is a compiled regex + regex : boolean, default True + - If True, assumes the passed-in pattern is a regular expression. + - If False, treats the pattern as a literal string + - Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. + + .. versionadded:: 0.23.0 Returns ------- replaced : Series/Index of objects + Raises + ------ + ValueError + * if `regex` is False and `repl` is a callable or `pat` is a compiled + regex + * if `pat` is a compiled regex and `case` or `flags` is set + Notes ----- When `pat` is a compiled regex, all flags should be included in the - compiled regex. Use of `case` or `flags` with a compiled regex will - raise an error. + compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled + regex will raise an error. Examples -------- - When `repl` is a string, every `pat` is replaced as with - :meth:`str.replace`. NaN value(s) in the Series are left as is. + When `pat` is a string and `regex` is True (the default), the given `pat` + is compiled as a regex. When `repl` is a string, it replaces matching + regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are + left as is: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) + 0 bao + 1 baz + 2 NaN + dtype: object + + When `pat` is a string and `regex` is False, every `pat` is replaced with + `repl` as with :meth:`str.replace`: - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', 'b') - 0 boo - 1 buz + >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) + 0 bao + 1 fuz 2 NaN dtype: object @@ -396,6 +422,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0): 1 bar 2 NaN dtype: object + """ # Check whether repl is valid (GH 13438, GH 15055) @@ -403,27 +430,33 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0): raise TypeError("repl must be a string or callable") is_compiled_re = is_re(pat) - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError("case and flags cannot be set" - " when pat is a compiled regex") - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - - use_re = is_compiled_re or len(pat) > 1 or flags or callable(repl) - - if use_re: - n = n if n >= 0 else 0 - regex = re.compile(pat, flags=flags) - f = lambda x: regex.sub(repl=repl, string=x, count=n) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError("case and flags cannot be set" + " when pat is a compiled regex") + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: + flags |= re.IGNORECASE + if is_compiled_re or len(pat) > 1 or flags or callable(repl): + n = n if n >= 0 else 0 + compiled = re.compile(pat, flags=flags) + f = lambda x: compiled.sub(repl=repl, string=x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) else: + if is_compiled_re: + raise ValueError("Cannot use a compiled regex as replacement " + "pattern with regex=False") + if callable(repl): + raise ValueError("Cannot use a callable replacement when " + "regex=False") f = lambda x: x.replace(pat, repl, n) return _na_map(f, arr) @@ -461,7 +494,7 @@ def rep(x, r): return compat.text_type.__mul__(x, r) repeats = np.asarray(repeats, dtype=object) - result = lib.vec_binop(com._values_from_object(arr), repeats, rep) + result = libops.vec_binop(com._values_from_object(arr), repeats, rep) return result @@ -865,23 +898,94 @@ def str_join(arr, sep): def str_findall(arr, pat, flags=0): """ - Find all occurrences of pattern or regular expression in the - Series/Index. Equivalent to :func:`re.findall`. + Find all occurrences of pattern or regular expression in the Series/Index. + + Equivalent to applying :func:`re.findall` to all the elements in the + Series/Index. Parameters ---------- pat : string - Pattern or regular expression - flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE + Pattern or regular expression. + flags : int, default 0 + ``re`` module flags, e.g. `re.IGNORECASE` (default is 0, which means + no flags). Returns ------- - matches : Series/Index of lists + Series/Index of lists of strings + All non-overlapping matches of pattern or regular expression in each + string of this Series/Index. See Also -------- - extractall : returns DataFrame with one column per capture group + count : Count occurrences of pattern or regular expression in each string + of the Series/Index. + extractall : For each string in the Series, extract groups from all matches + of regular expression and return a DataFrame with one row for each + match and one column for each group. + re.findall : The equivalent ``re`` function to all non-overlapping matches + of pattern or regular expression in string, as a list of strings. + + Examples + -------- + + >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) + + The search for the pattern 'Monkey' returns one match: + + >>> s.str.findall('Monkey') + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + On the other hand, the search for the pattern 'MONKEY' doesn't return any + match: + + >>> s.str.findall('MONKEY') + 0 [] + 1 [] + 2 [] + dtype: object + + Flags can be added to the pattern or regular expression. For instance, + to find the pattern 'MONKEY' ignoring the case: + + >>> import re + >>> s.str.findall('MONKEY', flags=re.IGNORECASE) + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + When the pattern matches more than one string in the Series, all matches + are returned: + + >>> s.str.findall('on') + 0 [on] + 1 [on] + 2 [] + dtype: object + + Regular expressions are supported too. For instance, the search for all the + strings ending with the word 'on' is shown next: + + >>> s.str.findall('on$') + 0 [on] + 1 [] + 2 [] + dtype: object + + If the pattern is found more than once in the same string, then a list of + multiple strings is returned: + + >>> s.str.findall('b') + 0 [] + 1 [] + 2 [b, b] + dtype: object + """ regex = re.compile(pat, flags=flags) return _na_map(regex.findall, arr) @@ -1595,9 +1699,9 @@ def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=None): return self._wrap_result(result) @copy(str_replace) - def replace(self, pat, repl, n=-1, case=None, flags=0): + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): result = str_replace(self._data, pat, repl, n=n, case=case, - flags=flags) + flags=flags, regex=regex) return self._wrap_result(result) @copy(str_repeat) diff --git a/pandas/core/window.py b/pandas/core/window.py index a3f19ef50459d0..5294cdfd5662d1 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -416,11 +416,11 @@ class Window(_Window): A ragged (meaning not-a-regular frequency), time-indexed DataFrame >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - ....: index = [pd.Timestamp('20130101 09:00:00'), - ....: pd.Timestamp('20130101 09:00:02'), - ....: pd.Timestamp('20130101 09:00:03'), - ....: pd.Timestamp('20130101 09:00:05'), - ....: pd.Timestamp('20130101 09:00:06')]) + ... index = [pd.Timestamp('20130101 09:00:00'), + ... pd.Timestamp('20130101 09:00:02'), + ... pd.Timestamp('20130101 09:00:03'), + ... pd.Timestamp('20130101 09:00:05'), + ... pd.Timestamp('20130101 09:00:06')]) >>> df B @@ -899,7 +899,35 @@ def skew(self, **kwargs): return self._apply('roll_skew', 'skew', check_minp=_require_min_periods(3), **kwargs) - _shared_docs['kurt'] = """Unbiased %(name)s kurtosis""" + _shared_docs['kurt'] = dedent(""" + Calculate unbiased %(name)s kurtosis. + + This function uses Fisher's definition of kurtosis without bias. + + Parameters + ---------- + **kwargs + Under Review. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation + + See Also + -------- + Series.%(name)s : Calling object with Series data + DataFrame.%(name)s : Calling object with DataFrames + Series.kurt : Equivalent method for Series + DataFrame.kurt : Equivalent method for DataFrame + scipy.stats.skew : Third moment of a probability density + scipy.stats.kurtosis : Reference SciPy method + + Notes + ----- + A minimum of 4 periods is required for the %(name)s calculation. + """) def kurt(self, **kwargs): return self._apply('roll_kurt', 'kurt', @@ -1220,8 +1248,32 @@ def var(self, ddof=1, *args, **kwargs): def skew(self, **kwargs): return super(Rolling, self).skew(**kwargs) + _agg_doc = dedent(""" + Examples + -------- + + The example below will show a rolling calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits + >>> import scipy.stats + >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) + -1.200000 + >>> print(fmt.format(scipy.stats.kurtosis(arr[1:], bias=False))) + 3.999946 + >>> s = pd.Series(arr) + >>> s.rolling(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 3.999946 + dtype: float64 + """) + + @Appender(_agg_doc) @Substitution(name='rolling') - @Appender(_doc_template) @Appender(_shared_docs['kurt']) def kurt(self, **kwargs): return super(Rolling, self).kurt(**kwargs) @@ -1460,8 +1512,32 @@ def var(self, ddof=1, *args, **kwargs): def skew(self, **kwargs): return super(Expanding, self).skew(**kwargs) + _agg_doc = dedent(""" + Examples + -------- + + The example below will show an expanding calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> import scipy.stats + >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits + >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) + -1.200000 + >>> print(fmt.format(scipy.stats.kurtosis(arr, bias=False))) + 4.999874 + >>> s = pd.Series(arr) + >>> s.expanding(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 4.999874 + dtype: float64 + """) + + @Appender(_agg_doc) @Substitution(name='expanding') - @Appender(_doc_template) @Appender(_shared_docs['kurt']) def kurt(self, **kwargs): return super(Expanding, self).kurt(**kwargs) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 78af86cc00f7fa..0f9df845117db7 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -779,35 +779,6 @@ def _pop_header_name(row, index_col): return none_fill(row[i]), row[:i] + [''] + row[i + 1:] -def _conv_value(val): - """ Convert numpy types to Python types for the Excel writers. - - Parameters - ---------- - val : object - Value to be written into cells - - Returns - ------- - If val is a numpy int, float, or bool, then the equivalent Python - types are returned. :obj:`datetime`, :obj:`date`, and :obj:`timedelta` - are passed and formatting must be handled in the writer. :obj:`str` - representation is returned for all other types. - """ - if is_integer(val): - val = int(val) - elif is_float(val): - val = float(val) - elif is_bool(val): - val = bool(val) - elif isinstance(val, (datetime, date, timedelta)): - pass - else: - val = compat.to_str(val) - - return val - - @add_metaclass(abc.ABCMeta) class ExcelWriter(object): """ @@ -953,6 +924,39 @@ def _get_sheet_name(self, sheet_name): 'cur_sheet property') return sheet_name + def _value_with_fmt(self, val): + """Convert numpy types to Python types for the Excel writers. + + Parameters + ---------- + val : object + Value to be written into cells + + Returns + ------- + Tuple with the first element being the converted value and the second + being an optional format + """ + fmt = None + + if is_integer(val): + val = int(val) + elif is_float(val): + val = float(val) + elif is_bool(val): + val = bool(val) + elif isinstance(val, datetime): + fmt = self.datetime_format + elif isinstance(val, date): + fmt = self.date_format + elif isinstance(val, timedelta): + val = val.total_seconds() / float(86400) + fmt = '0' + else: + val = compat.to_str(val) + + return val, fmt + @classmethod def check_extension(cls, ext): """checks that path's extension against the Writer's supported @@ -1382,7 +1386,9 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, row=startrow + cell.row + 1, column=startcol + cell.col + 1 ) - xcell.value = _conv_value(cell.val) + xcell.value, fmt = self._value_with_fmt(cell.val) + if fmt: + xcell.number_format = fmt style_kwargs = {} if cell.style: @@ -1469,25 +1475,16 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, style_dict = {} for cell in cells: - val = _conv_value(cell.val) - - num_format_str = None - if isinstance(cell.val, datetime): - num_format_str = self.datetime_format - elif isinstance(cell.val, date): - num_format_str = self.date_format - elif isinstance(cell.val, timedelta): - delta = cell.val - val = delta.total_seconds() / float(86400) + val, fmt = self._value_with_fmt(cell.val) stylekey = json.dumps(cell.style) - if num_format_str: - stylekey += num_format_str + if fmt: + stylekey += fmt if stylekey in style_dict: style = style_dict[stylekey] else: - style = self._convert_to_style(cell.style, num_format_str) + style = self._convert_to_style(cell.style, fmt) style_dict[stylekey] = style if cell.mergestart is not None and cell.mergeend is not None: @@ -1745,23 +1742,17 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, wks.freeze_panes(*(freeze_panes)) for cell in cells: - val = _conv_value(cell.val) - - num_format_str = None - if isinstance(cell.val, datetime): - num_format_str = self.datetime_format - elif isinstance(cell.val, date): - num_format_str = self.date_format + val, fmt = self._value_with_fmt(cell.val) stylekey = json.dumps(cell.style) - if num_format_str: - stylekey += num_format_str + if fmt: + stylekey += fmt if stylekey in style_dict: style = style_dict[stylekey] else: style = self.book.add_format( - _XlsxStyler.convert(cell.style, num_format_str)) + _XlsxStyler.convert(cell.style, fmt)) style_dict[stylekey] = style if cell.mergestart is not None and cell.mergeend is not None: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 621641747f3761..50b4f11634b78e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1209,6 +1209,9 @@ def write_result(self, buf): frame = self.frame _classes = ['dataframe'] # Default class. + use_mathjax = get_option("display.html.use_mathjax") + if not use_mathjax: + _classes.append('tex2jax_ignore') if self.classes is not None: if isinstance(self.classes, str): self.classes = self.classes.split() diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 525f487d8aa391..f876ceb8a26bfe 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -325,9 +325,19 @@ def format_attr(pair): .format(row=r, col=c)}) body.append(row_es) + table_attr = self.table_attributes + use_mathjax = get_option("display.html.use_mathjax") + if not use_mathjax: + table_attr = table_attr or '' + if 'class="' in table_attr: + table_attr = table_attr.replace('class="', + 'class="tex2jax_ignore ') + else: + table_attr += ' class="tex2jax_ignore"' + return dict(head=head, cellstyle=cellstyle, body=body, uuid=uuid, precision=precision, table_styles=table_styles, - caption=caption, table_attributes=self.table_attributes) + caption=caption, table_attributes=table_attr) def format(self, formatter, subset=None): """ diff --git a/pandas/io/html.py b/pandas/io/html.py index be4854bc19cc68..300a5a151f5d22 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -160,6 +160,14 @@ class _HtmlFrameParser(object): attrs : dict List of HTML
element attributes to match. + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + .. versionadded:: 0.23.0 + Attributes ---------- io : str or file-like @@ -172,6 +180,14 @@ class _HtmlFrameParser(object): A dictionary of valid table attributes to use to search for table elements. + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + .. versionadded:: 0.23.0 + Notes ----- To subclass this class effectively you must override the following methods: @@ -187,11 +203,12 @@ class _HtmlFrameParser(object): functionality. """ - def __init__(self, io, match, attrs, encoding): + def __init__(self, io, match, attrs, encoding, displayed_only): self.io = io self.match = match self.attrs = attrs self.encoding = encoding + self.displayed_only = displayed_only def parse_tables(self): tables = self._parse_tables(self._build_doc(), self.match, self.attrs) @@ -380,6 +397,27 @@ def _parse_raw_tbody(self, table): res = self._parse_tr(table) return self._parse_raw_data(res) + def _handle_hidden_tables(self, tbl_list, attr_name): + """Returns list of tables, potentially removing hidden elements + + Parameters + ---------- + tbl_list : list of Tag or list of Element + Type of list elements will vary depending upon parser used + attr_name : str + Name of the accessor for retrieving HTML attributes + + Returns + ------- + list of Tag or list of Element + Return type matches `tbl_list` + """ + if not self.displayed_only: + return tbl_list + + return [x for x in tbl_list if "display:none" not in + getattr(x, attr_name).get('style', '').replace(" ", "")] + class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): """HTML to DataFrame parser that uses BeautifulSoup under the hood. @@ -431,8 +469,14 @@ def _parse_tables(self, doc, match, attrs): result = [] unique_tables = set() + tables = self._handle_hidden_tables(tables, "attrs") for table in tables: + if self.displayed_only: + for elem in table.find_all( + style=re.compile(r"display:\s*none")): + elem.decompose() + if (table not in unique_tables and table.find(text=match) is not None): result.append(table) @@ -528,6 +572,17 @@ def _parse_tables(self, doc, match, kwargs): tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + tables = self._handle_hidden_tables(tables, "attrib") + if self.displayed_only: + for table in tables: + # lxml utilizes XPATH 1.0 which does not have regex + # support. As a result, we find all elements with a style + # attribute and iterate them to check for display:none + for elem in table.xpath('.//*[@style]'): + if "display:none" in elem.attrib.get( + "style", "").replace(" ", ""): + elem.getparent().remove(elem) + if not tables: raise ValueError("No tables found matching regex {patt!r}" .format(patt=pattern)) @@ -729,7 +784,7 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, attrs, encoding, **kwargs): +def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -737,7 +792,7 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs): retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding) + p = parser(io, compiled_match, attrs, encoding, displayed_only) try: tables = p.parse_tables() @@ -773,7 +828,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=None, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, - keep_default_na=True): + keep_default_na=True, displayed_only=True): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -877,6 +932,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 + display_only : bool, default True + Whether elements with "display: none" should be parsed + + .. versionadded:: 0.23.0 + Returns ------- dfs : list of DataFrames @@ -924,4 +984,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na) + keep_default_na=keep_default_na, + displayed_only=displayed_only) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 24364fe07405ec..1627b2f4d3ec3f 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -348,7 +348,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, to denote a missing :class:`Index` name, and the subsequent :func:`read_json` operation cannot distinguish between the two. The same limitation is encountered with a :class:`MultiIndex` and any names - beginning with 'level_'. + beginning with ``'level_'``. See Also -------- diff --git a/pandas/io/packers.py b/pandas/io/packers.py index d3e6f0cf4a1bc5..f9b1d1574d45cd 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -55,7 +55,8 @@ from pandas import (Timestamp, Period, Series, DataFrame, # noqa Index, MultiIndex, Float64Index, Int64Index, Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, - Categorical, CategoricalIndex) + Categorical, CategoricalIndex, IntervalIndex, Interval, + TimedeltaIndex) from pandas.core.sparse.api import SparseSeries, SparseDataFrame from pandas.core.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame @@ -401,6 +402,13 @@ def encode(obj): u'freq': u_safe(getattr(obj, 'freqstr', None)), u'tz': tz, u'compress': compressor} + elif isinstance(obj, IntervalIndex): + return {u'typ': u'interval_index', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'left': getattr(obj, '_left', None), + u'right': getattr(obj, '_right', None), + u'closed': getattr(obj, '_closed', None)} elif isinstance(obj, MultiIndex): return {u'typ': u'multi_index', u'klass': u(obj.__class__.__name__), @@ -513,7 +521,12 @@ def encode(obj): elif isinstance(obj, Period): return {u'typ': u'period', u'ordinal': obj.ordinal, - u'freq': u(obj.freq)} + u'freq': u_safe(obj.freqstr)} + elif isinstance(obj, Interval): + return {u'typ': u'interval', + u'left': obj.left, + u'right': obj.right, + u'closed': obj.closed} elif isinstance(obj, BlockIndex): return {u'typ': u'block_index', u'klass': u(obj.__class__.__name__), @@ -597,12 +610,19 @@ def decode(obj): result = result.tz_localize('UTC').tz_convert(tz) return result + elif typ == u'interval_index': + return globals()[obj[u'klass']].from_arrays(obj[u'left'], + obj[u'right'], + obj[u'closed'], + name=obj[u'name']) elif typ == u'category': from_codes = globals()[obj[u'klass']].from_codes return from_codes(codes=obj[u'codes'], categories=obj[u'categories'], ordered=obj[u'ordered']) + elif typ == u'interval': + return Interval(obj[u'left'], obj[u'right'], obj[u'closed']) elif typ == u'series': dtype = dtype_for(obj[u'dtype']) pd_dtype = pandas_dtype(dtype) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4b1385514a0c4e..469cd6d82e4b42 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -43,6 +43,7 @@ import pandas._libs.lib as lib import pandas._libs.parsers as parsers +import pandas._libs.ops as libops from pandas._libs.tslibs import parsing # BOM character (byte order mark) @@ -1616,9 +1617,9 @@ def _infer_types(self, values, na_values, try_num_bool=True): na_count = parsers.sanitize_objects(values, na_values, False) if result.dtype == np.object_ and try_num_bool: - result = lib.maybe_convert_bool(values, - true_values=self.true_values, - false_values=self.false_values) + result = libops.maybe_convert_bool(values, + true_values=self.true_values, + false_values=self.false_values) return result, na_count diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 437e279e909790..ccb8d2d99d7347 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -404,7 +404,7 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, Parameters ---------- - frame : DataFrame + frame : DataFrame, Series name : string Name of SQL table. con : SQLAlchemy connectable(engine/connection) or database string URI @@ -572,8 +572,29 @@ def create(self): else: self._execute_create() - def insert_statement(self): - return self.table.insert() + def insert_statement(self, data, conn): + """ + Generate tuple of SQLAlchemy insert statement and any arguments + to be executed by connection (via `_execute_insert`). + + Parameters + ---------- + conn : SQLAlchemy connectable(engine/connection) + Connection to recieve the data + data : list of dict + The data to be inserted + + Returns + ------- + SQLAlchemy statement + insert statement + *, optional + Additional parameters to be passed when executing insert statement + """ + dialect = getattr(conn, 'dialect', None) + if dialect and getattr(dialect, 'supports_multivalues_insert', False): + return self.table.insert(data), + return self.table.insert(), data def insert_data(self): if self.index is not None: @@ -612,8 +633,9 @@ def insert_data(self): return column_names, data_list def _execute_insert(self, conn, keys, data_iter): + """Insert data into this table with database connection""" data = [{k: v for k, v in zip(keys, row)} for row in data_iter] - conn.execute(self.insert_statement(), data) + conn.execute(*self.insert_statement(data, conn)) def insert(self, chunksize=None): keys, data_list = self.insert_data() diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index b15c5271ae3215..c22741479a837c 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1380,6 +1380,51 @@ def orientation(self): return 'vertical' +_kde_docstring = """ + Generate Kernel Density Estimate plot using Gaussian kernels. + + In statistics, `kernel density estimation`_ (KDE) is a non-parametric + way to estimate the probability density function (PDF) of a random + variable. This function uses Gaussian kernels and includes automatic + bandwith determination. + + .. _kernel density estimation: + https://en.wikipedia.org/wiki/Kernel_density_estimation + + Parameters + ---------- + bw_method : str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be + 'scott', 'silverman', a scalar constant or a callable. + If None (default), 'scott' is used. + See :class:`scipy.stats.gaussian_kde` for more information. + ind : NumPy array or integer, optional + Evaluation points for the estimated PDF. If None (default), + 1000 equally spaced points are used. If `ind` is a NumPy array, the + KDE is evaluated at the points passed. If `ind` is an integer, + `ind` number of equally spaced points are used. + **kwds : optional + Additional keyword arguments are documented in + :meth:`pandas.%(this-datatype)s.plot`. + + Returns + ------- + axes : matplotlib.axes.Axes or numpy.ndarray of them + + See Also + -------- + scipy.stats.gaussian_kde : Representation of a kernel-density + estimate using Gaussian kernels. This is the function used + internally to estimate the PDF. + %(sibling-datatype)s.plot.kde : Generate a KDE plot for a + %(sibling-datatype)s. + + Examples + -------- + %(examples)s + """ + + class KdePlot(HistPlot): _kind = 'kde' orientation = 'vertical' @@ -1873,7 +1918,7 @@ def _plot(data, x=None, y=None, subplots=False, Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them Notes ----- @@ -2532,11 +2577,21 @@ def line(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> s = pd.Series([1, 3, 2]) + >>> s.plot.line() """ return self(kind='line', **kwds) @@ -2547,11 +2602,12 @@ def bar(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='bar', **kwds) @@ -2562,11 +2618,12 @@ def barh(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='barh', **kwds) @@ -2577,11 +2634,12 @@ def box(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='box', **kwds) @@ -2594,37 +2652,54 @@ def hist(self, bins=10, **kwds): bins: integer, default 10 Number of histogram bins to be used `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='hist', bins=bins, **kwds) - def kde(self, bw_method=None, ind=None, **kwds): - """ - Kernel Density Estimate plot + @Appender(_kde_docstring % { + 'this-datatype': 'Series', + 'sibling-datatype': 'DataFrame', + 'examples': """ + Given a Series of points randomly sampled from an unknown + distribution, estimate its PDF using KDE with automatic + bandwidth determination and plot the results, evaluating them at + 1000 equally spaced points (default): - Parameters - ---------- - bw_method: str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be - 'scott', 'silverman', a scalar constant or a callable. - If None (default), 'scott' is used. - See :class:`scipy.stats.gaussian_kde` for more information. - ind : NumPy array or integer, optional - Evaluation points. If None (default), 1000 equally spaced points - are used. If `ind` is a NumPy array, the kde is evaluated at the - points passed. If `ind` is an integer, `ind` number of equally - spaced points are used. - `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + .. plot:: + :context: close-figs - Returns - ------- - axes : matplotlib.AxesSubplot or np.array of them - """ + >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5]) + >>> ax = s.plot.kde() + + A scalar bandwidth can be specified. Using a small bandwidth value can + lead to overfitting, while using a large bandwidth value may result + in underfitting: + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(bw_method=0.3) + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(bw_method=3) + + Finally, the `ind` parameter determines the evaluation points for the + plot of the estimated PDF: + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5]) + """.strip() + }) + def kde(self, bw_method=None, ind=None, **kwds): return self(kind='kde', bw_method=bw_method, ind=ind, **kwds) density = kde @@ -2636,11 +2711,12 @@ def area(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='area', **kwds) @@ -2651,11 +2727,12 @@ def pie(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='pie', **kwds) @@ -2702,11 +2779,12 @@ def line(self, x=None, y=None, **kwds): x, y : label or position, optional Coordinates for each point. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='line', x=x, y=y, **kwds) @@ -2719,28 +2797,92 @@ def bar(self, x=None, y=None, **kwds): x, y : label or position, optional Coordinates for each point. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='bar', x=x, y=y, **kwds) def barh(self, x=None, y=None, **kwds): """ - Horizontal bar plot + Make a horizontal bar plot. + + A horizontal bar plot is a plot that presents quantitative data with + rectangular bars with lengths proportional to the values that they + represent. A bar plot shows comparisons among discrete categories. One + axis of the plot shows the specific categories being compared, and the + other axis represents a measured value. Parameters ---------- - x, y : label or position, optional - Coordinates for each point. - `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + x : label or position, default DataFrame.index + Column to be used for categories. + y : label or position, default All numeric columns in dataframe + Columns to be plotted from the DataFrame. + **kwds + Keyword arguments to pass on to :meth:`pandas.DataFrame.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them. + + See Also + -------- + pandas.DataFrame.plot.bar: Vertical bar plot. + pandas.DataFrame.plot : Make plots of DataFrame using matplotlib. + matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib. + + Examples + -------- + Basic example + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) + >>> ax = df.plot.barh(x='lab', y='val') + + Plot a whole DataFrame to a horizontal bar plot + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.barh() + + Plot a column of the DataFrame to a horizontal bar plot + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.barh(y='speed') + + Plot DataFrame versus the desired column + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.barh(x='lifespan') """ return self(kind='barh', x=x, y=y, **kwds) @@ -2753,11 +2895,12 @@ def box(self, by=None, **kwds): by : string or sequence Column in the DataFrame to group by. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='box', by=by, **kwds) @@ -2772,37 +2915,57 @@ def hist(self, by=None, bins=10, **kwds): bins: integer, default 10 Number of histogram bins to be used `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='hist', by=by, bins=bins, **kwds) - def kde(self, bw_method=None, ind=None, **kwds): - """ - Kernel Density Estimate plot + @Appender(_kde_docstring % { + 'this-datatype': 'DataFrame', + 'sibling-datatype': 'Series', + 'examples': """ + Given several Series of points randomly sampled from unknown + distributions, estimate their PDFs using KDE with automatic + bandwidth determination and plot the results, evaluating them at + 1000 equally spaced points (default): - Parameters - ---------- - bw_method: str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be - 'scott', 'silverman', a scalar constant or a callable. - If None (default), 'scott' is used. - See :class:`scipy.stats.gaussian_kde` for more information. - ind : NumPy array or integer, optional - Evaluation points. If None (default), 1000 equally spaced points - are used. If `ind` is a NumPy array, the kde is evaluated at the - points passed. If `ind` is an integer, `ind` number of equally - spaced points are used. - `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + .. plot:: + :context: close-figs - Returns - ------- - axes : matplotlib.AxesSubplot or np.array of them - """ + >>> df = pd.DataFrame({ + ... 'x': [1, 2, 2.5, 3, 3.5, 4, 5], + ... 'y': [4, 4, 4.5, 5, 5.5, 6, 6], + ... }) + >>> ax = df.plot.kde() + + A scalar bandwidth can be specified. Using a small bandwidth value can + lead to overfitting, while using a large bandwidth value may result + in underfitting: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(bw_method=0.3) + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(bw_method=3) + + Finally, the `ind` parameter determines the evaluation points for the + plot of the estimated PDF: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) + """.strip() + }) + def kde(self, bw_method=None, ind=None, **kwds): return self(kind='kde', bw_method=bw_method, ind=ind, **kwds) density = kde @@ -2816,11 +2979,12 @@ def area(self, x=None, y=None, **kwds): x, y : label or position, optional Coordinates for each point. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='area', x=x, y=y, **kwds) @@ -2833,11 +2997,12 @@ def pie(self, y=None, **kwds): y : label or position, optional Column to plot. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='pie', y=y, **kwds) @@ -2854,11 +3019,12 @@ def scatter(self, x, y, s=None, c=None, **kwds): c : label or position, optional Color of each point. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='scatter', x=x, y=y, c=c, s=s, **kwds) @@ -2879,11 +3045,12 @@ def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, gridsize : int, optional Number of bins. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- - axes : matplotlib.AxesSubplot or np.array of them + axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ if reduce_C_function is not None: kwds['reduce_C_function'] = reduce_C_function diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 45594e9c6ea956..150c9274d4e5cf 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -147,25 +147,66 @@ def _get_marker_compat(marker): def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): - """RadViz - a multivariate data visualization algorithm + """ + Plot a multidimensional dataset in 2D. + + Each Series in the DataFrame is represented as a evenly distributed + slice on a circle. Each data point is rendered in the circle according to + the value on each Series. Highly correlated `Series` in the `DataFrame` + are placed closer on the unit circle. + + RadViz allow to project a N-dimensional data set into a 2D space where the + influence of each dimension can be interpreted as a balance between the + influence of all dimensions. + + More info available at the `original article + `_ + describing RadViz. Parameters ---------- - frame: DataFrame - class_column: str - Column name containing class names - ax: Matplotlib axis object, optional - color: list or tuple, optional - Colors to use for the different classes - colormap : str or matplotlib colormap object, default None - Colormap to select colors from. If string, load colormap with that name - from matplotlib. - kwds: keywords - Options to pass to matplotlib scatter plotting method + frame : `DataFrame` + Pandas object holding the data. + class_column : str + Column name containing the name of the data point category. + ax : :class:`matplotlib.axes.Axes`, optional + A plot instance to which to add the information. + color : list[str] or tuple[str], optional + Assign a color to each category. Example: ['blue', 'green']. + colormap : str or :class:`matplotlib.colors.Colormap`, default None + Colormap to select colors from. If string, load colormap with that + name from matplotlib. + kwds : optional + Options to pass to matplotlib scatter plotting method. Returns ------- - ax: Matplotlib axis object + axes : :class:`matplotlib.axes.Axes` + + See Also + -------- + pandas.plotting.andrews_curves : Plot clustering visualization + + Examples + -------- + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6, + ... 6.7, 4.6], + ... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2, + ... 3.3, 3.6], + ... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4, + ... 5.7, 1.0], + ... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2, + ... 2.1, 0.2], + ... 'Category': ['virginica', 'virginica', 'setosa', + ... 'virginica', 'virginica', 'versicolor', + ... 'versicolor', 'setosa', 'virginica', + ... 'setosa'] + ... }) + >>> rad_viz = pd.plotting.radviz(df, 'Category') """ import matplotlib.pyplot as plt import matplotlib.patches as patches @@ -323,20 +364,51 @@ def f(t): def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): - """Bootstrap plot. + """ + Bootstrap plot on mean, median and mid-range statistics. + + The bootstrap plot is used to estimate the uncertainty of a statistic + by relaying on random sampling with replacement [1]_. This function will + generate bootstrapping plots for mean, median and mid-range statistics + for the given number of samples of the given size. + + .. [1] "Bootstrapping (statistics)" in \ + https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29 Parameters ---------- - series: Time series - fig: matplotlib figure object, optional - size: number of data points to consider during each sampling - samples: number of times the bootstrap procedure is performed - kwds: optional keyword arguments for plotting commands, must be accepted - by both hist and plot + series : pandas.Series + Pandas Series from where to get the samplings for the bootstrapping. + fig : matplotlib.figure.Figure, default None + If given, it will use the `fig` reference for plotting instead of + creating a new one with default parameters. + size : int, default 50 + Number of data points to consider during each sampling. It must be + greater or equal than the length of the `series`. + samples : int, default 500 + Number of times the bootstrap procedure is performed. + **kwds : + Options to pass to matplotlib plotting method. Returns ------- - fig: matplotlib figure + fig : matplotlib.figure.Figure + Matplotlib figure + + See Also + -------- + pandas.DataFrame.plot : Basic plotting for DataFrame objects. + pandas.Series.plot : Basic plotting for Series objects. + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> import numpy as np + >>> s = pd.Series(np.random.uniform(size=100)) + >>> fig = pd.plotting.bootstrap_plot(s) """ import random import matplotlib.pyplot as plt diff --git a/pandas/plotting/_timeseries.py b/pandas/plotting/_timeseries.py index 56b5311326e986..21a03ea3885662 100644 --- a/pandas/plotting/_timeseries.py +++ b/pandas/plotting/_timeseries.py @@ -23,6 +23,7 @@ def tsplot(series, plotf, ax=None, **kwargs): + import warnings """ Plots a Series on the given Matplotlib axes or the current axes @@ -35,7 +36,14 @@ def tsplot(series, plotf, ax=None, **kwargs): _____ Supports same kwargs as Axes.plot + + .. deprecated:: 0.23.0 + Use Series.plot() instead """ + warnings.warn("'tsplot' is deprecated and will be removed in a " + "future version. Please use Series.plot() instead.", + FutureWarning, stacklevel=2) + # Used inferred freq is possible, need a test case for inferred if ax is None: import matplotlib.pyplot as plt diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 2d5d747aec5a7f..4ac04d71338fdf 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -9,6 +9,11 @@ class BaseConstructorsTests(BaseExtensionTests): + def test_array_from_scalars(self, data): + scalars = [data[0], data[1], data[2]] + result = data._constructor_from_sequence(scalars) + assert isinstance(result, type(data)) + def test_series_constructor(self, data): result = pd.Series(data) assert result.dtype == data.dtype diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 8b2eaadeca99ee..736556e4be20df 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -32,6 +32,10 @@ def __init__(self, values): self.values = values + @classmethod + def _constructor_from_sequence(cls, scalars): + return cls(scalars) + def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.values[item] diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 90aac93c68f64c..21addf9d1549f9 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -33,11 +33,17 @@ def __init__(self, values): raise TypeError self.data = values + @classmethod + def _constructor_from_sequence(cls, scalars): + return cls(scalars) + def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] elif isinstance(item, np.ndarray) and item.dtype == 'bool': - return type(self)([x for x, m in zip(self, item) if m]) + return self._constructor_from_sequence([ + x for x, m in zip(self, item) if m + ]) else: return type(self)(self.data[item]) @@ -77,7 +83,7 @@ def isna(self): def take(self, indexer, allow_fill=True, fill_value=None): output = [self.data[loc] if loc != -1 else self._na_value for loc in indexer] - return type(self)(output) + return self._constructor_from_sequence(output) def copy(self, deep=False): return type(self)(self.data[:]) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index c824f0026af503..3e0ba26c20eb06 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -301,7 +301,7 @@ def test_set_index_timezone(self): def test_set_index_dst(self): di = pd.date_range('2006-10-29 00:00:00', periods=3, - req='H', tz='US/Pacific') + freq='H', tz='US/Pacific') df = pd.DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]}, index=di).reset_index() diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index de4a132e0d613e..59a30fc69905f6 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -15,7 +15,8 @@ from pandas.compat import lrange, product from pandas import (compat, isna, notna, DataFrame, Series, - MultiIndex, date_range, Timestamp, Categorical) + MultiIndex, date_range, Timestamp, Categorical, + _np_version_under1p15) import pandas as pd import pandas.core.nanops as nanops import pandas.core.algorithms as algorithms @@ -2057,6 +2058,9 @@ def test_clip_against_list_like(self, inplace, lower, axis, res): result = original tm.assert_frame_equal(result, expected, check_exact=True) + @pytest.mark.xfail( + not _np_version_under1p15, + reason="failing under numpy-dev gh-19976") @pytest.mark.parametrize("axis", [0, 1, None]) def test_clip_against_frame(self, axis): df = DataFrame(np.random.randn(1000, 2)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e0b94815878dde..499751e8643318 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -15,7 +15,7 @@ from pandas.core.dtypes.common import is_integer_dtype from pandas.compat import (lmap, long, zip, range, lrange, lzip, - OrderedDict, is_platform_little_endian) + OrderedDict, is_platform_little_endian, PY36) from pandas import compat from pandas import (DataFrame, Index, Series, isna, MultiIndex, Timedelta, Timestamp, @@ -290,6 +290,24 @@ def test_constructor_dict(self): with tm.assert_raises_regex(ValueError, msg): DataFrame({'a': 0.7}, columns=['b']) + @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6') + def test_constructor_dict_order_insertion(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6 + d = {'b': self.ts2, 'a': self.ts1} + frame = DataFrame(data=d) + expected = DataFrame(data=d, columns=list('ba')) + tm.assert_frame_equal(frame, expected) + + @pytest.mark.skipif(PY36, reason='order by value for Python<3.6') + def test_constructor_dict_order_by_values(self): + # GH19018 + # initialization ordering: by value if python<3.6 + d = {'b': self.ts2, 'a': self.ts1} + frame = DataFrame(data=d) + expected = DataFrame(data=d, columns=list('ab')) + tm.assert_frame_equal(frame, expected) + def test_constructor_multi_index(self): # GH 4078 # construction error with mi and all-nan frame diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index e9e5b2a447a4aa..90daa9aa882c8b 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -8,11 +8,11 @@ import numpy as np from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, - compat, concat, option_context) + Categorical, compat, concat, option_context) from pandas.compat import u from pandas import _np_version_under1p14 -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import DatetimeTZDtype, CategoricalDtype from pandas.tests.frame.common import TestData from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -619,12 +619,21 @@ def test_astype_duplicate_col(self): expected = concat([a1_str, b, a2_str], axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']]) - def test_categorical_astype_ndim_raises(self, columns): - # GH 18004 - msg = '> 1 ndim Categorical are not supported at this time' - with tm.assert_raises_regex(NotImplementedError, msg): - DataFrame(columns=columns).astype('category') + @pytest.mark.parametrize('dtype', [ + 'category', + CategoricalDtype(), + CategoricalDtype(ordered=True), + CategoricalDtype(ordered=False), + CategoricalDtype(categories=list('abcdef')), + CategoricalDtype(categories=list('edba'), ordered=False), + CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr) + def test_astype_categorical(self, dtype): + # GH 18099 + d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')} + df = DataFrame(d) + result = df.astype(dtype) + expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("cls", [ pd.api.types.CategoricalDtype, @@ -640,6 +649,15 @@ def test_astype_categoricaldtype_class_raises(self, cls): with tm.assert_raises_regex(TypeError, xpr): df['A'].astype(cls) + @pytest.mark.parametrize('dtype', [ + {100: 'float64', 200: 'uint64'}, 'category', 'float64']) + def test_astype_column_metadata(self, dtype): + # GH 19920 + columns = pd.UInt64Index([100, 200, 300], name='foo') + df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) + df = df.astype(dtype) + tm.assert_index_equal(df.columns, columns) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_astype_from_datetimelike_to_objectt(self, dtype, unit): diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index 02fe0edf955777..b8ba408b547158 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -1,16 +1,16 @@ # -*- coding: utf-8 -*- import pytest -from datetime import timedelta, datetime -from distutils.version import LooseVersion -from numpy import nan import numpy as np +import pandas.util.testing as tm -from pandas import Series, DataFrame +from distutils.version import LooseVersion +from datetime import timedelta, datetime +from numpy import nan -from pandas.compat import product from pandas.util.testing import assert_frame_equal -import pandas.util.testing as tm from pandas.tests.frame.common import TestData +from pandas import Series, DataFrame +from pandas.compat import product class TestRank(TestData): @@ -266,3 +266,34 @@ def _check2d(df, expected, method='average', axis=0): continue frame = df if dtype is None else df.astype(dtype) _check2d(frame, results[method], method=method, axis=axis) + + +@pytest.mark.parametrize( + "method,exp", [("dense", + [[1., 1., 1.], + [1., 0.5, 2. / 3], + [1., 0.5, 1. / 3]]), + ("min", + [[1. / 3, 1., 1.], + [1. / 3, 1. / 3, 2. / 3], + [1. / 3, 1. / 3, 1. / 3]]), + ("max", + [[1., 1., 1.], + [1., 2. / 3, 2. / 3], + [1., 2. / 3, 1. / 3]]), + ("average", + [[2. / 3, 1., 1.], + [2. / 3, 0.5, 2. / 3], + [2. / 3, 0.5, 1. / 3]]), + ("first", + [[1. / 3, 1., 1.], + [2. / 3, 1. / 3, 2. / 3], + [3. / 3, 2. / 3, 1. / 3]])]) +def test_rank_pct_true(method, exp): + # see gh-15630. + + df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]]) + result = df.rank(method=method, pct=True) + + expected = DataFrame(exp) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index fbc4accd0e41eb..dd83a94b7062a4 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -33,9 +33,6 @@ def test_replace_inplace(self): tsframe.replace(nan, 0, inplace=True) assert_frame_equal(tsframe, self.tsframe.fillna(0)) - pytest.raises(TypeError, self.tsframe.replace, nan, inplace=True) - pytest.raises(TypeError, self.tsframe.replace, nan) - # mixed type mf = self.mixed_frame mf.iloc[5:20, mf.columns.get_loc('foo')] = nan @@ -720,7 +717,6 @@ def test_replace_simple_nested_dict_with_nonexistent_value(self): assert_frame_equal(expected, result) def test_replace_value_is_none(self): - pytest.raises(TypeError, self.tsframe.replace, nan) orig_value = self.tsframe.iloc[0, 0] orig2 = self.tsframe.iloc[1, 0] @@ -1072,3 +1068,36 @@ def test_replace_with_empty_dictlike(self): assert_frame_equal(df, df.replace({'b': {}})) assert_frame_equal(df, df.replace(Series({'b': {}}))) + + @pytest.mark.parametrize("to_replace, method, expected", [ + (0, 'bfill', {'A': [1, 1, 2], + 'B': [5, nan, 7], + 'C': ['a', 'b', 'c']}), + (nan, 'bfill', {'A': [0, 1, 2], + 'B': [5.0, 7.0, 7.0], + 'C': ['a', 'b', 'c']}), + ('d', 'ffill', {'A': [0, 1, 2], + 'B': [5, nan, 7], + 'C': ['a', 'b', 'c']}), + ([0, 2], 'bfill', {'A': [1, 1, 2], + 'B': [5, nan, 7], + 'C': ['a', 'b', 'c']}), + ([1, 2], 'pad', {'A': [0, 0, 0], + 'B': [5, nan, 7], + 'C': ['a', 'b', 'c']}), + ((1, 2), 'bfill', {'A': [0, 2, 2], + 'B': [5, nan, 7], + 'C': ['a', 'b', 'c']}), + (['b', 'c'], 'ffill', {'A': [0, 1, 2], + 'B': [5, nan, 7], + 'C': ['a', 'a', 'a']}), + ]) + def test_replace_method(self, to_replace, method, expected): + # GH 19632 + df = DataFrame({'A': [0, 1, 2], + 'B': [5, nan, 7], + 'C': ['a', 'b', 'c']}) + + result = df.replace(to_replace=to_replace, value=None, method=method) + expected = DataFrame(expected) + assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index e1bc310e1e9347..ceb6c942c81b10 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -57,6 +57,32 @@ def test_diff(self): 1), 'z': pd.Series(1)}).astype('float64') assert_frame_equal(result, expected) + @pytest.mark.parametrize('tz', [None, 'UTC']) + def test_diff_datetime_axis0(self, tz): + # GH 18578 + df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), + 1: date_range('2010', freq='D', periods=2, tz=tz)}) + + result = df.diff(axis=0) + expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']), + 1: pd.TimedeltaIndex(['NaT', '1 days'])}) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('tz', [None, 'UTC']) + def test_diff_datetime_axis1(self, tz): + # GH 18578 + df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), + 1: date_range('2010', freq='D', periods=2, tz=tz)}) + if tz is None: + result = df.diff(axis=1) + expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']), + 1: pd.TimedeltaIndex(['0 days', + '0 days'])}) + assert_frame_equal(result, expected) + else: + with pytest.raises(NotImplementedError): + result = df.diff(axis=1) + def test_diff_timedelta(self): # GH 4533 df = DataFrame(dict(time=[Timestamp('20130101 9:01'), diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2429e9975fc8e3..be0c32cefa6ffa 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -9,6 +9,7 @@ from pandas import (date_range, bdate_range, Timestamp, Index, MultiIndex, DataFrame, Series, concat, Panel, DatetimeIndex, read_csv) +from pandas.core.dtypes.missing import isna from pandas.errors import UnsupportedFunctionCall, PerformanceWarning from pandas.util.testing import (assert_frame_equal, assert_index_equal, assert_series_equal, assert_almost_equal) @@ -2061,60 +2062,29 @@ def test_rank_object_raises(self, ties_method, ascending, na_option, ascending=ascending, na_option=na_option, pct=pct) - @pytest.mark.parametrize("mix_groupings", [True, False]) - @pytest.mark.parametrize("as_series", [True, False]) - @pytest.mark.parametrize("val1,val2", [ - ('foo', 'bar'), (1, 2), (1., 2.)]) - @pytest.mark.parametrize("fill_method,limit,exp_vals", [ - ("ffill", None, - [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), - ("ffill", 1, - [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), - ("bfill", None, - ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), - ("bfill", 1, - [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) + @pytest.mark.parametrize("agg_func", ['any', 'all']) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("vals", [ + ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''], + [1, 2, 3], [1, 0, 0], [0, 0, 0], + [1., 2., 3.], [1., 0., 0.], [0., 0., 0.], + [True, True, True], [True, False, False], [False, False, False], + [np.nan, np.nan, np.nan] ]) - def test_group_fill_methods(self, mix_groupings, as_series, val1, val2, - fill_method, limit, exp_vals): - vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] - _exp_vals = list(exp_vals) - # Overwrite placeholder values - for index, exp_val in enumerate(_exp_vals): - if exp_val == 'val1': - _exp_vals[index] = val1 - elif exp_val == 'val2': - _exp_vals[index] = val2 - - # Need to modify values and expectations depending on the - # Series / DataFrame that we ultimately want to generate - if mix_groupings: # ['a', 'b', 'a, 'b', ...] - keys = ['a', 'b'] * len(vals) - - def interweave(list_obj): - temp = list() - for x in list_obj: - temp.extend([x, x]) - - return temp - - _exp_vals = interweave(_exp_vals) - vals = interweave(vals) - else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] - keys = ['a'] * len(vals) + ['b'] * len(vals) - _exp_vals = _exp_vals * 2 - vals = vals * 2 - - df = DataFrame({'key': keys, 'val': vals}) - if as_series: - result = getattr( - df.groupby('key')['val'], fill_method)(limit=limit) - exp = Series(_exp_vals, name='val') - assert_series_equal(result, exp) - else: - result = getattr(df.groupby('key'), fill_method)(limit=limit) - exp = DataFrame({'key': keys, 'val': _exp_vals}) - assert_frame_equal(result, exp) + def test_groupby_bool_aggs(self, agg_func, skipna, vals): + df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) + + # Figure out expectation using Python builtin + exp = getattr(compat.builtins, agg_func)(vals) + + # edge case for missing data with skipna and 'any' + if skipna and all(isna(vals)) and agg_func == 'any': + exp = False + + exp_df = DataFrame([exp] * 2, columns=['val'], index=pd.Index( + ['a', 'b'], name='key')) + result = getattr(df.groupby('key'), agg_func)(skipna=skipna) + assert_frame_equal(result, exp_df) def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 1be7dfdcc64e64..bce38b8cf9eed4 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -498,6 +498,31 @@ def test_cython_transform_series(self, op, args, targop): tm.assert_series_equal(expected, getattr( data.groupby(labels), op)(*args)) + @pytest.mark.parametrize("op", ['cumprod', 'cumsum']) + @pytest.mark.parametrize("skipna", [False, True]) + @pytest.mark.parametrize('input, exp', [ + # When everything is NaN + ({'key': ['b'] * 10, 'value': np.nan}, + pd.Series([np.nan] * 10, name='value')), + # When there is a single NaN + ({'key': ['b'] * 10 + ['a'] * 2, + 'value': [3] * 3 + [np.nan] + [3] * 8}, + {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], + ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., + 2187., 6561., 19683., 3.0, 9.0], + ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], + ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., + 21., 24., 27., 3.0, 6.0]})]) + def test_groupby_cum_skipna(self, op, skipna, input, exp): + df = pd.DataFrame(input) + result = df.groupby('key')['value'].transform(op, skipna=skipna) + if isinstance(exp, dict): + expected = exp[(op, skipna)] + else: + expected = exp + expected = pd.Series(expected, name='value') + tm.assert_series_equal(expected, result) + @pytest.mark.parametrize( "op, args, targop", [('cumprod', (), lambda x: x.cumprod()), @@ -611,3 +636,90 @@ def test_transform_numeric_ret(self, cols, exp, comp_func, agg_func): exp = exp.astype('float') comp_func(result, exp) + + @pytest.mark.parametrize("mix_groupings", [True, False]) + @pytest.mark.parametrize("as_series", [True, False]) + @pytest.mark.parametrize("val1,val2", [ + ('foo', 'bar'), (1, 2), (1., 2.)]) + @pytest.mark.parametrize("fill_method,limit,exp_vals", [ + ("ffill", None, + [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), + ("ffill", 1, + [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), + ("bfill", None, + ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), + ("bfill", 1, + [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) + ]) + def test_group_fill_methods(self, mix_groupings, as_series, val1, val2, + fill_method, limit, exp_vals): + vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] + _exp_vals = list(exp_vals) + # Overwrite placeholder values + for index, exp_val in enumerate(_exp_vals): + if exp_val == 'val1': + _exp_vals[index] = val1 + elif exp_val == 'val2': + _exp_vals[index] = val2 + + # Need to modify values and expectations depending on the + # Series / DataFrame that we ultimately want to generate + if mix_groupings: # ['a', 'b', 'a, 'b', ...] + keys = ['a', 'b'] * len(vals) + + def interweave(list_obj): + temp = list() + for x in list_obj: + temp.extend([x, x]) + + return temp + + _exp_vals = interweave(_exp_vals) + vals = interweave(vals) + else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] + keys = ['a'] * len(vals) + ['b'] * len(vals) + _exp_vals = _exp_vals * 2 + vals = vals * 2 + + df = DataFrame({'key': keys, 'val': vals}) + if as_series: + result = getattr( + df.groupby('key')['val'], fill_method)(limit=limit) + exp = Series(_exp_vals, name='val') + assert_series_equal(result, exp) + else: + result = getattr(df.groupby('key'), fill_method)(limit=limit) + exp = DataFrame({'key': keys, 'val': _exp_vals}) + assert_frame_equal(result, exp) + + @pytest.mark.parametrize("test_series", [True, False]) + @pytest.mark.parametrize("periods,fill_method,limit", [ + (1, 'ffill', None), (1, 'ffill', 1), + (1, 'bfill', None), (1, 'bfill', 1), + (-1, 'ffill', None), (-1, 'ffill', 1), + (-1, 'bfill', None), (-1, 'bfill', 1)]) + def test_pct_change(self, test_series, periods, fill_method, limit): + vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] + exp_vals = Series(vals).pct_change(periods=periods, + fill_method=fill_method, + limit=limit).tolist() + + df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals), + 'vals': vals * 2}) + grp = df.groupby('key') + + def get_result(grp_obj): + return grp_obj.pct_change(periods=periods, + fill_method=fill_method, + limit=limit) + + if test_series: + exp = pd.Series(exp_vals * 2) + exp.name = 'vals' + grp = grp['vals'] + result = get_result(grp) + tm.assert_series_equal(result, exp) + else: + exp = DataFrame({'vals': exp_vals * 2}) + result = get_result(grp) + tm.assert_frame_equal(result, exp) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index 0c56c6b16fb2f8..8f259a7e788973 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -14,6 +14,7 @@ from pandas import (Timestamp, Timedelta, Series, DatetimeIndex, TimedeltaIndex, date_range) +from pandas.core import ops from pandas._libs import tslib from pandas._libs.tslibs.offsets import shift_months @@ -307,6 +308,17 @@ def test_dti_cmp_list(self): class TestDatetimeIndexArithmetic(object): + # ------------------------------------------------------------- + # Invalid Operations + + @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])]) + @pytest.mark.parametrize('op', [operator.add, ops.radd, + operator.sub, ops.rsub]) + def test_dti_add_sub_float(self, op, other): + dti = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') + with pytest.raises(TypeError): + op(dti, other) + def test_dti_add_timestamp_raises(self): idx = DatetimeIndex(['2011-01-01', '2011-01-02']) msg = "cannot add DatetimeIndex and Timestamp" diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 2013b5e6cd6ddf..056924f2c66635 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -1,3 +1,6 @@ +import locale +import calendar + import pytest import numpy as np @@ -87,7 +90,6 @@ def test_range_edges(self): class TestDatetime64(object): def test_datetimeindex_accessors(self): - dti_naive = DatetimeIndex(freq='D', start=datetime(1998, 1, 1), periods=365) # GH 13303 @@ -134,23 +136,6 @@ def test_datetimeindex_accessors(self): assert not dti.is_year_end[0] assert dti.is_year_end[364] - # GH 11128 - assert dti.weekday_name[4] == u'Monday' - assert dti.weekday_name[5] == u'Tuesday' - assert dti.weekday_name[6] == u'Wednesday' - assert dti.weekday_name[7] == u'Thursday' - assert dti.weekday_name[8] == u'Friday' - assert dti.weekday_name[9] == u'Saturday' - assert dti.weekday_name[10] == u'Sunday' - - assert Timestamp('2016-04-04').weekday_name == u'Monday' - assert Timestamp('2016-04-05').weekday_name == u'Tuesday' - assert Timestamp('2016-04-06').weekday_name == u'Wednesday' - assert Timestamp('2016-04-07').weekday_name == u'Thursday' - assert Timestamp('2016-04-08').weekday_name == u'Friday' - assert Timestamp('2016-04-09').weekday_name == u'Saturday' - assert Timestamp('2016-04-10').weekday_name == u'Sunday' - assert len(dti.year) == 365 assert len(dti.month) == 365 assert len(dti.day) == 365 @@ -256,6 +241,56 @@ def test_datetimeindex_accessors(self): assert dates.weekofyear.tolist() == expected assert [d.weekofyear for d in dates] == expected + # GH 12806 + @pytest.mark.parametrize('time_locale', [ + None] if tm.get_locales() is None else [None] + tm.get_locales()) + def test_datetime_name_accessors(self, time_locale): + # Test Monday -> Sunday and January -> December, in that sequence + if time_locale is None: + # If the time_locale is None, day-name and month_name should + # return the english attributes + expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday', 'Sunday'] + expected_months = ['January', 'February', 'March', 'April', 'May', + 'June', 'July', 'August', 'September', + 'October', 'November', 'December'] + else: + with tm.set_locale(time_locale, locale.LC_TIME): + expected_days = calendar.day_name[:] + expected_months = calendar.month_name[1:] + + # GH 11128 + dti = DatetimeIndex(freq='D', start=datetime(1998, 1, 1), + periods=365) + english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday', 'Sunday'] + for day, name, eng_name in zip(range(4, 11), + expected_days, + english_days): + name = name.capitalize() + assert dti.weekday_name[day] == eng_name + assert dti.day_name(locale=time_locale)[day] == name + ts = Timestamp(datetime(2016, 4, day)) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + assert ts.weekday_name == eng_name + assert ts.day_name(locale=time_locale) == name + dti = dti.append(DatetimeIndex([pd.NaT])) + assert np.isnan(dti.day_name(locale=time_locale)[-1]) + ts = Timestamp(pd.NaT) + assert np.isnan(ts.day_name(locale=time_locale)) + + # GH 12805 + dti = DatetimeIndex(freq='M', start='2012', end='2013') + result = dti.month_name(locale=time_locale) + expected = Index([month.capitalize() for month in expected_months]) + tm.assert_index_equal(result, expected) + for date, expected in zip(dti, expected_months): + result = date.month_name(locale=time_locale) + assert result == expected.capitalize() + dti = dti.append(DatetimeIndex([pd.NaT])) + assert np.isnan(dti.month_name(locale=time_locale)[-1]) + def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 6bb42298835251..f263ac78cd3438 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -2,7 +2,7 @@ import pytest -from datetime import datetime, date +from datetime import datetime import numpy as np import pandas as pd import operator as op @@ -349,7 +349,7 @@ def test_loc_datetime_length_one(self): @pytest.mark.parametrize('datetimelike', [ Timestamp('20130101'), datetime(2013, 1, 1), - date(2013, 1, 1), np.datetime64('2013-01-01T00:00', 'ns')]) + np.datetime64('2013-01-01T00:00', 'ns')]) @pytest.mark.parametrize('op,expected', [ (op.lt, [True, False, False, False]), (op.le, [True, True, False, False]), diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 6f0756949edc63..9180bb0af3af36 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -47,7 +47,12 @@ def test_dti_timestamp_fields(self, field): # extra fields from DatetimeIndex like quarter and week idx = tm.makeDateIndex(100) expected = getattr(idx, field)[-1] - result = getattr(Timestamp(idx[-1]), field) + if field == 'weekday_name': + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = getattr(Timestamp(idx[-1]), field) + else: + result = getattr(Timestamp(idx[-1]), field) assert result == expected def test_dti_timestamp_freq_fields(self): diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index fbf0977a04d826..0d42b6e9692feb 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -224,27 +224,34 @@ def test_to_datetime_today(self): # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. with tm.set_timezone('Pacific/Auckland'): # 12-13 hours ahead of UTC - nptoday = np.datetime64('today').astype('datetime64[ns]') + nptoday = np.datetime64('today')\ + .astype('datetime64[ns]').astype(np.int64) pdtoday = pd.to_datetime('today') pdtoday2 = pd.to_datetime(['today'])[0] + tstoday = pd.Timestamp('today') + tstoday2 = pd.Timestamp.today() + # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds - assert abs(pdtoday.value - nptoday.astype(np.int64)) < 1e10 - assert abs(pdtoday2.value - nptoday.astype(np.int64)) < 1e10 + assert abs(pdtoday.normalize().value - nptoday) < 1e10 + assert abs(pdtoday2.normalize().value - nptoday) < 1e10 + assert abs(pdtoday.value - tstoday.value) < 1e10 + assert abs(pdtoday.value - tstoday2.value) < 1e10 assert pdtoday.tzinfo is None assert pdtoday2.tzinfo is None with tm.set_timezone('US/Samoa'): # 11 hours behind UTC - nptoday = np.datetime64('today').astype('datetime64[ns]') + nptoday = np.datetime64('today')\ + .astype('datetime64[ns]').astype(np.int64) pdtoday = pd.to_datetime('today') pdtoday2 = pd.to_datetime(['today'])[0] # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds - assert abs(pdtoday.value - nptoday.astype(np.int64)) < 1e10 - assert abs(pdtoday2.value - nptoday.astype(np.int64)) < 1e10 + assert abs(pdtoday.normalize().value - nptoday) < 1e10 + assert abs(pdtoday2.normalize().value - nptoday) < 1e10 assert pdtoday.tzinfo is None assert pdtoday2.tzinfo is None diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index d7bf1e0210f622..c75fdd35a974c6 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- from datetime import timedelta +import operator + import pytest import numpy as np @@ -9,6 +11,7 @@ period_range, Period, PeriodIndex, _np_version_under1p10) import pandas.core.indexes.period as period +from pandas.core import ops from pandas.errors import PerformanceWarning @@ -256,6 +259,18 @@ def test_comp_nat(self, dtype): class TestPeriodIndexArithmetic(object): + # ------------------------------------------------------------- + # Invalid Operations + + @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])]) + @pytest.mark.parametrize('op', [operator.add, ops.radd, + operator.sub, ops.rsub]) + def test_pi_add_sub_float(self, op, other): + dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') + pi = dti.to_period('D') + with pytest.raises(TypeError): + op(pi, other) + # ----------------------------------------------------------------- # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64] diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index d7f185853ca452..eb429f46a33557 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -519,7 +519,6 @@ def test_is_(self): assert not ind.is_(ind.copy()) assert not ind.is_(ind.copy(deep=False)) assert not ind.is_(ind[:]) - assert not ind.is_(ind.view(np.ndarray).view(Index)) assert not ind.is_(np.array(range(10))) # quasi-implementation dependent @@ -2327,3 +2326,10 @@ def test_generated_op_names(opname, indices): opname = '__{name}__'.format(name=opname) method = getattr(index, opname) assert method.__name__ == opname + + +@pytest.mark.parametrize('idx_maker', tm.index_subclass_makers_generator()) +def test_index_subclass_constructor_wrong_kwargs(idx_maker): + # GH #19348 + with tm.assert_raises_regex(TypeError, 'unexpected keyword argument'): + idx_maker(foo='bar') diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 9ffffb6ff06d5e..9035434046ccbd 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import operator + import pytest import numpy as np from datetime import timedelta @@ -11,6 +13,7 @@ Series, Timestamp, Timedelta) from pandas.errors import PerformanceWarning, NullFrequencyError +from pandas.core import ops @pytest.fixture(params=[pd.offsets.Hour(2), timedelta(hours=2), @@ -270,6 +273,15 @@ class TestTimedeltaIndexArithmetic(object): # ------------------------------------------------------------- # Invalid Operations + @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])]) + @pytest.mark.parametrize('op', [operator.add, ops.radd, + operator.sub, ops.rsub]) + def test_tdi_add_sub_float(self, op, other): + dti = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') + tdi = dti - dti.shift(1) + with pytest.raises(TypeError): + op(tdi, other) + def test_tdi_add_str_invalid(self): # GH 13624 tdi = TimedeltaIndex(['1 day', '2 days']) diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 16326845de1d54..3eb5f38ba0c806 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -1,6 +1,5 @@ import pytest import numpy as np -import pandas as pd from pandas import Series, IntervalIndex, Interval import pandas.util.testing as tm @@ -170,17 +169,17 @@ def test_loc_with_overlap(self): # interval expected = 0 - result = s.loc[pd.interval(1, 5)] + result = s.loc[Interval(1, 5)] tm.assert_series_equal(expected, result) - result = s[pd.interval(1, 5)] + result = s[Interval(1, 5)] tm.assert_series_equal(expected, result) expected = s - result = s.loc[[pd.interval(1, 5), pd.Interval(3, 7)]] + result = s.loc[[Interval(1, 5), Interval(3, 7)]] tm.assert_series_equal(expected, result) - result = s[[pd.interval(1, 5), pd.Interval(3, 7)]] + result = s[[Interval(1, 5), Interval(3, 7)]] tm.assert_series_equal(expected, result) with pytest.raises(KeyError): @@ -197,17 +196,17 @@ def test_loc_with_overlap(self): # slices with interval (only exact matches) expected = s - result = s.loc[pd.interval(1, 5):pd.Interval(3, 7)] + result = s.loc[Interval(1, 5):Interval(3, 7)] tm.assert_series_equal(expected, result) - result = s[pd.interval(1, 5):pd.Interval(3, 7)] + result = s[Interval(1, 5):Interval(3, 7)] tm.assert_series_equal(expected, result) with pytest.raises(KeyError): - s.loc[pd.interval(1, 6):pd.Interval(3, 8)] + s.loc[Interval(1, 6):Interval(3, 8)] with pytest.raises(KeyError): - s[pd.interval(1, 6):pd.Interval(3, 8)] + s[Interval(1, 6):Interval(3, 8)] # slices with scalar raise for overlapping intervals # TODO KeyError is the appropriate error? @@ -217,7 +216,7 @@ def test_loc_with_overlap(self): def test_non_unique(self): idx = IntervalIndex.from_tuples([(1, 3), (3, 7)]) - s = pd.Series(range(len(idx)), index=idx) + s = Series(range(len(idx)), index=idx) result = s.loc[Interval(1, 3)] assert result == 0 diff --git a/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.14.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.14.pickle new file mode 100644 index 00000000000000..6341fa26d1f258 Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.14.pickle differ diff --git a/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_AMD64_windows_2.7.14.pickle b/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_AMD64_windows_2.7.14.pickle new file mode 100644 index 00000000000000..ddd88f77aa2a49 Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_AMD64_windows_2.7.14.pickle differ diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 03c071dbe4bc53..6c3b75cdfa6df3 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1434,6 +1434,13 @@ def test_repr_html(self): tm.reset_display_options() + def test_repr_html_mathjax(self): + df = DataFrame([[1, 2], [3, 4]]) + assert 'tex2jax_ignore' not in df._repr_html_() + + with pd.option_context('display.html.use_mathjax', False): + assert 'tex2jax_ignore' in df._repr_html_() + def test_repr_html_wide(self): max_cols = get_option('display.max_columns') df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index adf8e14b756c26..c1ab9cd1843408 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -46,6 +46,13 @@ def test_init_series(self): def test_repr_html_ok(self): self.styler._repr_html_() + def test_repr_html_mathjax(self): + # gh-19824 + assert 'tex2jax_ignore' not in self.styler._repr_html_() + + with pd.option_context('display.html.use_mathjax', False): + assert 'tex2jax_ignore' in self.styler._repr_html_() + def test_update_ctx(self): self.styler._update_ctx(self.attrs) expected = {(0, 0): ['color: red'], diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 67f95c828c80e8..9f1ac8b1e677b2 100755 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -40,7 +40,7 @@ from pandas import (Series, DataFrame, Panel, SparseSeries, SparseDataFrame, Index, MultiIndex, bdate_range, to_msgpack, - date_range, period_range, + date_range, period_range, timedelta_range, Timestamp, NaT, Categorical, Period) from pandas.tseries.offsets import ( DateOffset, Hour, Minute, Day, @@ -116,7 +116,18 @@ def create_data(): index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), - period=period_range('2013-01-01', freq='M', periods=10)) + period=period_range('2013-01-01', freq='M', periods=10), + float=Index(np.arange(10, dtype=np.float64)), + uint=Index(np.arange(10, dtype=np.uint64)), + timedelta=timedelta_range('00:00:00', freq='30T', periods=10)) + + if _loose_version >= LooseVersion('0.18'): + from pandas import RangeIndex + index['range'] = RangeIndex(10) + + if _loose_version >= LooseVersion('0.21'): + from pandas import interval_range + index['interval'] = interval_range(0, periods=10) mi = dict(reg2=MultiIndex.from_tuples( tuple(zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo', @@ -276,6 +287,9 @@ def create_msgpack_data(): del data['frame']['cat_onecol'] del data['frame']['cat_and_float'] del data['scalars']['period'] + if _loose_version < LooseVersion('0.23.0'): + del data['index']['interval'] + del data['offsets'] return _u(data) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index fdf9954285db88..6b39717213c0d1 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -762,17 +762,17 @@ def test_read_excel_multiindex_empty_level(self, ext): # GH 12453 with ensure_clean('.xlsx') as path: df = DataFrame({ - ('Zero', ''): {0: 0}, ('One', 'x'): {0: 1}, ('Two', 'X'): {0: 3}, - ('Two', 'Y'): {0: 7} + ('Two', 'Y'): {0: 7}, + ('Zero', ''): {0: 0} }) expected = DataFrame({ - ('Zero', 'Unnamed: 3_level_1'): {0: 0}, ('One', u'x'): {0: 1}, ('Two', u'X'): {0: 3}, - ('Two', u'Y'): {0: 7} + ('Two', u'Y'): {0: 7}, + ('Zero', 'Unnamed: 3_level_1'): {0: 0} }) df.to_excel(path) @@ -1014,7 +1014,7 @@ class _WriterBase(SharedItems): def set_engine_and_path(self, request, merge_cells, engine, ext): """Fixture to set engine and open file for use in each test case - Rather than requiring `engine=...` to be provided explictly as an + Rather than requiring `engine=...` to be provided explicitly as an argument in each test, this fixture sets a global option to dictate which engine should be used to write Excel files. After executing the test it rolls back said change to the global option. @@ -1373,11 +1373,6 @@ def test_to_excel_interval_labels(self, merge_cells, engine, ext): def test_to_excel_timedelta(self, merge_cells, engine, ext): # GH 19242, GH9155 - test writing timedelta to xls - if engine == 'openpyxl': - pytest.xfail('Timedelta roundtrip broken with openpyxl') - if engine == 'xlsxwriter' and (sys.version_info[0] == 2 and - sys.platform.startswith('linux')): - pytest.xfail('Not working on linux with Py2 and xlsxwriter') frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), columns=['A'], dtype=np.int64 diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 151a0750b7f6e7..b18104e9515049 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -674,6 +674,39 @@ def test_wikipedia_states_table(self): result = self.read_html(data, 'Arizona', header=1)[0] assert result['sq mi'].dtype == np.dtype('float64') + @pytest.mark.parametrize("displayed_only,exp0,exp1", [ + (True, DataFrame(["foo"]), None), + (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) + def test_displayed_only(self, displayed_only, exp0, exp1): + # GH 20027 + data = StringIO(""" + +
+ + + +
+ foo + bar + baz + qux +
+ + + + +
foo
+ + """) + + dfs = self.read_html(data, displayed_only=displayed_only) + tm.assert_frame_equal(dfs[0], exp0) + + if exp1 is not None: + tm.assert_frame_equal(dfs[1], exp1) + else: + assert len(dfs) == 1 # Should not parse hidden table + def test_decimal_rows(self): # GH 12907 @@ -896,6 +929,39 @@ def test_computer_sales_page(self): data = os.path.join(DATA_PATH, 'computer_sales_page.html') self.read_html(data, header=[0, 1]) + @pytest.mark.parametrize("displayed_only,exp0,exp1", [ + (True, DataFrame(["foo"]), None), + (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) + def test_displayed_only(self, displayed_only, exp0, exp1): + # GH 20027 + data = StringIO(""" + + + + + +
+ foo + bar + baz + qux +
+ + + + +
foo
+ + """) + + dfs = self.read_html(data, displayed_only=displayed_only) + tm.assert_frame_equal(dfs[0], exp0) + + if exp1 is not None: + tm.assert_frame_equal(dfs[1], exp1) + else: + assert len(dfs) == 1 # Should not parse hidden table + def test_invalid_flavor(): url = 'google.com' diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index c343e0105eb4f7..919b34dc09f6fe 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -10,7 +10,8 @@ from pandas import compat from pandas.compat import u, PY3 from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, - date_range, period_range, Index, Categorical) + date_range, period_range, Index, Categorical, + Period, Interval) from pandas.errors import PerformanceWarning from pandas.io.packers import to_msgpack, read_msgpack import pandas.util.testing as tm @@ -317,6 +318,19 @@ def test_timedeltas(self): i_rec = self.encode_decode(i) assert i == i_rec + def test_periods(self): + # 13463 + for i in [Period('2010-09', 'M'), Period('2014-Q1', 'Q')]: + i_rec = self.encode_decode(i) + assert i == i_rec + + def test_intervals(self): + # 19967 + for i in [Interval(0, 1), Interval(0, 1, 'left'), + Interval(10, 25., 'right')]: + i_rec = self.encode_decode(i) + assert i == i_rec + class TestIndex(TestPackers): @@ -334,7 +348,9 @@ def setup_method(self, method): 'period': Index(period_range('2012-1-1', freq='M', periods=3)), 'date2': Index(date_range('2013-01-1', periods=10)), 'bdate': Index(bdate_range('2013-01-02', periods=10)), - 'cat': tm.makeCategoricalIndex(100) + 'cat': tm.makeCategoricalIndex(100), + 'interval': tm.makeIntervalIndex(100), + 'timedelta': tm.makeTimedeltaIndex(100, 'H') } self.mi = { diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 04da6da74059be..e690b1e302d8bf 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -2034,7 +2034,7 @@ def test_table_values_dtypes_roundtrip(self): 'bool': 1, 'int16': 1, 'int8': 1, 'int64': 1, 'object': 1, 'datetime64[ns]': 2}) result = result.sort_index() - result = expected.sort_index() + expected = expected.sort_index() tm.assert_series_equal(result, expected) def test_table_mixed_dtypes(self): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index f3ab74d37a2bc9..4530cc9d2fba9a 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1665,6 +1665,29 @@ class Temporary(Base): tm.assert_frame_equal(df, expected) + def test_insert_multivalues(self): + # issues addressed + # https://github.com/pandas-dev/pandas/issues/14315 + # https://github.com/pandas-dev/pandas/issues/8953 + + db = sql.SQLDatabase(self.conn) + df = DataFrame({'A': [1, 0, 0], 'B': [1.1, 0.2, 4.3]}) + table = sql.SQLTable("test_table", db, frame=df) + data = [ + {'A': 1, 'B': 0.46}, + {'A': 0, 'B': -2.06} + ] + statement = table.insert_statement(data, conn=self.conn)[0] + + if self.supports_multivalues_insert: + assert statement.parameters == data, ( + 'insert statement should be multivalues' + ) + else: + assert statement.parameters is None, ( + 'insert statement should not be multivalues' + ) + class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): @@ -1679,6 +1702,7 @@ class _TestSQLiteAlchemy(object): """ flavor = 'sqlite' + supports_multivalues_insert = True @classmethod def connect(cls): @@ -1727,6 +1751,7 @@ class _TestMySQLAlchemy(object): """ flavor = 'mysql' + supports_multivalues_insert = True @classmethod def connect(cls): @@ -1796,6 +1821,7 @@ class _TestPostgreSQLAlchemy(object): """ flavor = 'postgresql' + supports_multivalues_insert = True @classmethod def connect(cls): diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 08a047a2e77070..2f2931c9c86acf 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -100,14 +100,26 @@ def test_nonnumeric_exclude(self): pytest.raises(TypeError, df['A'].plot) + def test_tsplot_deprecated(self): + from pandas.tseries.plotting import tsplot + + _, ax = self.plt.subplots() + ts = tm.makeTimeSeries() + + with tm.assert_produces_warning(FutureWarning): + tsplot(ts, self.plt.Axes.plot, ax=ax) + @pytest.mark.slow def test_tsplot(self): + from pandas.tseries.plotting import tsplot _, ax = self.plt.subplots() ts = tm.makeTimeSeries() - f = lambda *args, **kwds: tsplot(s, self.plt.Axes.plot, *args, **kwds) + def f(*args, **kwds): + with tm.assert_produces_warning(FutureWarning): + return tsplot(s, self.plt.Axes.plot, *args, **kwds) for s in self.period_ser: _check_plot_works(f, s.index.freq, ax=ax, series=s) @@ -179,11 +191,13 @@ def check_format_of_first_point(ax, expected_string): tm.close() # tsplot - _, ax = self.plt.subplots() from pandas.tseries.plotting import tsplot - tsplot(annual, self.plt.Axes.plot, ax=ax) + _, ax = self.plt.subplots() + with tm.assert_produces_warning(FutureWarning): + tsplot(annual, self.plt.Axes.plot, ax=ax) check_format_of_first_point(ax, 't = 2014 y = 1.000000') - tsplot(daily, self.plt.Axes.plot, ax=ax) + with tm.assert_produces_warning(FutureWarning): + tsplot(daily, self.plt.Axes.plot, ax=ax) check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000') @pytest.mark.slow @@ -870,12 +884,12 @@ def test_to_weekly_resampling(self): for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - # tsplot - from pandas.tseries.plotting import tsplot - _, ax = self.plt.subplots() - tsplot(high, self.plt.Axes.plot, ax=ax) - lines = tsplot(low, self.plt.Axes.plot, ax=ax) + from pandas.tseries.plotting import tsplot + with tm.assert_produces_warning(FutureWarning): + tsplot(high, self.plt.Axes.plot, ax=ax) + with tm.assert_produces_warning(FutureWarning): + lines = tsplot(low, self.plt.Axes.plot, ax=ax) for l in lines: assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq @@ -901,12 +915,12 @@ def test_from_weekly_resampling(self): tm.assert_numpy_array_equal(xdata, expected_h) tm.close() - # tsplot - from pandas.tseries.plotting import tsplot - _, ax = self.plt.subplots() - tsplot(low, self.plt.Axes.plot, ax=ax) - lines = tsplot(high, self.plt.Axes.plot, ax=ax) + from pandas.tseries.plotting import tsplot + with tm.assert_produces_warning(FutureWarning): + tsplot(low, self.plt.Axes.plot, ax=ax) + with tm.assert_produces_warning(FutureWarning): + lines = tsplot(high, self.plt.Axes.plot, ax=ax) for l in lines: assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq xdata = l.get_xdata(orig=False) @@ -1027,6 +1041,7 @@ def test_irreg_dtypes(self): _, ax = self.plt.subplots() _check_plot_works(df.plot, ax=ax) + @pytest.mark.xfail(not PY3, reason="failing on mpl 1.4.3 on PY2") @pytest.mark.slow def test_time(self): t = datetime(1, 1, 1, 3, 30, 0) @@ -1342,9 +1357,11 @@ def test_plot_outofbounds_datetime(self): values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)] ax.plot(values) + @td.xfail_if_mpl_2_2 + @pytest.mark.skip( + is_platform_mac(), + "skip on mac for precision display issue on older mpl") def test_format_timedelta_ticks_narrow(self): - if is_platform_mac(): - pytest.skip("skip on mac for precision display issue on older mpl") if self.mpl_ge_2_0_0: expected_labels = [''] + [ @@ -1365,9 +1382,11 @@ def test_format_timedelta_ticks_narrow(self): for l, l_expected in zip(labels, expected_labels): assert l.get_text() == l_expected + @td.xfail_if_mpl_2_2 + @pytest.mark.skip( + is_platform_mac(), + "skip on mac for precision display issue on older mpl") def test_format_timedelta_ticks_wide(self): - if is_platform_mac(): - pytest.skip("skip on mac for precision display issue on older mpl") if self.mpl_ge_2_0_0: expected_labels = [ diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 3d25b0b51e0520..b29afcb404ac6c 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2461,6 +2461,7 @@ def test_errorbar_asymmetrical(self): tm.close() + @td.xfail_if_mpl_2_2 def test_table(self): df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 9e538ae130a856..c5ce8aba9d80ec 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -52,6 +52,7 @@ def test_bootstrap_plot(self): @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): + @td.xfail_if_mpl_2_2 @td.skip_if_no_scipy def test_scatter_matrix_axis(self): scatter_matrix = plotting.scatter_matrix diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 278be433183fa9..5dc7d52e057785 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -792,6 +792,7 @@ def test_errorbar_plot(self): with pytest.raises((ValueError, TypeError)): s.plot(yerr=s_err) + @td.xfail_if_mpl_2_2 def test_table(self): _check_plot_works(self.series.plot, table=True) _check_plot_works(self.series.plot, table=self.series) diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index ff914273d47b1d..8d093f2784ba14 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -4,7 +4,7 @@ import numpy as np from pandas.compat import zip -from pandas import (Series, isna, to_datetime, DatetimeIndex, +from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index, Timestamp, Interval, IntervalIndex, Categorical, cut, qcut, date_range, NaT, TimedeltaIndex) from pandas.tseries.offsets import Nano, Day @@ -104,6 +104,12 @@ def test_cut_corner(self): pytest.raises(ValueError, cut, [1, 2, 3], 0.5) + @pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))]) + @pytest.mark.parametrize('cut_func', [cut, qcut]) + def test_cut_not_1d_arg(self, arg, cut_func): + with pytest.raises(ValueError): + cut_func(arg, 2) + def test_cut_out_of_range_more(self): # #1511 s = Series([0, -1, 0, 1, -3], name='x') @@ -251,18 +257,6 @@ def test_qcut_nas(self): result = qcut(arr, 4) assert isna(result[:20]).all() - @pytest.mark.parametrize('s', [ - Series(DatetimeIndex(['20180101', NaT, '20180103'])), - Series(TimedeltaIndex(['0 days', NaT, '2 days']))], - ids=lambda x: str(x.dtype)) - def test_qcut_nat(self, s): - # GH 19768 - intervals = IntervalIndex.from_tuples( - [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) - expected = Series(Categorical(intervals, ordered=True)) - result = qcut(s, 2) - tm.assert_series_equal(result, expected) - def test_qcut_index(self): result = qcut([0, 2], 2) intervals = [Interval(-0.001, 1), Interval(1, 2)] @@ -452,6 +446,37 @@ def test_single_bin(self): result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "array_1_writeable, array_2_writeable", + [(True, True), (True, False), (False, False)]) + def test_cut_read_only(self, array_1_writeable, array_2_writeable): + # issue 18773 + array_1 = np.arange(0, 100, 10) + array_1.flags.writeable = array_1_writeable + + array_2 = np.arange(0, 100, 10) + array_2.flags.writeable = array_2_writeable + + hundred_elements = np.arange(100) + + tm.assert_categorical_equal(cut(hundred_elements, array_1), + cut(hundred_elements, array_2)) + + +class TestDatelike(object): + + @pytest.mark.parametrize('s', [ + Series(DatetimeIndex(['20180101', NaT, '20180103'])), + Series(TimedeltaIndex(['0 days', NaT, '2 days']))], + ids=lambda x: str(x.dtype)) + def test_qcut_nat(self, s): + # GH 19768 + intervals = IntervalIndex.from_tuples( + [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) + expected = Series(Categorical(intervals, ordered=True)) + result = qcut(s, 2) + tm.assert_series_equal(result, expected) + def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series @@ -488,6 +513,47 @@ def test_datetime_cut(self): result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected) + @pytest.mark.parametrize('bins', [ + 3, [Timestamp('2013-01-01 04:57:07.200000'), + Timestamp('2013-01-01 21:00:00'), + Timestamp('2013-01-02 13:00:00'), + Timestamp('2013-01-03 05:00:00')]]) + @pytest.mark.parametrize('box', [list, np.array, Index, Series]) + def test_datetimetz_cut(self, bins, box): + # GH 19872 + tz = 'US/Eastern' + s = Series(date_range('20130101', periods=3, tz=tz)) + if not isinstance(bins, int): + bins = box(bins) + result = cut(s, bins) + expected = ( + Series(IntervalIndex([ + Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz), + Timestamp('2013-01-01 16:00:00', tz=tz)), + Interval(Timestamp('2013-01-01 16:00:00', tz=tz), + Timestamp('2013-01-02 08:00:00', tz=tz)), + Interval(Timestamp('2013-01-02 08:00:00', tz=tz), + Timestamp('2013-01-03 00:00:00', tz=tz))])) + .astype(CDT(ordered=True))) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)]) + def test_datetimetz_qcut(self, bins): + # GH 19872 + tz = 'US/Eastern' + s = Series(date_range('20130101', periods=3, tz=tz)) + result = qcut(s, bins) + expected = ( + Series(IntervalIndex([ + Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz), + Timestamp('2013-01-01 16:00:00', tz=tz)), + Interval(Timestamp('2013-01-01 16:00:00', tz=tz), + Timestamp('2013-01-02 08:00:00', tz=tz)), + Interval(Timestamp('2013-01-02 08:00:00', tz=tz), + Timestamp('2013-01-03 00:00:00', tz=tz))])) + .astype(CDT(ordered=True))) + tm.assert_series_equal(result, expected) + def test_datetime_bin(self): data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] @@ -523,19 +589,3 @@ def f(): mask = result.isna() tm.assert_numpy_array_equal( mask, np.array([False, True, True, True, True])) - - @pytest.mark.parametrize( - "array_1_writeable, array_2_writeable", - [(True, True), (True, False), (False, False)]) - def test_cut_read_only(self, array_1_writeable, array_2_writeable): - # issue 18773 - array_1 = np.arange(0, 100, 10) - array_1.flags.writeable = array_1_writeable - - array_2 = np.arange(0, 100, 10) - array_2.flags.writeable = array_2_writeable - - hundred_elements = np.arange(100) - - tm.assert_categorical_equal(cut(hundred_elements, array_1), - cut(hundred_elements, array_2)) diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 7695c944092323..cde5baf47c18e7 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -4,6 +4,7 @@ import pytest import dateutil import calendar +import locale import numpy as np from dateutil.tz import tzutc @@ -21,7 +22,7 @@ from pandas.errors import OutOfBoundsDatetime from pandas.compat import long, PY3 from pandas.compat.numpy import np_datetime64_compat -from pandas import Timestamp, Period, Timedelta +from pandas import Timestamp, Period, Timedelta, NaT class TestTimestampProperties(object): @@ -95,13 +96,33 @@ def check(value, equal): for end in ends: assert getattr(ts, end) - @pytest.mark.parametrize('data, expected', - [(Timestamp('2017-08-28 23:00:00'), 'Monday'), - (Timestamp('2017-08-28 23:00:00', tz='EST'), - 'Monday')]) - def test_weekday_name(self, data, expected): + # GH 12806 + @pytest.mark.parametrize('data', + [Timestamp('2017-08-28 23:00:00'), + Timestamp('2017-08-28 23:00:00', tz='EST')]) + @pytest.mark.parametrize('time_locale', [ + None] if tm.get_locales() is None else [None] + tm.get_locales()) + def test_names(self, data, time_locale): # GH 17354 - assert data.weekday_name == expected + # Test .weekday_name, .day_name(), .month_name + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + assert data.weekday_name == 'Monday' + if time_locale is None: + expected_day = 'Monday' + expected_month = 'August' + else: + with tm.set_locale(time_locale, locale.LC_TIME): + expected_day = calendar.day_name[0].capitalize() + expected_month = calendar.month_name[8].capitalize() + + assert data.day_name(time_locale) == expected_day + assert data.month_name(time_locale) == expected_month + + # Test NaT + nan_ts = Timestamp(NaT) + assert np.isnan(nan_ts.day_name(time_locale)) + assert np.isnan(nan_ts.month_name(time_locale)) @pytest.mark.parametrize('tz', [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']) def test_is_leap_year(self, tz): @@ -385,6 +406,27 @@ def test_constructor_fromordinal(self): ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern') assert ts.to_pydatetime() == dt_tz + @pytest.mark.parametrize('result', [ + Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1), + Timestamp(year=2000, month=1, day=2, hour=3, minute=4, second=5, + microsecond=6, nanosecond=1), + Timestamp(year=2000, month=1, day=2, hour=3, minute=4, second=5, + microsecond=6, nanosecond=1, tz='UTC'), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC)]) + def test_constructor_nanosecond(self, result): + # GH 18898 + expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz) + expected = expected + Timedelta(nanoseconds=1) + assert result == expected + + @pytest.mark.parametrize('arg', ['year', 'month', 'day', 'hour', 'minute', + 'second', 'microsecond', 'nanosecond']) + def test_invalid_date_kwarg_with_string_input(self, arg): + kwarg = {arg: 1} + with pytest.raises(ValueError): + Timestamp('2010-10-10 12:59:59.999999999', **kwarg) + def test_out_of_bounds_value(self): one_us = np.timedelta64(1).astype('timedelta64[us]') diff --git a/pandas/tests/series/indexing/__init__.py b/pandas/tests/series/indexing/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/pandas/tests/series/indexing/conftest.py b/pandas/tests/series/indexing/conftest.py new file mode 100644 index 00000000000000..0e06f6b8e4640a --- /dev/null +++ b/pandas/tests/series/indexing/conftest.py @@ -0,0 +1,8 @@ +import pytest + +from pandas.tests.series.common import TestData + + +@pytest.fixture(scope='module') +def test_data(): + return TestData() diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py new file mode 100644 index 00000000000000..c1b6d0a4522328 --- /dev/null +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -0,0 +1,520 @@ +# coding=utf-8 +# pylint: disable-msg=E1101,W0612 + +import pytest + +from datetime import datetime + +import pandas as pd +import numpy as np + +from numpy import nan + +from pandas import compat + +from pandas import (Series, date_range, isna, Categorical) +from pandas.compat import lrange, range + +from pandas.util.testing import (assert_series_equal) +import pandas.util.testing as tm + +JOIN_TYPES = ['inner', 'outer', 'left', 'right'] + + +@pytest.mark.parametrize( + 'first_slice,second_slice', [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]] + ]) +@pytest.mark.parametrize('join_type', JOIN_TYPES) +@pytest.mark.parametrize('fill', [None, -1]) +def test_align(test_data, first_slice, second_slice, join_type, fill): + a = test_data.ts[slice(*first_slice)] + b = test_data.ts[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, fill_value=fill) + + join_index = a.index.join(b.index, how=join_type) + if fill is not None: + diff_a = aa.index.difference(join_index) + diff_b = ab.index.difference(join_index) + if len(diff_a) > 0: + assert (aa.reindex(diff_a) == fill).all() + if len(diff_b) > 0: + assert (ab.reindex(diff_b) == fill).all() + + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + if fill is not None: + ea = ea.fillna(fill) + eb = eb.fillna(fill) + + assert_series_equal(aa, ea) + assert_series_equal(ab, eb) + assert aa.name == 'ts' + assert ea.name == 'ts' + assert ab.name == 'ts' + assert eb.name == 'ts' + + +@pytest.mark.parametrize( + 'first_slice,second_slice', [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]] + ]) +@pytest.mark.parametrize('join_type', JOIN_TYPES) +@pytest.mark.parametrize('method', ['pad', 'bfill']) +@pytest.mark.parametrize('limit', [None, 1]) +def test_align_fill_method(test_data, + first_slice, second_slice, + join_type, method, limit): + a = test_data.ts[slice(*first_slice)] + b = test_data.ts[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, method=method, limit=limit) + + join_index = a.index.join(b.index, how=join_type) + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + ea = ea.fillna(method=method, limit=limit) + eb = eb.fillna(method=method, limit=limit) + + assert_series_equal(aa, ea) + assert_series_equal(ab, eb) + + +def test_align_nocopy(test_data): + b = test_data.ts[:5].copy() + + # do copy + a = test_data.ts.copy() + ra, _ = a.align(b, join='left') + ra[:5] = 5 + assert not (a[:5] == 5).any() + + # do not copy + a = test_data.ts.copy() + ra, _ = a.align(b, join='left', copy=False) + ra[:5] = 5 + assert (a[:5] == 5).all() + + # do copy + a = test_data.ts.copy() + b = test_data.ts[:5].copy() + _, rb = a.align(b, join='right') + rb[:3] = 5 + assert not (b[:3] == 5).any() + + # do not copy + a = test_data.ts.copy() + b = test_data.ts[:5].copy() + _, rb = a.align(b, join='right', copy=False) + rb[:2] = 5 + assert (b[:2] == 5).all() + + +def test_align_same_index(test_data): + a, b = test_data.ts.align(test_data.ts, copy=False) + assert a.index is test_data.ts.index + assert b.index is test_data.ts.index + + a, b = test_data.ts.align(test_data.ts, copy=True) + assert a.index is not test_data.ts.index + assert b.index is not test_data.ts.index + + +def test_align_multiindex(): + # GH 10665 + + midx = pd.MultiIndex.from_product([range(2), range(3), range(2)], + names=('a', 'b', 'c')) + idx = pd.Index(range(2), name='b') + s1 = pd.Series(np.arange(12, dtype='int64'), index=midx) + s2 = pd.Series(np.arange(2, dtype='int64'), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = s1.align(s2, join='left') + res2l, res2r = s2.align(s1, join='right') + + expl = s1 + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = pd.Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + res1l, res1r = s1.align(s2, join='right') + res2l, res2r = s2.align(s1, join='left') + + exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)], + names=('a', 'b', 'c')) + expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = pd.Series([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + +def test_reindex(test_data): + identity = test_data.series.reindex(test_data.series.index) + + # __array_interface__ is not defined for older numpies + # and on some pythons + try: + assert np.may_share_memory(test_data.series.index, identity.index) + except AttributeError: + pass + + assert identity.index.is_(test_data.series.index) + assert identity.index.identical(test_data.series.index) + + subIndex = test_data.series.index[10:20] + subSeries = test_data.series.reindex(subIndex) + + for idx, val in compat.iteritems(subSeries): + assert val == test_data.series[idx] + + subIndex2 = test_data.ts.index[10:20] + subTS = test_data.ts.reindex(subIndex2) + + for idx, val in compat.iteritems(subTS): + assert val == test_data.ts[idx] + stuffSeries = test_data.ts.reindex(subIndex) + + assert np.isnan(stuffSeries).all() + + # This is extremely important for the Cython code to not screw up + nonContigIndex = test_data.ts.index[::2] + subNonContig = test_data.ts.reindex(nonContigIndex) + for idx, val in compat.iteritems(subNonContig): + assert val == test_data.ts[idx] + + # return a copy the same index here + result = test_data.ts.reindex() + assert not (result is test_data.ts) + + +def test_reindex_nan(): + ts = Series([2, 3, 5, 7], index=[1, 4, nan, 8]) + + i, j = [nan, 1, nan, 8, 4, nan], [2, 0, 2, 3, 1, 2] + assert_series_equal(ts.reindex(i), ts.iloc[j]) + + ts.index = ts.index.astype('object') + + # reindex coerces index.dtype to float, loc/iloc doesn't + assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False) + + +def test_reindex_series_add_nat(): + rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + series = Series(rng) + + result = series.reindex(lrange(15)) + assert np.issubdtype(result.dtype, np.dtype('M8[ns]')) + + mask = result.isna() + assert mask[-5:].all() + assert not mask[:-5].any() + + +def test_reindex_with_datetimes(): + rng = date_range('1/1/2000', periods=20) + ts = Series(np.random.randn(20), index=rng) + + result = ts.reindex(list(ts.index[5:10])) + expected = ts[5:10] + tm.assert_series_equal(result, expected) + + result = ts[list(ts.index[5:10])] + tm.assert_series_equal(result, expected) + + +def test_reindex_corner(test_data): + # (don't forget to fix this) I think it's fixed + test_data.empty.reindex(test_data.ts.index, method='pad') # it works + + # corner case: pad empty series + reindexed = test_data.empty.reindex(test_data.ts.index, method='pad') + + # pass non-Index + reindexed = test_data.ts.reindex(list(test_data.ts.index)) + assert_series_equal(test_data.ts, reindexed) + + # bad fill method + ts = test_data.ts[::2] + pytest.raises(Exception, ts.reindex, test_data.ts.index, method='foo') + + +def test_reindex_pad(): + s = Series(np.arange(10), dtype='int64') + s2 = s[::2] + + reindexed = s2.reindex(s.index, method='pad') + reindexed2 = s2.reindex(s.index, method='ffill') + assert_series_equal(reindexed, reindexed2) + + expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], index=np.arange(10)) + assert_series_equal(reindexed, expected) + + # GH4604 + s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e']) + new_index = ['a', 'g', 'c', 'f'] + expected = Series([1, 1, 3, 3], index=new_index) + + # this changes dtype because the ffill happens after + result = s.reindex(new_index).ffill() + assert_series_equal(result, expected.astype('float64')) + + result = s.reindex(new_index).ffill(downcast='infer') + assert_series_equal(result, expected) + + expected = Series([1, 5, 3, 5], index=new_index) + result = s.reindex(new_index, method='ffill') + assert_series_equal(result, expected) + + # inference of new dtype + s = Series([True, False, False, True], index=list('abcd')) + new_index = 'agc' + result = s.reindex(list(new_index)).ffill() + expected = Series([True, True, False], index=list(new_index)) + assert_series_equal(result, expected) + + # GH4618 shifted series downcasting + s = Series(False, index=lrange(0, 5)) + result = s.shift(1).fillna(method='bfill') + expected = Series(False, index=lrange(0, 5)) + assert_series_equal(result, expected) + + +def test_reindex_nearest(): + s = Series(np.arange(10, dtype='int64')) + target = [0.1, 0.9, 1.5, 2.0] + actual = s.reindex(target, method='nearest') + expected = Series(np.around(target).astype('int64'), target) + assert_series_equal(expected, actual) + + actual = s.reindex_like(actual, method='nearest') + assert_series_equal(expected, actual) + + actual = s.reindex_like(actual, method='nearest', tolerance=1) + assert_series_equal(expected, actual) + actual = s.reindex_like(actual, method='nearest', + tolerance=[1, 2, 3, 4]) + assert_series_equal(expected, actual) + + actual = s.reindex(target, method='nearest', tolerance=0.2) + expected = Series([0, 1, np.nan, 2], target) + assert_series_equal(expected, actual) + + actual = s.reindex(target, method='nearest', + tolerance=[0.3, 0.01, 0.4, 3]) + expected = Series([0, np.nan, np.nan, 2], target) + assert_series_equal(expected, actual) + + +def test_reindex_backfill(): + pass + + +def test_reindex_int(test_data): + ts = test_data.ts[::2] + int_ts = Series(np.zeros(len(ts), dtype=int), index=ts.index) + + # this should work fine + reindexed_int = int_ts.reindex(test_data.ts.index) + + # if NaNs introduced + assert reindexed_int.dtype == np.float_ + + # NO NaNs introduced + reindexed_int = int_ts.reindex(int_ts.index[::2]) + assert reindexed_int.dtype == np.int_ + + +def test_reindex_bool(test_data): + # A series other than float, int, string, or object + ts = test_data.ts[::2] + bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) + + # this should work fine + reindexed_bool = bool_ts.reindex(test_data.ts.index) + + # if NaNs introduced + assert reindexed_bool.dtype == np.object_ + + # NO NaNs introduced + reindexed_bool = bool_ts.reindex(bool_ts.index[::2]) + assert reindexed_bool.dtype == np.bool_ + + +def test_reindex_bool_pad(test_data): + # fail + ts = test_data.ts[5:] + bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) + filled_bool = bool_ts.reindex(test_data.ts.index, method='pad') + assert isna(filled_bool[:5]).all() + + +def test_reindex_categorical(): + index = date_range('20000101', periods=3) + + # reindexing to an invalid Categorical + s = Series(['a', 'b', 'c'], dtype='category') + result = s.reindex(index) + expected = Series(Categorical(values=[np.nan, np.nan, np.nan], + categories=['a', 'b', 'c'])) + expected.index = index + tm.assert_series_equal(result, expected) + + # partial reindexing + expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b', + 'c'])) + expected.index = [1, 2] + result = s.reindex([1, 2]) + tm.assert_series_equal(result, expected) + + expected = Series(Categorical( + values=['c', np.nan], categories=['a', 'b', 'c'])) + expected.index = [2, 3] + result = s.reindex([2, 3]) + tm.assert_series_equal(result, expected) + + +def test_reindex_like(test_data): + other = test_data.ts[::2] + assert_series_equal(test_data.ts.reindex(other.index), + test_data.ts.reindex_like(other)) + + # GH 7179 + day1 = datetime(2013, 3, 5) + day2 = datetime(2013, 5, 5) + day3 = datetime(2014, 3, 5) + + series1 = Series([5, None, None], [day1, day2, day3]) + series2 = Series([None, None], [day1, day3]) + + result = series1.reindex_like(series2, method='pad') + expected = Series([5, np.nan], index=[day1, day3]) + assert_series_equal(result, expected) + + +def test_reindex_fill_value(): + # ----------------------------------------------------------- + # floats + floats = Series([1., 2., 3.]) + result = floats.reindex([1, 2, 3]) + expected = Series([2., 3., np.nan], index=[1, 2, 3]) + assert_series_equal(result, expected) + + result = floats.reindex([1, 2, 3], fill_value=0) + expected = Series([2., 3., 0], index=[1, 2, 3]) + assert_series_equal(result, expected) + + # ----------------------------------------------------------- + # ints + ints = Series([1, 2, 3]) + + result = ints.reindex([1, 2, 3]) + expected = Series([2., 3., np.nan], index=[1, 2, 3]) + assert_series_equal(result, expected) + + # don't upcast + result = ints.reindex([1, 2, 3], fill_value=0) + expected = Series([2, 3, 0], index=[1, 2, 3]) + assert issubclass(result.dtype.type, np.integer) + assert_series_equal(result, expected) + + # ----------------------------------------------------------- + # objects + objects = Series([1, 2, 3], dtype=object) + + result = objects.reindex([1, 2, 3]) + expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) + assert_series_equal(result, expected) + + result = objects.reindex([1, 2, 3], fill_value='foo') + expected = Series([2, 3, 'foo'], index=[1, 2, 3], dtype=object) + assert_series_equal(result, expected) + + # ------------------------------------------------------------ + # bools + bools = Series([True, False, True]) + + result = bools.reindex([1, 2, 3]) + expected = Series([False, True, np.nan], index=[1, 2, 3], dtype=object) + assert_series_equal(result, expected) + + result = bools.reindex([1, 2, 3], fill_value=False) + expected = Series([False, True, False], index=[1, 2, 3]) + assert_series_equal(result, expected) + + +def test_rename(): + # GH 17407 + s = Series(range(1, 6), index=pd.Index(range(2, 7), name='IntIndex')) + result = s.rename(str) + expected = s.rename(lambda i: str(i)) + assert_series_equal(result, expected) + + assert result.name == expected.name + + +def test_drop(): + # unique + s = Series([1, 2], index=['one', 'two']) + expected = Series([1], index=['one']) + result = s.drop(['two']) + assert_series_equal(result, expected) + result = s.drop('two', axis='rows') + assert_series_equal(result, expected) + + # non-unique + # GH 5248 + s = Series([1, 1, 2], index=['one', 'two', 'one']) + expected = Series([1, 2], index=['one', 'one']) + result = s.drop(['two'], axis=0) + assert_series_equal(result, expected) + result = s.drop('two') + assert_series_equal(result, expected) + + expected = Series([1], index=['two']) + result = s.drop(['one']) + assert_series_equal(result, expected) + result = s.drop('one') + assert_series_equal(result, expected) + + # single string/tuple-like + s = Series(range(3), index=list('abc')) + pytest.raises(KeyError, s.drop, 'bc') + pytest.raises(KeyError, s.drop, ('a',)) + + # errors='ignore' + s = Series(range(3), index=list('abc')) + result = s.drop('bc', errors='ignore') + assert_series_equal(result, s) + result = s.drop(['a', 'd'], errors='ignore') + expected = s.iloc[1:] + assert_series_equal(result, expected) + + # bad axis + pytest.raises(ValueError, s.drop, 'one', axis='columns') + + # GH 8522 + s = Series([2, 3], index=[True, False]) + assert s.index.is_object() + result = s.drop(True) + expected = Series([3], index=[False]) + assert_series_equal(result, expected) + + # GH 16877 + s = Series([2, 3], index=[0, 1]) + with tm.assert_raises_regex(KeyError, 'not contained in axis'): + s.drop([False, True]) diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py new file mode 100644 index 00000000000000..f1f4a5a05697da --- /dev/null +++ b/pandas/tests/series/indexing/test_boolean.py @@ -0,0 +1,603 @@ +# coding=utf-8 +# pylint: disable-msg=E1101,W0612 + +import pytest + +import pandas as pd +import numpy as np + +from pandas import (Series, date_range, isna, Index, Timestamp) +from pandas.compat import lrange, range +from pandas.core.dtypes.common import is_integer + +from pandas.core.indexing import IndexingError +from pandas.tseries.offsets import BDay + +from pandas.util.testing import (assert_series_equal) +import pandas.util.testing as tm + +JOIN_TYPES = ['inner', 'outer', 'left', 'right'] + + +def test_getitem_boolean(test_data): + s = test_data.series + mask = s > s.median() + + # passing list is OK + result = s[list(mask)] + expected = s[mask] + assert_series_equal(result, expected) + tm.assert_index_equal(result.index, s.index[mask]) + + +def test_getitem_boolean_empty(): + s = Series([], dtype=np.int64) + s.index.name = 'index_name' + s = s[s.isna()] + assert s.index.name == 'index_name' + assert s.dtype == np.int64 + + # GH5877 + # indexing with empty series + s = Series(['A', 'B']) + expected = Series(np.nan, index=['C'], dtype=object) + result = s[Series(['C'], dtype=object)] + assert_series_equal(result, expected) + + s = Series(['A', 'B']) + expected = Series(dtype=object, index=Index([], dtype='int64')) + result = s[Series([], dtype=object)] + assert_series_equal(result, expected) + + # invalid because of the boolean indexer + # that's empty or not-aligned + def f(): + s[Series([], dtype=bool)] + + pytest.raises(IndexingError, f) + + def f(): + s[Series([True], dtype=bool)] + + pytest.raises(IndexingError, f) + + +def test_getitem_boolean_object(test_data): + # using column from DataFrame + + s = test_data.series + mask = s > s.median() + omask = mask.astype(object) + + # getitem + result = s[omask] + expected = s[mask] + assert_series_equal(result, expected) + + # setitem + s2 = s.copy() + cop = s.copy() + cop[omask] = 5 + s2[mask] = 5 + assert_series_equal(cop, s2) + + # nans raise exception + omask[5:10] = np.nan + pytest.raises(Exception, s.__getitem__, omask) + pytest.raises(Exception, s.__setitem__, omask, 5) + + +def test_getitem_setitem_boolean_corner(test_data): + ts = test_data.ts + mask_shifted = ts.shift(1, freq=BDay()) > ts.median() + + # these used to raise...?? + + pytest.raises(Exception, ts.__getitem__, mask_shifted) + pytest.raises(Exception, ts.__setitem__, mask_shifted, 1) + # ts[mask_shifted] + # ts[mask_shifted] = 1 + + pytest.raises(Exception, ts.loc.__getitem__, mask_shifted) + pytest.raises(Exception, ts.loc.__setitem__, mask_shifted, 1) + # ts.loc[mask_shifted] + # ts.loc[mask_shifted] = 2 + + +def test_setitem_boolean(test_data): + mask = test_data.series > test_data.series.median() + + # similar indexed series + result = test_data.series.copy() + result[mask] = test_data.series * 2 + expected = test_data.series * 2 + assert_series_equal(result[mask], expected[mask]) + + # needs alignment + result = test_data.series.copy() + result[mask] = (test_data.series * 2)[0:5] + expected = (test_data.series * 2)[0:5].reindex_like(test_data.series) + expected[-mask] = test_data.series[mask] + assert_series_equal(result[mask], expected[mask]) + + +def test_get_set_boolean_different_order(test_data): + ordered = test_data.series.sort_values() + + # setting + copy = test_data.series.copy() + copy[ordered > 0] = 0 + + expected = test_data.series.copy() + expected[expected > 0] = 0 + + assert_series_equal(copy, expected) + + # getting + sel = test_data.series[ordered > 0] + exp = test_data.series[test_data.series > 0] + assert_series_equal(sel, exp) + + +def test_where_unsafe(): + # unsafe dtype changes + for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, + np.float32, np.float64]: + s = Series(np.arange(10), dtype=dtype) + mask = s < 5 + s[mask] = lrange(2, 7) + expected = Series(lrange(2, 7) + lrange(5, 10), dtype=dtype) + assert_series_equal(s, expected) + assert s.dtype == expected.dtype + + # these are allowed operations, but are upcasted + for dtype in [np.int64, np.float64]: + s = Series(np.arange(10), dtype=dtype) + mask = s < 5 + values = [2.5, 3.5, 4.5, 5.5, 6.5] + s[mask] = values + expected = Series(values + lrange(5, 10), dtype='float64') + assert_series_equal(s, expected) + assert s.dtype == expected.dtype + + # GH 9731 + s = Series(np.arange(10), dtype='int64') + mask = s > 5 + values = [2.5, 3.5, 4.5, 5.5] + s[mask] = values + expected = Series(lrange(6) + values, dtype='float64') + assert_series_equal(s, expected) + + # can't do these as we are forced to change the itemsize of the input + # to something we cannot + for dtype in [np.int8, np.int16, np.int32, np.float16, np.float32]: + s = Series(np.arange(10), dtype=dtype) + mask = s < 5 + values = [2.5, 3.5, 4.5, 5.5, 6.5] + pytest.raises(Exception, s.__setitem__, tuple(mask), values) + + # GH3235 + s = Series(np.arange(10), dtype='int64') + mask = s < 5 + s[mask] = lrange(2, 7) + expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64') + assert_series_equal(s, expected) + assert s.dtype == expected.dtype + + s = Series(np.arange(10), dtype='int64') + mask = s > 5 + s[mask] = [0] * 4 + expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64') + assert_series_equal(s, expected) + + s = Series(np.arange(10)) + mask = s > 5 + + def f(): + s[mask] = [5, 4, 3, 2, 1] + + pytest.raises(ValueError, f) + + def f(): + s[mask] = [0] * 5 + + pytest.raises(ValueError, f) + + # dtype changes + s = Series([1, 2, 3, 4]) + result = s.where(s > 2, np.nan) + expected = Series([np.nan, np.nan, 3, 4]) + assert_series_equal(result, expected) + + # GH 4667 + # setting with None changes dtype + s = Series(range(10)).astype(float) + s[8] = None + result = s[8] + assert isna(result) + + s = Series(range(10)).astype(float) + s[s > 8] = None + result = s[isna(s)] + expected = Series(np.nan, index=[9]) + assert_series_equal(result, expected) + + +def test_where_raise_on_error_deprecation(): + # gh-14968 + # deprecation of raise_on_error + s = Series(np.random.randn(5)) + cond = s > 0 + with tm.assert_produces_warning(FutureWarning): + s.where(cond, raise_on_error=True) + with tm.assert_produces_warning(FutureWarning): + s.mask(cond, raise_on_error=True) + + +def test_where(): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(cond).dropna() + rs2 = s[cond] + assert_series_equal(rs, rs2) + + rs = s.where(cond, -s) + assert_series_equal(rs, s.abs()) + + rs = s.where(cond) + assert (s.shape == rs.shape) + assert (rs is not s) + + # test alignment + cond = Series([True, False, False, True, False], index=s.index) + s2 = -(s.abs()) + + expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index) + rs = s2.where(cond[:3]) + assert_series_equal(rs, expected) + + expected = s2.abs() + expected.iloc[0] = s2[0] + rs = s2.where(cond[:3], -s2) + assert_series_equal(rs, expected) + + +def test_where_error(): + s = Series(np.random.randn(5)) + cond = s > 0 + + pytest.raises(ValueError, s.where, 1) + pytest.raises(ValueError, s.where, cond[:3].values, -s) + + # GH 2745 + s = Series([1, 2]) + s[[True, False]] = [0, 1] + expected = Series([0, 2]) + assert_series_equal(s, expected) + + # failures + pytest.raises(ValueError, s.__setitem__, tuple([[[True, False]]]), + [0, 2, 3]) + pytest.raises(ValueError, s.__setitem__, tuple([[[True, False]]]), + []) + + +@pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) +def test_where_array_like(klass): + # see gh-15414 + s = Series([1, 2, 3]) + cond = [False, True, True] + expected = Series([np.nan, 2, 3]) + + result = s.where(klass(cond)) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('cond', [ + [1, 0, 1], + Series([2, 5, 7]), + ["True", "False", "True"], + [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")] +]) +def test_where_invalid_input(cond): + # see gh-15414: only boolean arrays accepted + s = Series([1, 2, 3]) + msg = "Boolean array expected for the condition" + + with tm.assert_raises_regex(ValueError, msg): + s.where(cond) + + msg = "Array conditional must be same shape as self" + with tm.assert_raises_regex(ValueError, msg): + s.where([True]) + + +def test_where_ndframe_align(): + msg = "Array conditional must be same shape as self" + s = Series([1, 2, 3]) + + cond = [True] + with tm.assert_raises_regex(ValueError, msg): + s.where(cond) + + expected = Series([1, np.nan, np.nan]) + + out = s.where(Series(cond)) + tm.assert_series_equal(out, expected) + + cond = np.array([False, True, False, True]) + with tm.assert_raises_regex(ValueError, msg): + s.where(cond) + + expected = Series([np.nan, 2, np.nan]) + + out = s.where(Series(cond)) + tm.assert_series_equal(out, expected) + + +def test_where_setitem_invalid(): + # GH 2702 + # make sure correct exceptions are raised on invalid list assignment + + # slice + s = Series(list('abc')) + + def f(): + s[0:3] = list(range(27)) + + pytest.raises(ValueError, f) + + s[0:3] = list(range(3)) + expected = Series([0, 1, 2]) + assert_series_equal(s.astype(np.int64), expected, ) + + # slice with step + s = Series(list('abcdef')) + + def f(): + s[0:4:2] = list(range(27)) + + pytest.raises(ValueError, f) + + s = Series(list('abcdef')) + s[0:4:2] = list(range(2)) + expected = Series([0, 'b', 1, 'd', 'e', 'f']) + assert_series_equal(s, expected) + + # neg slices + s = Series(list('abcdef')) + + def f(): + s[:-1] = list(range(27)) + + pytest.raises(ValueError, f) + + s[-3:-1] = list(range(2)) + expected = Series(['a', 'b', 'c', 0, 1, 'f']) + assert_series_equal(s, expected) + + # list + s = Series(list('abc')) + + def f(): + s[[0, 1, 2]] = list(range(27)) + + pytest.raises(ValueError, f) + + s = Series(list('abc')) + + def f(): + s[[0, 1, 2]] = list(range(2)) + + pytest.raises(ValueError, f) + + # scalar + s = Series(list('abc')) + s[0] = list(range(10)) + expected = Series([list(range(10)), 'b', 'c']) + assert_series_equal(s, expected) + + +@pytest.mark.parametrize('size', range(2, 6)) +@pytest.mark.parametrize('mask', [ + [True, False, False, False, False], + [True, False], + [False] +]) +@pytest.mark.parametrize('item', [ + 2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min +]) +# Test numpy arrays, lists and tuples as the input to be +# broadcast +@pytest.mark.parametrize('box', [ + lambda x: np.array([x]), + lambda x: [x], + lambda x: (x,) +]) +def test_broadcast(size, mask, item, box): + selection = np.resize(mask, size) + + data = np.arange(size, dtype=float) + + # Construct the expected series by taking the source + # data or item based on the selection + expected = Series([item if use_item else data[ + i] for i, use_item in enumerate(selection)]) + + s = Series(data) + s[selection] = box(item) + assert_series_equal(s, expected) + + s = Series(data) + result = s.where(~selection, box(item)) + assert_series_equal(result, expected) + + s = Series(data) + result = s.mask(selection, box(item)) + assert_series_equal(result, expected) + + +def test_where_inplace(): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.copy() + + rs.where(cond, inplace=True) + assert_series_equal(rs.dropna(), s[cond]) + assert_series_equal(rs, s.where(cond)) + + rs = s.copy() + rs.where(cond, -s, inplace=True) + assert_series_equal(rs, s.where(cond, -s)) + + +def test_where_dups(): + # GH 4550 + # where crashes with dups in index + s1 = Series(list(range(3))) + s2 = Series(list(range(3))) + comb = pd.concat([s1, s2]) + result = comb.where(comb < 2) + expected = Series([0, 1, np.nan, 0, 1, np.nan], + index=[0, 1, 2, 0, 1, 2]) + assert_series_equal(result, expected) + + # GH 4548 + # inplace updating not working with dups + comb[comb < 1] = 5 + expected = Series([5, 1, 2, 5, 1, 2], index=[0, 1, 2, 0, 1, 2]) + assert_series_equal(comb, expected) + + comb[comb < 2] += 10 + expected = Series([5, 11, 2, 5, 11, 2], index=[0, 1, 2, 0, 1, 2]) + assert_series_equal(comb, expected) + + +def test_where_numeric_with_string(): + # GH 9280 + s = pd.Series([1, 2, 3]) + w = s.where(s > 1, 'X') + + assert not is_integer(w[0]) + assert is_integer(w[1]) + assert is_integer(w[2]) + assert isinstance(w[0], str) + assert w.dtype == 'object' + + w = s.where(s > 1, ['X', 'Y', 'Z']) + assert not is_integer(w[0]) + assert is_integer(w[1]) + assert is_integer(w[2]) + assert isinstance(w[0], str) + assert w.dtype == 'object' + + w = s.where(s > 1, np.array(['X', 'Y', 'Z'])) + assert not is_integer(w[0]) + assert is_integer(w[1]) + assert is_integer(w[2]) + assert isinstance(w[0], str) + assert w.dtype == 'object' + + +def test_where_timedelta_coerce(): + s = Series([1, 2], dtype='timedelta64[ns]') + expected = Series([10, 10]) + mask = np.array([False, False]) + + rs = s.where(mask, [10, 10]) + assert_series_equal(rs, expected) + + rs = s.where(mask, 10) + assert_series_equal(rs, expected) + + rs = s.where(mask, 10.0) + assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, 10.0]) + assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, np.nan]) + expected = Series([10, None], dtype='object') + assert_series_equal(rs, expected) + + +def test_where_datetime_conversion(): + s = Series(date_range('20130102', periods=2)) + expected = Series([10, 10]) + mask = np.array([False, False]) + + rs = s.where(mask, [10, 10]) + assert_series_equal(rs, expected) + + rs = s.where(mask, 10) + assert_series_equal(rs, expected) + + rs = s.where(mask, 10.0) + assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, 10.0]) + assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, np.nan]) + expected = Series([10, None], dtype='object') + assert_series_equal(rs, expected) + + # GH 15701 + timestamps = ['2016-12-31 12:00:04+00:00', + '2016-12-31 12:00:04.010000+00:00'] + s = Series([pd.Timestamp(t) for t in timestamps]) + rs = s.where(Series([False, True])) + expected = Series([pd.NaT, s[1]]) + assert_series_equal(rs, expected) + + +def test_mask(): + # compare with tested results in test_where + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(~cond, np.nan) + assert_series_equal(rs, s.mask(cond)) + + rs = s.where(~cond) + rs2 = s.mask(cond) + assert_series_equal(rs, rs2) + + rs = s.where(~cond, -s) + rs2 = s.mask(cond, -s) + assert_series_equal(rs, rs2) + + cond = Series([True, False, False, True, False], index=s.index) + s2 = -(s.abs()) + rs = s2.where(~cond[:3]) + rs2 = s2.mask(cond[:3]) + assert_series_equal(rs, rs2) + + rs = s2.where(~cond[:3], -s2) + rs2 = s2.mask(cond[:3], -s2) + assert_series_equal(rs, rs2) + + pytest.raises(ValueError, s.mask, 1) + pytest.raises(ValueError, s.mask, cond[:3].values, -s) + + # dtype changes + s = Series([1, 2, 3, 4]) + result = s.mask(s > 2, np.nan) + expected = Series([1, 2, np.nan, np.nan]) + assert_series_equal(result, expected) + + +def test_mask_inplace(): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.copy() + rs.mask(cond, inplace=True) + assert_series_equal(rs.dropna(), s[~cond]) + assert_series_equal(rs, s.mask(cond)) + + rs = s.copy() + rs.mask(cond, -s, inplace=True) + assert_series_equal(rs, s.mask(cond, -s)) diff --git a/pandas/tests/series/indexing/test_callable.py b/pandas/tests/series/indexing/test_callable.py new file mode 100644 index 00000000000000..b6561375459039 --- /dev/null +++ b/pandas/tests/series/indexing/test_callable.py @@ -0,0 +1,33 @@ +import pandas as pd +import pandas.util.testing as tm + + +def test_getitem_callable(): + # GH 12533 + s = pd.Series(4, index=list('ABCD')) + result = s[lambda x: 'A'] + assert result == s.loc['A'] + + result = s[lambda x: ['A', 'B']] + tm.assert_series_equal(result, s.loc[['A', 'B']]) + + result = s[lambda x: [True, False, True, True]] + tm.assert_series_equal(result, s.iloc[[0, 2, 3]]) + + +def test_setitem_callable(): + # GH 12533 + s = pd.Series([1, 2, 3, 4], index=list('ABCD')) + s[lambda x: 'A'] = -1 + tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list('ABCD'))) + + +def test_setitem_other_callable(): + # GH 13299 + inc = lambda x: x + 1 + + s = pd.Series([1, 2, -1, 4]) + s[s < 0] = inc + + expected = pd.Series([1, 2, inc, 4]) + tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py new file mode 100644 index 00000000000000..f484cdea2e09fb --- /dev/null +++ b/pandas/tests/series/indexing/test_datetime.py @@ -0,0 +1,710 @@ +# coding=utf-8 +# pylint: disable-msg=E1101,W0612 + +import pytest + +from datetime import datetime, timedelta + +import numpy as np +import pandas as pd + +from pandas import (Series, DataFrame, + date_range, Timestamp, DatetimeIndex, NaT) + +from pandas.compat import lrange, range +from pandas.util.testing import (assert_series_equal, + assert_frame_equal, assert_almost_equal) + +import pandas.util.testing as tm + +import pandas._libs.index as _index +from pandas._libs import tslib + +JOIN_TYPES = ['inner', 'outer', 'left', 'right'] + +""" +Also test support for datetime64[ns] in Series / DataFrame +""" + + +def test_fancy_getitem(): + dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), + end=datetime(2010, 1, 1)) + + s = Series(np.arange(len(dti)), index=dti) + + assert s[48] == 48 + assert s['1/2/2009'] == 48 + assert s['2009-1-2'] == 48 + assert s[datetime(2009, 1, 2)] == 48 + assert s[Timestamp(datetime(2009, 1, 2))] == 48 + pytest.raises(KeyError, s.__getitem__, '2009-1-3') + + assert_series_equal(s['3/6/2009':'2009-06-05'], + s[datetime(2009, 3, 6):datetime(2009, 6, 5)]) + + +def test_fancy_setitem(): + dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), + end=datetime(2010, 1, 1)) + + s = Series(np.arange(len(dti)), index=dti) + s[48] = -1 + assert s[48] == -1 + s['1/2/2009'] = -2 + assert s[48] == -2 + s['1/2/2009':'2009-06-05'] = -3 + assert (s[48:54] == -3).all() + + +def test_dti_snap(): + dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', + '1/5/2002', '1/6/2002', '1/7/2002'], freq='D') + + res = dti.snap(freq='W-MON') + exp = date_range('12/31/2001', '1/7/2002', freq='w-mon') + exp = exp.repeat([3, 4]) + assert (res == exp).all() + + res = dti.snap(freq='B') + + exp = date_range('1/1/2002', '1/7/2002', freq='b') + exp = exp.repeat([1, 1, 1, 2, 2]) + assert (res == exp).all() + + +def test_dti_reset_index_round_trip(): + dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') + d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti) + d2 = d1.reset_index() + assert d2.dtypes[0] == np.dtype('M8[ns]') + d3 = d2.set_index('index') + assert_frame_equal(d1, d3, check_names=False) + + # #2329 + stamp = datetime(2012, 11, 22) + df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value']) + df = df.set_index('Date') + + assert df.index[0] == stamp + assert df.reset_index()['Date'][0] == stamp + + +def test_series_set_value(): + # #1561 + + dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] + index = DatetimeIndex(dates) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + s = Series().set_value(dates[0], 1.) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + s2 = s.set_value(dates[1], np.nan) + + exp = Series([1., np.nan], index=index) + + assert_series_equal(s2, exp) + + # s = Series(index[:1], index[:1]) + # s2 = s.set_value(dates[1], index[1]) + # assert s2.values.dtype == 'M8[ns]' + + +@pytest.mark.slow +def test_slice_locs_indexerror(): + times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) + for i in range(100000)] + s = Series(lrange(100000), times) + s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)] + + +def test_slicing_datetimes(): + # GH 7523 + + # unique + df = DataFrame(np.arange(4., dtype='float64'), + index=[datetime(2001, 1, i, 10, 00) + for i in [1, 2, 3, 4]]) + result = df.loc[datetime(2001, 1, 1, 10):] + assert_frame_equal(result, df) + result = df.loc[:datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + + result = df.loc[datetime(2001, 1, 1, 11):] + expected = df.iloc[1:] + assert_frame_equal(result, expected) + result = df.loc['20010101 11':] + assert_frame_equal(result, expected) + + # duplicates + df = pd.DataFrame(np.arange(5., dtype='float64'), + index=[datetime(2001, 1, i, 10, 00) + for i in [1, 2, 2, 3, 4]]) + + result = df.loc[datetime(2001, 1, 1, 10):] + assert_frame_equal(result, df) + result = df.loc[:datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + assert_frame_equal(result, df) + + result = df.loc[datetime(2001, 1, 1, 11):] + expected = df.iloc[1:] + assert_frame_equal(result, expected) + result = df.loc['20010101 11':] + assert_frame_equal(result, expected) + + +def test_frame_datetime64_duplicated(): + dates = date_range('2010-07-01', end='2010-08-05') + + tst = DataFrame({'symbol': 'AAA', 'date': dates}) + result = tst.duplicated(['date', 'symbol']) + assert (-result).all() + + tst = DataFrame({'date': dates}) + result = tst.duplicated() + assert (-result).all() + + +def test_getitem_setitem_datetime_tz_pytz(): + from pytz import timezone as tz + from pandas import date_range + + N = 50 + # testing with timezone, GH #2785 + rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + ts = Series(np.random.randn(N), index=rng) + + # also test Timestamp tz handling, GH #2789 + result = ts.copy() + result["1990-01-01 09:00:00+00:00"] = 0 + result["1990-01-01 09:00:00+00:00"] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + result["1990-01-01 03:00:00-06:00"] = 0 + result["1990-01-01 03:00:00-06:00"] = ts[4] + assert_series_equal(result, ts) + + # repeat with datetimes + result = ts.copy() + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + + # comparison dates with datetime MUST be localized! + date = tz('US/Central').localize(datetime(1990, 1, 1, 3)) + result[date] = 0 + result[date] = ts[4] + assert_series_equal(result, ts) + + +def test_getitem_setitem_datetime_tz_dateutil(): + from dateutil.tz import tzutc + from pandas._libs.tslibs.timezones import dateutil_gettz as gettz + + tz = lambda x: tzutc() if x == 'UTC' else gettz( + x) # handle special case for utc in dateutil + + from pandas import date_range + + N = 50 + + # testing with timezone, GH #2785 + rng = date_range('1/1/1990', periods=N, freq='H', + tz='America/New_York') + ts = Series(np.random.randn(N), index=rng) + + # also test Timestamp tz handling, GH #2789 + result = ts.copy() + result["1990-01-01 09:00:00+00:00"] = 0 + result["1990-01-01 09:00:00+00:00"] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + result["1990-01-01 03:00:00-06:00"] = 0 + result["1990-01-01 03:00:00-06:00"] = ts[4] + assert_series_equal(result, ts) + + # repeat with datetimes + result = ts.copy() + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = 0 + result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = ts[4] + assert_series_equal(result, ts) + + +def test_getitem_setitem_datetimeindex(): + N = 50 + # testing with timezone, GH #2785 + rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + ts = Series(np.random.randn(N), index=rng) + + result = ts["1990-01-01 04:00:00"] + expected = ts[4] + assert result == expected + + result = ts.copy() + result["1990-01-01 04:00:00"] = 0 + result["1990-01-01 04:00:00"] = ts[4] + assert_series_equal(result, ts) + + result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0 + result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8] + assert_series_equal(result, ts) + + lb = "1990-01-01 04:00:00" + rb = "1990-01-01 07:00:00" + # GH#18435 strings get a pass from tzawareness compat + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + lb = "1990-01-01 04:00:00-0500" + rb = "1990-01-01 07:00:00-0500" + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + # repeat all the above with naive datetimes + result = ts[datetime(1990, 1, 1, 4)] + expected = ts[4] + assert result == expected + + result = ts.copy() + result[datetime(1990, 1, 1, 4)] = 0 + result[datetime(1990, 1, 1, 4)] = ts[4] + assert_series_equal(result, ts) + + result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 + result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] + assert_series_equal(result, ts) + + lb = datetime(1990, 1, 1, 4) + rb = datetime(1990, 1, 1, 7) + with pytest.raises(TypeError): + # tznaive vs tzaware comparison is invalid + # see GH#18376, GH#18162 + ts[(ts.index >= lb) & (ts.index <= rb)] + + lb = pd.Timestamp(datetime(1990, 1, 1, 4)).tz_localize(rng.tzinfo) + rb = pd.Timestamp(datetime(1990, 1, 1, 7)).tz_localize(rng.tzinfo) + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts[ts.index[4]] + expected = ts[4] + assert result == expected + + result = ts[ts.index[4:8]] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[ts.index[4:8]] = 0 + result[4:8] = ts[4:8] + assert_series_equal(result, ts) + + # also test partial date slicing + result = ts["1990-01-02"] + expected = ts[24:48] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-02"] = 0 + result["1990-01-02"] = ts[24:48] + assert_series_equal(result, ts) + + +def test_getitem_setitem_periodindex(): + from pandas import period_range + + N = 50 + rng = period_range('1/1/1990', periods=N, freq='H') + ts = Series(np.random.randn(N), index=rng) + + result = ts["1990-01-01 04"] + expected = ts[4] + assert result == expected + + result = ts.copy() + result["1990-01-01 04"] = 0 + result["1990-01-01 04"] = ts[4] + assert_series_equal(result, ts) + + result = ts["1990-01-01 04":"1990-01-01 07"] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-01 04":"1990-01-01 07"] = 0 + result["1990-01-01 04":"1990-01-01 07"] = ts[4:8] + assert_series_equal(result, ts) + + lb = "1990-01-01 04" + rb = "1990-01-01 07" + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + # GH 2782 + result = ts[ts.index[4]] + expected = ts[4] + assert result == expected + + result = ts[ts.index[4:8]] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[ts.index[4:8]] = 0 + result[4:8] = ts[4:8] + assert_series_equal(result, ts) + + +def test_getitem_median_slice_bug(): + index = date_range('20090415', '20090519', freq='2B') + s = Series(np.random.randn(13), index=index) + + indexer = [slice(6, 7, None)] + result = s[indexer] + expected = s[indexer[0]] + assert_series_equal(result, expected) + + +def test_datetime_indexing(): + from pandas import date_range + + index = date_range('1/1/2000', '1/7/2000') + index = index.repeat(3) + + s = Series(len(index), index=index) + stamp = Timestamp('1/8/2000') + + pytest.raises(KeyError, s.__getitem__, stamp) + s[stamp] = 0 + assert s[stamp] == 0 + + # not monotonic + s = Series(len(index), index=index) + s = s[::-1] + + pytest.raises(KeyError, s.__getitem__, stamp) + s[stamp] = 0 + assert s[stamp] == 0 + + +""" +test duplicates in time series +""" + + +@pytest.fixture(scope='module') +def dups(): + dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), + datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 3), datetime(2000, 1, 3), + datetime(2000, 1, 4), datetime(2000, 1, 4), + datetime(2000, 1, 4), datetime(2000, 1, 5)] + + return Series(np.random.randn(len(dates)), index=dates) + + +def test_constructor(dups): + assert isinstance(dups, Series) + assert isinstance(dups.index, DatetimeIndex) + + +def test_is_unique_monotonic(dups): + assert not dups.index.is_unique + + +def test_index_unique(dups): + uniques = dups.index.unique() + expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 4), datetime(2000, 1, 5)]) + assert uniques.dtype == 'M8[ns]' # sanity + tm.assert_index_equal(uniques, expected) + assert dups.index.nunique() == 4 + + # #2563 + assert isinstance(uniques, DatetimeIndex) + + dups_local = dups.index.tz_localize('US/Eastern') + dups_local.name = 'foo' + result = dups_local.unique() + expected = DatetimeIndex(expected, name='foo') + expected = expected.tz_localize('US/Eastern') + assert result.tz is not None + assert result.name == 'foo' + tm.assert_index_equal(result, expected) + + # NaT, note this is excluded + arr = [1370745748 + t for t in range(20)] + [tslib.iNaT] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + assert idx.nunique() == 20 + assert idx.nunique(dropna=False) == 21 + + arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) + for t in range(20)] + [NaT] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + assert idx.nunique() == 20 + assert idx.nunique(dropna=False) == 21 + + +def test_index_dupes_contains(): + d = datetime(2011, 12, 5, 20, 30) + ix = DatetimeIndex([d, d]) + assert d in ix + + +def test_duplicate_dates_indexing(dups): + ts = dups + + uniques = ts.index.unique() + for date in uniques: + result = ts[date] + + mask = ts.index == date + total = (ts.index == date).sum() + expected = ts[mask] + if total > 1: + assert_series_equal(result, expected) + else: + assert_almost_equal(result, expected[0]) + + cp = ts.copy() + cp[date] = 0 + expected = Series(np.where(mask, 0, ts), index=ts.index) + assert_series_equal(cp, expected) + + pytest.raises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) + + # new index + ts[datetime(2000, 1, 6)] = 0 + assert ts[datetime(2000, 1, 6)] == 0 + + +def test_range_slice(): + idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', + '1/4/2000']) + + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts['1/2/2000':] + expected = ts[1:] + assert_series_equal(result, expected) + + result = ts['1/2/2000':'1/3/2000'] + expected = ts[1:4] + assert_series_equal(result, expected) + + +def test_groupby_average_dup_values(dups): + result = dups.groupby(level=0).mean() + expected = dups.groupby(dups.index).mean() + assert_series_equal(result, expected) + + +def test_indexing_over_size_cutoff(): + import datetime + # #1821 + + old_cutoff = _index._SIZE_CUTOFF + try: + _index._SIZE_CUTOFF = 1000 + + # create large list of non periodic datetime + dates = [] + sec = datetime.timedelta(seconds=1) + half_sec = datetime.timedelta(microseconds=500000) + d = datetime.datetime(2011, 12, 5, 20, 30) + n = 1100 + for i in range(n): + dates.append(d) + dates.append(d + sec) + dates.append(d + sec + half_sec) + dates.append(d + sec + sec + half_sec) + d += 3 * sec + + # duplicate some values in the list + duplicate_positions = np.random.randint(0, len(dates) - 1, 20) + for p in duplicate_positions: + dates[p + 1] = dates[p] + + df = DataFrame(np.random.randn(len(dates), 4), + index=dates, + columns=list('ABCD')) + + pos = n * 3 + timestamp = df.index[pos] + assert timestamp in df.index + + # it works! + df.loc[timestamp] + assert len(df.loc[[timestamp]]) > 0 + finally: + _index._SIZE_CUTOFF = old_cutoff + + +def test_indexing_unordered(): + # GH 2437 + rng = date_range(start='2011-01-01', end='2011-01-15') + ts = Series(np.random.rand(len(rng)), index=rng) + ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]]) + + for t in ts.index: + # TODO: unused? + s = str(t) # noqa + + expected = ts[t] + result = ts2[t] + assert expected == result + + # GH 3448 (ranges) + def compare(slobj): + result = ts2[slobj].copy() + result = result.sort_index() + expected = ts[slobj] + assert_series_equal(result, expected) + + compare(slice('2011-01-01', '2011-01-15')) + compare(slice('2010-12-30', '2011-01-15')) + compare(slice('2011-01-01', '2011-01-16')) + + # partial ranges + compare(slice('2011-01-01', '2011-01-6')) + compare(slice('2011-01-06', '2011-01-8')) + compare(slice('2011-01-06', '2011-01-12')) + + # single values + result = ts2['2011'].sort_index() + expected = ts['2011'] + assert_series_equal(result, expected) + + # diff freq + rng = date_range(datetime(2005, 1, 1), periods=20, freq='M') + ts = Series(np.arange(len(rng)), index=rng) + ts = ts.take(np.random.permutation(20)) + + result = ts['2005'] + for t in result.index: + assert t.year == 2005 + + +def test_indexing(): + idx = date_range("2001-1-1", periods=20, freq='M') + ts = Series(np.random.rand(len(idx)), index=idx) + + # getting + + # GH 3070, make sure semantics work on Series/Frame + expected = ts['2001'] + expected.name = 'A' + + df = DataFrame(dict(A=ts)) + result = df['2001']['A'] + assert_series_equal(expected, result) + + # setting + ts['2001'] = 1 + expected = ts['2001'] + expected.name = 'A' + + df.loc['2001', 'A'] = 1 + + result = df['2001']['A'] + assert_series_equal(expected, result) + + # GH3546 (not including times on the last day) + idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', + freq='H') + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013-05'] + assert_series_equal(expected, ts) + + idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', + freq='S') + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013-05'] + assert_series_equal(expected, ts) + + idx = [Timestamp('2013-05-31 00:00'), + Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999))] + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013'] + assert_series_equal(expected, ts) + + # GH14826, indexing with a seconds resolution string / datetime object + df = DataFrame(np.random.rand(5, 5), + columns=['open', 'high', 'low', 'close', 'volume'], + index=date_range('2012-01-02 18:01:00', + periods=5, tz='US/Central', freq='s')) + expected = df.loc[[df.index[2]]] + + # this is a single date, so will raise + pytest.raises(KeyError, df.__getitem__, '2012-01-02 18:01:02', ) + pytest.raises(KeyError, df.__getitem__, df.index[2], ) + + +""" +test NaT support +""" + + +def test_set_none_nan(): + series = Series(date_range('1/1/2000', periods=10)) + series[3] = None + assert series[3] is NaT + + series[3:5] = None + assert series[4] is NaT + + series[5] = np.nan + assert series[5] is NaT + + series[5:7] = np.nan + assert series[6] is NaT + + +def test_nat_operations(): + # GH 8617 + s = Series([0, pd.NaT], dtype='m8[ns]') + exp = s[0] + assert s.median() == exp + assert s.min() == exp + assert s.max() == exp + + +@pytest.mark.parametrize('method', ["round", "floor", "ceil"]) +@pytest.mark.parametrize('freq', ["s", "5s", "min", "5min", "h", "5h"]) +def test_round_nat(method, freq): + # GH14940 + s = Series([pd.NaT]) + expected = Series(pd.NaT) + round_method = getattr(s.dt, method) + assert_series_equal(round_method(freq), expected) diff --git a/pandas/tests/series/indexing/test_iloc.py b/pandas/tests/series/indexing/test_iloc.py new file mode 100644 index 00000000000000..648a37ce0262be --- /dev/null +++ b/pandas/tests/series/indexing/test_iloc.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# pylint: disable-msg=E1101,W0612 + +import numpy as np + +from pandas import Series + +from pandas.compat import lrange, range +from pandas.util.testing import (assert_series_equal, + assert_almost_equal) + + +def test_iloc(): + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + + for i in range(len(s)): + result = s.iloc[i] + exp = s[s.index[i]] + assert_almost_equal(result, exp) + + # pass a slice + result = s.iloc[slice(1, 3)] + expected = s.loc[2:4] + assert_series_equal(result, expected) + + # test slice is a view + result[:] = 0 + assert (s[1:3] == 0).all() + + # list of integers + result = s.iloc[[0, 2, 3, 4, 5]] + expected = s.reindex(s.index[[0, 2, 3, 4, 5]]) + assert_series_equal(result, expected) + + +def test_iloc_nonunique(): + s = Series([0, 1, 2], index=[0, 1, 0]) + assert s.iloc[2] == 2 diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py new file mode 100644 index 00000000000000..5cc1a8ff1c451b --- /dev/null +++ b/pandas/tests/series/indexing/test_indexing.py @@ -0,0 +1,760 @@ +# coding=utf-8 +# pylint: disable-msg=E1101,W0612 + +""" test get/set & misc """ + +import pytest + +from datetime import timedelta + +import numpy as np +import pandas as pd + +from pandas.core.dtypes.common import is_scalar +from pandas import (Series, DataFrame, MultiIndex, + Timestamp, Timedelta, Categorical) +from pandas.tseries.offsets import BDay + +from pandas.compat import lrange, range + +from pandas.util.testing import (assert_series_equal) +import pandas.util.testing as tm + + +def test_basic_indexing(): + s = Series(np.random.randn(5), index=['a', 'b', 'a', 'a', 'b']) + + pytest.raises(IndexError, s.__getitem__, 5) + pytest.raises(IndexError, s.__setitem__, 5, 0) + + pytest.raises(KeyError, s.__getitem__, 'c') + + s = s.sort_index() + + pytest.raises(IndexError, s.__getitem__, 5) + pytest.raises(IndexError, s.__setitem__, 5, 0) + + +def test_basic_getitem_with_labels(test_data): + indices = test_data.ts.index[[5, 10, 15]] + + result = test_data.ts[indices] + expected = test_data.ts.reindex(indices) + assert_series_equal(result, expected) + + result = test_data.ts[indices[0]:indices[2]] + expected = test_data.ts.loc[indices[0]:indices[2]] + assert_series_equal(result, expected) + + # integer indexes, be careful + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + inds = [0, 2, 5, 7, 8] + arr_inds = np.array([0, 2, 5, 7, 8]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = s[inds] + expected = s.reindex(inds) + assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = s[arr_inds] + expected = s.reindex(arr_inds) + assert_series_equal(result, expected) + + # GH12089 + # with tz for values + s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), + index=['a', 'b', 'c']) + expected = Timestamp('2011-01-01', tz='US/Eastern') + result = s.loc['a'] + assert result == expected + result = s.iloc[0] + assert result == expected + result = s['a'] + assert result == expected + + +def test_getitem_setitem_ellipsis(): + s = Series(np.random.randn(10)) + + np.fix(s) + + result = s[...] + assert_series_equal(result, s) + + s[...] = 5 + assert (result == 5).all() + + +def test_getitem_get(test_data): + test_series = test_data.series + test_obj_series = test_data.objSeries + + idx1 = test_series.index[5] + idx2 = test_obj_series.index[5] + + assert test_series[idx1] == test_series.get(idx1) + assert test_obj_series[idx2] == test_obj_series.get(idx2) + + assert test_series[idx1] == test_series[5] + assert test_obj_series[idx2] == test_obj_series[5] + + assert test_series.get(-1) == test_series.get(test_series.index[-1]) + assert test_series[5] == test_series.get(test_series.index[5]) + + # missing + d = test_data.ts.index[0] - BDay() + pytest.raises(KeyError, test_data.ts.__getitem__, d) + + # None + # GH 5652 + for s in [Series(), Series(index=list('abc'))]: + result = s.get(None) + assert result is None + + +def test_getitem_fancy(test_data): + slice1 = test_data.series[[1, 2, 3]] + slice2 = test_data.objSeries[[1, 2, 3]] + assert test_data.series.index[2] == slice1.index[1] + assert test_data.objSeries.index[2] == slice2.index[1] + assert test_data.series[2] == slice1[1] + assert test_data.objSeries[2] == slice2[1] + + +def test_getitem_generator(test_data): + gen = (x > 0 for x in test_data.series) + result = test_data.series[gen] + result2 = test_data.series[iter(test_data.series > 0)] + expected = test_data.series[test_data.series > 0] + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + +def test_type_promotion(): + # GH12599 + s = pd.Series() + s["a"] = pd.Timestamp("2016-01-01") + s["b"] = 3.0 + s["c"] = "foo" + expected = Series([pd.Timestamp("2016-01-01"), 3.0, "foo"], + index=["a", "b", "c"]) + assert_series_equal(s, expected) + + +@pytest.mark.parametrize( + 'result_1, duplicate_item, expected_1', + [ + [ + pd.Series({1: 12, 2: [1, 2, 2, 3]}), pd.Series({1: 313}), + pd.Series({1: 12, }, dtype=object), + ], + [ + pd.Series({1: [1, 2, 3], 2: [1, 2, 2, 3]}), + pd.Series({1: [1, 2, 3]}), pd.Series({1: [1, 2, 3], }), + ], + ]) +def test_getitem_with_duplicates_indices( + result_1, duplicate_item, expected_1): + # GH 17610 + result = result_1.append(duplicate_item) + expected = expected_1.append(duplicate_item) + assert_series_equal(result[1], expected) + assert result[2] == result_1[2] + + +def test_getitem_out_of_bounds(test_data): + # don't segfault, GH #495 + pytest.raises(IndexError, test_data.ts.__getitem__, len(test_data.ts)) + + # GH #917 + s = Series([]) + pytest.raises(IndexError, s.__getitem__, -1) + + +def test_getitem_setitem_integers(): + # caused bug without test + s = Series([1, 2, 3], ['a', 'b', 'c']) + + assert s.iloc[0] == s['a'] + s.iloc[0] = 5 + tm.assert_almost_equal(s['a'], 5) + + +def test_getitem_box_float64(test_data): + value = test_data.ts[5] + assert isinstance(value, np.float64) + + +def test_series_box_timestamp(): + rng = pd.date_range('20090415', '20090519', freq='B') + ser = Series(rng) + + assert isinstance(ser[5], pd.Timestamp) + + rng = pd.date_range('20090415', '20090519', freq='B') + ser = Series(rng, index=rng) + assert isinstance(ser[5], pd.Timestamp) + + assert isinstance(ser.iat[5], pd.Timestamp) + + +def test_getitem_ambiguous_keyerror(): + s = Series(lrange(10), index=lrange(0, 20, 2)) + pytest.raises(KeyError, s.__getitem__, 1) + pytest.raises(KeyError, s.loc.__getitem__, 1) + + +def test_getitem_unordered_dup(): + obj = Series(lrange(5), index=['c', 'a', 'a', 'b', 'b']) + assert is_scalar(obj['c']) + assert obj['c'] == 0 + + +def test_getitem_dups_with_missing(): + # breaks reindex, so need to use .loc internally + # GH 4246 + s = Series([1, 2, 3, 4], ['foo', 'bar', 'foo', 'bah']) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + expected = s.loc[['foo', 'bar', 'bah', 'bam']] + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = s[['foo', 'bar', 'bah', 'bam']] + assert_series_equal(result, expected) + + +def test_getitem_dups(): + s = Series(range(5), index=['A', 'A', 'B', 'C', 'C'], dtype=np.int64) + expected = Series([3, 4], index=['C', 'C'], dtype=np.int64) + result = s['C'] + assert_series_equal(result, expected) + + +def test_setitem_ambiguous_keyerror(): + s = Series(lrange(10), index=lrange(0, 20, 2)) + + # equivalent of an append + s2 = s.copy() + s2[1] = 5 + expected = s.append(Series([5], index=[1])) + assert_series_equal(s2, expected) + + s2 = s.copy() + s2.loc[1] = 5 + expected = s.append(Series([5], index=[1])) + assert_series_equal(s2, expected) + + +def test_getitem_dataframe(): + rng = list(range(10)) + s = pd.Series(10, index=rng) + df = pd.DataFrame(rng, index=rng) + pytest.raises(TypeError, s.__getitem__, df > 5) + + +def test_setitem(test_data): + test_data.ts[test_data.ts.index[5]] = np.NaN + test_data.ts[[1, 2, 17]] = np.NaN + test_data.ts[6] = np.NaN + assert np.isnan(test_data.ts[6]) + assert np.isnan(test_data.ts[2]) + test_data.ts[np.isnan(test_data.ts)] = 5 + assert not np.isnan(test_data.ts[2]) + + # caught this bug when writing tests + series = Series(tm.makeIntIndex(20).astype(float), + index=tm.makeIntIndex(20)) + + series[::2] = 0 + assert (series[::2] == 0).all() + + # set item that's not contained + s = test_data.series.copy() + s['foobar'] = 1 + + app = Series([1], index=['foobar'], name='series') + expected = test_data.series.append(app) + assert_series_equal(s, expected) + + # Test for issue #10193 + key = pd.Timestamp('2012-01-01') + series = pd.Series() + series[key] = 47 + expected = pd.Series(47, [key]) + assert_series_equal(series, expected) + + series = pd.Series([], pd.DatetimeIndex([], freq='D')) + series[key] = 47 + expected = pd.Series(47, pd.DatetimeIndex([key], freq='D')) + assert_series_equal(series, expected) + + +def test_setitem_dtypes(): + # change dtypes + # GH 4463 + expected = Series([np.nan, 2, 3]) + + s = Series([1, 2, 3]) + s.iloc[0] = np.nan + assert_series_equal(s, expected) + + s = Series([1, 2, 3]) + s.loc[0] = np.nan + assert_series_equal(s, expected) + + s = Series([1, 2, 3]) + s[0] = np.nan + assert_series_equal(s, expected) + + s = Series([False]) + s.loc[0] = np.nan + assert_series_equal(s, Series([np.nan])) + + s = Series([False, True]) + s.loc[0] = np.nan + assert_series_equal(s, Series([np.nan, 1.0])) + + +def test_set_value(test_data): + idx = test_data.ts.index[10] + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + res = test_data.ts.set_value(idx, 0) + assert res is test_data.ts + assert test_data.ts[idx] == 0 + + # equiv + s = test_data.series.copy() + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + res = s.set_value('foobar', 0) + assert res is s + assert res.index[-1] == 'foobar' + assert res['foobar'] == 0 + + s = test_data.series.copy() + s.loc['foobar'] = 0 + assert s.index[-1] == 'foobar' + assert s['foobar'] == 0 + + +def test_setslice(test_data): + sl = test_data.ts[5:20] + assert len(sl) == len(sl.index) + assert sl.index.is_unique + + +def test_basic_getitem_setitem_corner(test_data): + # invalid tuples, e.g. td.ts[:, None] vs. td.ts[:, 2] + with tm.assert_raises_regex(ValueError, 'tuple-index'): + test_data.ts[:, 2] + with tm.assert_raises_regex(ValueError, 'tuple-index'): + test_data.ts[:, 2] = 2 + + # weird lists. [slice(0, 5)] will work but not two slices + result = test_data.ts[[slice(None, 5)]] + expected = test_data.ts[:5] + assert_series_equal(result, expected) + + # OK + pytest.raises(Exception, test_data.ts.__getitem__, + [5, slice(None, None)]) + pytest.raises(Exception, test_data.ts.__setitem__, + [5, slice(None, None)], 2) + + +@pytest.mark.parametrize('tz', ['US/Eastern', 'UTC', 'Asia/Tokyo']) +def test_setitem_with_tz(tz): + orig = pd.Series(pd.date_range('2016-01-01', freq='H', periods=3, + tz=tz)) + assert orig.dtype == 'datetime64[ns, {0}]'.format(tz) + + # scalar + s = orig.copy() + s[1] = pd.Timestamp('2011-01-01', tz=tz) + exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), + pd.Timestamp('2011-01-01 00:00', tz=tz), + pd.Timestamp('2016-01-01 02:00', tz=tz)]) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.loc[1] = pd.Timestamp('2011-01-01', tz=tz) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz) + tm.assert_series_equal(s, exp) + + # vector + vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2]) + assert vals.dtype == 'datetime64[ns, {0}]'.format(tz) + + s[[1, 2]] = vals + exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), + pd.Timestamp('2011-01-01 00:00', tz=tz), + pd.Timestamp('2012-01-01 00:00', tz=tz)]) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.loc[[1, 2]] = vals + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.iloc[[1, 2]] = vals + tm.assert_series_equal(s, exp) + + +def test_setitem_with_tz_dst(): + # GH XXX + tz = 'US/Eastern' + orig = pd.Series(pd.date_range('2016-11-06', freq='H', periods=3, + tz=tz)) + assert orig.dtype == 'datetime64[ns, {0}]'.format(tz) + + # scalar + s = orig.copy() + s[1] = pd.Timestamp('2011-01-01', tz=tz) + exp = pd.Series([pd.Timestamp('2016-11-06 00:00-04:00', tz=tz), + pd.Timestamp('2011-01-01 00:00-05:00', tz=tz), + pd.Timestamp('2016-11-06 01:00-05:00', tz=tz)]) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.loc[1] = pd.Timestamp('2011-01-01', tz=tz) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz) + tm.assert_series_equal(s, exp) + + # vector + vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2]) + assert vals.dtype == 'datetime64[ns, {0}]'.format(tz) + + s[[1, 2]] = vals + exp = pd.Series([pd.Timestamp('2016-11-06 00:00', tz=tz), + pd.Timestamp('2011-01-01 00:00', tz=tz), + pd.Timestamp('2012-01-01 00:00', tz=tz)]) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.loc[[1, 2]] = vals + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.iloc[[1, 2]] = vals + tm.assert_series_equal(s, exp) + + +def test_categorial_assigning_ops(): + orig = Series(Categorical(["b", "b"], categories=["a", "b"])) + s = orig.copy() + s[:] = "a" + exp = Series(Categorical(["a", "a"], categories=["a", "b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[1] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[s.index > 0] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[[False, True]] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.index = ["x", "y"] + s["y"] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"]), + index=["x", "y"]) + tm.assert_series_equal(s, exp) + + # ensure that one can set something to np.nan + s = Series(Categorical([1, 2, 3])) + exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3])) + s[1] = np.nan + tm.assert_series_equal(s, exp) + + +def test_slice(test_data): + numSlice = test_data.series[10:20] + numSliceEnd = test_data.series[-10:] + objSlice = test_data.objSeries[10:20] + + assert test_data.series.index[9] not in numSlice.index + assert test_data.objSeries.index[9] not in objSlice.index + + assert len(numSlice) == len(numSlice.index) + assert test_data.series[numSlice.index[0]] == numSlice[numSlice.index[0]] + + assert numSlice.index[1] == test_data.series.index[11] + assert tm.equalContents(numSliceEnd, np.array(test_data.series)[-10:]) + + # Test return view. + sl = test_data.series[10:20] + sl[:] = 0 + + assert (test_data.series[10:20] == 0).all() + + +def test_slice_can_reorder_not_uniquely_indexed(): + s = Series(1, index=['a', 'a', 'b', 'b', 'c']) + s[::-1] # it works! + + +def test_ix_setitem(test_data): + inds = test_data.series.index[[3, 4, 7]] + + result = test_data.series.copy() + result.loc[inds] = 5 + + expected = test_data.series.copy() + expected[[3, 4, 7]] = 5 + assert_series_equal(result, expected) + + result.iloc[5:10] = 10 + expected[5:10] = 10 + assert_series_equal(result, expected) + + # set slice with indices + d1, d2 = test_data.series.index[[5, 15]] + result.loc[d1:d2] = 6 + expected[5:16] = 6 # because it's inclusive + assert_series_equal(result, expected) + + # set index value + test_data.series.loc[d1] = 4 + test_data.series.loc[d2] = 6 + assert test_data.series[d1] == 4 + assert test_data.series[d2] == 6 + + +def test_setitem_na(): + # these induce dtype changes + expected = Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]) + s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) + s[::2] = np.nan + assert_series_equal(s, expected) + + # gets coerced to float, right? + expected = Series([np.nan, 1, np.nan, 0]) + s = Series([True, True, False, False]) + s[::2] = np.nan + assert_series_equal(s, expected) + + expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, + 9]) + s = Series(np.arange(10)) + s[:5] = np.nan + assert_series_equal(s, expected) + + +def test_timedelta_assignment(): + # GH 8209 + s = Series([]) + s.loc['B'] = timedelta(1) + tm.assert_series_equal(s, Series(Timedelta('1 days'), index=['B'])) + + s = s.reindex(s.index.insert(0, 'A')) + tm.assert_series_equal(s, Series( + [np.nan, Timedelta('1 days')], index=['A', 'B'])) + + result = s.fillna(timedelta(1)) + expected = Series(Timedelta('1 days'), index=['A', 'B']) + tm.assert_series_equal(result, expected) + + s.loc['A'] = timedelta(1) + tm.assert_series_equal(s, expected) + + # GH 14155 + s = Series(10 * [np.timedelta64(10, 'm')]) + s.loc[[1, 2, 3]] = np.timedelta64(20, 'm') + expected = pd.Series(10 * [np.timedelta64(10, 'm')]) + expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm')) + tm.assert_series_equal(s, expected) + + +def test_underlying_data_conversion(): + # GH 4080 + df = DataFrame({c: [1, 2, 3] for c in ['a', 'b', 'c']}) + df.set_index(['a', 'b', 'c'], inplace=True) + s = Series([1], index=[(2, 2, 2)]) + df['val'] = 0 + df + df['val'].update(s) + + expected = DataFrame( + dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0])) + expected.set_index(['a', 'b', 'c'], inplace=True) + tm.assert_frame_equal(df, expected) + + # GH 3970 + # these are chained assignments as well + pd.set_option('chained_assignment', None) + df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) + df["cc"] = 0.0 + + ck = [True] * len(df) + + df["bb"].iloc[0] = .13 + + # TODO: unused + df_tmp = df.iloc[ck] # noqa + + df["bb"].iloc[0] = .15 + assert df['bb'].iloc[0] == 0.15 + pd.set_option('chained_assignment', 'raise') + + # GH 3217 + df = DataFrame(dict(a=[1, 3], b=[np.nan, 2])) + df['c'] = np.nan + df['c'].update(pd.Series(['foo'], index=[0])) + + expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=['foo', np.nan])) + tm.assert_frame_equal(df, expected) + + +def test_preserve_refs(test_data): + seq = test_data.ts[[5, 10, 15]] + seq[1] = np.NaN + assert not np.isnan(test_data.ts[10]) + + +def test_cast_on_putmask(): + # GH 2746 + + # need to upcast + s = Series([1, 2], index=[1, 2], dtype='int64') + s[[True, False]] = Series([0], index=[1], dtype='int64') + expected = Series([0, 2], index=[1, 2], dtype='int64') + + assert_series_equal(s, expected) + + +def test_type_promote_putmask(): + # GH8387: test that changing types does not break alignment + ts = Series(np.random.randn(100), index=np.arange(100, 0, -1)).round(5) + left, mask = ts.copy(), ts > 0 + right = ts[mask].copy().map(str) + left[mask] = right + assert_series_equal(left, ts.map(lambda t: str(t) if t > 0 else t)) + + s = Series([0, 1, 2, 0]) + mask = s > 0 + s2 = s[mask].map(str) + s[mask] = s2 + assert_series_equal(s, Series([0, '1', '2', 0])) + + s = Series([0, 'foo', 'bar', 0]) + mask = Series([False, True, True, False]) + s2 = s[mask] + s[mask] = s2 + assert_series_equal(s, Series([0, 'foo', 'bar', 0])) + + +def test_multilevel_preserve_name(): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + s = Series(np.random.randn(len(index)), index=index, name='sth') + + result = s['foo'] + result2 = s.loc['foo'] + assert result.name == s.name + assert result2.name == s.name + + +def test_setitem_scalar_into_readonly_backing_data(): + # GH14359: test that you cannot mutate a read only buffer + + array = np.zeros(5) + array.flags.writeable = False # make the array immutable + series = Series(array) + + for n in range(len(series)): + with pytest.raises(ValueError): + series[n] = 1 + + assert array[n] == 0 + + +def test_setitem_slice_into_readonly_backing_data(): + # GH14359: test that you cannot mutate a read only buffer + + array = np.zeros(5) + array.flags.writeable = False # make the array immutable + series = Series(array) + + with pytest.raises(ValueError): + series[1:3] = 1 + + assert not array.any() + + +""" +miscellaneous methods +""" + + +def test_select(test_data): + # deprecated: gh-12410 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + n = len(test_data.ts) + result = test_data.ts.select(lambda x: x >= test_data.ts.index[n // 2]) + expected = test_data.ts.reindex(test_data.ts.index[n // 2:]) + assert_series_equal(result, expected) + + result = test_data.ts.select(lambda x: x.weekday() == 2) + expected = test_data.ts[test_data.ts.index.weekday == 2] + assert_series_equal(result, expected) + + +def test_pop(): + # GH 6600 + df = DataFrame({'A': 0, 'B': np.arange(5, dtype='int64'), 'C': 0, }) + k = df.iloc[4] + + result = k.pop('B') + assert result == 4 + + expected = Series([0, 0], index=['A', 'C'], name=4) + assert_series_equal(k, expected) + + +def test_take(): + s = Series([-1, 5, 6, 2, 4]) + + actual = s.take([1, 3, 4]) + expected = Series([5, 2, 4], index=[1, 3, 4]) + tm.assert_series_equal(actual, expected) + + actual = s.take([-1, 3, 4]) + expected = Series([4, 2, 4], index=[4, 3, 4]) + tm.assert_series_equal(actual, expected) + + pytest.raises(IndexError, s.take, [1, 10]) + pytest.raises(IndexError, s.take, [2, 5]) + + with tm.assert_produces_warning(FutureWarning): + s.take([-1, 3, 4], convert=False) + + +def test_head_tail(test_data): + assert_series_equal(test_data.series.head(), test_data.series[:5]) + assert_series_equal(test_data.series.head(0), test_data.series[0:0]) + assert_series_equal(test_data.series.tail(), test_data.series[-5:]) + assert_series_equal(test_data.series.tail(0), test_data.series[0:0]) diff --git a/pandas/tests/series/indexing/test_loc.py b/pandas/tests/series/indexing/test_loc.py new file mode 100644 index 00000000000000..088406e0a1db67 --- /dev/null +++ b/pandas/tests/series/indexing/test_loc.py @@ -0,0 +1,150 @@ +# coding=utf-8 +# pylint: disable-msg=E1101,W0612 + +import pytest + +import numpy as np +import pandas as pd + +from pandas import (Series, Timestamp) + +from pandas.compat import lrange +from pandas.util.testing import (assert_series_equal) + + +def test_loc_getitem(test_data): + inds = test_data.series.index[[3, 4, 7]] + assert_series_equal( + test_data.series.loc[inds], + test_data.series.reindex(inds)) + assert_series_equal(test_data.series.iloc[5::2], test_data.series[5::2]) + + # slice with indices + d1, d2 = test_data.ts.index[[5, 15]] + result = test_data.ts.loc[d1:d2] + expected = test_data.ts.truncate(d1, d2) + assert_series_equal(result, expected) + + # boolean + mask = test_data.series > test_data.series.median() + assert_series_equal(test_data.series.loc[mask], test_data.series[mask]) + + # ask for index value + assert test_data.ts.loc[d1] == test_data.ts[d1] + assert test_data.ts.loc[d2] == test_data.ts[d2] + + +def test_loc_getitem_not_monotonic(test_data): + d1, d2 = test_data.ts.index[[5, 15]] + + ts2 = test_data.ts[::2][[1, 2, 0]] + + pytest.raises(KeyError, ts2.loc.__getitem__, slice(d1, d2)) + pytest.raises(KeyError, ts2.loc.__setitem__, slice(d1, d2), 0) + + +def test_loc_getitem_setitem_integer_slice_keyerrors(): + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + + # this is OK + cp = s.copy() + cp.iloc[4:10] = 0 + assert (cp.iloc[4:10] == 0).all() + + # so is this + cp = s.copy() + cp.iloc[3:11] = 0 + assert (cp.iloc[3:11] == 0).values.all() + + result = s.iloc[2:6] + result2 = s.loc[3:11] + expected = s.reindex([4, 6, 8, 10]) + + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + # non-monotonic, raise KeyError + s2 = s.iloc[lrange(5) + lrange(5, 10)[::-1]] + pytest.raises(KeyError, s2.loc.__getitem__, slice(3, 11)) + pytest.raises(KeyError, s2.loc.__setitem__, slice(3, 11), 0) + + +def test_loc_getitem_iterator(test_data): + idx = iter(test_data.series.index[:10]) + result = test_data.series.loc[idx] + assert_series_equal(result, test_data.series[:10]) + + +def test_loc_setitem_boolean(test_data): + mask = test_data.series > test_data.series.median() + + result = test_data.series.copy() + result.loc[mask] = 0 + expected = test_data.series + expected[mask] = 0 + assert_series_equal(result, expected) + + +def test_loc_setitem_corner(test_data): + inds = list(test_data.series.index[[5, 8, 12]]) + test_data.series.loc[inds] = 5 + pytest.raises(Exception, test_data.series.loc.__setitem__, + inds + ['foo'], 5) + + +def test_basic_setitem_with_labels(test_data): + indices = test_data.ts.index[[5, 10, 15]] + + cp = test_data.ts.copy() + exp = test_data.ts.copy() + cp[indices] = 0 + exp.loc[indices] = 0 + assert_series_equal(cp, exp) + + cp = test_data.ts.copy() + exp = test_data.ts.copy() + cp[indices[0]:indices[2]] = 0 + exp.loc[indices[0]:indices[2]] = 0 + assert_series_equal(cp, exp) + + # integer indexes, be careful + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + inds = [0, 4, 6] + arr_inds = np.array([0, 4, 6]) + + cp = s.copy() + exp = s.copy() + s[inds] = 0 + s.loc[inds] = 0 + assert_series_equal(cp, exp) + + cp = s.copy() + exp = s.copy() + s[arr_inds] = 0 + s.loc[arr_inds] = 0 + assert_series_equal(cp, exp) + + inds_notfound = [0, 4, 5, 6] + arr_inds_notfound = np.array([0, 4, 5, 6]) + pytest.raises(Exception, s.__setitem__, inds_notfound, 0) + pytest.raises(Exception, s.__setitem__, arr_inds_notfound, 0) + + # GH12089 + # with tz for values + s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), + index=['a', 'b', 'c']) + s2 = s.copy() + expected = Timestamp('2011-01-03', tz='US/Eastern') + s2.loc['a'] = expected + result = s2.loc['a'] + assert result == expected + + s2 = s.copy() + s2.iloc[0] = expected + result = s2.iloc[0] + assert result == expected + + s2 = s.copy() + s2['a'] = expected + result = s2['a'] + assert result == expected diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py new file mode 100644 index 00000000000000..b964ec3874998b --- /dev/null +++ b/pandas/tests/series/indexing/test_numeric.py @@ -0,0 +1,236 @@ +# coding=utf-8 +# pylint: disable-msg=E1101,W0612 + +import pytest + +import numpy as np +import pandas as pd + +from pandas import (Index, Series, DataFrame) + +from pandas.compat import lrange, range +from pandas.util.testing import (assert_series_equal) + +import pandas.util.testing as tm + + +def test_get(): + # GH 6383 + s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45, + 51, 39, 55, 43, 54, 52, 51, 54])) + + result = s.get(25, 0) + expected = 0 + assert result == expected + + s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, + 45, 51, 39, 55, 43, 54, 52, 51, 54]), + index=pd.Float64Index( + [25.0, 36.0, 49.0, 64.0, 81.0, 100.0, + 121.0, 144.0, 169.0, 196.0, 1225.0, + 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, + 1681.0, 1764.0, 1849.0, 1936.0], + dtype='object')) + + result = s.get(25, 0) + expected = 43 + assert result == expected + + # GH 7407 + # with a boolean accessor + df = pd.DataFrame({'i': [0] * 3, 'b': [False] * 3}) + vc = df.i.value_counts() + result = vc.get(99, default='Missing') + assert result == 'Missing' + + vc = df.b.value_counts() + result = vc.get(False, default='Missing') + assert result == 3 + + result = vc.get(True, default='Missing') + assert result == 'Missing' + + +def test_get_nan(): + # GH 8569 + s = pd.Float64Index(range(10)).to_series() + assert s.get(np.nan) is None + assert s.get(np.nan, default='Missing') == 'Missing' + + # ensure that fixing the above hasn't broken get + # with multiple elements + idx = [20, 30] + assert_series_equal(s.get(idx), + Series([np.nan] * 2, index=idx)) + idx = [np.nan, np.nan] + assert_series_equal(s.get(idx), + Series([np.nan] * 2, index=idx)) + + +def test_delitem(): + # GH 5542 + # should delete the item inplace + s = Series(lrange(5)) + del s[0] + + expected = Series(lrange(1, 5), index=lrange(1, 5)) + assert_series_equal(s, expected) + + del s[1] + expected = Series(lrange(2, 5), index=lrange(2, 5)) + assert_series_equal(s, expected) + + # empty + s = Series() + + def f(): + del s[0] + + pytest.raises(KeyError, f) + + # only 1 left, del, add, del + s = Series(1) + del s[0] + assert_series_equal(s, Series(dtype='int64', index=Index( + [], dtype='int64'))) + s[0] = 1 + assert_series_equal(s, Series(1)) + del s[0] + assert_series_equal(s, Series(dtype='int64', index=Index( + [], dtype='int64'))) + + # Index(dtype=object) + s = Series(1, index=['a']) + del s['a'] + assert_series_equal(s, Series(dtype='int64', index=Index( + [], dtype='object'))) + s['a'] = 1 + assert_series_equal(s, Series(1, index=['a'])) + del s['a'] + assert_series_equal(s, Series(dtype='int64', index=Index( + [], dtype='object'))) + + +def test_slice_float64(): + values = np.arange(10., 50., 2) + index = Index(values) + + start, end = values[[5, 15]] + + s = Series(np.random.randn(20), index=index) + + result = s[start:end] + expected = s.iloc[5:16] + assert_series_equal(result, expected) + + result = s.loc[start:end] + assert_series_equal(result, expected) + + df = DataFrame(np.random.randn(20, 3), index=index) + + result = df[start:end] + expected = df.iloc[5:16] + tm.assert_frame_equal(result, expected) + + result = df.loc[start:end] + tm.assert_frame_equal(result, expected) + + +def test_getitem_negative_out_of_bounds(): + s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) + + pytest.raises(IndexError, s.__getitem__, -11) + pytest.raises(IndexError, s.__setitem__, -11, 'foo') + + +def test_getitem_regression(): + s = Series(lrange(5), index=lrange(5)) + result = s[lrange(5)] + assert_series_equal(result, s) + + +def test_getitem_setitem_slice_bug(): + s = Series(lrange(10), lrange(10)) + result = s[-12:] + assert_series_equal(result, s) + + result = s[-7:] + assert_series_equal(result, s[3:]) + + result = s[:-12] + assert_series_equal(result, s[:0]) + + s = Series(lrange(10), lrange(10)) + s[-12:] = 0 + assert (s == 0).all() + + s[:-12] = 5 + assert (s == 0).all() + + +def test_getitem_setitem_slice_integers(): + s = Series(np.random.randn(8), index=[2, 4, 6, 8, 10, 12, 14, 16]) + + result = s[:4] + expected = s.reindex([2, 4, 6, 8]) + assert_series_equal(result, expected) + + s[:4] = 0 + assert (s[:4] == 0).all() + assert not (s[4:] == 0).any() + + +def test_setitem_float_labels(): + # note labels are floats + s = Series(['a', 'b', 'c'], index=[0, 0.5, 1]) + tmp = s.copy() + + s.loc[1] = 'zoo' + tmp.iloc[2] = 'zoo' + + assert_series_equal(s, tmp) + + +def test_slice_float_get_set(test_data): + pytest.raises(TypeError, lambda: test_data.ts[4.0:10.0]) + + def f(): + test_data.ts[4.0:10.0] = 0 + + pytest.raises(TypeError, f) + + pytest.raises(TypeError, test_data.ts.__getitem__, slice(4.5, 10.0)) + pytest.raises(TypeError, test_data.ts.__setitem__, slice(4.5, 10.0), 0) + + +def test_slice_floats2(): + s = Series(np.random.rand(10), index=np.arange(10, 20, dtype=float)) + + assert len(s.loc[12.0:]) == 8 + assert len(s.loc[12.5:]) == 7 + + i = np.arange(10, 20, dtype=float) + i[2] = 12.2 + s.index = i + assert len(s.loc[12.0:]) == 8 + assert len(s.loc[12.5:]) == 7 + + +def test_int_indexing(): + s = Series(np.random.randn(6), index=[0, 0, 1, 1, 2, 2]) + + pytest.raises(KeyError, s.__getitem__, 5) + + pytest.raises(KeyError, s.__getitem__, 'c') + + # not monotonic + s = Series(np.random.randn(6), index=[2, 2, 0, 0, 1, 1]) + + pytest.raises(KeyError, s.__getitem__, 5) + + pytest.raises(KeyError, s.__getitem__, 'c') + + +def test_getitem_int64(test_data): + idx = np.int64(5) + assert test_data.ts[idx] == test_data.ts[5] diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 5b8d9cfab3e0d3..ec0d7296e540e5 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -88,6 +88,23 @@ def test_ser_cmp_result_names(self, names, op): class TestTimestampSeriesComparison(object): + def test_dt64ser_cmp_date_invalid(self): + # GH#19800 datetime.date comparison raises to + # match DatetimeIndex/Timestamp. This also matches the behavior + # of stdlib datetime.datetime + ser = pd.Series(pd.date_range('20010101', periods=10), name='dates') + date = ser.iloc[0].to_pydatetime().date() + assert not (ser == date).any() + assert (ser != date).all() + with pytest.raises(TypeError): + ser > date + with pytest.raises(TypeError): + ser < date + with pytest.raises(TypeError): + ser >= date + with pytest.raises(TypeError): + ser <= date + def test_dt64ser_cmp_period_scalar(self): ser = Series(pd.period_range('2000-01-01', periods=10, freq='D')) val = Period('2000-01-04', freq='D') diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 25f425ffa00215..e0bfe41645a3f8 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -22,7 +22,7 @@ from pandas._libs import lib from pandas._libs.tslib import iNaT -from pandas.compat import lrange, range, zip, long +from pandas.compat import lrange, range, zip, long, PY36 from pandas.util.testing import assert_series_equal import pandas.util.testing as tm @@ -811,6 +811,18 @@ def test_constructor_dict(self): expected.iloc[1] = 1 assert_series_equal(result, expected) + def test_constructor_dict_order(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6, else + # order by value + d = {'b': 1, 'a': 0, 'c': 2} + result = Series(d) + if PY36: + expected = Series([1, 0, 2], index=list('bac')) + else: + expected = Series([0, 1, 2], index=list('abc')) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) def test_constructor_dict_nan_key(self, value): # GH 18480 diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 93c8ebc5f05dfc..3abc0f724db256 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -1,6 +1,8 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 +import locale +import calendar import pytest from datetime import datetime, date @@ -32,7 +34,7 @@ def test_dt_namespace_accessor(self): ok_for_dt = DatetimeIndex._datetimelike_ops ok_for_dt_methods = ['to_period', 'to_pydatetime', 'tz_localize', 'tz_convert', 'normalize', 'strftime', 'round', - 'floor', 'ceil', 'weekday_name'] + 'floor', 'ceil', 'day_name', 'month_name'] ok_for_td = TimedeltaIndex._datetimelike_ops ok_for_td_methods = ['components', 'to_pytimedelta', 'total_seconds', 'round', 'floor', 'ceil'] @@ -274,6 +276,46 @@ def test_dt_accessor_no_new_attributes(self): "You cannot add any new attribute"): s.dt.xlabel = "a" + @pytest.mark.parametrize('time_locale', [ + None] if tm.get_locales() is None else [None] + tm.get_locales()) + def test_dt_accessor_datetime_name_accessors(self, time_locale): + # Test Monday -> Sunday and January -> December, in that sequence + if time_locale is None: + # If the time_locale is None, day-name and month_name should + # return the english attributes + expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday', 'Sunday'] + expected_months = ['January', 'February', 'March', 'April', 'May', + 'June', 'July', 'August', 'September', + 'October', 'November', 'December'] + else: + with tm.set_locale(time_locale, locale.LC_TIME): + expected_days = calendar.day_name[:] + expected_months = calendar.month_name[1:] + + s = Series(DatetimeIndex(freq='D', start=datetime(1998, 1, 1), + periods=365)) + english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday', 'Sunday'] + for day, name, eng_name in zip(range(4, 11), + expected_days, + english_days): + name = name.capitalize() + assert s.dt.weekday_name[day] == eng_name + assert s.dt.day_name(locale=time_locale)[day] == name + s = s.append(Series([pd.NaT])) + assert np.isnan(s.dt.day_name(locale=time_locale).iloc[-1]) + + s = Series(DatetimeIndex(freq='M', start='2012', end='2013')) + result = s.dt.month_name(locale=time_locale) + expected = Series([month.capitalize() for month in expected_months]) + tm.assert_series_equal(result, expected) + for s_date, expected in zip(s, expected_months): + result = s_date.month_name(locale=time_locale) + assert result == expected.capitalize() + s = s.append(Series([pd.NaT])) + assert np.isnan(s.dt.month_name(locale=time_locale).iloc[-1]) + def test_strftime(self): # GH 10086 s = Series(date_range('20130101', periods=5)) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py deleted file mode 100644 index e5c3d6f7d3ee16..00000000000000 --- a/pandas/tests/series/test_indexing.py +++ /dev/null @@ -1,2874 +0,0 @@ -# coding=utf-8 -# pylint: disable-msg=E1101,W0612 - -import pytest - -from datetime import datetime, timedelta - -from numpy import nan -import numpy as np -import pandas as pd - -import pandas._libs.index as _index -from pandas.core.dtypes.common import is_integer, is_scalar -from pandas import (Index, Series, DataFrame, isna, - date_range, NaT, MultiIndex, - Timestamp, DatetimeIndex, Timedelta, - Categorical) -from pandas.core.indexing import IndexingError -from pandas.tseries.offsets import BDay -from pandas._libs import tslib - -from pandas.compat import lrange, range -from pandas import compat -from pandas.util.testing import (assert_series_equal, - assert_almost_equal, - assert_frame_equal) -import pandas.util.testing as tm - -from pandas.tests.series.common import TestData - -JOIN_TYPES = ['inner', 'outer', 'left', 'right'] - - -class TestSeriesIndexing(TestData): - - def test_get(self): - - # GH 6383 - s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45, - 51, 39, 55, 43, 54, 52, 51, 54])) - - result = s.get(25, 0) - expected = 0 - assert result == expected - - s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, - 45, 51, 39, 55, 43, 54, 52, 51, 54]), - index=pd.Float64Index( - [25.0, 36.0, 49.0, 64.0, 81.0, 100.0, - 121.0, 144.0, 169.0, 196.0, 1225.0, - 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, - 1681.0, 1764.0, 1849.0, 1936.0], - dtype='object')) - - result = s.get(25, 0) - expected = 43 - assert result == expected - - # GH 7407 - # with a boolean accessor - df = pd.DataFrame({'i': [0] * 3, 'b': [False] * 3}) - vc = df.i.value_counts() - result = vc.get(99, default='Missing') - assert result == 'Missing' - - vc = df.b.value_counts() - result = vc.get(False, default='Missing') - assert result == 3 - - result = vc.get(True, default='Missing') - assert result == 'Missing' - - def test_get_nan(self): - # GH 8569 - s = pd.Float64Index(range(10)).to_series() - assert s.get(np.nan) is None - assert s.get(np.nan, default='Missing') == 'Missing' - - # ensure that fixing the above hasn't broken get - # with multiple elements - idx = [20, 30] - assert_series_equal(s.get(idx), - Series([np.nan] * 2, index=idx)) - idx = [np.nan, np.nan] - assert_series_equal(s.get(idx), - Series([np.nan] * 2, index=idx)) - - def test_delitem(self): - - # GH 5542 - # should delete the item inplace - s = Series(lrange(5)) - del s[0] - - expected = Series(lrange(1, 5), index=lrange(1, 5)) - assert_series_equal(s, expected) - - del s[1] - expected = Series(lrange(2, 5), index=lrange(2, 5)) - assert_series_equal(s, expected) - - # empty - s = Series() - - def f(): - del s[0] - - pytest.raises(KeyError, f) - - # only 1 left, del, add, del - s = Series(1) - del s[0] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='int64'))) - s[0] = 1 - assert_series_equal(s, Series(1)) - del s[0] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='int64'))) - - # Index(dtype=object) - s = Series(1, index=['a']) - del s['a'] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='object'))) - s['a'] = 1 - assert_series_equal(s, Series(1, index=['a'])) - del s['a'] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='object'))) - - def test_getitem_setitem_ellipsis(self): - s = Series(np.random.randn(10)) - - np.fix(s) - - result = s[...] - assert_series_equal(result, s) - - s[...] = 5 - assert (result == 5).all() - - def test_getitem_negative_out_of_bounds(self): - s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) - - pytest.raises(IndexError, s.__getitem__, -11) - pytest.raises(IndexError, s.__setitem__, -11, 'foo') - - def test_pop(self): - # GH 6600 - df = DataFrame({'A': 0, 'B': np.arange(5, dtype='int64'), 'C': 0, }) - k = df.iloc[4] - - result = k.pop('B') - assert result == 4 - - expected = Series([0, 0], index=['A', 'C'], name=4) - assert_series_equal(k, expected) - - def test_getitem_get(self): - idx1 = self.series.index[5] - idx2 = self.objSeries.index[5] - - assert self.series[idx1] == self.series.get(idx1) - assert self.objSeries[idx2] == self.objSeries.get(idx2) - - assert self.series[idx1] == self.series[5] - assert self.objSeries[idx2] == self.objSeries[5] - - assert self.series.get(-1) == self.series.get(self.series.index[-1]) - assert self.series[5] == self.series.get(self.series.index[5]) - - # missing - d = self.ts.index[0] - BDay() - pytest.raises(KeyError, self.ts.__getitem__, d) - - # None - # GH 5652 - for s in [Series(), Series(index=list('abc'))]: - result = s.get(None) - assert result is None - - def test_iloc(self): - - s = Series(np.random.randn(10), index=lrange(0, 20, 2)) - - for i in range(len(s)): - result = s.iloc[i] - exp = s[s.index[i]] - assert_almost_equal(result, exp) - - # pass a slice - result = s.iloc[slice(1, 3)] - expected = s.loc[2:4] - assert_series_equal(result, expected) - - # test slice is a view - result[:] = 0 - assert (s[1:3] == 0).all() - - # list of integers - result = s.iloc[[0, 2, 3, 4, 5]] - expected = s.reindex(s.index[[0, 2, 3, 4, 5]]) - assert_series_equal(result, expected) - - def test_iloc_nonunique(self): - s = Series([0, 1, 2], index=[0, 1, 0]) - assert s.iloc[2] == 2 - - def test_getitem_regression(self): - s = Series(lrange(5), index=lrange(5)) - result = s[lrange(5)] - assert_series_equal(result, s) - - def test_getitem_setitem_slice_bug(self): - s = Series(lrange(10), lrange(10)) - result = s[-12:] - assert_series_equal(result, s) - - result = s[-7:] - assert_series_equal(result, s[3:]) - - result = s[:-12] - assert_series_equal(result, s[:0]) - - s = Series(lrange(10), lrange(10)) - s[-12:] = 0 - assert (s == 0).all() - - s[:-12] = 5 - assert (s == 0).all() - - def test_getitem_int64(self): - idx = np.int64(5) - assert self.ts[idx] == self.ts[5] - - def test_getitem_fancy(self): - slice1 = self.series[[1, 2, 3]] - slice2 = self.objSeries[[1, 2, 3]] - assert self.series.index[2] == slice1.index[1] - assert self.objSeries.index[2] == slice2.index[1] - assert self.series[2] == slice1[1] - assert self.objSeries[2] == slice2[1] - - def test_getitem_boolean(self): - s = self.series - mask = s > s.median() - - # passing list is OK - result = s[list(mask)] - expected = s[mask] - assert_series_equal(result, expected) - tm.assert_index_equal(result.index, s.index[mask]) - - def test_getitem_boolean_empty(self): - s = Series([], dtype=np.int64) - s.index.name = 'index_name' - s = s[s.isna()] - assert s.index.name == 'index_name' - assert s.dtype == np.int64 - - # GH5877 - # indexing with empty series - s = Series(['A', 'B']) - expected = Series(np.nan, index=['C'], dtype=object) - result = s[Series(['C'], dtype=object)] - assert_series_equal(result, expected) - - s = Series(['A', 'B']) - expected = Series(dtype=object, index=Index([], dtype='int64')) - result = s[Series([], dtype=object)] - assert_series_equal(result, expected) - - # invalid because of the boolean indexer - # that's empty or not-aligned - def f(): - s[Series([], dtype=bool)] - - pytest.raises(IndexingError, f) - - def f(): - s[Series([True], dtype=bool)] - - pytest.raises(IndexingError, f) - - def test_getitem_generator(self): - gen = (x > 0 for x in self.series) - result = self.series[gen] - result2 = self.series[iter(self.series > 0)] - expected = self.series[self.series > 0] - assert_series_equal(result, expected) - assert_series_equal(result2, expected) - - def test_type_promotion(self): - # GH12599 - s = pd.Series() - s["a"] = pd.Timestamp("2016-01-01") - s["b"] = 3.0 - s["c"] = "foo" - expected = Series([pd.Timestamp("2016-01-01"), 3.0, "foo"], - index=["a", "b", "c"]) - assert_series_equal(s, expected) - - def test_getitem_boolean_object(self): - # using column from DataFrame - - s = self.series - mask = s > s.median() - omask = mask.astype(object) - - # getitem - result = s[omask] - expected = s[mask] - assert_series_equal(result, expected) - - # setitem - s2 = s.copy() - cop = s.copy() - cop[omask] = 5 - s2[mask] = 5 - assert_series_equal(cop, s2) - - # nans raise exception - omask[5:10] = np.nan - pytest.raises(Exception, s.__getitem__, omask) - pytest.raises(Exception, s.__setitem__, omask, 5) - - def test_getitem_setitem_boolean_corner(self): - ts = self.ts - mask_shifted = ts.shift(1, freq=BDay()) > ts.median() - - # these used to raise...?? - - pytest.raises(Exception, ts.__getitem__, mask_shifted) - pytest.raises(Exception, ts.__setitem__, mask_shifted, 1) - # ts[mask_shifted] - # ts[mask_shifted] = 1 - - pytest.raises(Exception, ts.loc.__getitem__, mask_shifted) - pytest.raises(Exception, ts.loc.__setitem__, mask_shifted, 1) - # ts.loc[mask_shifted] - # ts.loc[mask_shifted] = 2 - - def test_getitem_setitem_slice_integers(self): - s = Series(np.random.randn(8), index=[2, 4, 6, 8, 10, 12, 14, 16]) - - result = s[:4] - expected = s.reindex([2, 4, 6, 8]) - assert_series_equal(result, expected) - - s[:4] = 0 - assert (s[:4] == 0).all() - assert not (s[4:] == 0).any() - - def test_getitem_setitem_datetime_tz_pytz(self): - from pytz import timezone as tz - from pandas import date_range - - N = 50 - # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') - ts = Series(np.random.randn(N), index=rng) - - # also test Timestamp tz handling, GH #2789 - result = ts.copy() - result["1990-01-01 09:00:00+00:00"] = 0 - result["1990-01-01 09:00:00+00:00"] = ts[4] - assert_series_equal(result, ts) - - result = ts.copy() - result["1990-01-01 03:00:00-06:00"] = 0 - result["1990-01-01 03:00:00-06:00"] = ts[4] - assert_series_equal(result, ts) - - # repeat with datetimes - result = ts.copy() - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] - assert_series_equal(result, ts) - - result = ts.copy() - - # comparison dates with datetime MUST be localized! - date = tz('US/Central').localize(datetime(1990, 1, 1, 3)) - result[date] = 0 - result[date] = ts[4] - assert_series_equal(result, ts) - - def test_getitem_setitem_datetime_tz_dateutil(self): - from dateutil.tz import tzutc - from pandas._libs.tslibs.timezones import dateutil_gettz as gettz - - tz = lambda x: tzutc() if x == 'UTC' else gettz( - x) # handle special case for utc in dateutil - - from pandas import date_range - - N = 50 - - # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', - tz='America/New_York') - ts = Series(np.random.randn(N), index=rng) - - # also test Timestamp tz handling, GH #2789 - result = ts.copy() - result["1990-01-01 09:00:00+00:00"] = 0 - result["1990-01-01 09:00:00+00:00"] = ts[4] - assert_series_equal(result, ts) - - result = ts.copy() - result["1990-01-01 03:00:00-06:00"] = 0 - result["1990-01-01 03:00:00-06:00"] = ts[4] - assert_series_equal(result, ts) - - # repeat with datetimes - result = ts.copy() - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] - assert_series_equal(result, ts) - - result = ts.copy() - result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = 0 - result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = ts[4] - assert_series_equal(result, ts) - - def test_getitem_setitem_datetimeindex(self): - N = 50 - # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') - ts = Series(np.random.randn(N), index=rng) - - result = ts["1990-01-01 04:00:00"] - expected = ts[4] - assert result == expected - - result = ts.copy() - result["1990-01-01 04:00:00"] = 0 - result["1990-01-01 04:00:00"] = ts[4] - assert_series_equal(result, ts) - - result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0 - result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8] - assert_series_equal(result, ts) - - lb = "1990-01-01 04:00:00" - rb = "1990-01-01 07:00:00" - # GH#18435 strings get a pass from tzawareness compat - result = ts[(ts.index >= lb) & (ts.index <= rb)] - expected = ts[4:8] - assert_series_equal(result, expected) - - lb = "1990-01-01 04:00:00-0500" - rb = "1990-01-01 07:00:00-0500" - result = ts[(ts.index >= lb) & (ts.index <= rb)] - expected = ts[4:8] - assert_series_equal(result, expected) - - # repeat all the above with naive datetimes - result = ts[datetime(1990, 1, 1, 4)] - expected = ts[4] - assert result == expected - - result = ts.copy() - result[datetime(1990, 1, 1, 4)] = 0 - result[datetime(1990, 1, 1, 4)] = ts[4] - assert_series_equal(result, ts) - - result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 - result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] - assert_series_equal(result, ts) - - lb = datetime(1990, 1, 1, 4) - rb = datetime(1990, 1, 1, 7) - with pytest.raises(TypeError): - # tznaive vs tzaware comparison is invalid - # see GH#18376, GH#18162 - ts[(ts.index >= lb) & (ts.index <= rb)] - - lb = pd.Timestamp(datetime(1990, 1, 1, 4)).tz_localize(rng.tzinfo) - rb = pd.Timestamp(datetime(1990, 1, 1, 7)).tz_localize(rng.tzinfo) - result = ts[(ts.index >= lb) & (ts.index <= rb)] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts[ts.index[4]] - expected = ts[4] - assert result == expected - - result = ts[ts.index[4:8]] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result[ts.index[4:8]] = 0 - result[4:8] = ts[4:8] - assert_series_equal(result, ts) - - # also test partial date slicing - result = ts["1990-01-02"] - expected = ts[24:48] - assert_series_equal(result, expected) - - result = ts.copy() - result["1990-01-02"] = 0 - result["1990-01-02"] = ts[24:48] - assert_series_equal(result, ts) - - def test_getitem_setitem_periodindex(self): - from pandas import period_range - - N = 50 - rng = period_range('1/1/1990', periods=N, freq='H') - ts = Series(np.random.randn(N), index=rng) - - result = ts["1990-01-01 04"] - expected = ts[4] - assert result == expected - - result = ts.copy() - result["1990-01-01 04"] = 0 - result["1990-01-01 04"] = ts[4] - assert_series_equal(result, ts) - - result = ts["1990-01-01 04":"1990-01-01 07"] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result["1990-01-01 04":"1990-01-01 07"] = 0 - result["1990-01-01 04":"1990-01-01 07"] = ts[4:8] - assert_series_equal(result, ts) - - lb = "1990-01-01 04" - rb = "1990-01-01 07" - result = ts[(ts.index >= lb) & (ts.index <= rb)] - expected = ts[4:8] - assert_series_equal(result, expected) - - # GH 2782 - result = ts[ts.index[4]] - expected = ts[4] - assert result == expected - - result = ts[ts.index[4:8]] - expected = ts[4:8] - assert_series_equal(result, expected) - - result = ts.copy() - result[ts.index[4:8]] = 0 - result[4:8] = ts[4:8] - assert_series_equal(result, ts) - - @pytest.mark.parametrize( - 'result_1, duplicate_item, expected_1', - [ - [ - pd.Series({1: 12, 2: [1, 2, 2, 3]}), pd.Series({1: 313}), - pd.Series({1: 12, }, dtype=object), - ], - [ - pd.Series({1: [1, 2, 3], 2: [1, 2, 2, 3]}), - pd.Series({1: [1, 2, 3]}), pd.Series({1: [1, 2, 3], }), - ], - ]) - def test_getitem_with_duplicates_indices( - self, result_1, duplicate_item, expected_1): - # GH 17610 - result = result_1.append(duplicate_item) - expected = expected_1.append(duplicate_item) - assert_series_equal(result[1], expected) - assert result[2] == result_1[2] - - def test_getitem_median_slice_bug(self): - index = date_range('20090415', '20090519', freq='2B') - s = Series(np.random.randn(13), index=index) - - indexer = [slice(6, 7, None)] - result = s[indexer] - expected = s[indexer[0]] - assert_series_equal(result, expected) - - def test_getitem_out_of_bounds(self): - # don't segfault, GH #495 - pytest.raises(IndexError, self.ts.__getitem__, len(self.ts)) - - # GH #917 - s = Series([]) - pytest.raises(IndexError, s.__getitem__, -1) - - def test_getitem_setitem_integers(self): - # caused bug without test - s = Series([1, 2, 3], ['a', 'b', 'c']) - - assert s.iloc[0] == s['a'] - s.iloc[0] = 5 - tm.assert_almost_equal(s['a'], 5) - - def test_getitem_box_float64(self): - value = self.ts[5] - assert isinstance(value, np.float64) - - def test_series_box_timestamp(self): - rng = pd.date_range('20090415', '20090519', freq='B') - ser = Series(rng) - - assert isinstance(ser[5], pd.Timestamp) - - rng = pd.date_range('20090415', '20090519', freq='B') - ser = Series(rng, index=rng) - assert isinstance(ser[5], pd.Timestamp) - - assert isinstance(ser.iat[5], pd.Timestamp) - - def test_getitem_ambiguous_keyerror(self): - s = Series(lrange(10), index=lrange(0, 20, 2)) - pytest.raises(KeyError, s.__getitem__, 1) - pytest.raises(KeyError, s.loc.__getitem__, 1) - - def test_getitem_unordered_dup(self): - obj = Series(lrange(5), index=['c', 'a', 'a', 'b', 'b']) - assert is_scalar(obj['c']) - assert obj['c'] == 0 - - def test_getitem_dups_with_missing(self): - - # breaks reindex, so need to use .loc internally - # GH 4246 - s = Series([1, 2, 3, 4], ['foo', 'bar', 'foo', 'bah']) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - expected = s.loc[['foo', 'bar', 'bah', 'bam']] - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = s[['foo', 'bar', 'bah', 'bam']] - assert_series_equal(result, expected) - - def test_getitem_dups(self): - s = Series(range(5), index=['A', 'A', 'B', 'C', 'C'], dtype=np.int64) - expected = Series([3, 4], index=['C', 'C'], dtype=np.int64) - result = s['C'] - assert_series_equal(result, expected) - - def test_getitem_dataframe(self): - rng = list(range(10)) - s = pd.Series(10, index=rng) - df = pd.DataFrame(rng, index=rng) - pytest.raises(TypeError, s.__getitem__, df > 5) - - def test_getitem_callable(self): - # GH 12533 - s = pd.Series(4, index=list('ABCD')) - result = s[lambda x: 'A'] - assert result == s.loc['A'] - - result = s[lambda x: ['A', 'B']] - tm.assert_series_equal(result, s.loc[['A', 'B']]) - - result = s[lambda x: [True, False, True, True]] - tm.assert_series_equal(result, s.iloc[[0, 2, 3]]) - - def test_setitem_ambiguous_keyerror(self): - s = Series(lrange(10), index=lrange(0, 20, 2)) - - # equivalent of an append - s2 = s.copy() - s2[1] = 5 - expected = s.append(Series([5], index=[1])) - assert_series_equal(s2, expected) - - s2 = s.copy() - s2.loc[1] = 5 - expected = s.append(Series([5], index=[1])) - assert_series_equal(s2, expected) - - def test_setitem_float_labels(self): - # note labels are floats - s = Series(['a', 'b', 'c'], index=[0, 0.5, 1]) - tmp = s.copy() - - s.loc[1] = 'zoo' - tmp.iloc[2] = 'zoo' - - assert_series_equal(s, tmp) - - def test_setitem_callable(self): - # GH 12533 - s = pd.Series([1, 2, 3, 4], index=list('ABCD')) - s[lambda x: 'A'] = -1 - tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list('ABCD'))) - - def test_setitem_other_callable(self): - # GH 13299 - inc = lambda x: x + 1 - - s = pd.Series([1, 2, -1, 4]) - s[s < 0] = inc - - expected = pd.Series([1, 2, inc, 4]) - tm.assert_series_equal(s, expected) - - def test_slice(self): - numSlice = self.series[10:20] - numSliceEnd = self.series[-10:] - objSlice = self.objSeries[10:20] - - assert self.series.index[9] not in numSlice.index - assert self.objSeries.index[9] not in objSlice.index - - assert len(numSlice) == len(numSlice.index) - assert self.series[numSlice.index[0]] == numSlice[numSlice.index[0]] - - assert numSlice.index[1] == self.series.index[11] - assert tm.equalContents(numSliceEnd, np.array(self.series)[-10:]) - - # Test return view. - sl = self.series[10:20] - sl[:] = 0 - - assert (self.series[10:20] == 0).all() - - def test_slice_can_reorder_not_uniquely_indexed(self): - s = Series(1, index=['a', 'a', 'b', 'b', 'c']) - s[::-1] # it works! - - def test_slice_float_get_set(self): - - pytest.raises(TypeError, lambda: self.ts[4.0:10.0]) - - def f(): - self.ts[4.0:10.0] = 0 - - pytest.raises(TypeError, f) - - pytest.raises(TypeError, self.ts.__getitem__, slice(4.5, 10.0)) - pytest.raises(TypeError, self.ts.__setitem__, slice(4.5, 10.0), 0) - - def test_slice_floats2(self): - s = Series(np.random.rand(10), index=np.arange(10, 20, dtype=float)) - - assert len(s.loc[12.0:]) == 8 - assert len(s.loc[12.5:]) == 7 - - i = np.arange(10, 20, dtype=float) - i[2] = 12.2 - s.index = i - assert len(s.loc[12.0:]) == 8 - assert len(s.loc[12.5:]) == 7 - - def test_slice_float64(self): - - values = np.arange(10., 50., 2) - index = Index(values) - - start, end = values[[5, 15]] - - s = Series(np.random.randn(20), index=index) - - result = s[start:end] - expected = s.iloc[5:16] - assert_series_equal(result, expected) - - result = s.loc[start:end] - assert_series_equal(result, expected) - - df = DataFrame(np.random.randn(20, 3), index=index) - - result = df[start:end] - expected = df.iloc[5:16] - tm.assert_frame_equal(result, expected) - - result = df.loc[start:end] - tm.assert_frame_equal(result, expected) - - def test_setitem(self): - self.ts[self.ts.index[5]] = np.NaN - self.ts[[1, 2, 17]] = np.NaN - self.ts[6] = np.NaN - assert np.isnan(self.ts[6]) - assert np.isnan(self.ts[2]) - self.ts[np.isnan(self.ts)] = 5 - assert not np.isnan(self.ts[2]) - - # caught this bug when writing tests - series = Series(tm.makeIntIndex(20).astype(float), - index=tm.makeIntIndex(20)) - - series[::2] = 0 - assert (series[::2] == 0).all() - - # set item that's not contained - s = self.series.copy() - s['foobar'] = 1 - - app = Series([1], index=['foobar'], name='series') - expected = self.series.append(app) - assert_series_equal(s, expected) - - # Test for issue #10193 - key = pd.Timestamp('2012-01-01') - series = pd.Series() - series[key] = 47 - expected = pd.Series(47, [key]) - assert_series_equal(series, expected) - - series = pd.Series([], pd.DatetimeIndex([], freq='D')) - series[key] = 47 - expected = pd.Series(47, pd.DatetimeIndex([key], freq='D')) - assert_series_equal(series, expected) - - def test_setitem_dtypes(self): - - # change dtypes - # GH 4463 - expected = Series([np.nan, 2, 3]) - - s = Series([1, 2, 3]) - s.iloc[0] = np.nan - assert_series_equal(s, expected) - - s = Series([1, 2, 3]) - s.loc[0] = np.nan - assert_series_equal(s, expected) - - s = Series([1, 2, 3]) - s[0] = np.nan - assert_series_equal(s, expected) - - s = Series([False]) - s.loc[0] = np.nan - assert_series_equal(s, Series([np.nan])) - - s = Series([False, True]) - s.loc[0] = np.nan - assert_series_equal(s, Series([np.nan, 1.0])) - - def test_set_value(self): - idx = self.ts.index[10] - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = self.ts.set_value(idx, 0) - assert res is self.ts - assert self.ts[idx] == 0 - - # equiv - s = self.series.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = s.set_value('foobar', 0) - assert res is s - assert res.index[-1] == 'foobar' - assert res['foobar'] == 0 - - s = self.series.copy() - s.loc['foobar'] = 0 - assert s.index[-1] == 'foobar' - assert s['foobar'] == 0 - - def test_setslice(self): - sl = self.ts[5:20] - assert len(sl) == len(sl.index) - assert sl.index.is_unique - - def test_basic_getitem_setitem_corner(self): - # invalid tuples, e.g. self.ts[:, None] vs. self.ts[:, 2] - with tm.assert_raises_regex(ValueError, 'tuple-index'): - self.ts[:, 2] - with tm.assert_raises_regex(ValueError, 'tuple-index'): - self.ts[:, 2] = 2 - - # weird lists. [slice(0, 5)] will work but not two slices - result = self.ts[[slice(None, 5)]] - expected = self.ts[:5] - assert_series_equal(result, expected) - - # OK - pytest.raises(Exception, self.ts.__getitem__, - [5, slice(None, None)]) - pytest.raises(Exception, self.ts.__setitem__, - [5, slice(None, None)], 2) - - def test_basic_getitem_with_labels(self): - indices = self.ts.index[[5, 10, 15]] - - result = self.ts[indices] - expected = self.ts.reindex(indices) - assert_series_equal(result, expected) - - result = self.ts[indices[0]:indices[2]] - expected = self.ts.loc[indices[0]:indices[2]] - assert_series_equal(result, expected) - - # integer indexes, be careful - s = Series(np.random.randn(10), index=lrange(0, 20, 2)) - inds = [0, 2, 5, 7, 8] - arr_inds = np.array([0, 2, 5, 7, 8]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = s[inds] - expected = s.reindex(inds) - assert_series_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = s[arr_inds] - expected = s.reindex(arr_inds) - assert_series_equal(result, expected) - - # GH12089 - # with tz for values - s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), - index=['a', 'b', 'c']) - expected = Timestamp('2011-01-01', tz='US/Eastern') - result = s.loc['a'] - assert result == expected - result = s.iloc[0] - assert result == expected - result = s['a'] - assert result == expected - - def test_basic_setitem_with_labels(self): - indices = self.ts.index[[5, 10, 15]] - - cp = self.ts.copy() - exp = self.ts.copy() - cp[indices] = 0 - exp.loc[indices] = 0 - assert_series_equal(cp, exp) - - cp = self.ts.copy() - exp = self.ts.copy() - cp[indices[0]:indices[2]] = 0 - exp.loc[indices[0]:indices[2]] = 0 - assert_series_equal(cp, exp) - - # integer indexes, be careful - s = Series(np.random.randn(10), index=lrange(0, 20, 2)) - inds = [0, 4, 6] - arr_inds = np.array([0, 4, 6]) - - cp = s.copy() - exp = s.copy() - s[inds] = 0 - s.loc[inds] = 0 - assert_series_equal(cp, exp) - - cp = s.copy() - exp = s.copy() - s[arr_inds] = 0 - s.loc[arr_inds] = 0 - assert_series_equal(cp, exp) - - inds_notfound = [0, 4, 5, 6] - arr_inds_notfound = np.array([0, 4, 5, 6]) - pytest.raises(Exception, s.__setitem__, inds_notfound, 0) - pytest.raises(Exception, s.__setitem__, arr_inds_notfound, 0) - - # GH12089 - # with tz for values - s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), - index=['a', 'b', 'c']) - s2 = s.copy() - expected = Timestamp('2011-01-03', tz='US/Eastern') - s2.loc['a'] = expected - result = s2.loc['a'] - assert result == expected - - s2 = s.copy() - s2.iloc[0] = expected - result = s2.iloc[0] - assert result == expected - - s2 = s.copy() - s2['a'] = expected - result = s2['a'] - assert result == expected - - def test_loc_getitem(self): - inds = self.series.index[[3, 4, 7]] - assert_series_equal(self.series.loc[inds], self.series.reindex(inds)) - assert_series_equal(self.series.iloc[5::2], self.series[5::2]) - - # slice with indices - d1, d2 = self.ts.index[[5, 15]] - result = self.ts.loc[d1:d2] - expected = self.ts.truncate(d1, d2) - assert_series_equal(result, expected) - - # boolean - mask = self.series > self.series.median() - assert_series_equal(self.series.loc[mask], self.series[mask]) - - # ask for index value - assert self.ts.loc[d1] == self.ts[d1] - assert self.ts.loc[d2] == self.ts[d2] - - def test_loc_getitem_not_monotonic(self): - d1, d2 = self.ts.index[[5, 15]] - - ts2 = self.ts[::2][[1, 2, 0]] - - pytest.raises(KeyError, ts2.loc.__getitem__, slice(d1, d2)) - pytest.raises(KeyError, ts2.loc.__setitem__, slice(d1, d2), 0) - - def test_loc_getitem_setitem_integer_slice_keyerrors(self): - s = Series(np.random.randn(10), index=lrange(0, 20, 2)) - - # this is OK - cp = s.copy() - cp.iloc[4:10] = 0 - assert (cp.iloc[4:10] == 0).all() - - # so is this - cp = s.copy() - cp.iloc[3:11] = 0 - assert (cp.iloc[3:11] == 0).values.all() - - result = s.iloc[2:6] - result2 = s.loc[3:11] - expected = s.reindex([4, 6, 8, 10]) - - assert_series_equal(result, expected) - assert_series_equal(result2, expected) - - # non-monotonic, raise KeyError - s2 = s.iloc[lrange(5) + lrange(5, 10)[::-1]] - pytest.raises(KeyError, s2.loc.__getitem__, slice(3, 11)) - pytest.raises(KeyError, s2.loc.__setitem__, slice(3, 11), 0) - - def test_loc_getitem_iterator(self): - idx = iter(self.series.index[:10]) - result = self.series.loc[idx] - assert_series_equal(result, self.series[:10]) - - def test_setitem_with_tz(self): - for tz in ['US/Eastern', 'UTC', 'Asia/Tokyo']: - orig = pd.Series(pd.date_range('2016-01-01', freq='H', periods=3, - tz=tz)) - assert orig.dtype == 'datetime64[ns, {0}]'.format(tz) - - # scalar - s = orig.copy() - s[1] = pd.Timestamp('2011-01-01', tz=tz) - exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2016-01-01 02:00', tz=tz)]) - tm.assert_series_equal(s, exp) - - s = orig.copy() - s.loc[1] = pd.Timestamp('2011-01-01', tz=tz) - tm.assert_series_equal(s, exp) - - s = orig.copy() - s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz) - tm.assert_series_equal(s, exp) - - # vector - vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2]) - assert vals.dtype == 'datetime64[ns, {0}]'.format(tz) - - s[[1, 2]] = vals - exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2012-01-01 00:00', tz=tz)]) - tm.assert_series_equal(s, exp) - - s = orig.copy() - s.loc[[1, 2]] = vals - tm.assert_series_equal(s, exp) - - s = orig.copy() - s.iloc[[1, 2]] = vals - tm.assert_series_equal(s, exp) - - def test_setitem_with_tz_dst(self): - # GH XXX - tz = 'US/Eastern' - orig = pd.Series(pd.date_range('2016-11-06', freq='H', periods=3, - tz=tz)) - assert orig.dtype == 'datetime64[ns, {0}]'.format(tz) - - # scalar - s = orig.copy() - s[1] = pd.Timestamp('2011-01-01', tz=tz) - exp = pd.Series([pd.Timestamp('2016-11-06 00:00-04:00', tz=tz), - pd.Timestamp('2011-01-01 00:00-05:00', tz=tz), - pd.Timestamp('2016-11-06 01:00-05:00', tz=tz)]) - tm.assert_series_equal(s, exp) - - s = orig.copy() - s.loc[1] = pd.Timestamp('2011-01-01', tz=tz) - tm.assert_series_equal(s, exp) - - s = orig.copy() - s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz) - tm.assert_series_equal(s, exp) - - # vector - vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2]) - assert vals.dtype == 'datetime64[ns, {0}]'.format(tz) - - s[[1, 2]] = vals - exp = pd.Series([pd.Timestamp('2016-11-06 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2012-01-01 00:00', tz=tz)]) - tm.assert_series_equal(s, exp) - - s = orig.copy() - s.loc[[1, 2]] = vals - tm.assert_series_equal(s, exp) - - s = orig.copy() - s.iloc[[1, 2]] = vals - tm.assert_series_equal(s, exp) - - def test_take(self): - s = Series([-1, 5, 6, 2, 4]) - - actual = s.take([1, 3, 4]) - expected = Series([5, 2, 4], index=[1, 3, 4]) - tm.assert_series_equal(actual, expected) - - actual = s.take([-1, 3, 4]) - expected = Series([4, 2, 4], index=[4, 3, 4]) - tm.assert_series_equal(actual, expected) - - pytest.raises(IndexError, s.take, [1, 10]) - pytest.raises(IndexError, s.take, [2, 5]) - - with tm.assert_produces_warning(FutureWarning): - s.take([-1, 3, 4], convert=False) - - def test_where_raise_on_error_deprecation(self): - - # gh-14968 - # deprecation of raise_on_error - s = Series(np.random.randn(5)) - cond = s > 0 - with tm.assert_produces_warning(FutureWarning): - s.where(cond, raise_on_error=True) - with tm.assert_produces_warning(FutureWarning): - s.mask(cond, raise_on_error=True) - - def test_where(self): - s = Series(np.random.randn(5)) - cond = s > 0 - - rs = s.where(cond).dropna() - rs2 = s[cond] - assert_series_equal(rs, rs2) - - rs = s.where(cond, -s) - assert_series_equal(rs, s.abs()) - - rs = s.where(cond) - assert (s.shape == rs.shape) - assert (rs is not s) - - # test alignment - cond = Series([True, False, False, True, False], index=s.index) - s2 = -(s.abs()) - - expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index) - rs = s2.where(cond[:3]) - assert_series_equal(rs, expected) - - expected = s2.abs() - expected.iloc[0] = s2[0] - rs = s2.where(cond[:3], -s2) - assert_series_equal(rs, expected) - - def test_where_error(self): - - s = Series(np.random.randn(5)) - cond = s > 0 - - pytest.raises(ValueError, s.where, 1) - pytest.raises(ValueError, s.where, cond[:3].values, -s) - - # GH 2745 - s = Series([1, 2]) - s[[True, False]] = [0, 1] - expected = Series([0, 2]) - assert_series_equal(s, expected) - - # failures - pytest.raises(ValueError, s.__setitem__, tuple([[[True, False]]]), - [0, 2, 3]) - pytest.raises(ValueError, s.__setitem__, tuple([[[True, False]]]), - []) - - def test_where_unsafe(self): - - # unsafe dtype changes - for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, - np.float32, np.float64]: - s = Series(np.arange(10), dtype=dtype) - mask = s < 5 - s[mask] = lrange(2, 7) - expected = Series(lrange(2, 7) + lrange(5, 10), dtype=dtype) - assert_series_equal(s, expected) - assert s.dtype == expected.dtype - - # these are allowed operations, but are upcasted - for dtype in [np.int64, np.float64]: - s = Series(np.arange(10), dtype=dtype) - mask = s < 5 - values = [2.5, 3.5, 4.5, 5.5, 6.5] - s[mask] = values - expected = Series(values + lrange(5, 10), dtype='float64') - assert_series_equal(s, expected) - assert s.dtype == expected.dtype - - # GH 9731 - s = Series(np.arange(10), dtype='int64') - mask = s > 5 - values = [2.5, 3.5, 4.5, 5.5] - s[mask] = values - expected = Series(lrange(6) + values, dtype='float64') - assert_series_equal(s, expected) - - # can't do these as we are forced to change the itemsize of the input - # to something we cannot - for dtype in [np.int8, np.int16, np.int32, np.float16, np.float32]: - s = Series(np.arange(10), dtype=dtype) - mask = s < 5 - values = [2.5, 3.5, 4.5, 5.5, 6.5] - pytest.raises(Exception, s.__setitem__, tuple(mask), values) - - # GH3235 - s = Series(np.arange(10), dtype='int64') - mask = s < 5 - s[mask] = lrange(2, 7) - expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64') - assert_series_equal(s, expected) - assert s.dtype == expected.dtype - - s = Series(np.arange(10), dtype='int64') - mask = s > 5 - s[mask] = [0] * 4 - expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64') - assert_series_equal(s, expected) - - s = Series(np.arange(10)) - mask = s > 5 - - def f(): - s[mask] = [5, 4, 3, 2, 1] - - pytest.raises(ValueError, f) - - def f(): - s[mask] = [0] * 5 - - pytest.raises(ValueError, f) - - # dtype changes - s = Series([1, 2, 3, 4]) - result = s.where(s > 2, np.nan) - expected = Series([np.nan, np.nan, 3, 4]) - assert_series_equal(result, expected) - - # GH 4667 - # setting with None changes dtype - s = Series(range(10)).astype(float) - s[8] = None - result = s[8] - assert isna(result) - - s = Series(range(10)).astype(float) - s[s > 8] = None - result = s[isna(s)] - expected = Series(np.nan, index=[9]) - assert_series_equal(result, expected) - - def test_where_array_like(self): - # see gh-15414 - s = Series([1, 2, 3]) - cond = [False, True, True] - expected = Series([np.nan, 2, 3]) - klasses = [list, tuple, np.array, Series] - - for klass in klasses: - result = s.where(klass(cond)) - assert_series_equal(result, expected) - - def test_where_invalid_input(self): - # see gh-15414: only boolean arrays accepted - s = Series([1, 2, 3]) - msg = "Boolean array expected for the condition" - - conds = [ - [1, 0, 1], - Series([2, 5, 7]), - ["True", "False", "True"], - [Timestamp("2017-01-01"), - pd.NaT, Timestamp("2017-01-02")] - ] - - for cond in conds: - with tm.assert_raises_regex(ValueError, msg): - s.where(cond) - - msg = "Array conditional must be same shape as self" - with tm.assert_raises_regex(ValueError, msg): - s.where([True]) - - def test_where_ndframe_align(self): - msg = "Array conditional must be same shape as self" - s = Series([1, 2, 3]) - - cond = [True] - with tm.assert_raises_regex(ValueError, msg): - s.where(cond) - - expected = Series([1, np.nan, np.nan]) - - out = s.where(Series(cond)) - tm.assert_series_equal(out, expected) - - cond = np.array([False, True, False, True]) - with tm.assert_raises_regex(ValueError, msg): - s.where(cond) - - expected = Series([np.nan, 2, np.nan]) - - out = s.where(Series(cond)) - tm.assert_series_equal(out, expected) - - def test_where_setitem_invalid(self): - - # GH 2702 - # make sure correct exceptions are raised on invalid list assignment - - # slice - s = Series(list('abc')) - - def f(): - s[0:3] = list(range(27)) - - pytest.raises(ValueError, f) - - s[0:3] = list(range(3)) - expected = Series([0, 1, 2]) - assert_series_equal(s.astype(np.int64), expected, ) - - # slice with step - s = Series(list('abcdef')) - - def f(): - s[0:4:2] = list(range(27)) - - pytest.raises(ValueError, f) - - s = Series(list('abcdef')) - s[0:4:2] = list(range(2)) - expected = Series([0, 'b', 1, 'd', 'e', 'f']) - assert_series_equal(s, expected) - - # neg slices - s = Series(list('abcdef')) - - def f(): - s[:-1] = list(range(27)) - - pytest.raises(ValueError, f) - - s[-3:-1] = list(range(2)) - expected = Series(['a', 'b', 'c', 0, 1, 'f']) - assert_series_equal(s, expected) - - # list - s = Series(list('abc')) - - def f(): - s[[0, 1, 2]] = list(range(27)) - - pytest.raises(ValueError, f) - - s = Series(list('abc')) - - def f(): - s[[0, 1, 2]] = list(range(2)) - - pytest.raises(ValueError, f) - - # scalar - s = Series(list('abc')) - s[0] = list(range(10)) - expected = Series([list(range(10)), 'b', 'c']) - assert_series_equal(s, expected) - - def test_where_broadcast(self): - # Test a variety of differently sized series - for size in range(2, 6): - # Test a variety of boolean indices - for selection in [ - # First element should be set - np.resize([True, False, False, False, False], size), - # Set alternating elements] - np.resize([True, False], size), - # No element should be set - np.resize([False], size)]: - - # Test a variety of different numbers as content - for item in [2.0, np.nan, np.finfo(np.float).max, - np.finfo(np.float).min]: - # Test numpy arrays, lists and tuples as the input to be - # broadcast - for arr in [np.array([item]), [item], (item, )]: - data = np.arange(size, dtype=float) - s = Series(data) - s[selection] = arr - # Construct the expected series by taking the source - # data or item based on the selection - expected = Series([item if use_item else data[ - i] for i, use_item in enumerate(selection)]) - assert_series_equal(s, expected) - - s = Series(data) - result = s.where(~selection, arr) - assert_series_equal(result, expected) - - def test_where_inplace(self): - s = Series(np.random.randn(5)) - cond = s > 0 - - rs = s.copy() - - rs.where(cond, inplace=True) - assert_series_equal(rs.dropna(), s[cond]) - assert_series_equal(rs, s.where(cond)) - - rs = s.copy() - rs.where(cond, -s, inplace=True) - assert_series_equal(rs, s.where(cond, -s)) - - def test_where_dups(self): - # GH 4550 - # where crashes with dups in index - s1 = Series(list(range(3))) - s2 = Series(list(range(3))) - comb = pd.concat([s1, s2]) - result = comb.where(comb < 2) - expected = Series([0, 1, np.nan, 0, 1, np.nan], - index=[0, 1, 2, 0, 1, 2]) - assert_series_equal(result, expected) - - # GH 4548 - # inplace updating not working with dups - comb[comb < 1] = 5 - expected = Series([5, 1, 2, 5, 1, 2], index=[0, 1, 2, 0, 1, 2]) - assert_series_equal(comb, expected) - - comb[comb < 2] += 10 - expected = Series([5, 11, 2, 5, 11, 2], index=[0, 1, 2, 0, 1, 2]) - assert_series_equal(comb, expected) - - def test_where_datetime_conversion(self): - s = Series(date_range('20130102', periods=2)) - expected = Series([10, 10]) - mask = np.array([False, False]) - - rs = s.where(mask, [10, 10]) - assert_series_equal(rs, expected) - - rs = s.where(mask, 10) - assert_series_equal(rs, expected) - - rs = s.where(mask, 10.0) - assert_series_equal(rs, expected) - - rs = s.where(mask, [10.0, 10.0]) - assert_series_equal(rs, expected) - - rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype='object') - assert_series_equal(rs, expected) - - # GH 15701 - timestamps = ['2016-12-31 12:00:04+00:00', - '2016-12-31 12:00:04.010000+00:00'] - s = Series([pd.Timestamp(t) for t in timestamps]) - rs = s.where(Series([False, True])) - expected = Series([pd.NaT, s[1]]) - assert_series_equal(rs, expected) - - def test_where_timedelta_coerce(self): - s = Series([1, 2], dtype='timedelta64[ns]') - expected = Series([10, 10]) - mask = np.array([False, False]) - - rs = s.where(mask, [10, 10]) - assert_series_equal(rs, expected) - - rs = s.where(mask, 10) - assert_series_equal(rs, expected) - - rs = s.where(mask, 10.0) - assert_series_equal(rs, expected) - - rs = s.where(mask, [10.0, 10.0]) - assert_series_equal(rs, expected) - - rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype='object') - assert_series_equal(rs, expected) - - def test_mask(self): - # compare with tested results in test_where - s = Series(np.random.randn(5)) - cond = s > 0 - - rs = s.where(~cond, np.nan) - assert_series_equal(rs, s.mask(cond)) - - rs = s.where(~cond) - rs2 = s.mask(cond) - assert_series_equal(rs, rs2) - - rs = s.where(~cond, -s) - rs2 = s.mask(cond, -s) - assert_series_equal(rs, rs2) - - cond = Series([True, False, False, True, False], index=s.index) - s2 = -(s.abs()) - rs = s2.where(~cond[:3]) - rs2 = s2.mask(cond[:3]) - assert_series_equal(rs, rs2) - - rs = s2.where(~cond[:3], -s2) - rs2 = s2.mask(cond[:3], -s2) - assert_series_equal(rs, rs2) - - pytest.raises(ValueError, s.mask, 1) - pytest.raises(ValueError, s.mask, cond[:3].values, -s) - - # dtype changes - s = Series([1, 2, 3, 4]) - result = s.mask(s > 2, np.nan) - expected = Series([1, 2, np.nan, np.nan]) - assert_series_equal(result, expected) - - def test_mask_broadcast(self): - # GH 8801 - # copied from test_where_broadcast - for size in range(2, 6): - for selection in [ - # First element should be set - np.resize([True, False, False, False, False], size), - # Set alternating elements] - np.resize([True, False], size), - # No element should be set - np.resize([False], size)]: - for item in [2.0, np.nan, np.finfo(np.float).max, - np.finfo(np.float).min]: - for arr in [np.array([item]), [item], (item, )]: - data = np.arange(size, dtype=float) - s = Series(data) - result = s.mask(selection, arr) - expected = Series([item if use_item else data[ - i] for i, use_item in enumerate(selection)]) - assert_series_equal(result, expected) - - def test_mask_inplace(self): - s = Series(np.random.randn(5)) - cond = s > 0 - - rs = s.copy() - rs.mask(cond, inplace=True) - assert_series_equal(rs.dropna(), s[~cond]) - assert_series_equal(rs, s.mask(cond)) - - rs = s.copy() - rs.mask(cond, -s, inplace=True) - assert_series_equal(rs, s.mask(cond, -s)) - - def test_ix_setitem(self): - inds = self.series.index[[3, 4, 7]] - - result = self.series.copy() - result.loc[inds] = 5 - - expected = self.series.copy() - expected[[3, 4, 7]] = 5 - assert_series_equal(result, expected) - - result.iloc[5:10] = 10 - expected[5:10] = 10 - assert_series_equal(result, expected) - - # set slice with indices - d1, d2 = self.series.index[[5, 15]] - result.loc[d1:d2] = 6 - expected[5:16] = 6 # because it's inclusive - assert_series_equal(result, expected) - - # set index value - self.series.loc[d1] = 4 - self.series.loc[d2] = 6 - assert self.series[d1] == 4 - assert self.series[d2] == 6 - - def test_where_numeric_with_string(self): - # GH 9280 - s = pd.Series([1, 2, 3]) - w = s.where(s > 1, 'X') - - assert not is_integer(w[0]) - assert is_integer(w[1]) - assert is_integer(w[2]) - assert isinstance(w[0], str) - assert w.dtype == 'object' - - w = s.where(s > 1, ['X', 'Y', 'Z']) - assert not is_integer(w[0]) - assert is_integer(w[1]) - assert is_integer(w[2]) - assert isinstance(w[0], str) - assert w.dtype == 'object' - - w = s.where(s > 1, np.array(['X', 'Y', 'Z'])) - assert not is_integer(w[0]) - assert is_integer(w[1]) - assert is_integer(w[2]) - assert isinstance(w[0], str) - assert w.dtype == 'object' - - def test_setitem_boolean(self): - mask = self.series > self.series.median() - - # similar indexed series - result = self.series.copy() - result[mask] = self.series * 2 - expected = self.series * 2 - assert_series_equal(result[mask], expected[mask]) - - # needs alignment - result = self.series.copy() - result[mask] = (self.series * 2)[0:5] - expected = (self.series * 2)[0:5].reindex_like(self.series) - expected[-mask] = self.series[mask] - assert_series_equal(result[mask], expected[mask]) - - def test_ix_setitem_boolean(self): - mask = self.series > self.series.median() - - result = self.series.copy() - result.loc[mask] = 0 - expected = self.series - expected[mask] = 0 - assert_series_equal(result, expected) - - def test_ix_setitem_corner(self): - inds = list(self.series.index[[5, 8, 12]]) - self.series.loc[inds] = 5 - pytest.raises(Exception, self.series.loc.__setitem__, - inds + ['foo'], 5) - - def test_get_set_boolean_different_order(self): - ordered = self.series.sort_values() - - # setting - copy = self.series.copy() - copy[ordered > 0] = 0 - - expected = self.series.copy() - expected[expected > 0] = 0 - - assert_series_equal(copy, expected) - - # getting - sel = self.series[ordered > 0] - exp = self.series[self.series > 0] - assert_series_equal(sel, exp) - - def test_setitem_na(self): - # these induce dtype changes - expected = Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]) - s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) - s[::2] = np.nan - assert_series_equal(s, expected) - - # gets coerced to float, right? - expected = Series([np.nan, 1, np.nan, 0]) - s = Series([True, True, False, False]) - s[::2] = np.nan - assert_series_equal(s, expected) - - expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, - 9]) - s = Series(np.arange(10)) - s[:5] = np.nan - assert_series_equal(s, expected) - - def test_basic_indexing(self): - s = Series(np.random.randn(5), index=['a', 'b', 'a', 'a', 'b']) - - pytest.raises(IndexError, s.__getitem__, 5) - pytest.raises(IndexError, s.__setitem__, 5, 0) - - pytest.raises(KeyError, s.__getitem__, 'c') - - s = s.sort_index() - - pytest.raises(IndexError, s.__getitem__, 5) - pytest.raises(IndexError, s.__setitem__, 5, 0) - - def test_int_indexing(self): - s = Series(np.random.randn(6), index=[0, 0, 1, 1, 2, 2]) - - pytest.raises(KeyError, s.__getitem__, 5) - - pytest.raises(KeyError, s.__getitem__, 'c') - - # not monotonic - s = Series(np.random.randn(6), index=[2, 2, 0, 0, 1, 1]) - - pytest.raises(KeyError, s.__getitem__, 5) - - pytest.raises(KeyError, s.__getitem__, 'c') - - def test_datetime_indexing(self): - from pandas import date_range - - index = date_range('1/1/2000', '1/7/2000') - index = index.repeat(3) - - s = Series(len(index), index=index) - stamp = Timestamp('1/8/2000') - - pytest.raises(KeyError, s.__getitem__, stamp) - s[stamp] = 0 - assert s[stamp] == 0 - - # not monotonic - s = Series(len(index), index=index) - s = s[::-1] - - pytest.raises(KeyError, s.__getitem__, stamp) - s[stamp] = 0 - assert s[stamp] == 0 - - def test_timedelta_assignment(self): - # GH 8209 - s = Series([]) - s.loc['B'] = timedelta(1) - tm.assert_series_equal(s, Series(Timedelta('1 days'), index=['B'])) - - s = s.reindex(s.index.insert(0, 'A')) - tm.assert_series_equal(s, Series( - [np.nan, Timedelta('1 days')], index=['A', 'B'])) - - result = s.fillna(timedelta(1)) - expected = Series(Timedelta('1 days'), index=['A', 'B']) - tm.assert_series_equal(result, expected) - - s.loc['A'] = timedelta(1) - tm.assert_series_equal(s, expected) - - # GH 14155 - s = Series(10 * [np.timedelta64(10, 'm')]) - s.loc[[1, 2, 3]] = np.timedelta64(20, 'm') - expected = pd.Series(10 * [np.timedelta64(10, 'm')]) - expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm')) - tm.assert_series_equal(s, expected) - - def test_underlying_data_conversion(self): - - # GH 4080 - df = DataFrame({c: [1, 2, 3] for c in ['a', 'b', 'c']}) - df.set_index(['a', 'b', 'c'], inplace=True) - s = Series([1], index=[(2, 2, 2)]) - df['val'] = 0 - df - df['val'].update(s) - - expected = DataFrame( - dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0])) - expected.set_index(['a', 'b', 'c'], inplace=True) - tm.assert_frame_equal(df, expected) - - # GH 3970 - # these are chained assignments as well - pd.set_option('chained_assignment', None) - df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) - df["cc"] = 0.0 - - ck = [True] * len(df) - - df["bb"].iloc[0] = .13 - - # TODO: unused - df_tmp = df.iloc[ck] # noqa - - df["bb"].iloc[0] = .15 - assert df['bb'].iloc[0] == 0.15 - pd.set_option('chained_assignment', 'raise') - - # GH 3217 - df = DataFrame(dict(a=[1, 3], b=[np.nan, 2])) - df['c'] = np.nan - df['c'].update(pd.Series(['foo'], index=[0])) - - expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=['foo', np.nan])) - tm.assert_frame_equal(df, expected) - - def test_preserveRefs(self): - seq = self.ts[[5, 10, 15]] - seq[1] = np.NaN - assert not np.isnan(self.ts[10]) - - def test_drop(self): - - # unique - s = Series([1, 2], index=['one', 'two']) - expected = Series([1], index=['one']) - result = s.drop(['two']) - assert_series_equal(result, expected) - result = s.drop('two', axis='rows') - assert_series_equal(result, expected) - - # non-unique - # GH 5248 - s = Series([1, 1, 2], index=['one', 'two', 'one']) - expected = Series([1, 2], index=['one', 'one']) - result = s.drop(['two'], axis=0) - assert_series_equal(result, expected) - result = s.drop('two') - assert_series_equal(result, expected) - - expected = Series([1], index=['two']) - result = s.drop(['one']) - assert_series_equal(result, expected) - result = s.drop('one') - assert_series_equal(result, expected) - - # single string/tuple-like - s = Series(range(3), index=list('abc')) - pytest.raises(KeyError, s.drop, 'bc') - pytest.raises(KeyError, s.drop, ('a', )) - - # errors='ignore' - s = Series(range(3), index=list('abc')) - result = s.drop('bc', errors='ignore') - assert_series_equal(result, s) - result = s.drop(['a', 'd'], errors='ignore') - expected = s.iloc[1:] - assert_series_equal(result, expected) - - # bad axis - pytest.raises(ValueError, s.drop, 'one', axis='columns') - - # GH 8522 - s = Series([2, 3], index=[True, False]) - assert s.index.is_object() - result = s.drop(True) - expected = Series([3], index=[False]) - assert_series_equal(result, expected) - - # GH 16877 - s = Series([2, 3], index=[0, 1]) - with tm.assert_raises_regex(KeyError, 'not contained in axis'): - s.drop([False, True]) - - def test_align(self): - def _check_align(a, b, how='left', fill=None): - aa, ab = a.align(b, join=how, fill_value=fill) - - join_index = a.index.join(b.index, how=how) - if fill is not None: - diff_a = aa.index.difference(join_index) - diff_b = ab.index.difference(join_index) - if len(diff_a) > 0: - assert (aa.reindex(diff_a) == fill).all() - if len(diff_b) > 0: - assert (ab.reindex(diff_b) == fill).all() - - ea = a.reindex(join_index) - eb = b.reindex(join_index) - - if fill is not None: - ea = ea.fillna(fill) - eb = eb.fillna(fill) - - assert_series_equal(aa, ea) - assert_series_equal(ab, eb) - assert aa.name == 'ts' - assert ea.name == 'ts' - assert ab.name == 'ts' - assert eb.name == 'ts' - - for kind in JOIN_TYPES: - _check_align(self.ts[2:], self.ts[:-5], how=kind) - _check_align(self.ts[2:], self.ts[:-5], how=kind, fill=-1) - - # empty left - _check_align(self.ts[:0], self.ts[:-5], how=kind) - _check_align(self.ts[:0], self.ts[:-5], how=kind, fill=-1) - - # empty right - _check_align(self.ts[:-5], self.ts[:0], how=kind) - _check_align(self.ts[:-5], self.ts[:0], how=kind, fill=-1) - - # both empty - _check_align(self.ts[:0], self.ts[:0], how=kind) - _check_align(self.ts[:0], self.ts[:0], how=kind, fill=-1) - - def test_align_fill_method(self): - def _check_align(a, b, how='left', method='pad', limit=None): - aa, ab = a.align(b, join=how, method=method, limit=limit) - - join_index = a.index.join(b.index, how=how) - ea = a.reindex(join_index) - eb = b.reindex(join_index) - - ea = ea.fillna(method=method, limit=limit) - eb = eb.fillna(method=method, limit=limit) - - assert_series_equal(aa, ea) - assert_series_equal(ab, eb) - - for kind in JOIN_TYPES: - for meth in ['pad', 'bfill']: - _check_align(self.ts[2:], self.ts[:-5], how=kind, method=meth) - _check_align(self.ts[2:], self.ts[:-5], how=kind, method=meth, - limit=1) - - # empty left - _check_align(self.ts[:0], self.ts[:-5], how=kind, method=meth) - _check_align(self.ts[:0], self.ts[:-5], how=kind, method=meth, - limit=1) - - # empty right - _check_align(self.ts[:-5], self.ts[:0], how=kind, method=meth) - _check_align(self.ts[:-5], self.ts[:0], how=kind, method=meth, - limit=1) - - # both empty - _check_align(self.ts[:0], self.ts[:0], how=kind, method=meth) - _check_align(self.ts[:0], self.ts[:0], how=kind, method=meth, - limit=1) - - def test_align_nocopy(self): - b = self.ts[:5].copy() - - # do copy - a = self.ts.copy() - ra, _ = a.align(b, join='left') - ra[:5] = 5 - assert not (a[:5] == 5).any() - - # do not copy - a = self.ts.copy() - ra, _ = a.align(b, join='left', copy=False) - ra[:5] = 5 - assert (a[:5] == 5).all() - - # do copy - a = self.ts.copy() - b = self.ts[:5].copy() - _, rb = a.align(b, join='right') - rb[:3] = 5 - assert not (b[:3] == 5).any() - - # do not copy - a = self.ts.copy() - b = self.ts[:5].copy() - _, rb = a.align(b, join='right', copy=False) - rb[:2] = 5 - assert (b[:2] == 5).all() - - def test_align_same_index(self): - a, b = self.ts.align(self.ts, copy=False) - assert a.index is self.ts.index - assert b.index is self.ts.index - - a, b = self.ts.align(self.ts, copy=True) - assert a.index is not self.ts.index - assert b.index is not self.ts.index - - def test_align_multiindex(self): - # GH 10665 - - midx = pd.MultiIndex.from_product([range(2), range(3), range(2)], - names=('a', 'b', 'c')) - idx = pd.Index(range(2), name='b') - s1 = pd.Series(np.arange(12, dtype='int64'), index=midx) - s2 = pd.Series(np.arange(2, dtype='int64'), index=idx) - - # these must be the same results (but flipped) - res1l, res1r = s1.align(s2, join='left') - res2l, res2r = s2.align(s1, join='right') - - expl = s1 - tm.assert_series_equal(expl, res1l) - tm.assert_series_equal(expl, res2r) - expr = pd.Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) - tm.assert_series_equal(expr, res1r) - tm.assert_series_equal(expr, res2l) - - res1l, res1r = s1.align(s2, join='right') - res2l, res2r = s2.align(s1, join='left') - - exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)], - names=('a', 'b', 'c')) - expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) - tm.assert_series_equal(expl, res1l) - tm.assert_series_equal(expl, res2r) - expr = pd.Series([0, 0, 1, 1] * 2, index=exp_idx) - tm.assert_series_equal(expr, res1r) - tm.assert_series_equal(expr, res2l) - - def test_reindex(self): - - identity = self.series.reindex(self.series.index) - - # __array_interface__ is not defined for older numpies - # and on some pythons - try: - assert np.may_share_memory(self.series.index, identity.index) - except AttributeError: - pass - - assert identity.index.is_(self.series.index) - assert identity.index.identical(self.series.index) - - subIndex = self.series.index[10:20] - subSeries = self.series.reindex(subIndex) - - for idx, val in compat.iteritems(subSeries): - assert val == self.series[idx] - - subIndex2 = self.ts.index[10:20] - subTS = self.ts.reindex(subIndex2) - - for idx, val in compat.iteritems(subTS): - assert val == self.ts[idx] - stuffSeries = self.ts.reindex(subIndex) - - assert np.isnan(stuffSeries).all() - - # This is extremely important for the Cython code to not screw up - nonContigIndex = self.ts.index[::2] - subNonContig = self.ts.reindex(nonContigIndex) - for idx, val in compat.iteritems(subNonContig): - assert val == self.ts[idx] - - # return a copy the same index here - result = self.ts.reindex() - assert not (result is self.ts) - - def test_reindex_nan(self): - ts = Series([2, 3, 5, 7], index=[1, 4, nan, 8]) - - i, j = [nan, 1, nan, 8, 4, nan], [2, 0, 2, 3, 1, 2] - assert_series_equal(ts.reindex(i), ts.iloc[j]) - - ts.index = ts.index.astype('object') - - # reindex coerces index.dtype to float, loc/iloc doesn't - assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False) - - def test_reindex_series_add_nat(self): - rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') - series = Series(rng) - - result = series.reindex(lrange(15)) - assert np.issubdtype(result.dtype, np.dtype('M8[ns]')) - - mask = result.isna() - assert mask[-5:].all() - assert not mask[:-5].any() - - def test_reindex_with_datetimes(self): - rng = date_range('1/1/2000', periods=20) - ts = Series(np.random.randn(20), index=rng) - - result = ts.reindex(list(ts.index[5:10])) - expected = ts[5:10] - tm.assert_series_equal(result, expected) - - result = ts[list(ts.index[5:10])] - tm.assert_series_equal(result, expected) - - def test_reindex_corner(self): - # (don't forget to fix this) I think it's fixed - self.empty.reindex(self.ts.index, method='pad') # it works - - # corner case: pad empty series - reindexed = self.empty.reindex(self.ts.index, method='pad') - - # pass non-Index - reindexed = self.ts.reindex(list(self.ts.index)) - assert_series_equal(self.ts, reindexed) - - # bad fill method - ts = self.ts[::2] - pytest.raises(Exception, ts.reindex, self.ts.index, method='foo') - - def test_reindex_pad(self): - - s = Series(np.arange(10), dtype='int64') - s2 = s[::2] - - reindexed = s2.reindex(s.index, method='pad') - reindexed2 = s2.reindex(s.index, method='ffill') - assert_series_equal(reindexed, reindexed2) - - expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], index=np.arange(10)) - assert_series_equal(reindexed, expected) - - # GH4604 - s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e']) - new_index = ['a', 'g', 'c', 'f'] - expected = Series([1, 1, 3, 3], index=new_index) - - # this changes dtype because the ffill happens after - result = s.reindex(new_index).ffill() - assert_series_equal(result, expected.astype('float64')) - - result = s.reindex(new_index).ffill(downcast='infer') - assert_series_equal(result, expected) - - expected = Series([1, 5, 3, 5], index=new_index) - result = s.reindex(new_index, method='ffill') - assert_series_equal(result, expected) - - # inference of new dtype - s = Series([True, False, False, True], index=list('abcd')) - new_index = 'agc' - result = s.reindex(list(new_index)).ffill() - expected = Series([True, True, False], index=list(new_index)) - assert_series_equal(result, expected) - - # GH4618 shifted series downcasting - s = Series(False, index=lrange(0, 5)) - result = s.shift(1).fillna(method='bfill') - expected = Series(False, index=lrange(0, 5)) - assert_series_equal(result, expected) - - def test_reindex_nearest(self): - s = Series(np.arange(10, dtype='int64')) - target = [0.1, 0.9, 1.5, 2.0] - actual = s.reindex(target, method='nearest') - expected = Series(np.around(target).astype('int64'), target) - assert_series_equal(expected, actual) - - actual = s.reindex_like(actual, method='nearest') - assert_series_equal(expected, actual) - - actual = s.reindex_like(actual, method='nearest', tolerance=1) - assert_series_equal(expected, actual) - actual = s.reindex_like(actual, method='nearest', - tolerance=[1, 2, 3, 4]) - assert_series_equal(expected, actual) - - actual = s.reindex(target, method='nearest', tolerance=0.2) - expected = Series([0, 1, np.nan, 2], target) - assert_series_equal(expected, actual) - - actual = s.reindex(target, method='nearest', - tolerance=[0.3, 0.01, 0.4, 3]) - expected = Series([0, np.nan, np.nan, 2], target) - assert_series_equal(expected, actual) - - def test_reindex_backfill(self): - pass - - def test_reindex_int(self): - ts = self.ts[::2] - int_ts = Series(np.zeros(len(ts), dtype=int), index=ts.index) - - # this should work fine - reindexed_int = int_ts.reindex(self.ts.index) - - # if NaNs introduced - assert reindexed_int.dtype == np.float_ - - # NO NaNs introduced - reindexed_int = int_ts.reindex(int_ts.index[::2]) - assert reindexed_int.dtype == np.int_ - - def test_reindex_bool(self): - - # A series other than float, int, string, or object - ts = self.ts[::2] - bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) - - # this should work fine - reindexed_bool = bool_ts.reindex(self.ts.index) - - # if NaNs introduced - assert reindexed_bool.dtype == np.object_ - - # NO NaNs introduced - reindexed_bool = bool_ts.reindex(bool_ts.index[::2]) - assert reindexed_bool.dtype == np.bool_ - - def test_reindex_bool_pad(self): - # fail - ts = self.ts[5:] - bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) - filled_bool = bool_ts.reindex(self.ts.index, method='pad') - assert isna(filled_bool[:5]).all() - - def test_reindex_like(self): - other = self.ts[::2] - assert_series_equal(self.ts.reindex(other.index), - self.ts.reindex_like(other)) - - # GH 7179 - day1 = datetime(2013, 3, 5) - day2 = datetime(2013, 5, 5) - day3 = datetime(2014, 3, 5) - - series1 = Series([5, None, None], [day1, day2, day3]) - series2 = Series([None, None], [day1, day3]) - - result = series1.reindex_like(series2, method='pad') - expected = Series([5, np.nan], index=[day1, day3]) - assert_series_equal(result, expected) - - def test_reindex_fill_value(self): - # ----------------------------------------------------------- - # floats - floats = Series([1., 2., 3.]) - result = floats.reindex([1, 2, 3]) - expected = Series([2., 3., np.nan], index=[1, 2, 3]) - assert_series_equal(result, expected) - - result = floats.reindex([1, 2, 3], fill_value=0) - expected = Series([2., 3., 0], index=[1, 2, 3]) - assert_series_equal(result, expected) - - # ----------------------------------------------------------- - # ints - ints = Series([1, 2, 3]) - - result = ints.reindex([1, 2, 3]) - expected = Series([2., 3., np.nan], index=[1, 2, 3]) - assert_series_equal(result, expected) - - # don't upcast - result = ints.reindex([1, 2, 3], fill_value=0) - expected = Series([2, 3, 0], index=[1, 2, 3]) - assert issubclass(result.dtype.type, np.integer) - assert_series_equal(result, expected) - - # ----------------------------------------------------------- - # objects - objects = Series([1, 2, 3], dtype=object) - - result = objects.reindex([1, 2, 3]) - expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) - assert_series_equal(result, expected) - - result = objects.reindex([1, 2, 3], fill_value='foo') - expected = Series([2, 3, 'foo'], index=[1, 2, 3], dtype=object) - assert_series_equal(result, expected) - - # ------------------------------------------------------------ - # bools - bools = Series([True, False, True]) - - result = bools.reindex([1, 2, 3]) - expected = Series([False, True, np.nan], index=[1, 2, 3], dtype=object) - assert_series_equal(result, expected) - - result = bools.reindex([1, 2, 3], fill_value=False) - expected = Series([False, True, False], index=[1, 2, 3]) - assert_series_equal(result, expected) - - def test_reindex_categorical(self): - - index = date_range('20000101', periods=3) - - # reindexing to an invalid Categorical - s = Series(['a', 'b', 'c'], dtype='category') - result = s.reindex(index) - expected = Series(Categorical(values=[np.nan, np.nan, np.nan], - categories=['a', 'b', 'c'])) - expected.index = index - tm.assert_series_equal(result, expected) - - # partial reindexing - expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b', - 'c'])) - expected.index = [1, 2] - result = s.reindex([1, 2]) - tm.assert_series_equal(result, expected) - - expected = Series(Categorical( - values=['c', np.nan], categories=['a', 'b', 'c'])) - expected.index = [2, 3] - result = s.reindex([2, 3]) - tm.assert_series_equal(result, expected) - - def test_rename(self): - - # GH 17407 - s = Series(range(1, 6), index=pd.Index(range(2, 7), name='IntIndex')) - result = s.rename(str) - expected = s.rename(lambda i: str(i)) - assert_series_equal(result, expected) - - assert result.name == expected.name - - def test_select(self): - - # deprecated: gh-12410 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - n = len(self.ts) - result = self.ts.select(lambda x: x >= self.ts.index[n // 2]) - expected = self.ts.reindex(self.ts.index[n // 2:]) - assert_series_equal(result, expected) - - result = self.ts.select(lambda x: x.weekday() == 2) - expected = self.ts[self.ts.index.weekday == 2] - assert_series_equal(result, expected) - - def test_cast_on_putmask(self): - - # GH 2746 - - # need to upcast - s = Series([1, 2], index=[1, 2], dtype='int64') - s[[True, False]] = Series([0], index=[1], dtype='int64') - expected = Series([0, 2], index=[1, 2], dtype='int64') - - assert_series_equal(s, expected) - - def test_type_promote_putmask(self): - - # GH8387: test that changing types does not break alignment - ts = Series(np.random.randn(100), index=np.arange(100, 0, -1)).round(5) - left, mask = ts.copy(), ts > 0 - right = ts[mask].copy().map(str) - left[mask] = right - assert_series_equal(left, ts.map(lambda t: str(t) if t > 0 else t)) - - s = Series([0, 1, 2, 0]) - mask = s > 0 - s2 = s[mask].map(str) - s[mask] = s2 - assert_series_equal(s, Series([0, '1', '2', 0])) - - s = Series([0, 'foo', 'bar', 0]) - mask = Series([False, True, True, False]) - s2 = s[mask] - s[mask] = s2 - assert_series_equal(s, Series([0, 'foo', 'bar', 0])) - - def test_head_tail(self): - assert_series_equal(self.series.head(), self.series[:5]) - assert_series_equal(self.series.head(0), self.series[0:0]) - assert_series_equal(self.series.tail(), self.series[-5:]) - assert_series_equal(self.series.tail(0), self.series[0:0]) - - def test_multilevel_preserve_name(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - s = Series(np.random.randn(len(index)), index=index, name='sth') - - result = s['foo'] - result2 = s.loc['foo'] - assert result.name == s.name - assert result2.name == s.name - - def test_setitem_scalar_into_readonly_backing_data(self): - # GH14359: test that you cannot mutate a read only buffer - - array = np.zeros(5) - array.flags.writeable = False # make the array immutable - series = Series(array) - - for n in range(len(series)): - with pytest.raises(ValueError): - series[n] = 1 - - assert array[n] == 0 - - def test_setitem_slice_into_readonly_backing_data(self): - # GH14359: test that you cannot mutate a read only buffer - - array = np.zeros(5) - array.flags.writeable = False # make the array immutable - series = Series(array) - - with pytest.raises(ValueError): - series[1:3] = 1 - - assert not array.any() - - def test_categorial_assigning_ops(self): - orig = Series(Categorical(["b", "b"], categories=["a", "b"])) - s = orig.copy() - s[:] = "a" - exp = Series(Categorical(["a", "a"], categories=["a", "b"])) - tm.assert_series_equal(s, exp) - - s = orig.copy() - s[1] = "a" - exp = Series(Categorical(["b", "a"], categories=["a", "b"])) - tm.assert_series_equal(s, exp) - - s = orig.copy() - s[s.index > 0] = "a" - exp = Series(Categorical(["b", "a"], categories=["a", "b"])) - tm.assert_series_equal(s, exp) - - s = orig.copy() - s[[False, True]] = "a" - exp = Series(Categorical(["b", "a"], categories=["a", "b"])) - tm.assert_series_equal(s, exp) - - s = orig.copy() - s.index = ["x", "y"] - s["y"] = "a" - exp = Series(Categorical(["b", "a"], categories=["a", "b"]), - index=["x", "y"]) - tm.assert_series_equal(s, exp) - - # ensure that one can set something to np.nan - s = Series(Categorical([1, 2, 3])) - exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3])) - s[1] = np.nan - tm.assert_series_equal(s, exp) - - -class TestTimeSeriesDuplicates(object): - - def setup_method(self, method): - dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), - datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 3), datetime(2000, 1, 3), - datetime(2000, 1, 4), datetime(2000, 1, 4), - datetime(2000, 1, 4), datetime(2000, 1, 5)] - - self.dups = Series(np.random.randn(len(dates)), index=dates) - - def test_constructor(self): - assert isinstance(self.dups, Series) - assert isinstance(self.dups.index, DatetimeIndex) - - def test_is_unique_monotonic(self): - assert not self.dups.index.is_unique - - def test_index_unique(self): - uniques = self.dups.index.unique() - expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 4), datetime(2000, 1, 5)]) - assert uniques.dtype == 'M8[ns]' # sanity - tm.assert_index_equal(uniques, expected) - assert self.dups.index.nunique() == 4 - - # #2563 - assert isinstance(uniques, DatetimeIndex) - - dups_local = self.dups.index.tz_localize('US/Eastern') - dups_local.name = 'foo' - result = dups_local.unique() - expected = DatetimeIndex(expected, name='foo') - expected = expected.tz_localize('US/Eastern') - assert result.tz is not None - assert result.name == 'foo' - tm.assert_index_equal(result, expected) - - # NaT, note this is excluded - arr = [1370745748 + t for t in range(20)] + [tslib.iNaT] - idx = DatetimeIndex(arr * 3) - tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) - assert idx.nunique() == 20 - assert idx.nunique(dropna=False) == 21 - - arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) - for t in range(20)] + [NaT] - idx = DatetimeIndex(arr * 3) - tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) - assert idx.nunique() == 20 - assert idx.nunique(dropna=False) == 21 - - def test_index_dupes_contains(self): - d = datetime(2011, 12, 5, 20, 30) - ix = DatetimeIndex([d, d]) - assert d in ix - - def test_duplicate_dates_indexing(self): - ts = self.dups - - uniques = ts.index.unique() - for date in uniques: - result = ts[date] - - mask = ts.index == date - total = (ts.index == date).sum() - expected = ts[mask] - if total > 1: - assert_series_equal(result, expected) - else: - assert_almost_equal(result, expected[0]) - - cp = ts.copy() - cp[date] = 0 - expected = Series(np.where(mask, 0, ts), index=ts.index) - assert_series_equal(cp, expected) - - pytest.raises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) - - # new index - ts[datetime(2000, 1, 6)] = 0 - assert ts[datetime(2000, 1, 6)] == 0 - - def test_range_slice(self): - idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', - '1/4/2000']) - - ts = Series(np.random.randn(len(idx)), index=idx) - - result = ts['1/2/2000':] - expected = ts[1:] - assert_series_equal(result, expected) - - result = ts['1/2/2000':'1/3/2000'] - expected = ts[1:4] - assert_series_equal(result, expected) - - def test_groupby_average_dup_values(self): - result = self.dups.groupby(level=0).mean() - expected = self.dups.groupby(self.dups.index).mean() - assert_series_equal(result, expected) - - def test_indexing_over_size_cutoff(self): - import datetime - # #1821 - - old_cutoff = _index._SIZE_CUTOFF - try: - _index._SIZE_CUTOFF = 1000 - - # create large list of non periodic datetime - dates = [] - sec = datetime.timedelta(seconds=1) - half_sec = datetime.timedelta(microseconds=500000) - d = datetime.datetime(2011, 12, 5, 20, 30) - n = 1100 - for i in range(n): - dates.append(d) - dates.append(d + sec) - dates.append(d + sec + half_sec) - dates.append(d + sec + sec + half_sec) - d += 3 * sec - - # duplicate some values in the list - duplicate_positions = np.random.randint(0, len(dates) - 1, 20) - for p in duplicate_positions: - dates[p + 1] = dates[p] - - df = DataFrame(np.random.randn(len(dates), 4), - index=dates, - columns=list('ABCD')) - - pos = n * 3 - timestamp = df.index[pos] - assert timestamp in df.index - - # it works! - df.loc[timestamp] - assert len(df.loc[[timestamp]]) > 0 - finally: - _index._SIZE_CUTOFF = old_cutoff - - def test_indexing_unordered(self): - # GH 2437 - rng = date_range(start='2011-01-01', end='2011-01-15') - ts = Series(np.random.rand(len(rng)), index=rng) - ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]]) - - for t in ts.index: - # TODO: unused? - s = str(t) # noqa - - expected = ts[t] - result = ts2[t] - assert expected == result - - # GH 3448 (ranges) - def compare(slobj): - result = ts2[slobj].copy() - result = result.sort_index() - expected = ts[slobj] - assert_series_equal(result, expected) - - compare(slice('2011-01-01', '2011-01-15')) - compare(slice('2010-12-30', '2011-01-15')) - compare(slice('2011-01-01', '2011-01-16')) - - # partial ranges - compare(slice('2011-01-01', '2011-01-6')) - compare(slice('2011-01-06', '2011-01-8')) - compare(slice('2011-01-06', '2011-01-12')) - - # single values - result = ts2['2011'].sort_index() - expected = ts['2011'] - assert_series_equal(result, expected) - - # diff freq - rng = date_range(datetime(2005, 1, 1), periods=20, freq='M') - ts = Series(np.arange(len(rng)), index=rng) - ts = ts.take(np.random.permutation(20)) - - result = ts['2005'] - for t in result.index: - assert t.year == 2005 - - def test_indexing(self): - - idx = date_range("2001-1-1", periods=20, freq='M') - ts = Series(np.random.rand(len(idx)), index=idx) - - # getting - - # GH 3070, make sure semantics work on Series/Frame - expected = ts['2001'] - expected.name = 'A' - - df = DataFrame(dict(A=ts)) - result = df['2001']['A'] - assert_series_equal(expected, result) - - # setting - ts['2001'] = 1 - expected = ts['2001'] - expected.name = 'A' - - df.loc['2001', 'A'] = 1 - - result = df['2001']['A'] - assert_series_equal(expected, result) - - # GH3546 (not including times on the last day) - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', - freq='H') - ts = Series(lrange(len(idx)), index=idx) - expected = ts['2013-05'] - assert_series_equal(expected, ts) - - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', - freq='S') - ts = Series(lrange(len(idx)), index=idx) - expected = ts['2013-05'] - assert_series_equal(expected, ts) - - idx = [Timestamp('2013-05-31 00:00'), - Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999))] - ts = Series(lrange(len(idx)), index=idx) - expected = ts['2013'] - assert_series_equal(expected, ts) - - # GH14826, indexing with a seconds resolution string / datetime object - df = DataFrame(np.random.rand(5, 5), - columns=['open', 'high', 'low', 'close', 'volume'], - index=date_range('2012-01-02 18:01:00', - periods=5, tz='US/Central', freq='s')) - expected = df.loc[[df.index[2]]] - - # this is a single date, so will raise - pytest.raises(KeyError, df.__getitem__, '2012-01-02 18:01:02', ) - pytest.raises(KeyError, df.__getitem__, df.index[2], ) - - -class TestDatetimeIndexing(object): - """ - Also test support for datetime64[ns] in Series / DataFrame - """ - - def setup_method(self, method): - dti = DatetimeIndex(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='Min') - self.series = Series(np.random.rand(len(dti)), dti) - - def test_fancy_getitem(self): - dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), - end=datetime(2010, 1, 1)) - - s = Series(np.arange(len(dti)), index=dti) - - assert s[48] == 48 - assert s['1/2/2009'] == 48 - assert s['2009-1-2'] == 48 - assert s[datetime(2009, 1, 2)] == 48 - assert s[Timestamp(datetime(2009, 1, 2))] == 48 - pytest.raises(KeyError, s.__getitem__, '2009-1-3') - - assert_series_equal(s['3/6/2009':'2009-06-05'], - s[datetime(2009, 3, 6):datetime(2009, 6, 5)]) - - def test_fancy_setitem(self): - dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), - end=datetime(2010, 1, 1)) - - s = Series(np.arange(len(dti)), index=dti) - s[48] = -1 - assert s[48] == -1 - s['1/2/2009'] = -2 - assert s[48] == -2 - s['1/2/2009':'2009-06-05'] = -3 - assert (s[48:54] == -3).all() - - def test_dti_snap(self): - dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', - '1/5/2002', '1/6/2002', '1/7/2002'], freq='D') - - res = dti.snap(freq='W-MON') - exp = date_range('12/31/2001', '1/7/2002', freq='w-mon') - exp = exp.repeat([3, 4]) - assert (res == exp).all() - - res = dti.snap(freq='B') - - exp = date_range('1/1/2002', '1/7/2002', freq='b') - exp = exp.repeat([1, 1, 1, 2, 2]) - assert (res == exp).all() - - def test_dti_reset_index_round_trip(self): - dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') - d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti) - d2 = d1.reset_index() - assert d2.dtypes[0] == np.dtype('M8[ns]') - d3 = d2.set_index('index') - assert_frame_equal(d1, d3, check_names=False) - - # #2329 - stamp = datetime(2012, 11, 22) - df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value']) - df = df.set_index('Date') - - assert df.index[0] == stamp - assert df.reset_index()['Date'][0] == stamp - - def test_series_set_value(self): - # #1561 - - dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] - index = DatetimeIndex(dates) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - s = Series().set_value(dates[0], 1.) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - s2 = s.set_value(dates[1], np.nan) - - exp = Series([1., np.nan], index=index) - - assert_series_equal(s2, exp) - - # s = Series(index[:1], index[:1]) - # s2 = s.set_value(dates[1], index[1]) - # assert s2.values.dtype == 'M8[ns]' - - @pytest.mark.slow - def test_slice_locs_indexerror(self): - times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) - for i in range(100000)] - s = Series(lrange(100000), times) - s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)] - - def test_slicing_datetimes(self): - - # GH 7523 - - # unique - df = DataFrame(np.arange(4., dtype='float64'), - index=[datetime(2001, 1, i, 10, 00) - for i in [1, 2, 3, 4]]) - result = df.loc[datetime(2001, 1, 1, 10):] - assert_frame_equal(result, df) - result = df.loc[:datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - - result = df.loc[datetime(2001, 1, 1, 11):] - expected = df.iloc[1:] - assert_frame_equal(result, expected) - result = df.loc['20010101 11':] - assert_frame_equal(result, expected) - - # duplicates - df = pd.DataFrame(np.arange(5., dtype='float64'), - index=[datetime(2001, 1, i, 10, 00) - for i in [1, 2, 2, 3, 4]]) - - result = df.loc[datetime(2001, 1, 1, 10):] - assert_frame_equal(result, df) - result = df.loc[:datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] - assert_frame_equal(result, df) - - result = df.loc[datetime(2001, 1, 1, 11):] - expected = df.iloc[1:] - assert_frame_equal(result, expected) - result = df.loc['20010101 11':] - assert_frame_equal(result, expected) - - def test_frame_datetime64_duplicated(self): - dates = date_range('2010-07-01', end='2010-08-05') - - tst = DataFrame({'symbol': 'AAA', 'date': dates}) - result = tst.duplicated(['date', 'symbol']) - assert (-result).all() - - tst = DataFrame({'date': dates}) - result = tst.duplicated() - assert (-result).all() - - -class TestNatIndexing(object): - - def setup_method(self, method): - self.series = Series(date_range('1/1/2000', periods=10)) - - # --------------------------------------------------------------------- - # NaT support - - def test_set_none_nan(self): - self.series[3] = None - assert self.series[3] is NaT - - self.series[3:5] = None - assert self.series[4] is NaT - - self.series[5] = np.nan - assert self.series[5] is NaT - - self.series[5:7] = np.nan - assert self.series[6] is NaT - - def test_nat_operations(self): - # GH 8617 - s = Series([0, pd.NaT], dtype='m8[ns]') - exp = s[0] - assert s.median() == exp - assert s.min() == exp - assert s.max() == exp - - def test_round_nat(self): - # GH14940 - s = Series([pd.NaT]) - expected = Series(pd.NaT) - for method in ["round", "floor", "ceil"]: - round_method = getattr(s.dt, method) - for freq in ["s", "5s", "min", "5min", "h", "5h"]: - assert_series_equal(round_method(freq), expected) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 6220ce8ff76692..d15325ca8ef0e7 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -376,3 +376,96 @@ def test_rank_modify_inplace(self): s.rank() result = s assert_series_equal(result, expected) + + +# GH15630, pct should be on 100% basis when method='dense' + +@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('ser, exp', [ + ([1], [1.]), + ([1, 2], [1. / 2, 2. / 2]), + ([2, 2], [1., 1.]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([1, 2, 2], [1. / 2, 2. / 2, 2. / 2]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]), + ([1, 1, 3, 3, 5, 5], [1. / 3, 1. / 3, 2. / 3, 2. / 3, 3. / 3, 3. / 3]), + ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +def test_rank_dense_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method='dense', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('ser, exp', [ + ([1], [1.]), + ([1, 2], [1. / 2, 2. / 2]), + ([2, 2], [1. / 2, 1. / 2]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([1, 2, 2], [1. / 3, 2. / 3, 2. / 3]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [1. / 5, 1. / 5, 4. / 5, 4. / 5, 3. / 5]), + ([1, 1, 3, 3, 5, 5], [1. / 6, 1. / 6, 3. / 6, 3. / 6, 5. / 6, 5. / 6]), + ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +def test_rank_min_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method='min', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('ser, exp', [ + ([1], [1.]), + ([1, 2], [1. / 2, 2. / 2]), + ([2, 2], [1., 1.]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([1, 2, 2], [1. / 3, 3. / 3, 3. / 3]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [2. / 5, 2. / 5, 5. / 5, 5. / 5, 3. / 5]), + ([1, 1, 3, 3, 5, 5], [2. / 6, 2. / 6, 4. / 6, 4. / 6, 6. / 6, 6. / 6]), + ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +def test_rank_max_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method='max', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('ser, exp', [ + ([1], [1.]), + ([1, 2], [1. / 2, 2. / 2]), + ([2, 2], [1.5 / 2, 1.5 / 2]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([1, 2, 2], [1. / 3, 2.5 / 3, 2.5 / 3]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3. / 5]), + ([1, 1, 3, 3, 5, 5], + [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]), + ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +def test_rank_average_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method='average', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('dtype', ['f8', 'i8']) +@pytest.mark.parametrize('ser, exp', [ + ([1], [1.]), + ([1, 2], [1. / 2, 2. / 2]), + ([2, 2], [1. / 2, 2. / 2.]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([1, 2, 2], [1. / 3, 2. / 3, 3. / 3]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [1. / 5, 2. / 5, 4. / 5, 5. / 5, 3. / 5]), + ([1, 1, 3, 3, 5, 5], [1. / 6, 2. / 6, 3. / 6, 4. / 6, 5. / 6, 6. / 6]), + ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +def test_rank_first_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method='first', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index ee0d63aff73672..1062de3119efc0 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -139,6 +139,18 @@ def test_constructor(self): repr(self.frame) + def test_constructor_dict_order(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6, else + # order by value + d = {'b': [2, 3], 'a': [0, 1]} + frame = SparseDataFrame(data=d) + if compat.PY36: + expected = SparseDataFrame(data=d, columns=list('ba')) + else: + expected = SparseDataFrame(data=d, columns=list('ab')) + tm.assert_sp_frame_equal(frame, expected) + def test_constructor_ndarray(self): # no index or columns sp = SparseDataFrame(self.frame.values) diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 3f5d5a59cc5402..eb63c87820070e 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -14,7 +14,7 @@ from pandas.tseries.offsets import BDay import pandas.util.testing as tm import pandas.util._test_decorators as td -from pandas.compat import range +from pandas.compat import range, PY36 from pandas.core.reshape.util import cartesian_product import pandas.core.sparse.frame as spf @@ -114,6 +114,18 @@ def test_constructor_dict_input(self): result = SparseSeries(constructor_dict) tm.assert_sp_series_equal(result, expected) + def test_constructor_dict_order(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6, else + # order by value + d = {'b': 1, 'a': 0, 'c': 2} + result = SparseSeries(d) + if PY36: + expected = SparseSeries([1, 0, 2], index=list('bac')) + else: + expected = SparseSeries([0, 1, 2], index=list('abc')) + tm.assert_sp_series_equal(result, expected) + def test_constructor_dtype(self): arr = SparseSeries([np.nan, 1, 2, np.nan]) assert arr.dtype == np.float64 diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 4b5ad336139b02..9f7b06ed2d61c5 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -10,7 +10,7 @@ import pandas as pd import pandas.compat as compat from pandas.core.dtypes.common import ( - is_object_dtype, is_datetimetz, + is_object_dtype, is_datetimetz, is_datetime64_dtype, needs_i8_conversion) import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, @@ -296,14 +296,21 @@ def test_none_comparison(self): # result = None != o # noqa # assert result.iat[0] # assert result.iat[1] + if (is_datetime64_dtype(o) or is_datetimetz(o)): + # Following DatetimeIndex (and Timestamp) convention, + # inequality comparisons with Series[datetime64] raise + with pytest.raises(TypeError): + None > o + with pytest.raises(TypeError): + o > None + else: + result = None > o + assert not result.iat[0] + assert not result.iat[1] - result = None > o - assert not result.iat[0] - assert not result.iat[1] - - result = o < None - assert not result.iat[0] - assert not result.iat[1] + result = o < None + assert not result.iat[0] + assert not result.iat[1] def test_ndarray_compat_properties(self): @@ -1217,10 +1224,11 @@ def test_values_consistent(array, expected_type, dtype): (pd.DatetimeIndex(['2017-01-01T00:00:00'], tz="US/Eastern"), np.array(['2017-01-01T05:00:00'], dtype='M8[ns]')), (pd.TimedeltaIndex([10**10]), np.array([10**10], dtype='m8[ns]')), - pytest.mark.xfail(reason='PeriodArray not implemented')(( + pytest.param( pd.PeriodIndex(['2017', '2018'], freq='D'), np.array([17167, 17532]), - )), + marks=pytest.mark.xfail(reason="PeriodArray Not implemented") + ), ]) def test_ndarray_values(array, expected): l_values = pd.Series(array)._ndarray_values diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 502f0c3bced611..3e34b48fb67951 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -3,6 +3,7 @@ import pytest import numpy as np +from pandas import Index from pandas._libs import lib, writers as libwriters import pandas.util.testing as tm @@ -198,3 +199,8 @@ def test_get_reverse_indexer(self): result = lib.get_reverse_indexer(indexer, 5) expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) + + +def test_cache_readonly_preserve_docstrings(): + # GH18197 + assert Index.hasnans.__doc__ is not None diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 1955fc301be9b0..301a7fc437fcfc 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2368,14 +2368,16 @@ def test_update_from_dict(self): pan.update(other) expected = Panel( - {'two': DataFrame([[3.6, 2., 3], - [1.5, np.nan, 7], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]), - 'one': DataFrame([[1.5, np.nan, 3.], + {'one': DataFrame([[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]])}) + [1.5, np.nan, 3.]]), + 'two': DataFrame([[3.6, 2., 3], + [1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]) + } + ) assert_panel_equal(pan, expected) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 178c5ff655b040..a878d6ed7b0524 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -530,6 +530,27 @@ def test_replace_compiled_regex(self): exp = Series(['foObaD__baRbaD', NA]) tm.assert_series_equal(result, exp) + def test_replace_literal(self): + # GH16808 literal replace (regex=False vs regex=True) + values = Series(['f.o', 'foo', NA]) + exp = Series(['bao', 'bao', NA]) + result = values.str.replace('f.', 'ba') + tm.assert_series_equal(result, exp) + + exp = Series(['bao', 'foo', NA]) + result = values.str.replace('f.', 'ba', regex=False) + tm.assert_series_equal(result, exp) + + # Cannot do a literal replace if given a callable repl or compiled + # pattern + callable_repl = lambda m: m.group(0).swapcase() + compiled_pat = re.compile('[a-z][A-Z]{2}') + + pytest.raises(ValueError, values.str.replace, 'abc', callable_repl, + regex=False) + pytest.raises(ValueError, values.str.replace, compiled_pat, '', + regex=False) + def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd']) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index b2745ab5eec77a..8ad73538fbec11 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -89,6 +89,17 @@ def _skip_if_mpl_1_5(): mod.use("Agg", warn=False) +def _skip_if_mpl_2_2(): + mod = safe_import("matplotlib") + + if mod: + v = mod.__version__ + if LooseVersion(v) > LooseVersion('2.1.2'): + return True + else: + mod.use("Agg", warn=False) + + def _skip_if_has_locale(): lang, _ = locale.getlocale() if lang is not None: @@ -151,6 +162,8 @@ def decorated_func(func): reason="Missing matplotlib dependency") skip_if_mpl_1_5 = pytest.mark.skipif(_skip_if_mpl_1_5(), reason="matplotlib 1.5") +xfail_if_mpl_2_2 = pytest.mark.xfail(_skip_if_mpl_2_2(), + reason="matplotlib 2.2") skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), reason="skipping for 32 bit") skip_if_windows = pytest.mark.skipif(is_platform_windows(), diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 942416408e4f08..a223e4d8fd23e9 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1539,16 +1539,16 @@ def makeUnicodeIndex(k=10, name=None): return Index(randu_array(nchars=10, size=k), name=name) -def makeCategoricalIndex(k=10, n=3, name=None): +def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): """ make a length k index or n categories """ x = rands_array(nchars=4, size=n) - return CategoricalIndex(np.random.choice(x, k), name=name) + return CategoricalIndex(np.random.choice(x, k), name=name, **kwargs) -def makeIntervalIndex(k=10, name=None): +def makeIntervalIndex(k=10, name=None, **kwargs): """ make a length k IntervalIndex """ x = np.linspace(0, 100, num=(k + 1)) - return IntervalIndex.from_breaks(x, name=name) + return IntervalIndex.from_breaks(x, name=name, **kwargs) def makeBoolIndex(k=10, name=None): @@ -1567,8 +1567,8 @@ def makeUIntIndex(k=10, name=None): return Index([2**63 + i for i in lrange(k)], name=name) -def makeRangeIndex(k=10, name=None): - return RangeIndex(0, k, 1, name=name) +def makeRangeIndex(k=10, name=None, **kwargs): + return RangeIndex(0, k, 1, name=name, **kwargs) def makeFloatIndex(k=10, name=None): @@ -1576,22 +1576,28 @@ def makeFloatIndex(k=10, name=None): return Index(values * (10 ** np.random.randint(0, 9)), name=name) -def makeDateIndex(k=10, freq='B', name=None): +def makeDateIndex(k=10, freq='B', name=None, **kwargs): dt = datetime(2000, 1, 1) dr = bdate_range(dt, periods=k, freq=freq, name=name) - return DatetimeIndex(dr, name=name) + return DatetimeIndex(dr, name=name, **kwargs) -def makeTimedeltaIndex(k=10, freq='D', name=None): - return TimedeltaIndex(start='1 day', periods=k, freq=freq, name=name) +def makeTimedeltaIndex(k=10, freq='D', name=None, **kwargs): + return TimedeltaIndex(start='1 day', periods=k, freq=freq, + name=name, **kwargs) -def makePeriodIndex(k=10, name=None): +def makePeriodIndex(k=10, name=None, **kwargs): dt = datetime(2000, 1, 1) - dr = PeriodIndex(start=dt, periods=k, freq='B', name=name) + dr = PeriodIndex(start=dt, periods=k, freq='B', name=name, **kwargs) return dr +def makeMultiIndex(k=10, names=None, **kwargs): + return MultiIndex.from_product( + (('foo', 'bar'), (1, 2)), names=names, **kwargs) + + def all_index_generator(k=10): """Generator which can be iterated over to get instances of all the various index classes. @@ -1609,6 +1615,17 @@ def all_index_generator(k=10): yield make_index_func(k=k) +def index_subclass_makers_generator(): + make_index_funcs = [ + makeDateIndex, makePeriodIndex, + makeTimedeltaIndex, makeRangeIndex, + makeIntervalIndex, makeCategoricalIndex, + makeMultiIndex + ] + for make_index_func in make_index_funcs: + yield make_index_func + + def all_timeseries_index_generator(k=10): """Generator which can be iterated over to get instances of all the classes which represent time-seires. diff --git a/scripts/api_rst_coverage.py b/scripts/api_rst_coverage.py deleted file mode 100755 index 4800e80d828919..00000000000000 --- a/scripts/api_rst_coverage.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- -""" -Script to generate a report with the coverage of the API in the docs. - -The output of this script shows the existing methods that are not -included in the API documentation, as well as the methods documented -that do not exist. Ideally, no method should be listed. Currently it -considers the methods of Series, DataFrame and Panel. - -Deprecated methods are usually removed from the documentation, while -still available for three minor versions. They are listed with the -word deprecated and the version number next to them. - -Usage:: - - $ PYTHONPATH=.. ./api_rst_coverage.py - -""" -import os -import re -import inspect -import pandas as pd - - -def main(): - # classes whose members to check - classes = [pd.Series, pd.DataFrame, pd.Panel] - - def class_name_sort_key(x): - if x.startswith('Series'): - # make sure Series precedes DataFrame, and Panel. - return ' ' + x - else: - return x - - def get_docstring(x): - class_name, method = x.split('.') - obj = getattr(getattr(pd, class_name), method) - return obj.__doc__ - - def deprecation_version(x): - pattern = re.compile('\.\. deprecated:: ([0-9]+\.[0-9]+\.[0-9]+)') - doc = get_docstring(x) - match = pattern.search(doc) - if match: - return match.groups()[0] - - def add_notes(x): - # Some methods are not documented in api.rst because they - # have been deprecated. Adding a comment to detect them easier. - doc = get_docstring(x) - note = None - if not doc: - note = 'no docstring' - else: - version = deprecation_version(x) - if version: - note = 'deprecated in {}'.format(version) - - return '{} ({})'.format(x, note) if note else x - - # class members - class_members = set() - for cls in classes: - for member in inspect.getmembers(cls): - class_members.add('{cls}.{member}'.format(cls=cls.__name__, - member=member[0])) - - # class members referenced in api.rst - api_rst_members = set() - base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - api_rst_fname = os.path.join(base_path, 'doc', 'source', 'api.rst') - class_names = (cls.__name__ for cls in classes) - pattern = re.compile('({})\.(\w+)'.format('|'.join(class_names))) - with open(api_rst_fname, 'r') as f: - for line in f: - match = pattern.search(line) - if match: - api_rst_members.add(match.group(0)) - - print() - print("Documented members in api.rst that aren't actual class members:") - for x in sorted(api_rst_members.difference(class_members), - key=class_name_sort_key): - print(x) - - print() - print("Class members (other than those beginning with '_') " - "missing from api.rst:") - for x in sorted(class_members.difference(api_rst_members), - key=class_name_sort_key): - if '._' not in x: - print(add_notes(x)) - - -if __name__ == "__main__": - main() diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py new file mode 100755 index 00000000000000..8425882f07be1a --- /dev/null +++ b/scripts/validate_docstrings.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python +""" +Analyze docstrings to detect errors. + +If no argument is provided, it does a quick check of docstrings and returns +a csv with all API functions and results of basic checks. + +If a function or method is provided in the form "pandas.function", +"pandas.module.class.method", etc. a list of all errors in the docstring for +the specified function or method. + +Usage:: + $ ./validate_docstrings.py + $ ./validate_docstrings.py pandas.DataFrame.head +""" +import os +import sys +import csv +import re +import functools +import collections +import argparse +import contextlib +import pydoc +import inspect +import importlib +import doctest +try: + from io import StringIO +except ImportError: + from cStringIO import StringIO +import numpy + +BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +sys.path.insert(0, os.path.join(BASE_PATH)) +import pandas + +sys.path.insert(1, os.path.join(BASE_PATH, 'doc', 'sphinxext')) +from numpydoc.docscrape import NumpyDocString + + +PRIVATE_CLASSES = ['NDFrame', 'IndexOpsMixin'] + + +def _load_obj(obj_name): + for maxsplit in range(1, obj_name.count('.') + 1): + # TODO when py3 only replace by: module, *func_parts = ... + func_name_split = obj_name.rsplit('.', maxsplit=maxsplit) + module = func_name_split[0] + func_parts = func_name_split[1:] + try: + obj = importlib.import_module(module) + except ImportError: + pass + else: + continue + + if 'module' not in locals(): + raise ImportError('No module can be imported ' + 'from "{}"'.format(obj_name)) + + for part in func_parts: + obj = getattr(obj, part) + return obj + + +def _to_original_callable(obj): + while True: + if inspect.isfunction(obj) or inspect.isclass(obj): + f = inspect.getfile(obj) + if f.startswith('<') and f.endswith('>'): + return None + return obj + if inspect.ismethod(obj): + obj = obj.__func__ + elif isinstance(obj, functools.partial): + obj = obj.func + elif isinstance(obj, property): + obj = obj.fget + else: + return None + + +def _output_header(title, width=80, char='#'): + full_line = char * width + side_len = (width - len(title) - 2) // 2 + adj = '' if len(title) % 2 == 0 else ' ' + title_line = '{side} {title}{adj} {side}'.format(side=char * side_len, + title=title, + adj=adj) + + return '\n{full_line}\n{title_line}\n{full_line}\n\n'.format( + full_line=full_line, title_line=title_line) + + +class Docstring: + def __init__(self, method_name, method_obj): + self.method_name = method_name + self.method_obj = method_obj + self.raw_doc = method_obj.__doc__ or '' + self.clean_doc = pydoc.getdoc(self.method_obj) + self.doc = NumpyDocString(self.clean_doc) + + def __len__(self): + return len(self.raw_doc) + + @property + def is_function_or_method(self): + return inspect.isfunction(self.method_obj) + + @property + def source_file_name(self): + fname = inspect.getsourcefile(self.method_obj) + if fname: + fname = os.path.relpath(fname, BASE_PATH) + return fname + + @property + def source_file_def_line(self): + try: + return inspect.getsourcelines(self.method_obj)[-1] + except OSError: + pass + + @property + def github_url(self): + url = 'https://github.com/pandas-dev/pandas/blob/master/' + url += '{}#L{}'.format(self.source_file_name, + self.source_file_def_line) + return url + + @property + def start_blank_lines(self): + i = None + if self.raw_doc: + for i, row in enumerate(self.raw_doc.split('\n')): + if row.strip(): + break + return i + + @property + def end_blank_lines(self): + i = None + if self.raw_doc: + for i, row in enumerate(reversed(self.raw_doc.split('\n'))): + if row.strip(): + break + return i + + @property + def double_blank_lines(self): + prev = True + for row in self.raw_doc.split('\n'): + if not prev and not row.strip(): + return True + prev = row.strip() + return False + + @property + def summary(self): + if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1: + return '' + return ' '.join(self.doc['Summary']) + + @property + def extended_summary(self): + if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1: + return ' '.join(self.doc['Summary']) + return ' '.join(self.doc['Extended Summary']) + + @property + def needs_summary(self): + return not (bool(self.summary) and bool(self.extended_summary)) + + @property + def doc_parameters(self): + return collections.OrderedDict((name, (type_, ''.join(desc))) + for name, type_, desc + in self.doc['Parameters']) + + @property + def signature_parameters(self): + if (inspect.isclass(self.method_obj) + and self.method_name.split('.')[-1] in {'dt', 'str', 'cat'}): + # accessor classes have a signature, but don't want to show this + return tuple() + try: + signature = inspect.signature(self.method_obj) + except (TypeError, ValueError): + # Some objects, mainly in C extensions do not support introspection + # of the signature + return tuple() + params = tuple(signature.parameters.keys()) + if params and params[0] in ('self', 'cls'): + return params[1:] + return params + + @property + def parameter_mismatches(self): + errs = [] + signature_params = self.signature_parameters + doc_params = tuple(self.doc_parameters) + missing = set(signature_params) - set(doc_params) + if missing: + errs.append('Parameters {!r} not documented'.format(missing)) + extra = set(doc_params) - set(signature_params) + if extra: + errs.append('Unknown parameters {!r}'.format(extra)) + if (not missing and not extra and signature_params != doc_params + and not (not signature_params and not doc_params)): + errs.append('Wrong parameters order. ' + + 'Actual: {!r}. '.format(signature_params) + + 'Documented: {!r}'.format(doc_params)) + + return errs + + @property + def correct_parameters(self): + return not bool(self.parameter_mismatches) + + def parameter_type(self, param): + return self.doc_parameters[param][0] + + def parameter_desc(self, param): + return self.doc_parameters[param][1] + + @property + def see_also(self): + return collections.OrderedDict((name, ''.join(desc)) + for name, desc, _ + in self.doc['See Also']) + + @property + def examples(self): + return self.doc['Examples'] + + @property + def returns(self): + return self.doc['Returns'] + + @property + def first_line_ends_in_dot(self): + if self.doc: + return self.doc.split('\n')[0][-1] == '.' + + @property + def deprecated(self): + pattern = re.compile('.. deprecated:: ') + return (self.method_name.startswith('pandas.Panel') or + bool(pattern.search(self.summary)) or + bool(pattern.search(self.extended_summary))) + + @property + def mentioned_private_classes(self): + return [klass for klass in PRIVATE_CLASSES if klass in self.raw_doc] + + @property + def examples_errors(self): + flags = doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL + finder = doctest.DocTestFinder() + runner = doctest.DocTestRunner(optionflags=flags) + context = {'np': numpy, 'pd': pandas} + error_msgs = '' + for test in finder.find(self.raw_doc, self.method_name, globs=context): + f = StringIO() + with contextlib.redirect_stdout(f): + runner.run(test) + error_msgs += f.getvalue() + return error_msgs + + +def get_api_items(): + api_fname = os.path.join(BASE_PATH, 'doc', 'source', 'api.rst') + + previous_line = current_section = current_subsection = '' + position = None + with open(api_fname) as f: + for line in f: + line = line.strip() + if len(line) == len(previous_line): + if set(line) == set('-'): + current_section = previous_line + continue + if set(line) == set('~'): + current_subsection = previous_line + continue + + if line.startswith('.. currentmodule::'): + current_module = line.replace('.. currentmodule::', '').strip() + continue + + if line == '.. autosummary::': + position = 'autosummary' + continue + + if position == 'autosummary': + if line == '': + position = 'items' + continue + + if position == 'items': + if line == '': + position = None + continue + item = line.strip() + func = importlib.import_module(current_module) + for part in item.split('.'): + func = getattr(func, part) + + yield ('.'.join([current_module, item]), func, + current_section, current_subsection) + + previous_line = line + + +def _csv_row(func_name, func_obj, section, subsection, in_api, seen={}): + obj_type = type(func_obj).__name__ + original_callable = _to_original_callable(func_obj) + if original_callable is None: + return [func_name, obj_type] + [''] * 12, '' + else: + doc = Docstring(func_name, original_callable) + key = doc.source_file_name, doc.source_file_def_line + shared_code = seen.get(key, '') + return [func_name, + obj_type, + in_api, + int(doc.deprecated), + section, + subsection, + doc.source_file_name, + doc.source_file_def_line, + doc.github_url, + int(bool(doc.summary)), + int(bool(doc.extended_summary)), + int(doc.correct_parameters), + int(bool(doc.examples)), + shared_code], key + + +def validate_all(): + writer = csv.writer(sys.stdout) + cols = ('Function or method', + 'Type', + 'In API doc', + 'Is deprecated', + 'Section', + 'Subsection', + 'File', + 'Code line', + 'GitHub link', + 'Has summary', + 'Has extended summary', + 'Parameters ok', + 'Has examples', + 'Shared code with') + writer.writerow(cols) + seen = {} + api_items = list(get_api_items()) + for func_name, func, section, subsection in api_items: + row, key = _csv_row(func_name, func, section, subsection, + in_api=1, seen=seen) + seen[key] = func_name + writer.writerow(row) + + api_item_names = set(list(zip(*api_items))[0]) + for class_ in (pandas.Series, pandas.DataFrame, pandas.Panel): + for member in inspect.getmembers(class_): + func_name = 'pandas.{}.{}'.format(class_.__name__, member[0]) + if (not member[0].startswith('_') and + func_name not in api_item_names): + func = _load_obj(func_name) + row, key = _csv_row(func_name, func, section='', subsection='', + in_api=0) + writer.writerow(row) + + return 0 + + +def validate_one(func_name): + func_obj = _load_obj(func_name) + doc = Docstring(func_name, func_obj) + + sys.stderr.write(_output_header('Docstring ({})'.format(func_name))) + sys.stderr.write('{}\n'.format(doc.clean_doc)) + + errs = [] + if doc.start_blank_lines != 1: + errs.append('Docstring text (summary) should start in the line ' + 'immediately after the opening quotes (not in the same ' + 'line, or leaving a blank line in between)') + if doc.end_blank_lines != 1: + errs.append('Closing quotes should be placed in the line after ' + 'the last text in the docstring (do not close the ' + 'quotes in the same line as the text, or leave a ' + 'blank line between the last text and the quotes)') + if doc.double_blank_lines: + errs.append('Use only one blank line to separate sections or ' + 'paragraphs') + + if not doc.summary: + errs.append('No summary found (a short summary in a single line ' + 'should be present at the beginning of the docstring)') + else: + if not doc.summary[0].isupper(): + errs.append('Summary does not start with capital') + if doc.summary[-1] != '.': + errs.append('Summary does not end with dot') + if (doc.is_function_or_method and + doc.summary.split(' ')[0][-1] == 's'): + errs.append('Summary must start with infinitive verb, ' + 'not third person (e.g. use "Generate" instead of ' + '"Generates")') + if not doc.extended_summary: + errs.append('No extended summary found') + + param_errs = doc.parameter_mismatches + for param in doc.doc_parameters: + if not doc.parameter_type(param): + param_errs.append('Parameter "{}" has no type'.format(param)) + else: + if doc.parameter_type(param)[-1] == '.': + param_errs.append('Parameter "{}" type ' + 'should not finish with "."'.format(param)) + + if not doc.parameter_desc(param): + param_errs.append('Parameter "{}" ' + 'has no description'.format(param)) + else: + if not doc.parameter_desc(param)[0].isupper(): + param_errs.append('Parameter "{}" description ' + 'should start with ' + 'capital letter'.format(param)) + if doc.parameter_desc(param)[-1] != '.': + param_errs.append('Parameter "{}" description ' + 'should finish with "."'.format(param)) + if param_errs: + errs.append('Errors in parameters section') + for param_err in param_errs: + errs.append('\t{}'.format(param_err)) + + if not doc.returns: + errs.append('No returns section found') + + mentioned_errs = doc.mentioned_private_classes + if mentioned_errs: + errs.append('Private classes ({}) should not be mentioned in public ' + 'docstring.'.format(mentioned_errs)) + + if not doc.see_also: + errs.append('See Also section not found') + else: + for rel_name, rel_desc in doc.see_also.items(): + if not rel_desc: + errs.append('Missing description for ' + 'See Also "{}" reference'.format(rel_name)) + examples_errs = '' + if not doc.examples: + errs.append('No examples section found') + else: + examples_errs = doc.examples_errors + if examples_errs: + errs.append('Examples do not pass tests') + + sys.stderr.write(_output_header('Validation')) + if errs: + sys.stderr.write('Errors found:\n') + for err in errs: + sys.stderr.write('\t{}\n'.format(err)) + else: + sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name)) + + if examples_errs: + sys.stderr.write(_output_header('Doctests')) + sys.stderr.write(examples_errs) + + return len(errs) + + +def main(function): + if function is None: + return validate_all() + else: + return validate_one(function) + + +if __name__ == '__main__': + argparser = argparse.ArgumentParser( + description='validate pandas docstrings') + argparser.add_argument('function', + nargs='?', + default=None, + help=('function or method to validate ' + '(e.g. pandas.DataFrame.head) ' + 'if not provided, all docstrings ' + 'are validated')) + args = argparser.parse_args() + sys.exit(main(args.function)) diff --git a/setup.py b/setup.py index c7784260d79ca7..7fb5358d0950b4 100755 --- a/setup.py +++ b/setup.py @@ -313,6 +313,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/testing.pyx', 'pandas/_libs/skiplist.pyx', 'pandas/_libs/sparse.pyx', + 'pandas/_libs/ops.pyx', 'pandas/_libs/parsers.pyx', 'pandas/_libs/tslibs/ccalendar.pyx', 'pandas/_libs/tslibs/period.pyx', @@ -525,6 +526,10 @@ def pxd(name): '_libs.reduction': { 'pyxfile': '_libs/reduction', 'pxdfiles': ['_libs/src/util']}, + '_libs.ops': { + 'pyxfile': '_libs/ops', + 'pxdfiles': ['_libs/src/util', + '_libs/missing']}, '_libs.tslibs.period': { 'pyxfile': '_libs/tslibs/period', 'pxdfiles': ['_libs/src/util',