diff --git a/.travis.yml b/.travis.yml index bd5cac8955c8d..4cbe7f86bd2fa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -73,6 +73,10 @@ matrix: env: - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" # In allow_failures + - dist: trusty + env: + - JOB="3.6_ASV" ASV=true + # In allow_failures - dist: trusty env: - JOB="3.6_DOC" DOC=true @@ -93,6 +97,9 @@ matrix: - dist: trusty env: - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" + - dist: trusty + env: + - JOB="3.6_ASV" ASV=true - dist: trusty env: - JOB="3.6_DOC" DOC=true @@ -128,6 +135,7 @@ script: - ci/script_single.sh - ci/script_multi.sh - ci/lint.sh + - ci/asv.sh - echo "checking imports" - source activate pandas && python ci/check_imports.py - echo "script done" diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 45d62163ae80b..cccd38ef11251 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,3 +1,4 @@ +import warnings from importlib import import_module import numpy as np @@ -83,7 +84,8 @@ def setup(self): self.all = self.uniques.repeat(10) def time_match_string(self): - pd.match(self.all, self.uniques) + with warnings.catch_warnings(record=True): + pd.match(self.all, self.uniques) class Hashing(object): diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 1613ca1b97f4b..7743921003353 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pandas as pd import pandas.util.testing as tm @@ -119,11 +121,15 @@ def setup(self): self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) self.s_str_cat = self.s_str.astype('category') - self.s_str_cat_ordered = self.s_str.astype('category', ordered=True) + with warnings.catch_warnings(record=True): + self.s_str_cat_ordered = self.s_str.astype('category', + ordered=True) self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) self.s_int_cat = self.s_int.astype('category') - self.s_int_cat_ordered = self.s_int.astype('category', ordered=True) + with warnings.catch_warnings(record=True): + self.s_int_cat_ordered = self.s_int.astype('category', + ordered=True) def time_rank_string(self): self.s_str.rank() diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 4cecf12a27042..4ff71c706cd34 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,4 +1,6 @@ import string +import warnings + import numpy as np import pandas.util.testing as tm from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, @@ -15,7 +17,8 @@ def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' self.df['bar'] = 'baz' - self.df = self.df.consolidate() + with warnings.catch_warnings(record=True): + self.df = self.df.consolidate() def time_frame_get_numeric_data(self): self.df._get_numeric_data() @@ -141,8 +144,8 @@ class Repr(object): def setup(self): nrows = 10000 data = np.random.randn(nrows, 10) - idx = MultiIndex.from_arrays(np.tile(np.random.randn(3, nrows / 100), - 100)) + arrays = np.tile(np.random.randn(3, int(nrows / 100)), 100) + idx = MultiIndex.from_arrays(arrays) self.df3 = DataFrame(data, index=idx) self.df4 = DataFrame(data, index=np.random.randn(nrows)) self.df_tall = DataFrame(np.random.randn(nrows, 10)) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 7d63d78084270..21c1ccf46e1c4 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,9 +1,13 @@ import numpy as np import pandas.util.testing as tm -from pandas import (DataFrame, Series, rolling_median, rolling_mean, - rolling_min, rolling_max, rolling_var, rolling_skew, - rolling_kurt, rolling_std, read_csv, factorize, date_range) +from pandas import DataFrame, Series, read_csv, factorize, date_range from pandas.core.algorithms import take_1d +try: + from pandas import (rolling_median, rolling_mean, rolling_min, rolling_max, + rolling_var, rolling_skew, rolling_kurt, rolling_std) + have_rolling_methods = True +except ImportError: + have_rolling_methods = False try: from pandas._libs import algos except ImportError: @@ -171,8 +175,7 @@ def run(period): class ParallelRolling(object): goal_time = 0.2 - params = ['rolling_median', 'rolling_mean', 'rolling_min', 'rolling_max', - 'rolling_var', 'rolling_skew', 'rolling_kurt', 'rolling_std'] + params = ['median', 'mean', 'min', 'max', 'var', 'skew', 'kurt', 'std'] param_names = ['method'] def setup(self, method): @@ -181,34 +184,28 @@ def setup(self, method): win = 100 arr = np.random.rand(100000) if hasattr(DataFrame, 'rolling'): - rolling = {'rolling_median': 'median', - 'rolling_mean': 'mean', - 'rolling_min': 'min', - 'rolling_max': 'max', - 'rolling_var': 'var', - 'rolling_skew': 'skew', - 'rolling_kurt': 'kurt', - 'rolling_std': 'std'} df = DataFrame(arr).rolling(win) @test_parallel(num_threads=2) def parallel_rolling(): - getattr(df, rolling[method])() + getattr(df, method)() self.parallel_rolling = parallel_rolling - else: - rolling = {'rolling_median': rolling_median, - 'rolling_mean': rolling_mean, - 'rolling_min': rolling_min, - 'rolling_max': rolling_max, - 'rolling_var': rolling_var, - 'rolling_skew': rolling_skew, - 'rolling_kurt': rolling_kurt, - 'rolling_std': rolling_std} + elif have_rolling_methods: + rolling = {'median': rolling_median, + 'mean': rolling_mean, + 'min': rolling_min, + 'max': rolling_max, + 'var': rolling_var, + 'skew': rolling_skew, + 'kurt': rolling_kurt, + 'std': rolling_std} @test_parallel(num_threads=2) def parallel_rolling(): rolling[method](arr, win) self.parallel_rolling = parallel_rolling + else: + raise NotImplementedError def time_rolling(self, method): self.parallel_rolling() diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 4dfd215e6dc3a..61db39528a5fb 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -1,3 +1,4 @@ +import warnings from string import ascii_letters from itertools import product from functools import partial @@ -159,6 +160,22 @@ def time_series_nth(self, df): df[1].groupby(df[0]).nth(0) +class NthObject(object): + + goal_time = 0.2 + + def setup_cache(self): + df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g']) + df['obj'] = ['a'] * 5000 + ['b'] * 5000 + return df + + def time_nth(self, df): + df.groupby('g').nth(5) + + def time_nth_last(self, df): + df.groupby('g').last() + + class DateAttributes(object): goal_time = 0.2 @@ -340,7 +357,8 @@ def time_dt_size(self): self.df.groupby(['dates']).size() def time_dt_timegrouper_size(self): - self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + with warnings.catch_warnings(record=True): + self.df.groupby(TimeGrouper(key='dates', freq='M')).size() def time_category_size(self): self.draws.groupby(self.cats).size() @@ -467,7 +485,7 @@ class SumMultiLevel(object): def setup(self): N = 50 - self.df = DataFrame({'A': range(N) * 2, + self.df = DataFrame({'A': list(range(N)) * 2, 'B': range(N * 2), 'C': 1}).set_index(['A', 'B']) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 970760373632a..f1703e163917a 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -147,6 +147,11 @@ def setup(self, dtype): self.idx = getattr(tm, 'make{}Index'.format(dtype))(N) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) + self.sorted = self.idx.sort_values() + half = N // 2 + self.non_unique = self.idx[:half].append(self.idx[:half]) + self.non_unique_sorted = self.sorted[:half].append(self.sorted[:half]) + self.key = self.sorted[N // 4] def time_boolean_array(self, dtype): self.idx[self.array_mask] @@ -163,6 +168,18 @@ def time_slice(self, dtype): def time_slice_step(self, dtype): self.idx[::2] + def time_get_loc(self, dtype): + self.idx.get_loc(self.key) + + def time_get_loc_sorted(self, dtype): + self.sorted.get_loc(self.key) + + def time_get_loc_non_unique(self, dtype): + self.non_unique.get_loc(self.key) + + def time_get_loc_non_unique_sorted(self, dtype): + self.non_unique_sorted.get_loc(self.key) + class Float64IndexMethod(object): # GH 13166 diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index b35f00db2b054..77e013e1e4fb0 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pandas.util.testing as tm from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index, @@ -91,7 +93,8 @@ def time_getitem_pos_slice(self, index): self.s[:80000] def time_get_value(self, index): - self.s.get_value(self.lbl) + with warnings.catch_warnings(record=True): + self.s.get_value(self.lbl) def time_getitem_scalar(self, index): self.s[self.lbl] @@ -112,7 +115,8 @@ def setup(self): self.bool_obj_indexer = self.bool_indexer.astype(object) def time_get_value(self): - self.df.get_value(self.idx_scalar, self.col_scalar) + with warnings.catch_warnings(record=True): + self.df.get_value(self.idx_scalar, self.col_scalar) def time_ix(self): self.df.ix[self.idx_scalar, self.col_scalar] @@ -231,11 +235,13 @@ class PanelIndexing(object): goal_time = 0.2 def setup(self): - self.p = Panel(np.random.randn(100, 100, 100)) - self.inds = range(0, 100, 10) + with warnings.catch_warnings(record=True): + self.p = Panel(np.random.randn(100, 100, 100)) + self.inds = range(0, 100, 10) def time_subset(self): - self.p.ix[(self.inds, self.inds, self.inds)] + with warnings.catch_warnings(record=True): + self.p.ix[(self.inds, self.inds, self.inds)] class MethodLookup(object): @@ -295,7 +301,8 @@ def setup(self): def time_insert(self): np.random.seed(1234) for i in range(100): - self.df.insert(0, i, np.random.randn(self.N)) + self.df.insert(0, i, np.random.randn(self.N), + allow_duplicates=True) def time_assign_with_setitem(self): np.random.seed(1234) diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 5c0e9586c1cb5..4b6e1d69af92d 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np from pandas import DataFrame, Panel, date_range, HDFStore, read_hdf import pandas.util.testing as tm @@ -105,22 +107,25 @@ class HDFStorePanel(BaseIO): def setup(self): self.fname = '__test__.h5' - self.p = Panel(np.random.randn(20, 1000, 25), - items=['Item%03d' % i for i in range(20)], - major_axis=date_range('1/1/2000', periods=1000), - minor_axis=['E%03d' % i for i in range(25)]) - self.store = HDFStore(self.fname) - self.store.append('p1', self.p) + with warnings.catch_warnings(record=True): + self.p = Panel(np.random.randn(20, 1000, 25), + items=['Item%03d' % i for i in range(20)], + major_axis=date_range('1/1/2000', periods=1000), + minor_axis=['E%03d' % i for i in range(25)]) + self.store = HDFStore(self.fname) + self.store.append('p1', self.p) def teardown(self): self.store.close() self.remove(self.fname) def time_read_store_table_panel(self): - self.store.select('p1') + with warnings.catch_warnings(record=True): + self.store.select('p1') def time_write_store_table_panel(self): - self.store.append('p2', self.p) + with warnings.catch_warnings(record=True): + self.store.append('p2', self.p) class HDF(BaseIO): diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 5b40a29d54683..de0a3b33da147 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -1,3 +1,4 @@ +import warnings import string import numpy as np @@ -26,7 +27,8 @@ def setup(self): self.mdf1['obj2'] = 'bar' self.mdf1['int1'] = 5 try: - self.mdf1.consolidate(inplace=True) + with warnings.catch_warnings(record=True): + self.mdf1.consolidate(inplace=True) except: pass self.mdf2 = self.mdf1.copy() @@ -75,16 +77,23 @@ class ConcatPanels(object): param_names = ['axis', 'ignore_index'] def setup(self, axis, ignore_index): - panel_c = Panel(np.zeros((10000, 200, 2), dtype=np.float32, order='C')) - self.panels_c = [panel_c] * 20 - panel_f = Panel(np.zeros((10000, 200, 2), dtype=np.float32, order='F')) - self.panels_f = [panel_f] * 20 + with warnings.catch_warnings(record=True): + panel_c = Panel(np.zeros((10000, 200, 2), + dtype=np.float32, + order='C')) + self.panels_c = [panel_c] * 20 + panel_f = Panel(np.zeros((10000, 200, 2), + dtype=np.float32, + order='F')) + self.panels_f = [panel_f] * 20 def time_c_ordered(self, axis, ignore_index): - concat(self.panels_c, axis=axis, ignore_index=ignore_index) + with warnings.catch_warnings(record=True): + concat(self.panels_c, axis=axis, ignore_index=ignore_index) def time_f_ordered(self, axis, ignore_index): - concat(self.panels_f, axis=axis, ignore_index=ignore_index) + with warnings.catch_warnings(record=True): + concat(self.panels_f, axis=axis, ignore_index=ignore_index) class ConcatDataFrames(object): diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index 034e861e7fc01..e161b887ee86f 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import warnings from datetime import datetime import numpy as np @@ -76,7 +77,8 @@ def setup(self, offset): self.data = pd.Series(rng) def time_add_offset(self, offset): - self.data + offset + with warnings.catch_warnings(record=True): + self.data + offset class OffsetDatetimeIndexArithmetic(object): @@ -90,7 +92,8 @@ def setup(self, offset): self.data = pd.date_range(start='1/1/2000', periods=N, freq='T') def time_add_offset(self, offset): - self.data + offset + with warnings.catch_warnings(record=True): + self.data + offset class OffestDatetimeArithmetic(object): diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py index 456fe959c5aa3..ce946c76ed199 100644 --- a/asv_bench/benchmarks/panel_ctor.py +++ b/asv_bench/benchmarks/panel_ctor.py @@ -1,3 +1,4 @@ +import warnings from datetime import datetime, timedelta from pandas import DataFrame, DatetimeIndex, date_range @@ -19,7 +20,8 @@ def setup(self): self.data_frames[x] = df def time_from_dict(self): - Panel.from_dict(self.data_frames) + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) class SameIndexes(object): @@ -34,7 +36,8 @@ def setup(self): self.data_frames = dict(enumerate([df] * 100)) def time_from_dict(self): - Panel.from_dict(self.data_frames) + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) class TwoIndexes(object): @@ -53,4 +56,5 @@ def setup(self): self.data_frames = dict(enumerate(dfs)) def time_from_dict(self): - Panel.from_dict(self.data_frames) + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py index 9ee1949b311db..a5b1a92e9cf67 100644 --- a/asv_bench/benchmarks/panel_methods.py +++ b/asv_bench/benchmarks/panel_methods.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np from .pandas_vb_common import Panel, setup # noqa @@ -10,10 +12,13 @@ class PanelMethods(object): param_names = ['axis'] def setup(self, axis): - self.panel = Panel(np.random.randn(100, 1000, 100)) + with warnings.catch_warnings(record=True): + self.panel = Panel(np.random.randn(100, 1000, 100)) def time_pct_change(self, axis): - self.panel.pct_change(1, axis=axis) + with warnings.catch_warnings(record=True): + self.panel.pct_change(1, axis=axis) def time_shift(self, axis): - self.panel.shift(1, axis=axis) + with warnings.catch_warnings(record=True): + self.panel.shift(1, axis=axis) diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 69a1a604b1ccc..413427a16f40b 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -167,10 +167,6 @@ def setup(self): col_array2 = col_array.copy() col_array2[:, :10000] = np.nan self.col_array_list = list(col_array) - self.col_array_list2 = list(col_array2) def time_lib_fast_zip(self): lib.fast_zip(self.col_array_list) - - def time_lib_fast_zip_fillna(self): - lib.fast_zip_fillna(self.col_array_list2) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index bd3b580d9d130..9044b080c45f9 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -104,9 +104,9 @@ def setup(self): self.letters = list('ABCD') yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))] - + columns = [str(i) for i in range(nidvars)] + yrvars self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), - columns=list(range(nidvars)) + yrvars) + columns=columns) self.df['id'] = self.df.index def time_wide_to_long_big(self): diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 4435327e1eb38..b203c8b0fa5c9 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np from pandas import Series import pandas.util.testing as tm @@ -23,7 +25,8 @@ def time_endswith(self): self.s.str.endswith('A') def time_extract(self): - self.s.str.extract('(\\w*)A(\\w*)') + with warnings.catch_warnings(record=True): + self.s.str.extract('(\\w*)A(\\w*)') def time_findall(self): self.s.str.findall('[A-Z]+') diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index ea2f077f980d0..e1a6bc7a68e9d 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -1,3 +1,4 @@ +import warnings from datetime import timedelta import numpy as np @@ -74,7 +75,8 @@ def setup(self): freq='S')) def time_infer_dst(self): - self.index.tz_localize('US/Eastern', infer_dst=True) + with warnings.catch_warnings(record=True): + self.index.tz_localize('US/Eastern', infer_dst=True) class ResetIndex(object): @@ -365,7 +367,7 @@ class ToDatetimeCache(object): def setup(self, cache): N = 10000 - self.unique_numeric_seconds = range(N) + self.unique_numeric_seconds = list(range(N)) self.dup_numeric_seconds = [1000] * N self.dup_string_dates = ['2000-02-11'] * N self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * N diff --git a/ci/asv.sh b/ci/asv.sh new file mode 100755 index 0000000000000..1e9a8d6380eb5 --- /dev/null +++ b/ci/asv.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +echo "inside $0" + +source activate pandas + +RET=0 + +if [ "$ASV" ]; then + echo "Check for failed asv benchmarks" + + cd asv_bench + + asv machine --yes + + time asv dev | tee failed_asv.txt + + echo "The following asvs benchmarks (if any) failed." + + cat failed_asv.txt | grep "failed" failed_asv.txt + + if [ $? = "0" ]; then + RET=1 + fi + + echo "DONE displaying failed asvs benchmarks." + + rm failed_asv.txt + + echo "Check for failed asv benchmarks DONE" +else + echo "NOT checking for failed asv benchmarks" +fi + +exit $RET diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 4ec5b0a9d8820..6e270519e60c3 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -110,7 +110,7 @@ if [ -e ${REQ} ]; then fi time conda install -n pandas pytest>=3.1.0 -time pip install pytest-xdist moto +time pip install -q pytest-xdist moto if [ "$LINT" ]; then conda install flake8=3.4.1 @@ -181,10 +181,10 @@ elif [ "$CONDA_BUILD_TEST" ]; then # build & install testing echo "[building conda recipe]" - time conda build ./conda.recipe --numpy 1.13 --python 3.5 -q --no-test + time conda build ./conda.recipe --python 3.5 -q --no-test || exit 1 echo "[installing]" - conda install pandas --use-local + conda install pandas --use-local || exit 1 else diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.build b/ci/requirements-3.5_CONDA_BUILD_TEST.build index 6648e3778777c..f7befe3b31865 100644 --- a/ci/requirements-3.5_CONDA_BUILD_TEST.build +++ b/ci/requirements-3.5_CONDA_BUILD_TEST.build @@ -2,5 +2,5 @@ python=3.5* python-dateutil pytz nomkl -numpy=1.13* +numpy cython diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.run b/ci/requirements-3.5_CONDA_BUILD_TEST.run index 19d9a91e86585..669cf437f2164 100644 --- a/ci/requirements-3.5_CONDA_BUILD_TEST.run +++ b/ci/requirements-3.5_CONDA_BUILD_TEST.run @@ -1,5 +1,5 @@ pytz -numpy=1.13* +numpy openpyxl xlsxwriter xlrd diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.sh b/ci/requirements-3.5_CONDA_BUILD_TEST.sh index 09d6775cfc894..093fdbcf21d78 100644 --- a/ci/requirements-3.5_CONDA_BUILD_TEST.sh +++ b/ci/requirements-3.5_CONDA_BUILD_TEST.sh @@ -8,4 +8,4 @@ echo "install 35 CONDA_BUILD_TEST" conda remove -n pandas python-dateutil --force pip install python-dateutil -conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0 +conda install -n pandas -c conda-forge feather-format pyarrow=0.7.1 diff --git a/ci/requirements-3.6.build b/ci/requirements-3.6.build index 94e1152450d87..1c4b46aea3865 100644 --- a/ci/requirements-3.6.build +++ b/ci/requirements-3.6.build @@ -2,5 +2,5 @@ python=3.6* python-dateutil pytz nomkl -numpy=1.13.* +numpy cython diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index e30461d06b8ea..822144a80bc9a 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -13,7 +13,7 @@ lxml html5lib jinja2 sqlalchemy -pymysql<0.8.0 +pymysql feather-format pyarrow psycopg2 diff --git a/ci/requirements-3.6_ASV.build b/ci/requirements-3.6_ASV.build new file mode 100644 index 0000000000000..bc72eed2a0d4e --- /dev/null +++ b/ci/requirements-3.6_ASV.build @@ -0,0 +1,5 @@ +python=3.6* +python-dateutil +pytz +numpy=1.13* +cython diff --git a/ci/requirements-3.6_ASV.run b/ci/requirements-3.6_ASV.run new file mode 100644 index 0000000000000..6c45e3371e9cf --- /dev/null +++ b/ci/requirements-3.6_ASV.run @@ -0,0 +1,25 @@ +ipython +ipykernel +ipywidgets +sphinx=1.5* +nbconvert +nbformat +notebook +matplotlib +seaborn +scipy +lxml +beautifulsoup4 +html5lib +pytables +python-snappy +openpyxl +xlrd +xlwt +xlsxwriter +sqlalchemy +numexpr +bottleneck +statsmodels +xarray +pyqt diff --git a/ci/requirements-3.6_ASV.sh b/ci/requirements-3.6_ASV.sh new file mode 100755 index 0000000000000..8a46f85dbb6bc --- /dev/null +++ b/ci/requirements-3.6_ASV.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source activate pandas + +echo "[install ASV_BUILD deps]" + +pip install git+https://github.com/spacetelescope/asv diff --git a/ci/script_multi.sh b/ci/script_multi.sh index c1fa756ece965..766e51625fbe6 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -37,6 +37,9 @@ if [ "$PIP_BUILD_TEST" ] || [ "$CONDA_BUILD_TEST" ]; then elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" +elif [ "$ASV" ]; then + echo "We are not running pytest as this is an asv-build" + elif [ "$COVERAGE" ]; then echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas diff --git a/ci/script_single.sh b/ci/script_single.sh index 005c648ee025f..153847ab2e8c9 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -22,6 +22,9 @@ if [ "$PIP_BUILD_TEST" ] || [ "$CONDA_BUILD_TEST" ]; then elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" +elif [ "$ASV" ]; then + echo "We are not running pytest as this is an asv-build" + elif [ "$COVERAGE" ]; then echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 87a79f7e5a987..86bed996c8aab 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -14,14 +14,14 @@ requirements: build: - python - cython - - {{ pin_compatible('numpy', upper_bound='1.14') }} + - numpy 1.11.* - setuptools >=3.3 - python-dateutil >=2.5.0 - pytz run: - python - - {{ pin_compatible('numpy', upper_bound='1.14') }} + - numpy >=1.11.* - python-dateutil >=2.5.0 - pytz diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 25f7c5a3ad948..ca903dadc6eb1 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -672,7 +672,7 @@ The ``CategoricalIndex`` is **preserved** after indexing: df2.loc['a'].index Sorting the index will sort by the order of the categories (Recall that we -created the index with with ``CategoricalDtype(list('cab'))``, so the sorted +created the index with ``CategoricalDtype(list('cab'))``, so the sorted order is ``cab``.). .. ipython:: python diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 18da53506f018..749d4be11ad45 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -774,9 +774,9 @@ We encourage you to view the source code of :meth:`~DataFrame.pipe`. Row or Column-wise Function Application ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Arbitrary functions can be applied along the axes of a DataFrame or Panel +Arbitrary functions can be applied along the axes of a DataFrame using the :meth:`~DataFrame.apply` method, which, like the descriptive -statistics methods, take an optional ``axis`` argument: +statistics methods, takes an optional ``axis`` argument: .. ipython:: python @@ -793,8 +793,16 @@ The :meth:`~DataFrame.apply` method will also dispatch on a string method name. df.apply('mean') df.apply('mean', axis=1) -Depending on the return type of the function passed to :meth:`~DataFrame.apply`, -the result will either be of lower dimension or the same dimension. +The return type of the function passed to :meth:`~DataFrame.apply` affects the +type of the final output from ``DataFrame.apply`` for the default behaviour: + +* If the applied function returns a ``Series``, the final output is a ``DataFrame``. + The columns match the index of the ``Series`` returned by the applied function. +* If the applied function returns any other type, the final output is a ``Series``. + +This default behaviour can be overridden using the ``result_type``, which +accepts three options: ``reduce``, ``broadcast``, and ``expand``. +These will determine how list-likes return values expand (or not) to a ``DataFrame``. :meth:`~DataFrame.apply` combined with some cleverness can be used to answer many questions about a data set. For example, suppose we wanted to extract the date where the diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index e9e0d7716af3a..214667119f7e0 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -279,7 +279,7 @@ date/datetime columns. The equivalent pandas operations are shown below. In addition to these functions pandas supports other Time Series features -not available in Base SAS (such as resampling and and custom offsets) - +not available in Base SAS (such as resampling and custom offsets) - see the :ref:`timeseries documentation` for more details. .. ipython:: python @@ -584,7 +584,7 @@ For example, in SAS you could do this to filter missing values. if value_x ^= .; run; -Which doesn't work in in pandas. Instead, the ``pd.isna`` or ``pd.notna`` functions +Which doesn't work in pandas. Instead, the ``pd.isna`` or ``pd.notna`` functions should be used for comparisons. .. ipython:: python diff --git a/doc/source/computation.rst b/doc/source/computation.rst index a64542fa71705..4285767654e25 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -512,7 +512,7 @@ a same sized result as the input. When using ``.resample()`` with an offset. Construct a new index that is the frequency of the offset. For each frequency bin, aggregate points from the input within a backwards-in-time looking window that fall in that bin. The result of this -aggregation is the output for that frequency point. The windows are fixed size size in the frequency space. Your result +aggregation is the output for that frequency point. The windows are fixed size in the frequency space. Your result will have the shape of a regular frequency between the min and the max of the original input object. To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation. diff --git a/doc/source/conf.py b/doc/source/conf.py index c188f83f80250..7c4edd0486636 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -15,6 +15,8 @@ import re import inspect import importlib +import warnings + from pandas.compat import u, PY3 try: @@ -375,6 +377,13 @@ 'wiki': ('https://github.com/pandas-dev/pandas/wiki/%s', 'wiki ')} + +# ignore all deprecation warnings from Panel during doc build +# (to avoid the need to add :okwarning: in many places) +warnings.filterwarnings("ignore", message="\nPanel is deprecated", + category=FutureWarning) + + ipython_exec_lines = [ 'import numpy as np', 'import pandas as pd', diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index d7650b6b0938f..78e2fdb46f659 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -95,7 +95,7 @@ constructed from the sorted keys of the dict, if possible. NaN (not a number) is the standard missing data marker used in pandas. -**From scalar value** +**From scalar value** If ``data`` is a scalar value, an index must be provided. The value will be repeated to match the length of **index**. @@ -154,7 +154,7 @@ See also the :ref:`section on attribute access`. Vectorized operations and label alignment with Series ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When working with raw NumPy arrays, looping through value-by-value is usually +When working with raw NumPy arrays, looping through value-by-value is usually not necessary. The same is true when working with Series in pandas. Series can also be passed into most NumPy methods expecting an ndarray. @@ -324,7 +324,7 @@ From a list of dicts From a dict of tuples ~~~~~~~~~~~~~~~~~~~~~ -You can automatically create a multi-indexed frame by passing a tuples +You can automatically create a multi-indexed frame by passing a tuples dictionary. .. ipython:: python @@ -347,7 +347,7 @@ column name provided). **Missing Data** Much more will be said on this topic in the :ref:`Missing data ` -section. To construct a DataFrame with missing data, we use ``np.nan`` to +section. To construct a DataFrame with missing data, we use ``np.nan`` to represent missing values. Alternatively, you may pass a ``numpy.MaskedArray`` as the data argument to the DataFrame constructor, and its masked entries will be considered missing. @@ -370,7 +370,7 @@ set to ``'index'`` in order to use the dict keys as row labels. ``DataFrame.from_records`` takes a list of tuples or an ndarray with structured dtype. It works analogously to the normal ``DataFrame`` constructor, except that -the resulting DataFrame index may be a specific field of the structured +the resulting DataFrame index may be a specific field of the structured dtype. For example: .. ipython:: python @@ -506,25 +506,70 @@ to be inserted (for example, a ``Series`` or NumPy array), or a function of one argument to be called on the ``DataFrame``. A *copy* of the original DataFrame is returned, with the new values inserted. +.. versionmodified:: 0.23.0 + +Starting with Python 3.6 the order of ``**kwargs`` is preserved. This allows +for *dependent* assignment, where an expression later in ``**kwargs`` can refer +to a column created earlier in the same :meth:`~DataFrame.assign`. + +.. ipython:: python + + dfa = pd.DataFrame({"A": [1, 2, 3], + "B": [4, 5, 6]}) + dfa.assign(C=lambda x: x['A'] + x['B'], + D=lambda x: x['A'] + x['C']) + +In the second expression, ``x['C']`` will refer to the newly created column, +that's equal to ``dfa['A'] + dfa['B']``. + +To write code compatible with all versions of Python, split the assignment in two. + +.. ipython:: python + + dependent = pd.DataFrame({"A": [1, 1, 1]}) + (dependent.assign(A=lambda x: x['A'] + 1) + .assign(B=lambda x: x['A'] + 2)) + .. warning:: - Since the function signature of ``assign`` is ``**kwargs``, a dictionary, - the order of the new columns in the resulting DataFrame cannot be guaranteed - to match the order you pass in. To make things predictable, items are inserted - alphabetically (by key) at the end of the DataFrame. + Dependent assignment maybe subtly change the behavior of your code between + Python 3.6 and older versions of Python. + + If you wish write code that supports versions of python before and after 3.6, + you'll need to take care when passing ``assign`` expressions that + + * Updating an existing column + * Refering to the newly updated column in the same ``assign`` + + For example, we'll update column "A" and then refer to it when creating "B". + + .. code-block:: python + + >>> dependent = pd.DataFrame({"A": [1, 1, 1]}) + >>> dependent.assign(A=lambda x: x["A"] + 1, + B=lambda x: x["A"] + 2) + + For Python 3.5 and earlier the expression creating ``B`` refers to the + "old" value of ``A``, ``[1, 1, 1]``. The output is then + + .. code-block:: python + + A B + 0 2 3 + 1 2 3 + 2 2 3 + + For Python 3.6 and later, the expression creating ``A`` refers to the + "new" value of ``A``, ``[2, 2, 2]``, which results in + + .. code-block:: python - All expressions are computed first, and then assigned. So you can't refer - to another column being assigned in the same call to ``assign``. For example: + A B + 0 2 4 + 1 2 4 + 2 2 4 - .. ipython:: - :verbatim: - In [1]: # Don't do this, bad reference to `C` - df.assign(C = lambda x: x['A'] + x['B'], - D = lambda x: x['A'] + x['C']) - In [2]: # Instead, break it into two assigns - (df.assign(C = lambda x: x['A'] + x['B']) - .assign(D = lambda x: x['A'] + x['C'])) Indexing / Selection ~~~~~~~~~~~~~~~~~~~~ @@ -914,7 +959,7 @@ For example, using the earlier example data, we could do: Squeezing ~~~~~~~~~ -Another way to change the dimensionality of an object is to ``squeeze`` a 1-len +Another way to change the dimensionality of an object is to ``squeeze`` a 1-len object, similar to ``wp['Item1']``. .. ipython:: python diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 413138b1e52fc..407fad39ba232 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1219,8 +1219,8 @@ see :ref:`here `. Combining ``.groupby`` and ``.pipe`` is often useful when you need to reuse GroupBy objects. -For an example, imagine having a DataFrame with columns for stores, products, -revenue and sold quantity. We'd like to do a groupwise calculation of *prices* +As an example, imagine having a DataFrame with columns for stores, products, +revenue and quantity sold. We'd like to do a groupwise calculation of *prices* (i.e. revenue/quantity) per store and per product. We could do this in a multi-step operation, but expressing it in terms of piping can make the code more readable. First we set the data: @@ -1230,7 +1230,8 @@ code more readable. First we set the data: import numpy as np n = 1000 df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), - 'Product': np.random.choice(['Product_1', 'Product_2', 'Product_3'], n), + 'Product': np.random.choice(['Product_1', + 'Product_2'], n), 'Revenue': (np.random.random(n)*50+10).round(2), 'Quantity': np.random.randint(1, 10, size=n)}) df.head(2) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index ee4df879d9478..957f82fd9eba7 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -89,6 +89,25 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but if you compute the levels and labels yourself, please be careful. +Values +~~~~~~ + +Pandas extends NumPy's type system with custom types, like ``Categorical`` or +datetimes with a timezone, so we have multiple notions of "values". For 1-D +containers (``Index`` classes and ``Series``) we have the following convention: + +* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally, + ``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``, + this returns the codes, not the array of objects. +* ``cls._values`` refers is the "best possible" array. This could be an + ``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the + process of removing the index subclasses here so that it's always an + ``ndarray`` or ``ExtensionArray``). + +So, for example, ``Series[category]._values`` is a ``Categorical``, while +``Series[category]._ndarray_values`` is the underlying codes. + + .. _ref-subclassing-pandas: Subclassing pandas Data Structures diff --git a/doc/source/io.rst b/doc/source/io.rst index 60dc89f8fd495..1785de54b7dd6 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4529,7 +4529,7 @@ Several caveats. on an attempt at serialization. You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. -If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, then +If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, then ``pyarrow`` is tried, and falling back to ``fastparquet``. See the documentation for `pyarrow `__ and `fastparquet `__ diff --git a/doc/source/release.rst b/doc/source/release.rst index cd763de42d162..8e063116cbf07 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -406,7 +406,7 @@ of all enhancements and bugs that have been fixed in 0.20.1. .. note:: - This is a combined release for 0.20.0 and and 0.20.1. + This is a combined release for 0.20.0 and 0.20.1. Version 0.20.1 contains one additional change for backwards-compatibility with downstream projects using pandas' ``utils`` routines. (:issue:`16250`) Thanks @@ -2918,7 +2918,7 @@ Improvements to existing features - clipboard functions use pyperclip (no dependencies on Windows, alternative dependencies offered for Linux) (:issue:`3837`). - Plotting functions now raise a ``TypeError`` before trying to plot anything - if the associated objects have have a dtype of ``object`` (:issue:`1818`, + if the associated objects have a dtype of ``object`` (:issue:`1818`, :issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object arrays to numeric arrays if possible so that you can still plot, for example, an object array with floats. This happens before any drawing takes place which @@ -4082,7 +4082,7 @@ Bug Fixes columns (:issue:`1943`) - Fix time zone localization bug causing improper fields (e.g. hours) in time zones that have not had a UTC transition in a long time (:issue:`1946`) -- Fix errors when parsing and working with with fixed offset timezones +- Fix errors when parsing and working with fixed offset timezones (:issue:`1922`, :issue:`1928`) - Fix text parser bug when handling UTC datetime objects generated by dateutil (:issue:`1693`) @@ -4383,7 +4383,7 @@ Bug Fixes error (:issue:`1090`) - Consistently set name on groupby pieces (:issue:`184`) - Treat dict return values as Series in GroupBy.apply (:issue:`823`) -- Respect column selection for DataFrame in in GroupBy.transform (:issue:`1365`) +- Respect column selection for DataFrame in GroupBy.transform (:issue:`1365`) - Fix MultiIndex partial indexing bug (:issue:`1352`) - Enable assignment of rows in mixed-type DataFrame via .ix (:issue:`1432`) - Reset index mapping when grouping Series in Cython (:issue:`1423`) @@ -5040,7 +5040,7 @@ New Features - Add `melt` function to `pandas.core.reshape` - Add `level` parameter to group by level in Series and DataFrame descriptive statistics (:issue:`313`) -- Add `head` and `tail` methods to Series, analogous to to DataFrame (PR +- Add `head` and `tail` methods to Series, analogous to DataFrame (PR :issue:`296`) - Add `Series.isin` function which checks if each value is contained in a passed sequence (:issue:`289`) diff --git a/doc/source/text.rst b/doc/source/text.rst index 2b6459b581c1e..1e620acb1f88a 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -218,7 +218,8 @@ Extract first match in each subject (extract) ``DataFrame``, depending on the subject and regular expression pattern (same behavior as pre-0.18.0). When ``expand=True`` it always returns a ``DataFrame``, which is more consistent and less - confusing from the perspective of a user. + confusing from the perspective of a user. ``expand=True`` is the + default since version 0.23.0. The ``extract`` method accepts a `regular expression `__ with at least one diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 43ccd372d9d5b..710212bc237cd 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -19,7 +19,7 @@ pandas Cookbook The goal of this cookbook (by `Julia Evans `_) is to give you some concrete examples for getting started with pandas. These are examples with real-world data, and all the bugs and weirdness that -that entails. +entails. Here are links to the v0.1 release. For an up-to-date table of contents, see the `pandas-cookbook GitHub repository `_. To run the examples in this tutorial, you'll need to diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index af3a297fa0ca7..94001d1ff5fe3 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -142,7 +142,7 @@ Previous Behavior: 4 NaN dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -167,7 +167,7 @@ Previous Behavior: 3 2.5 dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -204,11 +204,96 @@ Please note that the string `index` is not supported with the round trip format, new_df print(new_df.index.name) +.. _whatsnew_0230.enhancements.index_division_by_zero: + +Index Division By Zero Fills Correctly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and `0 / 0` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) + +Previous Behavior: + +.. code-block:: ipython + + In [6]: index = pd.Int64Index([-1, 0, 1]) + + In [7]: index / 0 + Out[7]: Int64Index([0, 0, 0], dtype='int64') + + # Previous behavior yielded different results depending on the type of zero in the divisor + In [8]: index / 0.0 + Out[8]: Float64Index([-inf, nan, inf], dtype='float64') + + In [9]: index = pd.UInt64Index([0, 1]) + + In [10]: index / np.array([0, 0], dtype=np.uint64) + Out[10]: UInt64Index([0, 0], dtype='uint64') + + In [11]: pd.RangeIndex(1, 5) / 0 + ZeroDivisionError: integer division or modulo by zero + +Current Behavior: + +.. ipython:: python + + index = pd.Int64Index([-1, 0, 1]) + # division by zero gives -infinity where negative, +infinity where positive, and NaN for 0 / 0 + index / 0 + + # The result of division by zero should not depend on whether the zero is int or float + index / 0.0 + + index = pd.UInt64Index([0, 1]) + index / np.array([0, 0], dtype=np.uint64) + + pd.RangeIndex(1, 5) / 0 + +.. _whatsnew_0230.enhancements.assign_dependent: + +``.assign()`` accepts dependent arguments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :func:`DataFrame.assign` now accepts dependent keyword arguments for python version later than 3.6 (see also `PEP 468 +`_). Later keyword arguments may now refer to earlier ones if the argument is a callable. See the +:ref:`documentation here ` (:issue:`14207`) + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3]}) + df + df.assign(B=df.A, C=lambda x:x['A']+ x['B']) + +.. warning:: + + This may subtly change the behavior of your code when you're + using ``.assign()`` to update an existing column. Previously, callables + referring to other variables being updated would get the "old" values + + Previous Behaviour: + + .. code-block:: ipython + + In [2]: df = pd.DataFrame({"A": [1, 2, 3]}) + + In [3]: df.assign(A=lambda df: df.A + 1, C=lambda df: df.A * -1) + Out[3]: + A C + 0 2 -1 + 1 3 -2 + 2 4 -3 + + New Behaviour: + + .. ipython:: python + + df.assign(A=df.A+1, C= lambda df: df.A* -1) + .. _whatsnew_0230.enhancements.other: Other Enhancements ^^^^^^^^^^^^^^^^^^ +- Unary ``+`` now permitted for ``Series`` and ``DataFrame`` as numeric operator (:issue:`16073`) - Better support for :func:`Dataframe.style.to_excel` output with the ``xlsxwriter`` engine. (:issue:`16149`) - :func:`pandas.tseries.frequencies.to_offset` now accepts leading '+' signs e.g. '+1h'. (:issue:`18171`) - :func:`MultiIndex.unique` now supports the ``level=`` argument, to get unique values from a specific index level (:issue:`17896`) @@ -288,6 +373,76 @@ Convert to an xarray DataArray p.to_xarray() +.. _whatsnew_0230.api_breaking.apply: + +Changes to make output of ``DataFrame.apply`` consistent +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies +are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case +where a list-like (e.g. ``tuple`` or ``list`` is returned) (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, +:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`). + +.. ipython:: python + + df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) + df + +Previous Behavior: if the returned shape happened to match the length of original columns, this would return a ``DataFrame``. +If the return shape did not match, a ``Series`` with lists was returned. + +.. code-block:: python + + In [3]: df.apply(lambda x: [1, 2, 3], axis=1) + Out[3]: + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + In [4]: df.apply(lambda x: [1, 2], axis=1) + Out[4]: + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + dtype: object + + +New Behavior: When the applied function returns a list-like, this will now *always* return a ``Series``. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1) + df.apply(lambda x: [1, 2], axis=1) + +To have expanded columns, you can use ``result_type='expand'`` + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') + +To broadcast the result across the original columns (the old behaviour for +list-likes of the correct length), you can use ``result_type='broadcast'``. +The shape must match the original columns. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + +Returning a ``Series`` allows one to control the exact return structure and column names: + +.. ipython:: python + + df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']]), axis=1) + + +.. _whatsnew_0230.api_breaking.build_changes: Build Changes ^^^^^^^^^^^^^ @@ -296,6 +451,78 @@ Build Changes - Building from source now explicitly requires ``setuptools`` in ``setup.py`` (:issue:`18113`) - Updated conda recipe to be in compliance with conda-build 3.0+ (:issue:`18002`) +.. _whatsnew_0230.api_breaking.extract: + +Extraction of matching patterns from strings +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, extracting matching patterns from strings with :func:`str.extract` used to return a +``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was +extracted``). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless +``expand`` is set to ``False`` (:issue:`11386`). + +Also, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to +``False``), but now raises a ``ValueError``. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: s = pd.Series(['number 10', '12 eggs']) + + In [2]: extracted = s.str.extract('.*(\d\d).*') + + In [3]: extracted + Out [3]: + 0 10 + 1 12 + dtype: object + + In [4]: type(extracted) + Out [4]: + pandas.core.series.Series + +New Behavior: + +.. ipython:: python + + s = pd.Series(['number 10', '12 eggs']) + extracted = s.str.extract('.*(\d\d).*') + extracted + type(extracted) + +To restore previous behavior, simply set ``expand`` to ``False``: + +.. ipython:: python + + s = pd.Series(['number 10', '12 eggs']) + extracted = s.str.extract('.*(\d\d).*', expand=False) + extracted + type(extracted) + +.. _whatsnew_0230.api_breaking.cdt_ordered: + +Default value for the ``ordered`` parameter of ``CategoricalDtype`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The default value of the ``ordered`` parameter for :class:`~pandas.api.types.CategoricalDtype` has changed from ``False`` to ``None`` to allow updating of ``categories`` without impacting ``ordered``. Behavior should remain consistent for downstream objects, such as :class:`Categorical` (:issue:`18790`) + +In previous versions, the default value for the ``ordered`` parameter was ``False``. This could potentially lead to the ``ordered`` parameter unintentionally being changed from ``True`` to ``False`` when users attempt to update ``categories`` if ``ordered`` is not explicitly specified, as it would silently default to ``False``. The new behavior for ``ordered=None`` is to retain the existing value of ``ordered``. + +New Behavior: + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba')) + cat + cdt = CategoricalDtype(categories=list('cbad')) + cat.astype(cdt) + +Notice in the example above that the converted ``Categorical`` has retained ``ordered=True``. Had the default value for ``ordered`` remained as ``False``, the converted ``Categorical`` would have become unordered, despite ``ordered=False`` never being explicitly specified. To change the value of ``ordered``, explicitly pass it to the new dtype, e.g. ``CategoricalDtype(categories=list('cbad'), ordered=False)``. + +Note that the unintenional conversion of ``ordered`` discussed above did not arise in previous versions due to separate bugs that prevented ``astype`` from doing any type of category to category conversion (:issue:`10696`, :issue:`18593`). These bugs have been fixed in this release, and motivated changing the default value of ``ordered``. + .. _whatsnew_0230.api: Other API Changes @@ -362,6 +589,8 @@ Deprecations - The ``is_copy`` attribute is deprecated and will be removed in a future version (:issue:`18801`). - ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - :func:``DataFrame.from_items`` is deprecated. Use :func:``DataFrame.from_dict()`` instead, or :func:``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`) +- The ``broadcast`` parameter of ``.apply()`` is deprecated in favor of ``result_type='broadcast'`` (:issue:`18577`) +- The ``reduce`` parameter of ``.apply()`` is deprecated in favor of ``result_type='reduce'`` (:issue:`18577`) .. _whatsnew_0230.prior_deprecations: @@ -416,6 +645,7 @@ Performance Improvements - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) +- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) .. _whatsnew_0230.docs: @@ -434,6 +664,32 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +Categorical +^^^^^^^^^^^ + +.. warning:: + + A class of bugs were introduced in pandas 0.21 with ``CategoricalDtype`` that + affects the correctness of operations like ``merge``, ``concat``, and + indexing when comparing multiple unordered ``Categorical`` arrays that have + the same categories, but in a different order. We highly recommend upgrading + or manually aligning your categories before doing these operations. + +- Bug in ``Categorical.equals`` returning the wrong result when comparing two + unordered ``Categorical`` arrays with the same categories, but in a different + order (:issue:`16603`) +- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result + when for unordered categoricals with the categories in a different order. + This affected :func:`pandas.concat` with Categorical data (:issue:`19096`). +- Bug in :func:`pandas.merge` returning the wrong result when joining on an + unordered ``Categorical`` that had the same categories but in a different + order (:issue:`19551`) +- Bug in :meth:`CategoricalIndex.get_indexer` returning the wrong result when + ``target`` was an unordered ``Categorical`` that had the same categories as + ``self`` but in a different order (:issue:`19551`) +- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) +- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) +- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) Datetimelike ^^^^^^^^^^^^ @@ -448,7 +704,7 @@ Datetimelike - Bug in :class:`Series` floor-division where operating on a scalar ``timedelta`` raises an exception (:issue:`18846`) - Bug in :class:`Series`` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` had results cast to ``dtype='int64'`` (:issue:`17250`) - Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (issue:`19042`) -- Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (issue:`19043`) +- Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (:issue:`19043`) - Bug in :class:`DatetimeIndex` where the repr was not showing high-precision time values at the end of a day (e.g., 23:59:59.999999999) (:issue:`19030`) - Bug where dividing a scalar timedelta-like object with :class:`TimedeltaIndex` performed the reciprocal operation (:issue:`19125`) - Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`) @@ -456,6 +712,9 @@ Datetimelike - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) - Bug in :func:`~DataFrame.pct_change` using ``periods`` and ``freq`` returned different length outputs (:issue:`7292`) - Bug in comparison of :class:`DatetimeIndex` against ``None`` or ``datetime.date`` objects raising ``TypeError`` for ``==`` and ``!=`` comparisons instead of all-``False`` and all-``True``, respectively (:issue:`19301`) +- Bug in :class:`Timestamp` and :func:`to_datetime` where a string representing a barely out-of-bounds timestamp would be incorrectly rounded down instead of raising ``OutOfBoundsDatetime`` (:issue:`19382`) +- Bug in :func:`Timestamp.floor` :func:`DatetimeIndex.floor` where time stamps far in the future and past were not rounded correctly (:issue:`19206`) +- Bug in :func:`to_datetime` where passing an out-of-bounds datetime with ``errors='coerce'`` and ``utc=True`` would raise ``OutOfBoundsDatetime`` instead of parsing to ``NaT`` (:issue:`19612`) - Timezones @@ -470,6 +729,7 @@ Timezones - Bug in tz-aware :class:`DatetimeIndex` where addition/subtraction with a :class:`TimedeltaIndex` or array with ``dtype='timedelta64[ns]'`` was incorrect (:issue:`17558`) - Bug in :func:`DatetimeIndex.insert` where inserting ``NaT`` into a timezone-aware index incorrectly raised (:issue:`16357`) - Bug in the :class:`DataFrame` constructor, where tz-aware Datetimeindex and a given column name will result in an empty ``DataFrame`` (:issue:`19157`) +- Bug in :func:`Timestamp.tz_localize` where localizing a timestamp near the minimum or maximum valid values could overflow and return a timestamp with an incorrect nanosecond value (:issue:`12677`) Offsets ^^^^^^^ @@ -487,6 +747,7 @@ Numeric - Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`) - Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) - Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) +- Bug in :class:`DataFrame` flex arithmetic (e.g. `df.add(other, fill_value=foo)`) with a `fill_value` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`) Indexing @@ -523,6 +784,7 @@ I/O ^^^ - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) +- :meth:`DataFrame.to_html` now has an option to add an id to the leading `` tag (:issue:`8496`) - Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`) - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) - Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`) @@ -532,6 +794,7 @@ I/O - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`) - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`) - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for xls file type (:issue:`19242`, :issue:`9155`) +- Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) Plotting ^^^^^^^^ @@ -548,15 +811,19 @@ Groupby/Resample/Rolling - Fixed regression in :func:`DataFrame.groupby` which would not emit an error when called with a tuple key not in the index (:issue:`18798`) - Bug in :func:`DataFrame.resample` which silently ignored unsupported (or mistyped) options for ``label``, ``closed`` and ``convention`` (:issue:`19303`) - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) -- Bug in ``transform`` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) -- +- Bug in :func:`DataFrame.groupby` where aggregation by ``first``/``last``/``min``/``max`` was causing timestamps to lose precision (:issue:`19526`) +- Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) +- Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) +- Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) +- Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`) Sparse ^^^^^^ - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) -- +- Bug in :class:`SparseSeries.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`) +- Bug in constructing a ``SparseArray``: if ``data`` is a scalar and ``index`` is defined it will coerce to ``float64`` regardless of scalar's dtype. (:issue:`19163`) Reshaping ^^^^^^^^^ @@ -572,21 +839,8 @@ Reshaping - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) - Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) -- - - -Categorical -^^^^^^^^^^^ - -- -- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result - when all the categoricals had the same categories, but in a different order. - This affected :func:`pandas.concat` with Categorical data (:issue:`19096`). -- Bug in ``Categorical.equals`` between two unordered categories with the same categories, but in a different order (:issue:`16603`) -- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) -- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) -- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) -- +- Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) +- Bug in :func:`DataFrame.join` which does an *outer* instead of a *left* join when being called with multiple DataFrames and some have non-unique indices (:issue:`19624`) Other ^^^^^ diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 6d80e6f0073eb..a535872ff7279 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -11,3 +11,11 @@ cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: a[0] = b[0] b[0] = t return 0 + +cdef enum TiebreakEnumType: + TIEBREAK_AVERAGE + TIEBREAK_MIN, + TIEBREAK_MAX + TIEBREAK_FIRST + TIEBREAK_FIRST_DESCENDING + TIEBREAK_DENSE diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5d17488963b1c..a418e54e4da9b 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -31,14 +31,6 @@ cdef double nan = NaN cdef int64_t iNaT = get_nat() -cdef: - int TIEBREAK_AVERAGE = 0 - int TIEBREAK_MIN = 1 - int TIEBREAK_MAX = 2 - int TIEBREAK_FIRST = 3 - int TIEBREAK_FIRST_DESCENDING = 4 - int TIEBREAK_DENSE = 5 - tiebreakers = { 'average': TIEBREAK_AVERAGE, 'min': TIEBREAK_MIN, diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9cc15fb6692d9..866683ce378ab 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -16,8 +16,9 @@ from numpy cimport (ndarray, from libc.stdlib cimport malloc, free from util cimport numeric, get_nat -from algos cimport swap -from algos import take_2d_axis1_float64_float64, groupsort_indexer +from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN, + TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE) +from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers cdef int64_t iNaT = get_nat() @@ -25,99 +26,6 @@ cdef double NaN = np.NaN cdef double nan = NaN -# TODO: aggregate multiple columns in single pass -# ---------------------------------------------------------------------- -# first, nth, last - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[int64_t, ndim=2] nobs - ndarray[object, ndim=2] resx - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index a751fadaf48cf..1d77a373bb7dd 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -317,7 +317,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{endfor}} #---------------------------------------------------------------------- -# group_nth, group_last +# group_nth, group_last, group_rank #---------------------------------------------------------------------- {{py: @@ -325,7 +325,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, # name, c_type, dest_type2, nan_val dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'), ('float32', 'float32_t', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'int64_t', 'iNaT')] + ('int64', 'int64_t', 'int64_t', 'iNaT'), + ('object', 'object', 'object', 'NAN')] def get_dispatch(dtypes): @@ -350,7 +351,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count + {{dest_type2}} val ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -360,11 +361,19 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) + {{if name=='object'}} + resx = np.empty(( out).shape, dtype=object) + {{else}} resx = np.empty_like(out) + {{endif}} N, K = ( values).shape + {{if name == "object"}} + if True: # make templating happy + {{else}} with nogil: + {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -375,11 +384,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} if val == val and val != {{nan_val}}: - {{endif}} nobs[lab, j] += 1 resx[lab, j] = val @@ -390,7 +395,6 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] - @cython.wraparound(False) @cython.boundscheck(False) def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @@ -403,7 +407,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count + {{dest_type2}} val ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -413,11 +417,19 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) + {{if name=='object'}} + resx = np.empty(( out).shape, dtype=object) + {{else}} resx = np.empty_like(out) + {{endif}} N, K = ( values).shape + {{if name == "object"}} + if True: # make templating happy + {{else}} with nogil: + {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -428,11 +440,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} if val == val and val != {{nan_val}}: - {{endif}} nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -444,8 +452,175 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] + +{{if name != 'object'}} +@cython.boundscheck(False) +@cython.wraparound(False) +def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels, + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): + """Provides the rank of values within each group + + Parameters + ---------- + out : array of float64_t values which this method will write its results to + values : array of {{c_type}} values to be ranked + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + is_datetimelike : bool + unused in this method but provided for call compatability with other + Cython transformations + ties_method : {'keep', 'top', 'bottom'} + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + ascending : boolean + False for ranks by high (1) to low (N) + pct : boolean + Compute percentage rank of data within each group + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + TiebreakEnumType tiebreak + Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0 + ndarray[int64_t] _as + ndarray[float64_t, ndim=2] grp_sizes + ndarray[{{c_type}}] masked_vals + ndarray[uint8_t] mask + bint keep_na + {{c_type}} nan_fill_val + + tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' + N, K = ( values).shape + grp_sizes = np.ones_like(out) + + # Copy values into new array in order to fill missing data + # with mask, without obfuscating location of missing data + # in values array + masked_vals = np.array(values[:, 0], copy=True) + {{if name=='int64'}} + mask = (masked_vals == {{nan_val}}).astype(np.uint8) + {{else}} + mask = np.isnan(masked_vals).astype(np.uint8) + {{endif}} + + if ascending ^ (na_option == 'top'): + {{if name == 'int64'}} + nan_fill_val = np.iinfo(np.int64).max + {{else}} + nan_fill_val = np.inf + {{endif}} + order = (masked_vals, mask, labels) + else: + {{if name == 'int64'}} + nan_fill_val = np.iinfo(np.int64).min + {{else}} + nan_fill_val = -np.inf + {{endif}} + order = (masked_vals, ~mask, labels) + np.putmask(masked_vals, mask, nan_fill_val) + + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values + _as = np.lexsort(order).view(dtype=np.int64) + + if not ascending: + _as = _as[::-1] + + with nogil: + # Loop over the length of the value array + # each incremental i value can be looked up in the _as array + # that we sorted previously, which gives us the location of + # that sorted value for retrieval back from the original + # values / masked_vals arrays + for i in range(N): + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and masked_vals[_as[i]] == nan_fill_val: + grp_na_count += 1 + out[_as[i], 0] = nan + else: + # this implementation is inefficient because it will + # continue overwriting previously encountered dups + # i.e. if 5 duplicated values are encountered it will + # write to the result as follows (assumes avg tiebreaker): + # 1 + # .5 .5 + # .33 .33 .33 + # .25 .25 .25 .25 + # .2 .2 .2 .2 .2 + # + # could potentially be optimized to only write to the + # result once the last duplicate value is encountered + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j], 0] = j + 1 - grp_start + else: + out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = grp_vals_seen + + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is coming + # up. the conditional also needs to handle nan equality and the + # end of iteration + if (i == N - 1 or ( + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not + (mask[_as[i]] and mask[_as[i+1]]))): + dups = sum_ranks = 0 + val_start = i + grp_vals_seen += 1 + + # Similar to the previous conditional, check now if we are moving + # to a new group. If so, keep track of the index where the new + # group occurs, so the tiebreaker calculations can decrement that + # from their position. fill in the size of each group encountered + # (used by pct calculations later). also be sure to reset any of + # the items helping to calculate dups + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count + dups = sum_ranks = 0 + grp_na_count = 0 + val_start = i + 1 + grp_start = i + 1 + grp_vals_seen = 1 + + if pct: + for i in range(N): + out[i, 0] = out[i, 0] / grp_sizes[i, 0] +{{endif}} {{endfor}} + #---------------------------------------------------------------------- # group_min, group_max #---------------------------------------------------------------------- diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index f1367978bd6c9..7c4de8e42e73b 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -14,6 +14,7 @@ See end of file for stuff pandas uses (search for 'pandas'). */ #include "period_helper.h" +#include "../datetime/np_datetime.h" /* ------------------------------------------------------------------ * Code derived from scikits.timeseries @@ -37,212 +38,37 @@ static int floordiv(int x, int divisor) { } } -/* Table with day offsets for each month (0-based, without and with leap) */ -static int month_offset[2][13] = { - {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, - {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}}; - -/* Table of number of days in a month (0-based, without and with leap) */ -static int days_in_month[2][12] = { - {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, - {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; - -/* Return 1/0 iff year points to a leap year in calendar. */ -static int dInfoCalc_Leapyear(npy_int64 year, int calendar) { - if (calendar == GREGORIAN_CALENDAR) { - return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); - } else { - return (year % 4 == 0); - } -} - -/* Return the day of the week for the given absolute date. */ -static int dInfoCalc_DayOfWeek(npy_int64 absdate) { - int day_of_week; - - if (absdate >= 1) { - day_of_week = (absdate - 1) % 7; - } else { - day_of_week = 6 - ((-absdate) % 7); - } - return day_of_week; -} static int monthToQuarter(int month) { return ((month - 1) / 3) + 1; } -/* Return the year offset, that is the absolute date of the day - 31.12.(year-1) in the given calendar. - - Note: - For the Julian calendar we shift the absdate (which is measured - using the Gregorian Epoch) value by two days because the Epoch - (0001-01-01) in the Julian calendar lies 2 days before the Epoch in - the Gregorian calendar. */ -static int dInfoCalc_YearOffset(npy_int64 year, int calendar) { - year--; - if (calendar == GREGORIAN_CALENDAR) { - if (year >= 0 || -1 / 4 == -1) - return year * 365 + year / 4 - year / 100 + year / 400; - else - return year * 365 + (year - 3) / 4 - (year - 99) / 100 + - (year - 399) / 400; - } else if (calendar == JULIAN_CALENDAR) { - if (year >= 0 || -1 / 4 == -1) - return year * 365 + year / 4 - 2; - else - return year * 365 + (year - 3) / 4 - 2; - } - Py_Error(PyExc_ValueError, "unknown calendar"); -onError: - return INT_ERR_CODE; -} -/* Set the instance's value using the given date and time. calendar may be set - * to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to indicate the calendar - * to be used. */ - -static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, - int month, int day, int hour, - int minute, double second, - int calendar) { +/* Find the absdate (days elapsed since datetime(1, 1, 1) + * for the given year/month/day. + * Assumes GREGORIAN_CALENDAR */ +npy_int64 absdate_from_ymd(int year, int month, int day) { /* Calculate the absolute date */ - { - int leap; - npy_int64 absdate; - int yearoffset; - - /* Range check */ - Py_AssertWithArg(year > -(INT_MAX / 366) && year < (INT_MAX / 366), - PyExc_ValueError, "year out of range: %i", year); - - /* Is it a leap year ? */ - leap = dInfoCalc_Leapyear(year, calendar); - - /* Negative month values indicate months relative to the years end */ - if (month < 0) month += 13; - Py_AssertWithArg(month >= 1 && month <= 12, PyExc_ValueError, - "month out of range (1-12): %i", month); - - /* Negative values indicate days relative to the months end */ - if (day < 0) day += days_in_month[leap][month - 1] + 1; - Py_AssertWithArg(day >= 1 && day <= days_in_month[leap][month - 1], - PyExc_ValueError, "day out of range: %i", day); - - yearoffset = dInfoCalc_YearOffset(year, calendar); - if (yearoffset == INT_ERR_CODE) goto onError; + pandas_datetimestruct dts; + npy_int64 unix_date; - absdate = day + month_offset[leap][month - 1] + yearoffset; - - dinfo->absdate = absdate; - - dinfo->year = year; - dinfo->month = month; - dinfo->quarter = ((month - 1) / 3) + 1; - dinfo->day = day; - - dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); - dinfo->day_of_year = (short)(absdate - yearoffset); - - dinfo->calendar = calendar; - } - - /* Calculate the absolute time */ - { - Py_AssertWithArg(hour >= 0 && hour <= 23, PyExc_ValueError, - "hour out of range (0-23): %i", hour); - Py_AssertWithArg(minute >= 0 && minute <= 59, PyExc_ValueError, - "minute out of range (0-59): %i", minute); - Py_AssertWithArg( - second >= (double)0.0 && - (second < (double)60.0 || - (hour == 23 && minute == 59 && second < (double)61.0)), - PyExc_ValueError, - "second out of range (0.0 - <60.0; <61.0 for 23:59): %f", second); - - dinfo->abstime = (double)(hour * 3600 + minute * 60) + second; - - dinfo->hour = hour; - dinfo->minute = minute; - dinfo->second = second; - } - return 0; - -onError: - return INT_ERR_CODE; + memset(&dts, 0, sizeof(pandas_datetimestruct)); + dts.year = year; + dts.month = month; + dts.day = day; + unix_date = pandas_datetimestruct_to_datetime(PANDAS_FR_D, &dts); + return ORD_OFFSET + unix_date; } -/* Sets the date part of the date_info struct using the indicated - calendar. - - XXX This could also be done using some integer arithmetics rather - than with this iterative approach... */ +/* Sets the date part of the date_info struct + Assumes GREGORIAN_CALENDAR */ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, - npy_int64 absdate, int calendar) { - register npy_int64 year; - npy_int64 yearoffset; - int leap, dayoffset; - int *monthoffset; - - /* Approximate year */ - if (calendar == GREGORIAN_CALENDAR) { - year = (npy_int64)(((double)absdate) / 365.2425); - } else if (calendar == JULIAN_CALENDAR) { - year = (npy_int64)(((double)absdate) / 365.25); - } else { - Py_Error(PyExc_ValueError, "unknown calendar"); - } - - if (absdate > 0) year++; - - /* Apply corrections to reach the correct year */ - while (1) { - /* Calculate the year offset */ - yearoffset = dInfoCalc_YearOffset(year, calendar); - if (yearoffset == INT_ERR_CODE) goto onError; - - /* Backward correction: absdate must be greater than the - yearoffset */ - if (yearoffset >= absdate) { - year--; - continue; - } - - dayoffset = absdate - yearoffset; - leap = dInfoCalc_Leapyear(year, calendar); - - /* Forward correction: non leap years only have 365 days */ - if (dayoffset > 365 && !leap) { - year++; - continue; - } - break; - } - - dinfo->year = year; - dinfo->calendar = calendar; - - /* Now iterate to find the month */ - monthoffset = month_offset[leap]; - { - register int month; - - for (month = 1; month < 13; month++) { - if (monthoffset[month] >= dayoffset) break; - } - - dinfo->month = month; - dinfo->quarter = monthToQuarter(month); - dinfo->day = dayoffset - month_offset[leap][month - 1]; - } - - dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); - dinfo->day_of_year = dayoffset; - dinfo->absdate = absdate; + npy_int64 absdate) { + pandas_datetimestruct dts; + pandas_datetime_to_datetimestruct(absdate - ORD_OFFSET, PANDAS_FR_D, &dts); + dinfo->year = dts.year; + dinfo->month = dts.month; + dinfo->day = dts.day; return 0; - -onError: - return INT_ERR_CODE; } /////////////////////////////////////////////// @@ -254,11 +80,14 @@ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, // helpers for frequency conversion routines // -static int daytime_conversion_factors[][2] = { - {FR_DAY, 1}, {FR_HR, 24}, {FR_MIN, 60}, {FR_SEC, 60}, - {FR_MS, 1000}, {FR_US, 1000}, {FR_NS, 1000}, {0, 0}}; - -static npy_int64 **daytime_conversion_factor_matrix = NULL; +static npy_int64 daytime_conversion_factor_matrix[7][7] = { + {1, 24, 1440, 86400, 86400000, 86400000000, 86400000000000}, + {0, 1, 60, 3600, 3600000, 3600000000, 3600000000000}, + {0, 0, 1, 60, 60000, 60000000, 60000000000}, + {0, 0, 0, 1, 1000, 1000000, 1000000000}, + {0, 0, 0, 0, 1, 1000, 1000000}, + {0, 0, 0, 0, 0, 1, 1000}, + {0, 0, 0, 0, 0, 0, 1}}; PANDAS_INLINE int max_value(int a, int b) { return a > b ? a : b; } @@ -268,100 +97,23 @@ PANDAS_INLINE int get_freq_group(int freq) { return (freq / 1000) * 1000; } PANDAS_INLINE int get_freq_group_index(int freq) { return freq / 1000; } -static int calc_conversion_factors_matrix_size(void) { - int matrix_size = 0; - int index; - for (index = 0;; index++) { - int period_value = - get_freq_group_index(daytime_conversion_factors[index][0]); - if (period_value == 0) { - break; - } - matrix_size = max_value(matrix_size, period_value); - } - return matrix_size + 1; -} - -static void alloc_conversion_factors_matrix(int matrix_size) { - int row_index; - int column_index; - daytime_conversion_factor_matrix = - malloc(matrix_size * sizeof(**daytime_conversion_factor_matrix)); - for (row_index = 0; row_index < matrix_size; row_index++) { - daytime_conversion_factor_matrix[row_index] = - malloc(matrix_size * sizeof(**daytime_conversion_factor_matrix)); - for (column_index = 0; column_index < matrix_size; column_index++) { - daytime_conversion_factor_matrix[row_index][column_index] = 0; - } - } -} - -static npy_int64 calculate_conversion_factor(int start_value, int end_value) { - npy_int64 conversion_factor = 0; - int index; - for (index = 0;; index++) { - int freq_group = daytime_conversion_factors[index][0]; - - if (freq_group == 0) { - conversion_factor = 0; - break; - } - - if (freq_group == start_value) { - conversion_factor = 1; - } else { - conversion_factor *= daytime_conversion_factors[index][1]; - } - if (freq_group == end_value) { - break; - } - } - return conversion_factor; -} - -static void populate_conversion_factors_matrix(void) { - int row_index_index; - int row_value, row_index; - int column_index_index; - int column_value, column_index; - - for (row_index_index = 0;; row_index_index++) { - row_value = daytime_conversion_factors[row_index_index][0]; - if (row_value == 0) { - break; - } - row_index = get_freq_group_index(row_value); - for (column_index_index = row_index_index;; column_index_index++) { - column_value = daytime_conversion_factors[column_index_index][0]; - if (column_value == 0) { - break; - } - column_index = get_freq_group_index(column_value); - - daytime_conversion_factor_matrix[row_index][column_index] = - calculate_conversion_factor(row_value, column_value); - } - } -} - -void initialize_daytime_conversion_factor_matrix() { - if (daytime_conversion_factor_matrix == NULL) { - int matrix_size = calc_conversion_factors_matrix_size(); - alloc_conversion_factors_matrix(matrix_size); - populate_conversion_factors_matrix(); +npy_int64 get_daytime_conversion_factor(int from_index, int to_index) { + int row = min_value(from_index, to_index); + int col = max_value(from_index, to_index); + // row or col < 6 means frequency strictly lower than Daily, which + // do not use daytime_conversion_factors + if (row < 6) { + return 0; + } else if (col < 6) { + return 0; } -} - -PANDAS_INLINE npy_int64 get_daytime_conversion_factor(int from_index, - int to_index) { - return daytime_conversion_factor_matrix[min_value(from_index, to_index)] - [max_value(from_index, to_index)]; + return daytime_conversion_factor_matrix[row - 6][col - 6]; } PANDAS_INLINE npy_int64 upsample_daytime(npy_int64 ordinal, - asfreq_info *af_info, int atEnd) { - if (atEnd) { + asfreq_info *af_info) { + if (af_info->is_end) { return (ordinal + 1) * af_info->intraday_conversion_factor - 1; } else { return ordinal * af_info->intraday_conversion_factor; @@ -369,21 +121,18 @@ PANDAS_INLINE npy_int64 upsample_daytime(npy_int64 ordinal, } PANDAS_INLINE npy_int64 downsample_daytime(npy_int64 ordinal, - asfreq_info *af_info, int atEnd) { + asfreq_info *af_info) { return ordinal / (af_info->intraday_conversion_factor); } -PANDAS_INLINE npy_int64 transform_via_day(npy_int64 ordinal, char relation, +PANDAS_INLINE npy_int64 transform_via_day(npy_int64 ordinal, asfreq_info *af_info, freq_conv_func first_func, freq_conv_func second_func) { - // printf("transform_via_day(%ld, %ld, %d)\n", ordinal, - // af_info->intraday_conversion_factor, - // af_info->intraday_conversion_upsample); npy_int64 result; - result = (*first_func)(ordinal, relation, af_info); - result = (*second_func)(result, relation, af_info); + result = (*first_func)(ordinal, af_info); + result = (*second_func)(result, af_info); return result; } @@ -392,40 +141,31 @@ static npy_int64 DtoB_weekday(npy_int64 absdate) { return (((absdate) / 7) * 5) + (absdate) % 7 - BDAY_OFFSET; } -static npy_int64 DtoB_WeekendToMonday(npy_int64 absdate, int day_of_week) { - if (day_of_week > 4) { - // change to Monday after weekend - absdate += (7 - day_of_week); - } - return DtoB_weekday(absdate); -} +static npy_int64 DtoB(struct date_info *dinfo, + int roll_back, npy_int64 absdate) { + int day_of_week = dayofweek(dinfo->year, dinfo->month, dinfo->day); -static npy_int64 DtoB_WeekendToFriday(npy_int64 absdate, int day_of_week) { - if (day_of_week > 4) { - // change to friday before weekend - absdate -= (day_of_week - 4); + if (roll_back == 1) { + if (day_of_week > 4) { + // change to friday before weekend + absdate -= (day_of_week - 4); + } + } else { + if (day_of_week > 4) { + // change to Monday after weekend + absdate += (7 - day_of_week); + } } return DtoB_weekday(absdate); } -static npy_int64 absdate_from_ymd(int y, int m, int d) { - struct date_info tempDate; - if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0, - GREGORIAN_CALENDAR)) { - return INT_ERR_CODE; - } - return tempDate.absdate; -} //************ FROM DAILY *************** -static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_DTtoA(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) - return INT_ERR_CODE; + ordinal = downsample_daytime(ordinal, af_info); + dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); if (dinfo.month > af_info->to_a_year_end) { return (npy_int64)(dinfo.year + 1 - BASE_YEAR); } else { @@ -436,9 +176,7 @@ static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, int *quarter) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) - return INT_ERR_CODE; + dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); if (af_info->to_q_year_end != 12) { dinfo.month -= af_info->to_q_year_end; if (dinfo.month <= 0) { @@ -446,164 +184,120 @@ static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, } else { dinfo.year += 1; } - dinfo.quarter = monthToQuarter(dinfo.month); } *year = dinfo.year; - *quarter = dinfo.quarter; + *quarter = monthToQuarter(dinfo.month); return 0; } -static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, asfreq_info *af_info) { int year, quarter; - ordinal = downsample_daytime(ordinal, af_info, 0); - - if (DtoQ_yq(ordinal, af_info, &year, &quarter) == INT_ERR_CODE) { - return INT_ERR_CODE; - } + ordinal = downsample_daytime(ordinal, af_info); + DtoQ_yq(ordinal, af_info, &year, &quarter); return (npy_int64)((year - BASE_YEAR) * 4 + quarter - 1); } -static npy_int64 asfreq_DTtoM(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_DTtoM(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - ordinal = downsample_daytime(ordinal, af_info, 0); + ordinal = downsample_daytime(ordinal, af_info); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) - return INT_ERR_CODE; + dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); return (npy_int64)((dinfo.year - BASE_YEAR) * 12 + dinfo.month - 1); } -static npy_int64 asfreq_DTtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - ordinal = downsample_daytime(ordinal, af_info, 0); +static npy_int64 asfreq_DTtoW(npy_int64 ordinal, asfreq_info *af_info) { + ordinal = downsample_daytime(ordinal, af_info); return (ordinal + ORD_OFFSET - (1 + af_info->to_week_end)) / 7 + 1 - WEEK_OFFSET; } -static npy_int64 asfreq_DTtoB(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_DTtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; + npy_int64 absdate; + int roll_back; - ordinal = downsample_daytime(ordinal, af_info, 0); - - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) - return INT_ERR_CODE; + ordinal = downsample_daytime(ordinal, af_info); + absdate = ordinal + ORD_OFFSET; + dInfoCalc_SetFromAbsDate(&dinfo, absdate); - if (relation == 'S') { - return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); - } else { - return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } + // This usage defines roll_back the opposite way from the others + roll_back = 1 - af_info->is_end; + return DtoB(&dinfo, roll_back, absdate); } // all intra day calculations are now done within one function -static npy_int64 asfreq_DownsampleWithinDay(npy_int64 ordinal, char relation, +static npy_int64 asfreq_DownsampleWithinDay(npy_int64 ordinal, asfreq_info *af_info) { - return downsample_daytime(ordinal, af_info, relation == 'E'); + return downsample_daytime(ordinal, af_info); } -static npy_int64 asfreq_UpsampleWithinDay(npy_int64 ordinal, char relation, +static npy_int64 asfreq_UpsampleWithinDay(npy_int64 ordinal, asfreq_info *af_info) { - return upsample_daytime(ordinal, af_info, relation == 'E'); + return upsample_daytime(ordinal, af_info); } //************ FROM BUSINESS *************** -static npy_int64 asfreq_BtoDT(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_BtoDT(npy_int64 ordinal, asfreq_info *af_info) { ordinal += BDAY_OFFSET; ordinal = (((ordinal - 1) / 5) * 7 + mod_compat(ordinal - 1, 5) + 1 - ORD_OFFSET); - return upsample_daytime(ordinal, af_info, relation != 'S'); + return upsample_daytime(ordinal, af_info); } -static npy_int64 asfreq_BtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, - asfreq_DTtoA); +static npy_int64 asfreq_BtoA(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoA); } -static npy_int64 asfreq_BtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, - asfreq_DTtoQ); +static npy_int64 asfreq_BtoQ(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoQ); } -static npy_int64 asfreq_BtoM(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, - asfreq_DTtoM); +static npy_int64 asfreq_BtoM(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoM); } -static npy_int64 asfreq_BtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, - asfreq_DTtoW); +static npy_int64 asfreq_BtoW(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoW); } //************ FROM WEEKLY *************** -static npy_int64 asfreq_WtoDT(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - ordinal += WEEK_OFFSET; - if (relation != 'S') { - ordinal += 1; - } - - ordinal = ordinal * 7 - 6 + af_info->from_week_end - ORD_OFFSET; - - if (relation != 'S') { - ordinal -= 1; - } - - return upsample_daytime(ordinal, af_info, relation != 'S'); +static npy_int64 asfreq_WtoDT(npy_int64 ordinal, asfreq_info *af_info) { + ordinal = (ordinal + WEEK_OFFSET) * 7 + + af_info->from_week_end - ORD_OFFSET + + (7 - 1) * (af_info->is_end - 1); + return upsample_daytime(ordinal, af_info); } -static npy_int64 asfreq_WtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, - asfreq_DTtoA); +static npy_int64 asfreq_WtoA(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoA); } -static npy_int64 asfreq_WtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, - asfreq_DTtoQ); +static npy_int64 asfreq_WtoQ(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoQ); } -static npy_int64 asfreq_WtoM(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, - asfreq_DTtoM); +static npy_int64 asfreq_WtoM(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoM); } -static npy_int64 asfreq_WtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, - asfreq_DTtoW); +static npy_int64 asfreq_WtoW(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoW); } -static npy_int64 asfreq_WtoB(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_WtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) - return INT_ERR_CODE; + npy_int64 absdate = asfreq_WtoDT(ordinal, af_info) + ORD_OFFSET; + int roll_back = af_info->is_end; + dInfoCalc_SetFromAbsDate(&dinfo, absdate); - if (relation == 'S') { - return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } else { - return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); - } + return DtoB(&dinfo, roll_back, absdate); } //************ FROM MONTHLY *************** @@ -612,58 +306,39 @@ static void MtoD_ym(npy_int64 ordinal, int *y, int *m) { *m = mod_compat(ordinal, 12) + 1; } -static npy_int64 asfreq_MtoDT(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_MtoDT(npy_int64 ordinal, asfreq_info *af_info) { npy_int64 absdate; int y, m; - if (relation == 'E') { - ordinal += 1; - } + ordinal += af_info->is_end; MtoD_ym(ordinal, &y, &m); - if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) - return INT_ERR_CODE; + absdate = absdate_from_ymd(y, m, 1); ordinal = absdate - ORD_OFFSET; - if (relation == 'E') { - ordinal -= 1; - } - - return upsample_daytime(ordinal, af_info, relation != 'S'); + ordinal -= af_info->is_end; + return upsample_daytime(ordinal, af_info); } -static npy_int64 asfreq_MtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, - asfreq_DTtoA); +static npy_int64 asfreq_MtoA(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_MtoDT, asfreq_DTtoA); } -static npy_int64 asfreq_MtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, - asfreq_DTtoQ); +static npy_int64 asfreq_MtoQ(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_MtoDT, asfreq_DTtoQ); } -static npy_int64 asfreq_MtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, - asfreq_DTtoW); +static npy_int64 asfreq_MtoW(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_MtoDT, asfreq_DTtoW); } -static npy_int64 asfreq_MtoB(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_MtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; + npy_int64 absdate = asfreq_MtoDT(ordinal, af_info) + ORD_OFFSET; + int roll_back = af_info->is_end; - if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) - return INT_ERR_CODE; + dInfoCalc_SetFromAbsDate(&dinfo, absdate); - if (relation == 'S') { - return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } else { - return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); - } + return DtoB(&dinfo, roll_back, absdate); } //************ FROM QUARTERLY *************** @@ -682,143 +357,94 @@ static void QtoD_ym(npy_int64 ordinal, int *y, int *m, asfreq_info *af_info) { } } -static npy_int64 asfreq_QtoDT(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_QtoDT(npy_int64 ordinal, asfreq_info *af_info) { npy_int64 absdate; int y, m; - if (relation == 'E') { - ordinal += 1; - } - + ordinal += af_info->is_end; QtoD_ym(ordinal, &y, &m, af_info); - if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) - return INT_ERR_CODE; + absdate = absdate_from_ymd(y, m, 1); - if (relation == 'E') { - absdate -= 1; - } - - return upsample_daytime(absdate - ORD_OFFSET, af_info, relation != 'S'); + absdate -= af_info->is_end; + return upsample_daytime(absdate - ORD_OFFSET, af_info); } -static npy_int64 asfreq_QtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, - asfreq_DTtoQ); +static npy_int64 asfreq_QtoQ(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoQ); } -static npy_int64 asfreq_QtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, - asfreq_DTtoA); +static npy_int64 asfreq_QtoA(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoA); } -static npy_int64 asfreq_QtoM(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, - asfreq_DTtoM); +static npy_int64 asfreq_QtoM(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoM); } -static npy_int64 asfreq_QtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, - asfreq_DTtoW); +static npy_int64 asfreq_QtoW(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoW); } -static npy_int64 asfreq_QtoB(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_QtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) - return INT_ERR_CODE; + npy_int64 absdate = asfreq_QtoDT(ordinal, af_info) + ORD_OFFSET; + int roll_back = af_info->is_end; - if (relation == 'S') { - return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } else { - return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); - } + dInfoCalc_SetFromAbsDate(&dinfo, absdate); + + return DtoB(&dinfo, roll_back, absdate); } //************ FROM ANNUAL *************** -static npy_int64 asfreq_AtoDT(npy_int64 year, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_AtoDT(npy_int64 ordinal, asfreq_info *af_info) { npy_int64 absdate; - int month = (af_info->from_a_year_end) % 12; // start from 1970 - year += BASE_YEAR; - - month += 1; + npy_int64 year = ordinal + BASE_YEAR; + int month = (af_info->from_a_year_end % 12) + 1; if (af_info->from_a_year_end != 12) { year -= 1; } - if (relation == 'E') { - year += 1; - } - + year += af_info->is_end; absdate = absdate_from_ymd(year, month, 1); - if (absdate == INT_ERR_CODE) { - return INT_ERR_CODE; - } - - if (relation == 'E') { - absdate -= 1; - } - - return upsample_daytime(absdate - ORD_OFFSET, af_info, relation != 'S'); + absdate -= af_info->is_end; + return upsample_daytime(absdate - ORD_OFFSET, af_info); } -static npy_int64 asfreq_AtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, - asfreq_DTtoA); +static npy_int64 asfreq_AtoA(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoA); } -static npy_int64 asfreq_AtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, - asfreq_DTtoQ); +static npy_int64 asfreq_AtoQ(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoQ); } -static npy_int64 asfreq_AtoM(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, - asfreq_DTtoM); +static npy_int64 asfreq_AtoM(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoM); } -static npy_int64 asfreq_AtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, - asfreq_DTtoW); +static npy_int64 asfreq_AtoW(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoW); } -static npy_int64 asfreq_AtoB(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_AtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) - return INT_ERR_CODE; + npy_int64 absdate = asfreq_AtoDT(ordinal, af_info) + ORD_OFFSET; + int roll_back = af_info->is_end; + dInfoCalc_SetFromAbsDate(&dinfo, absdate); - if (relation == 'S') { - return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } else { - return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); - } + return DtoB(&dinfo, roll_back, absdate); } -static npy_int64 nofunc(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 nofunc(npy_int64 ordinal, asfreq_info *af_info) { return INT_ERR_CODE; } -static npy_int64 no_op(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 no_op(npy_int64 ordinal, asfreq_info *af_info) { return ordinal; } @@ -835,18 +461,21 @@ static int calc_a_year_end(int freq, int group) { static int calc_week_end(int freq, int group) { return freq - group; } -void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) { +void get_asfreq_info(int fromFreq, int toFreq, char relation, + asfreq_info *af_info) { int fromGroup = get_freq_group(fromFreq); int toGroup = get_freq_group(toFreq); + if (relation == 'E') { + af_info->is_end = 1; + } else { + af_info->is_end = 0; + } + af_info->intraday_conversion_factor = get_daytime_conversion_factor( get_freq_group_index(max_value(fromGroup, FR_DAY)), get_freq_group_index(max_value(toGroup, FR_DAY))); - // printf("get_asfreq_info(%d, %d) %ld, %d\n", fromFreq, toFreq, - // af_info->intraday_conversion_factor, - // af_info->intraday_conversion_upsample); - switch (fromGroup) { case FR_WK: af_info->from_week_end = calc_week_end(fromFreq, fromGroup); @@ -1041,83 +670,6 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq) { } } -double get_abs_time(int freq, npy_int64 date_ordinal, npy_int64 ordinal) { - // printf("get_abs_time %d %lld %lld\n", freq, date_ordinal, ordinal); - - int freq_index, day_index, base_index; - npy_int64 per_day, start_ord; - double unit, result; - - if (freq <= FR_DAY) { - return 0; - } - - freq_index = get_freq_group_index(freq); - day_index = get_freq_group_index(FR_DAY); - base_index = get_freq_group_index(FR_SEC); - - // printf(" indices: day %d, freq %d, base %d\n", day_index, freq_index, - // base_index); - - per_day = get_daytime_conversion_factor(day_index, freq_index); - unit = get_daytime_conversion_factor(freq_index, base_index); - - // printf(" per_day: %lld, unit: %f\n", per_day, unit); - - if (base_index < freq_index) { - unit = 1 / unit; - // printf(" corrected unit: %f\n", unit); - } - - start_ord = date_ordinal * per_day; - // printf("start_ord: %lld\n", start_ord); - result = (double)(unit * (ordinal - start_ord)); - // printf(" result: %f\n", result); - return result; -} - -/* Sets the time part of the DateTime object. */ -static int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, double abstime) { - int inttime; - int hour, minute; - double second; - - inttime = (int)abstime; - hour = inttime / 3600; - minute = (inttime % 3600) / 60; - second = abstime - (double)(hour * 3600 + minute * 60); - - dinfo->hour = hour; - dinfo->minute = minute; - dinfo->second = second; - - dinfo->abstime = abstime; - - return 0; -} - -/* Set the instance's value using the given date and time. calendar - may be set to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to - indicate the calendar to be used. */ -static int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, - npy_int64 absdate, double abstime, - int calendar) { - /* Bounds check */ - Py_AssertWithArg(abstime >= 0.0 && abstime <= SECONDS_PER_DAY, - PyExc_ValueError, - "abstime out of range (0.0 - 86400.0): %f", abstime); - - /* Calculate the date */ - if (dInfoCalc_SetFromAbsDate(dinfo, absdate, calendar)) goto onError; - - /* Calculate the time */ - if (dInfoCalc_SetFromAbsTime(dinfo, abstime)) goto onError; - - return 0; -onError: - return INT_ERR_CODE; -} - /* ------------------------------------------------------------------ * New pandas API-helper code, to expose to cython * ------------------------------------------------------------------*/ @@ -1130,357 +682,7 @@ npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, func = get_asfreq_func(freq1, freq2); - get_asfreq_info(freq1, freq2, &finfo); - - // printf("\n%x %d %d %ld %ld\n", func, freq1, freq2, - // finfo.intraday_conversion_factor, -finfo.intraday_conversion_factor); - - val = (*func)(period_ordinal, relation, &finfo); - - if (val == INT_ERR_CODE) { - // Py_Error(PyExc_ValueError, "Unable to convert to desired - // frequency."); - goto onError; - } + get_asfreq_info(freq1, freq2, relation, &finfo); + val = (*func)(period_ordinal, &finfo); return val; -onError: - return INT_ERR_CODE; -} - -/* generate an ordinal in period space */ -npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, - int second, int microseconds, int picoseconds, - int freq) { - npy_int64 absdays, delta, seconds; - npy_int64 weeks, days; - npy_int64 ordinal, day_adj; - int freq_group, fmonth, mdiff; - freq_group = get_freq_group(freq); - - if (freq == FR_SEC || freq == FR_MS || freq == FR_US || freq == FR_NS) { - absdays = absdate_from_ymd(year, month, day); - delta = (absdays - ORD_OFFSET); - seconds = - (npy_int64)(delta * 86400 + hour * 3600 + minute * 60 + second); - - switch (freq) { - case FR_MS: - return seconds * 1000 + microseconds / 1000; - - case FR_US: - return seconds * 1000000 + microseconds; - - case FR_NS: - return seconds * 1000000000 + microseconds * 1000 + - picoseconds / 1000; - } - - return seconds; - } - - if (freq == FR_MIN) { - absdays = absdate_from_ymd(year, month, day); - delta = (absdays - ORD_OFFSET); - return (npy_int64)(delta * 1440 + hour * 60 + minute); - } - - if (freq == FR_HR) { - if ((absdays = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) { - goto onError; - } - delta = (absdays - ORD_OFFSET); - return (npy_int64)(delta * 24 + hour); - } - - if (freq == FR_DAY) { - return (npy_int64)(absdate_from_ymd(year, month, day) - ORD_OFFSET); - } - - if (freq == FR_UND) { - return (npy_int64)(absdate_from_ymd(year, month, day) - ORD_OFFSET); - } - - if (freq == FR_BUS) { - if ((days = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) { - goto onError; - } - // calculate the current week assuming sunday as last day of a week - weeks = (days - BASE_WEEK_TO_DAY_OFFSET) / DAYS_PER_WEEK; - // calculate the current weekday (in range 1 .. 7) - delta = (days - BASE_WEEK_TO_DAY_OFFSET) % DAYS_PER_WEEK + 1; - // return the number of business days in full weeks plus the business - // days in the last - possible partial - week - return (npy_int64)(weeks * BUSINESS_DAYS_PER_WEEK) + - (delta <= BUSINESS_DAYS_PER_WEEK ? delta - : BUSINESS_DAYS_PER_WEEK + 1) - - BDAY_OFFSET; - } - - if (freq_group == FR_WK) { - if ((ordinal = (npy_int64)absdate_from_ymd(year, month, day)) == - INT_ERR_CODE) { - goto onError; - } - day_adj = freq - FR_WK; - return (ordinal - (1 + day_adj)) / 7 + 1 - WEEK_OFFSET; - } - - if (freq == FR_MTH) { - return (year - BASE_YEAR) * 12 + month - 1; - } - - if (freq_group == FR_QTR) { - fmonth = freq - FR_QTR; - if (fmonth == 0) fmonth = 12; - - mdiff = month - fmonth; - if (mdiff < 0) mdiff += 12; - if (month >= fmonth) mdiff += 12; - - return (year - BASE_YEAR) * 4 + (mdiff - 1) / 3; - } - - if (freq_group == FR_ANN) { - fmonth = freq - FR_ANN; - if (fmonth == 0) fmonth = 12; - if (month <= fmonth) { - return year - BASE_YEAR; - } else { - return year - BASE_YEAR + 1; - } - } - - Py_Error(PyExc_RuntimeError, "Unable to generate frequency ordinal"); - -onError: - return INT_ERR_CODE; -} - -/* - Returns the proleptic Gregorian ordinal of the date, as an integer. - This corresponds to the number of days since Jan., 1st, 1AD. - When the instance has a frequency less than daily, the proleptic date - is calculated for the last day of the period. - */ - -npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq) { - asfreq_info af_info; - freq_conv_func toDaily = NULL; - - if (freq == FR_DAY) return period_ordinal + ORD_OFFSET; - - toDaily = get_asfreq_func(freq, FR_DAY); - get_asfreq_info(freq, FR_DAY, &af_info); - - return toDaily(period_ordinal, 'E', &af_info) + ORD_OFFSET; -} - - -// function to generate a nice string representation of the period -// object, originally from DateObject_strftime - -char *c_strftime(struct date_info *tmp, char *fmt) { - struct tm c_date; - char *result; - struct date_info dinfo = *tmp; - int result_len = strlen(fmt) + 50; - - c_date.tm_sec = (int)dinfo.second; - c_date.tm_min = dinfo.minute; - c_date.tm_hour = dinfo.hour; - c_date.tm_mday = dinfo.day; - c_date.tm_mon = dinfo.month - 1; - c_date.tm_year = dinfo.year - 1900; - c_date.tm_wday = (dinfo.day_of_week + 1) % 7; - c_date.tm_yday = dinfo.day_of_year - 1; - c_date.tm_isdst = -1; - - result = malloc(result_len * sizeof(char)); - - strftime(result, result_len, fmt, &c_date); - - return result; -} - -int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year) { - asfreq_info af_info; - int qtr_freq; - npy_int64 daily_ord; - npy_int64 (*toDaily)(npy_int64, char, asfreq_info *) = NULL; - - toDaily = get_asfreq_func(freq, FR_DAY); - get_asfreq_info(freq, FR_DAY, &af_info); - - daily_ord = toDaily(ordinal, 'E', &af_info); - - if (get_freq_group(freq) == FR_QTR) { - qtr_freq = freq; - } else { - qtr_freq = FR_QTR; - } - get_asfreq_info(FR_DAY, qtr_freq, &af_info); - - if (DtoQ_yq(daily_ord, &af_info, year, quarter) == INT_ERR_CODE) return -1; - - return 0; -} - -static int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { - asfreq_info af_info; - int qtr_freq; - - ordinal = get_python_ordinal(ordinal, freq) - ORD_OFFSET; - - if (get_freq_group(freq) == FR_QTR) - qtr_freq = freq; - else - qtr_freq = FR_QTR; - - get_asfreq_info(FR_DAY, qtr_freq, &af_info); - - if (DtoQ_yq(ordinal, &af_info, year, quarter) == INT_ERR_CODE) - return INT_ERR_CODE; - - if ((qtr_freq % 1000) > 12) *year -= 1; - - return 0; -} - -static int _ISOWeek(struct date_info *dinfo) { - int week; - - /* Estimate */ - week = (dinfo->day_of_year - 1) - dinfo->day_of_week + 3; - if (week >= 0) week = week / 7 + 1; - - /* Verify */ - if (week < 0) { - /* The day lies in last week of the previous year */ - if ((week > -2) || (week == -2 && dInfoCalc_Leapyear(dinfo->year - 1, - dinfo->calendar))) - week = 53; - else - week = 52; - } else if (week == 53) { - /* Check if the week belongs to year or year+1 */ - if (31 - dinfo->day + dinfo->day_of_week < 3) { - week = 1; - } - } - - return week; -} - -int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) { - npy_int64 absdate = get_python_ordinal(ordinal, freq); - double abstime = get_abs_time(freq, absdate - ORD_OFFSET, ordinal); - - while (abstime < 0) { - abstime += 86400; - absdate -= 1; - } - while (abstime >= 86400) { - abstime -= 86400; - absdate += 1; - } - - if (dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime, - GREGORIAN_CALENDAR)) - return INT_ERR_CODE; - - return 0; -} - -int pyear(npy_int64 ordinal, int freq) { - struct date_info dinfo; - get_date_info(ordinal, freq, &dinfo); - return dinfo.year; -} - -int pqyear(npy_int64 ordinal, int freq) { - int year, quarter; - if (_quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) - return INT_ERR_CODE; - return year; -} - -int pquarter(npy_int64 ordinal, int freq) { - int year, quarter; - if (_quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) - return INT_ERR_CODE; - return quarter; -} - -int pmonth(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.month; -} - -int pday(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.day; -} - -int pweekday(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.day_of_week; -} - -int pday_of_week(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.day_of_week; -} - -int pday_of_year(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.day_of_year; -} - -int pweek(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return _ISOWeek(&dinfo); -} - -int phour(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.hour; -} - -int pminute(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.minute; -} - -int psecond(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return (int)dinfo.second; -} - -int pdays_in_month(npy_int64 ordinal, int freq) { - int days; - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - - days = days_in_month[dInfoCalc_Leapyear(dinfo.year, dinfo.calendar)] - [dinfo.month - 1]; - return days; } diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index 35dd20848a2ec..1573b1eeec74b 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -24,18 +24,6 @@ frequency conversion routines. * declarations from period here */ -#define GREGORIAN_CALENDAR 0 -#define JULIAN_CALENDAR 1 - -#define SECONDS_PER_DAY ((double)86400.0) - -#define Py_AssertWithArg(x, errortype, errorstr, a1) \ - { \ - if (!(x)) { \ - PyErr_Format(errortype, errorstr, a1); \ - goto onError; \ - } \ - } #define Py_Error(errortype, errorstr) \ { \ PyErr_SetString(errortype, errorstr); \ @@ -113,6 +101,10 @@ frequency conversion routines. #define INT_ERR_CODE INT32_MIN typedef struct asfreq_info { + int is_end; + // char relation == 'S' (for START) --> is_end = 0 + // char relation == 'E' (for END) --> is_end = 1 + int from_week_end; // day the week ends on in the "from" frequency int to_week_end; // day the week ends on in the "to" frequency @@ -126,22 +118,15 @@ typedef struct asfreq_info { } asfreq_info; typedef struct date_info { - npy_int64 absdate; - double abstime; - double second; int minute; int hour; int day; int month; - int quarter; int year; - int day_of_week; - int day_of_year; - int calendar; } date_info; -typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info *); +typedef npy_int64 (*freq_conv_func)(npy_int64, asfreq_info *af_info); /* * new pandas API helper functions here @@ -149,33 +134,10 @@ typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info *); npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation); -npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, - int second, int microseconds, int picoseconds, - int freq); - -npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq); - -int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo); freq_conv_func get_asfreq_func(int fromFreq, int toFreq); -void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info); - -int pyear(npy_int64 ordinal, int freq); -int pqyear(npy_int64 ordinal, int freq); -int pquarter(npy_int64 ordinal, int freq); -int pmonth(npy_int64 ordinal, int freq); -int pday(npy_int64 ordinal, int freq); -int pweekday(npy_int64 ordinal, int freq); -int pday_of_week(npy_int64 ordinal, int freq); -int pday_of_year(npy_int64 ordinal, int freq); -int pweek(npy_int64 ordinal, int freq); -int phour(npy_int64 ordinal, int freq); -int pminute(npy_int64 ordinal, int freq); -int psecond(npy_int64 ordinal, int freq); -int pdays_in_month(npy_int64 ordinal, int freq); - -char *c_strftime(struct date_info *dinfo, char *fmt); -int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year); - -void initialize_daytime_conversion_factor_matrix(void); +void get_asfreq_info(int fromFreq, int toFreq, char relation, + asfreq_info *af_info); + +npy_int64 get_daytime_conversion_factor(int from_index, int to_index); #endif // PANDAS__LIBS_SRC_PERIOD_HELPER_H_ diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 81df7981096ba..85e667521e5f2 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -7,7 +7,7 @@ import numpy as np cnp.import_array() -from cpython cimport PyFloat_Check +from cpython cimport PyFloat_Check, PyUnicode_Check from util cimport (is_integer_object, is_float_object, is_string_object, is_datetime64_object) @@ -56,6 +56,8 @@ from tslibs.timestamps cimport (create_timestamp_from_ts, _NS_UPPER_BOUND, _NS_LOWER_BOUND) from tslibs.timestamps import Timestamp +cdef bint PY2 = str == bytes + cdef inline object create_datetime_from_ts( int64_t value, pandas_datetimestruct dts, @@ -522,11 +524,10 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', seen_datetime = 1 if val.tzinfo is not None: if utc_convert: - _ts = convert_datetime_to_tsobject(val, None) - iresult[i] = _ts.value try: - check_dts_bounds(&_ts.dts) - except ValueError: + _ts = convert_datetime_to_tsobject(val, None) + iresult[i] = _ts.value + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT continue @@ -542,31 +543,31 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] += val.nanosecond try: check_dts_bounds(&dts) - except ValueError: + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT continue raise elif PyDate_Check(val): + seen_datetime = 1 iresult[i] = pydate_to_dt64(val, &dts) try: check_dts_bounds(&dts) - seen_datetime = 1 - except ValueError: + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT continue raise elif is_datetime64_object(val): + seen_datetime = 1 if get_datetime64_value(val) == NPY_NAT: iresult[i] = NPY_NAT else: try: iresult[i] = get_datetime64_nanos(val) - seen_datetime = 1 - except ValueError: + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT continue @@ -574,19 +575,18 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition + seen_integer = 1 if val != val or val == NPY_NAT: iresult[i] = NPY_NAT elif is_raise or is_ignore: iresult[i] = val - seen_integer = 1 else: # coerce # we now need to parse this as if unit='ns' # we can ONLY accept integers at this point # if we have previously (or in future accept # datetimes/strings, then we must coerce) - seen_integer = 1 try: iresult[i] = cast_from_unit(val, 'ns') except: @@ -594,42 +594,37 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', elif is_string_object(val): # string + seen_string = 1 if len(val) == 0 or val in nat_strings: iresult[i] = NPY_NAT continue - - seen_string = 1 + if PyUnicode_Check(val) and PY2: + val = val.encode('utf-8') try: _string_to_dts(val, &dts, &out_local, &out_tzoffset) - value = dtstruct_to_dt64(&dts) - if out_local == 1: - tz = pytz.FixedOffset(out_tzoffset) - value = tz_convert_single(value, tz, 'UTC') - iresult[i] = value - check_dts_bounds(&dts) except ValueError: - # if requiring iso8601 strings, skip trying other formats - if require_iso8601: - if _parse_today_now(val, &iresult[i]): - continue + # A ValueError at this point is a _parsing_ error + # specifically _not_ OutOfBoundsDatetime + if _parse_today_now(val, &iresult[i]): + continue + elif require_iso8601: + # if requiring iso8601 strings, skip trying + # other formats if is_coerce: iresult[i] = NPY_NAT continue elif is_raise: - raise ValueError( - "time data %r doesn't match format " - "specified" % (val,)) - else: - return values + raise ValueError("time data {val} doesn't match " + "format specified" + .format(val=val)) + return values try: py_dt = parse_datetime_string(val, dayfirst=dayfirst, yearfirst=yearfirst) except Exception: - if _parse_today_now(val, &iresult[i]): - continue if is_coerce: iresult[i] = NPY_NAT continue @@ -638,16 +633,42 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: _ts = convert_datetime_to_tsobject(py_dt, None) iresult[i] = _ts.value - except ValueError: + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT continue raise except: + # TODO: What exception are we concerned with here? if is_coerce: iresult[i] = NPY_NAT continue raise + else: + # No error raised by string_to_dts, pick back up + # where we left off + value = dtstruct_to_dt64(&dts) + if out_local == 1: + tz = pytz.FixedOffset(out_tzoffset) + value = tz_convert_single(value, tz, 'UTC') + iresult[i] = value + try: + check_dts_bounds(&dts) + except OutOfBoundsDatetime: + # GH#19382 for just-barely-OutOfBounds falling back to + # dateutil parser will return incorrect result because + # it will ignore nanoseconds + if is_coerce: + iresult[i] = NPY_NAT + continue + elif require_iso8601: + if is_raise: + raise ValueError("time data {val} doesn't " + "match format specified" + .format(val=val)) + return values + raise + else: if is_coerce: iresult[i] = NPY_NAT diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index a1bbeea1cb69a..42473a97a7150 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -10,3 +10,4 @@ cdef int dayofweek(int y, int m, int m) nogil cdef bint is_leapyear(int64_t year) nogil cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil cpdef int32_t get_week_of_year(int year, int month, int day) nogil +cpdef int32_t get_day_of_year(int year, int month, int day) nogil diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index ae52f7dd30165..9bd315b43ea9e 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -142,17 +142,13 @@ cpdef int32_t get_week_of_year(int year, int month, int day) nogil: Assumes the inputs describe a valid date. """ cdef: - bint isleap, isleap_prev - int32_t mo_off + bint isleap int32_t doy, dow int woy isleap = is_leapyear(year) - isleap_prev = is_leapyear(year - 1) - - mo_off = _month_offset[isleap * 13 + month - 1] - doy = mo_off + day + doy = get_day_of_year(year, month, day) dow = dayofweek(year, month, day) # estimate @@ -162,7 +158,7 @@ cpdef int32_t get_week_of_year(int year, int month, int day) nogil: # verify if woy < 0: - if (woy > -2) or (woy == -2 and isleap_prev): + if (woy > -2) or (woy == -2 and is_leapyear(year - 1)): woy = 53 else: woy = 52 @@ -171,3 +167,35 @@ cpdef int32_t get_week_of_year(int year, int month, int day) nogil: woy = 1 return woy + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef int32_t get_day_of_year(int year, int month, int day) nogil: + """Return the ordinal day-of-year for the given day. + + Parameters + ---------- + year : int + month : int + day : int + + Returns + ------- + day_of_year : int32_t + + Notes + ----- + Assumes the inputs describe a valid date. + """ + cdef: + bint isleap + int32_t mo_off + int day_of_year + + isleap = is_leapyear(year) + + mo_off = _month_offset[isleap * 13 + month - 1] + + day_of_year = mo_off + day + return day_of_year diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 6e7df10e7c424..868c2641b34db 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # cython: profile=False -from cpython.datetime cimport datetime +from cpython.datetime cimport datetime, tzinfo from numpy cimport int64_t, int32_t @@ -21,8 +21,6 @@ cdef convert_to_tsobject(object ts, object tz, object unit, cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, int32_t nanos=*) -cdef void _localize_tso(_TSObject obj, object tz) - cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2) cdef int64_t get_datetime64_nanos(object val) except? -1 @@ -30,3 +28,5 @@ cdef int64_t get_datetime64_nanos(object val) except? -1 cpdef int64_t pydt_to_i8(object pydt) except? -1 cdef maybe_datetimelike_to_i8(object val) + +cdef int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a32bfc1f6836c..beaca1a8483c7 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -26,6 +26,7 @@ from np_datetime cimport (check_dts_bounds, dt64_to_dtstruct, dtstruct_to_dt64, get_datetime64_unit, get_datetime64_value, pydatetime_to_dt64) +from np_datetime import OutOfBoundsDatetime from util cimport (is_string_object, is_datetime64_object, @@ -308,12 +309,13 @@ cdef convert_to_tsobject(object ts, object tz, object unit, raise TypeError('Cannot convert input [{}] of type {} to ' 'Timestamp'.format(ts, type(ts))) - if obj.value != NPY_NAT: - check_dts_bounds(&obj.dts) - if tz is not None: - _localize_tso(obj, tz) + localize_tso(obj, tz) + if obj.value != NPY_NAT: + # check_overflows needs to run after localize_tso + check_dts_bounds(&obj.dts) + check_overflows(obj) return obj @@ -390,6 +392,7 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, obj.dts.ps = nanos * 1000 check_dts_bounds(&obj.dts) + check_overflows(obj) return obj @@ -453,6 +456,7 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, obj.value = tz_convert_single(obj.value, obj.tzinfo, 'UTC') if tz is None: check_dts_bounds(&obj.dts) + check_overflows(obj) return obj else: # Keep the converter same as PyDateTime's @@ -468,10 +472,17 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, else: ts = obj.value if tz is not None: - # shift for _localize_tso + # shift for localize_tso ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, ambiguous='raise', errors='raise')[0] + + except OutOfBoundsDatetime: + # GH#19382 for just-barely-OutOfBounds falling back to dateutil + # parser will return incorrect result because it will ignore + # nanoseconds + raise + except ValueError: try: ts = parse_datetime_string(ts, dayfirst=dayfirst, @@ -482,16 +493,55 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst) +cdef inline check_overflows(_TSObject obj): + """ + Check that we haven't silently overflowed in timezone conversion + + Parameters + ---------- + obj : _TSObject + + Returns + ------- + None + + Raises + ------ + OutOfBoundsDatetime + """ + # GH#12677 + if obj.dts.year == 1677: + if not (obj.value < 0): + raise OutOfBoundsDatetime + elif obj.dts.year == 2262: + if not (obj.value > 0): + raise OutOfBoundsDatetime + + # ---------------------------------------------------------------------- # Localization -cdef inline void _localize_tso(_TSObject obj, object tz): +cdef inline void localize_tso(_TSObject obj, tzinfo tz): """ - Take a TSObject in UTC and localizes to timezone tz. + Given the UTC nanosecond timestamp in obj.value, find the wall-clock + representation of that timestamp in the given timezone. + + Parameters + ---------- + obj : _TSObject + tz : tzinfo + + Returns + ------- + None + + Notes + ----- + Sets obj.tzinfo inplace, alters obj.dts inplace. """ cdef: ndarray[int64_t] trans, deltas - int64_t delta + int64_t delta, local_val Py_ssize_t posn datetime dt @@ -502,11 +552,8 @@ cdef inline void _localize_tso(_TSObject obj, object tz): elif obj.value == NPY_NAT: pass elif is_tzlocal(tz): - dt64_to_dtstruct(obj.value, &obj.dts) - dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, obj.dts.hour, - obj.dts.min, obj.dts.sec, obj.dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - dt64_to_dtstruct(obj.value + delta, &obj.dts) + local_val = tz_convert_utc_to_tzlocal(obj.value, tz) + dt64_to_dtstruct(local_val, &obj.dts) else: # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) @@ -548,6 +595,66 @@ cdef inline datetime _localize_pydatetime(datetime dt, tzinfo tz): # ---------------------------------------------------------------------- # Timezone Conversion +cdef inline int64_t tz_convert_tzlocal_to_utc(int64_t val, tzinfo tz): + """ + Parameters + ---------- + val : int64_t + tz : tzinfo + + Returns + ------- + utc_date : int64_t + + See Also + -------- + tz_convert_utc_to_tzlocal + """ + cdef: + pandas_datetimestruct dts + int64_t utc_date, delta + datetime dt + + dt64_to_dtstruct(val, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz) + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 + utc_date = val - delta + return utc_date + + +cdef inline int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz): + """ + Parameters + ---------- + utc_val : int64_t + tz : tzinfo + + Returns + ------- + local_val : int64_t + + See Also + -------- + tz_convert_tzlocal_to_utc + + Notes + ----- + The key difference between this and tz_convert_tzlocal_to_utc is a + an addition flipped to a subtraction in the last line. + """ + cdef: + pandas_datetimestruct dts + int64_t local_val, delta + datetime dt + + dt64_to_dtstruct(utc_val, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz) + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 + local_val = utc_val + delta + return local_val + cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): """ @@ -582,11 +689,7 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): # Convert to UTC if is_tzlocal(tz1): - dt64_to_dtstruct(val, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz1) - delta = int(get_utcoffset(tz1, dt).total_seconds()) * 1000000000 - utc_date = val - delta + utc_date = tz_convert_tzlocal_to_utc(val, tz1) elif get_timezone(tz1) != 'UTC': trans, deltas, typ = get_dst_info(tz1) pos = trans.searchsorted(val, side='right') - 1 @@ -600,11 +703,7 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): if get_timezone(tz2) == 'UTC': return utc_date elif is_tzlocal(tz2): - dt64_to_dtstruct(utc_date, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz2) - delta = int(get_utcoffset(tz2, dt).total_seconds()) * 1000000000 - return utc_date + delta + return tz_convert_utc_to_tzlocal(utc_date, tz2) # Convert UTC to other timezone trans, deltas, typ = get_dst_info(tz2) @@ -654,12 +753,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): if v == NPY_NAT: utc_dates[i] = NPY_NAT else: - dt64_to_dtstruct(v, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz1) - delta = (int(get_utcoffset(tz1, dt).total_seconds()) * - 1000000000) - utc_dates[i] = v - delta + utc_dates[i] = tz_convert_tzlocal_to_utc(v, tz1) else: trans, deltas, typ = get_dst_info(tz1) @@ -694,12 +788,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): if v == NPY_NAT: result[i] = NPY_NAT else: - dt64_to_dtstruct(v, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz2) - delta = (int(get_utcoffset(tz2, dt).total_seconds()) * - 1000000000) - result[i] = v + delta + result[i] = tz_convert_utc_to_tzlocal(v, tz2) return result # Convert UTC to other timezone @@ -769,11 +858,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, if is_tzlocal(tz): for i in range(n): v = vals[i] - dt64_to_dtstruct(v, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - result[i] = v - delta + result[i] = tz_convert_tzlocal_to_utc(v, tz) return result if is_string_object(ambiguous): @@ -1016,11 +1101,8 @@ cdef ndarray[int64_t] _normalize_local(ndarray[int64_t] stamps, object tz): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - dt64_to_dtstruct(stamps[i], &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - dt64_to_dtstruct(stamps[i] + delta, &dts) + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) result[i] = _normalized_stamp(&dts) else: # Adjust datetime64 timestamp, recompute datetimestruct @@ -1089,7 +1171,7 @@ def is_date_array_normalized(ndarray[int64_t] stamps, tz=None): Py_ssize_t i, n = len(stamps) ndarray[int64_t] trans, deltas pandas_datetimestruct dts - datetime dt + int64_t local_val if tz is None or is_utc(tz): for i in range(n): @@ -1098,11 +1180,9 @@ def is_date_array_normalized(ndarray[int64_t] stamps, tz=None): return False elif is_tzlocal(tz): for i in range(n): - dt64_to_dtstruct(stamps[i], &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, - dts.sec, dts.us, tz) - dt = dt + tz.utcoffset(dt) - if (dt.hour + dt.minute + dt.second + dt.microsecond) > 0: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) + if (dts.hour + dts.min + dts.sec + dts.us) > 0: return False else: trans, deltas, typ = get_dst_info(tz) diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index a8a865eec38dd..7a4b9775bd56e 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -15,7 +15,7 @@ cnp.import_array() from ccalendar cimport (get_days_in_month, is_leapyear, dayofweek, - get_week_of_year) + get_week_of_year, get_day_of_year) from np_datetime cimport (pandas_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, td64_to_tdstruct) from nattype cimport NPY_NAT @@ -374,15 +374,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): cdef: Py_ssize_t i, count = 0 ndarray[int32_t] out - ndarray[int32_t, ndim=2] _month_offset - int isleap, isleap_prev pandas_datetimestruct dts - int mo_off, doy, dow - - _month_offset = np.array( - [[0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365], - [0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]], - dtype=np.int32 ) count = len(dtindex) out = np.empty(count, dtype='i4') @@ -482,8 +474,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): continue dt64_to_dtstruct(dtindex[i], &dts) - isleap = is_leapyear(dts.year) - out[i] = _month_offset[isleap, dts.month -1] + dts.day + out[i] = get_day_of_year(dts.year, dts.month, dts.day) return out elif field == 'dow': diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e2caebe4c4afc..c11a8b149bc13 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -11,7 +11,9 @@ from numpy cimport int64_t, import_array, ndarray import numpy as np import_array() -from libc.stdlib cimport free +from libc.stdlib cimport free, malloc +from libc.time cimport strftime, tm +from libc.string cimport strlen, memset from pandas.compat import PY2 @@ -22,7 +24,15 @@ from cpython.datetime cimport PyDateTime_Check, PyDateTime_IMPORT PyDateTime_IMPORT from np_datetime cimport (pandas_datetimestruct, dtstruct_to_dt64, - dt64_to_dtstruct, is_leapyear) + dt64_to_dtstruct, + PANDAS_FR_D, + pandas_datetime_to_datetimestruct, + PANDAS_DATETIMEUNIT) + +cdef extern from "../src/datetime/np_datetime.h": + int64_t pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *d + ) nogil cimport util from util cimport is_period_object, is_string_object, INT32_MIN @@ -33,12 +43,16 @@ from timestamps import Timestamp from timezones cimport is_utc, is_tzlocal, get_utcoffset, get_dst_info from timedeltas cimport delta_to_nanoseconds +cimport ccalendar +from ccalendar cimport dayofweek, get_day_of_year from ccalendar import MONTH_NUMBERS +from ccalendar cimport is_leapyear +from conversion cimport tz_convert_utc_to_tzlocal from frequencies cimport (get_freq_code, get_base_alias, get_to_timestamp_base, get_freq_str, get_rule_month) from parsing import parse_time_string, NAT_SENTINEL -from resolution import resolution, Resolution +from resolution import Resolution from nattype import nat_strings, NaT, iNaT from nattype cimport _nat_scalar_rules, NPY_NAT @@ -47,21 +61,35 @@ from pandas.tseries import frequencies cdef extern from "period_helper.h": + int FR_ANN + int FR_QTR + int FR_MTH + int FR_WK + int FR_DAY + int FR_HR + int FR_MIN + int FR_SEC + int FR_MS + int FR_US + int FR_NS + int FR_BUS + int FR_UND + + int ORD_OFFSET + int WEEK_OFFSET + int BDAY_OFFSET + ctypedef struct date_info: - int64_t absdate - double abstime double second int minute int hour int day int month - int quarter int year - int day_of_week - int day_of_year - int calendar ctypedef struct asfreq_info: + int is_end + int from_week_end int to_week_end @@ -71,41 +99,332 @@ cdef extern from "period_helper.h": int from_q_year_end int to_q_year_end - ctypedef int64_t (*freq_conv_func)(int64_t, char, asfreq_info*) + ctypedef int64_t (*freq_conv_func)(int64_t, asfreq_info*) nogil - void initialize_daytime_conversion_factor_matrix() int64_t asfreq(int64_t dtordinal, int freq1, int freq2, char relation) except INT32_MIN - freq_conv_func get_asfreq_func(int fromFreq, int toFreq) - void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) - - int64_t get_period_ordinal(int year, int month, int day, - int hour, int minute, int second, - int microseconds, int picoseconds, - int freq) nogil except INT32_MIN - - int get_date_info(int64_t ordinal, int freq, - date_info *dinfo) nogil except INT32_MIN - - int pyear(int64_t ordinal, int freq) except INT32_MIN - int pqyear(int64_t ordinal, int freq) except INT32_MIN - int pquarter(int64_t ordinal, int freq) except INT32_MIN - int pmonth(int64_t ordinal, int freq) except INT32_MIN - int pday(int64_t ordinal, int freq) except INT32_MIN - int pweekday(int64_t ordinal, int freq) except INT32_MIN - int pday_of_week(int64_t ordinal, int freq) except INT32_MIN - # TODO: pday_of_week and pweekday are identical. Make one an alias instead - # of importing them separately. - int pday_of_year(int64_t ordinal, int freq) except INT32_MIN - int pweek(int64_t ordinal, int freq) except INT32_MIN - int phour(int64_t ordinal, int freq) except INT32_MIN - int pminute(int64_t ordinal, int freq) except INT32_MIN - int psecond(int64_t ordinal, int freq) except INT32_MIN - int pdays_in_month(int64_t ordinal, int freq) except INT32_MIN - char *c_strftime(date_info *dinfo, char *fmt) - int get_yq(int64_t ordinal, int freq, int *quarter, int *year) - -initialize_daytime_conversion_factor_matrix() + freq_conv_func get_asfreq_func(int fromFreq, int toFreq) nogil + void get_asfreq_info(int fromFreq, int toFreq, char relation, + asfreq_info *af_info) nogil + + int64_t get_daytime_conversion_factor(int from_index, int to_index) nogil + + +@cython.cdivision +cdef char* c_strftime(date_info *dinfo, char *fmt): + """ + Generate a nice string representation of the period + object, originally from DateObject_strftime + + Parameters + ---------- + dinfo : date_info* + fmt : char* + + Returns + ------- + result : char* + """ + cdef: + tm c_date + char *result + int result_len = strlen(fmt) + 50 + + c_date.tm_sec = dinfo.second + c_date.tm_min = dinfo.minute + c_date.tm_hour = dinfo.hour + c_date.tm_mday = dinfo.day + c_date.tm_mon = dinfo.month - 1 + c_date.tm_year = dinfo.year - 1900 + c_date.tm_wday = (dayofweek(dinfo.year, dinfo.month, dinfo.day) + 1) % 7 + c_date.tm_yday = get_day_of_year(dinfo.year, dinfo.month, dinfo.day) - 1 + c_date.tm_isdst = -1 + + result = malloc(result_len * sizeof(char)) + + strftime(result, result_len, fmt, &c_date) + + return result + + +# ---------------------------------------------------------------------- +# Conversion between date_info and pandas_datetimestruct + +cdef inline int get_freq_group(int freq) nogil: + return (freq // 1000) * 1000 + + +@cython.cdivision +cdef int64_t get_period_ordinal(int year, int month, int day, + int hour, int minute, int second, + int microseconds, int picoseconds, + int freq) nogil: + """generate an ordinal in period space""" + cdef: + int64_t absdays, unix_date, seconds, delta + int64_t weeks + int64_t day_adj + int freq_group, fmonth, mdiff + + freq_group = get_freq_group(freq) + + if freq_group == FR_ANN: + fmonth = freq - FR_ANN + if fmonth == 0: + fmonth = 12 + if month <= fmonth: + return year - 1970 + else: + return year - 1970 + 1 + + elif freq_group == FR_QTR: + fmonth = freq - FR_QTR + if fmonth == 0: + fmonth = 12 + + mdiff = month - fmonth + # TODO: Aren't the next two conditions equivalent to + # unconditional incrementing? + if mdiff < 0: + mdiff += 12 + if month >= fmonth: + mdiff += 12 + + return (year - 1970) * 4 + (mdiff - 1) / 3 + + elif freq == FR_MTH: + return (year - 1970) * 12 + month - 1 + + absdays = absdate_from_ymd(year, month, day) + unix_date = absdays - ORD_OFFSET + + if freq >= FR_SEC: + seconds = unix_date * 86400 + hour * 3600 + minute * 60 + second + + if freq == FR_MS: + return seconds * 1000 + microseconds / 1000 + + elif freq == FR_US: + return seconds * 1000000 + microseconds + + elif freq == FR_NS: + return (seconds * 1000000000 + + microseconds * 1000 + picoseconds / 1000) + + else: + return seconds + + elif freq == FR_MIN: + return unix_date * 1440 + hour * 60 + minute + + elif freq == FR_HR: + return unix_date * 24 + hour + + elif freq == FR_DAY: + return unix_date + + elif freq == FR_UND: + return unix_date + + elif freq == FR_BUS: + # calculate the current week assuming sunday as last day of a week + # Jan 1 0001 is a Monday, so subtract 1 to get to end-of-week + weeks = (unix_date + ORD_OFFSET - 1) / 7 + # calculate the current weekday (in range 1 .. 7) + delta = (unix_date + ORD_OFFSET - 1) % 7 + 1 + # return the number of business days in full weeks plus the business + # days in the last - possible partial - week + if delta <= 5: + return (weeks * 5) + delta - BDAY_OFFSET + else: + return (weeks * 5) + (5 + 1) - BDAY_OFFSET + + elif freq_group == FR_WK: + day_adj = freq - FR_WK + return (unix_date + ORD_OFFSET - (1 + day_adj)) / 7 + 1 - WEEK_OFFSET + + # raise ValueError + + +cdef int get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil: + cdef: + int64_t absdate + double abstime + + absdate = get_python_ordinal(ordinal, freq); + abstime = get_abs_time(freq, absdate - ORD_OFFSET, ordinal) + + while abstime < 0: + abstime += 86400 + absdate -= 1 + + while abstime >= 86400: + abstime -= 86400 + absdate += 1 + + dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime) + return 0 + + +cdef int64_t get_python_ordinal(int64_t period_ordinal, int freq) nogil: + """ + Returns the proleptic Gregorian ordinal of the date, as an integer. + This corresponds to the number of days since Jan., 1st, 1AD. + When the instance has a frequency less than daily, the proleptic date + is calculated for the last day of the period. + """ + cdef: + asfreq_info af_info + freq_conv_func toDaily = NULL + + if freq == FR_DAY: + return period_ordinal + ORD_OFFSET + + toDaily = get_asfreq_func(freq, FR_DAY) + get_asfreq_info(freq, FR_DAY, 'E', &af_info) + return toDaily(period_ordinal, &af_info) + ORD_OFFSET + + +cdef int dInfoCalc_SetFromAbsDateTime(date_info *dinfo, + int64_t absdate, double abstime) nogil: + """ + Set the instance's value using the given date and time. + Assumes GREGORIAN_CALENDAR. + """ + # Bounds check + # The calling function is responsible for ensuring that + # abstime >= 0.0 and abstime <= 86400 + + # Calculate the date + dInfoCalc_SetFromAbsDate(dinfo, absdate) + + # Calculate the time + dInfoCalc_SetFromAbsTime(dinfo, abstime) + return 0 + + +cdef int dInfoCalc_SetFromAbsDate(date_info *dinfo, int64_t absdate) nogil: + """ + Sets the date part of the date_info struct + Assumes GREGORIAN_CALENDAR + """ + cdef: + pandas_datetimestruct dts + + pandas_datetime_to_datetimestruct(absdate - ORD_OFFSET, PANDAS_FR_D, &dts) + dinfo.year = dts.year + dinfo.month = dts.month + dinfo.day = dts.day + return 0 + + +@cython.cdivision +cdef int dInfoCalc_SetFromAbsTime(date_info *dinfo, double abstime) nogil: + """ + Sets the time part of the DateTime object. + """ + cdef: + int inttime + int hour, minute + double second + + inttime = abstime + hour = inttime / 3600 + minute = (inttime % 3600) / 60 + second = abstime - (hour * 3600 + minute * 60) + + dinfo.hour = hour + dinfo.minute = minute + dinfo.second = second + return 0 + + +@cython.cdivision +cdef double get_abs_time(int freq, int64_t date_ordinal, + int64_t ordinal) nogil: + cdef: + int freq_index, day_index, base_index + int64_t per_day, start_ord + double unit, result + + if freq <= FR_DAY: + return 0 + + freq_index = freq // 1000 + day_index = FR_DAY // 1000 + base_index = FR_SEC // 1000 + + per_day = get_daytime_conversion_factor(day_index, freq_index) + unit = get_daytime_conversion_factor(freq_index, base_index) + + if base_index < freq_index: + unit = 1 / unit + + start_ord = date_ordinal * per_day + result = (unit * (ordinal - start_ord)) + return result + + +cdef int64_t absdate_from_ymd(int year, int month, int day) nogil: + """ + Find the absdate (days elapsed since datetime(1, 1, 1) + for the given year/month/day. + Assumes GREGORIAN_CALENDAR + """ + # /* Calculate the absolute date + cdef: + pandas_datetimestruct dts + int64_t unix_date + + memset(&dts, 0, sizeof(pandas_datetimestruct)) + dts.year = year + dts.month = month + dts.day = day + unix_date = pandas_datetimestruct_to_datetime(PANDAS_FR_D, &dts) + return ORD_OFFSET + unix_date + + +cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): + cdef: + asfreq_info af_info + int qtr_freq + int64_t daily_ord + + daily_ord = get_python_ordinal(ordinal, freq) - ORD_OFFSET + + if get_freq_group(freq) == FR_QTR: + qtr_freq = freq + else: + qtr_freq = FR_QTR + + get_asfreq_info(FR_DAY, qtr_freq, 'E', &af_info) + + DtoQ_yq(daily_ord, &af_info, year, quarter) + return qtr_freq + + +cdef int64_t DtoQ_yq(int64_t ordinal, asfreq_info *af_info, + int *year, int *quarter): + cdef: + date_info dinfo + + dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET) + + if af_info.to_q_year_end != 12: + dinfo.month -= af_info.to_q_year_end + if dinfo.month <= 0: + dinfo.month += 12 + else: + dinfo.year += 1 + + year[0] = dinfo.year + quarter[0] = monthToQuarter(dinfo.month) + return 0 + + +cdef inline int monthToQuarter(int month): + return (month - 1) // 3 + 1 + # ---------------------------------------------------------------------- # Period logic @@ -171,8 +490,7 @@ cdef char START = 'S' cdef char END = 'E' -cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int freq2, - bint end): +cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end): """ Convert period ordinal from one frequency to another, and if upsampling, choose to use start ('S') or end ('E') of period. @@ -180,13 +498,13 @@ cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int freq2, cdef: int64_t retval - if period_ordinal == iNaT: + if ordinal == iNaT: return iNaT if end: - retval = asfreq(period_ordinal, freq1, freq2, END) + retval = asfreq(ordinal, freq1, freq2, END) else: - retval = asfreq(period_ordinal, freq1, freq2, START) + retval = asfreq(ordinal, freq1, freq2, START) if retval == INT32_MIN: raise ValueError('Frequency conversion failed') @@ -203,33 +521,33 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): ndarray[int64_t] result Py_ssize_t i, n freq_conv_func func - asfreq_info finfo + asfreq_info af_info int64_t val char relation n = len(arr) result = np.empty(n, dtype=np.int64) - func = get_asfreq_func(freq1, freq2) - get_asfreq_info(freq1, freq2, &finfo) - if end: relation = END else: relation = START + func = get_asfreq_func(freq1, freq2) + get_asfreq_info(freq1, freq2, relation, &af_info) + mask = arr == iNaT if mask.any(): # NaT process for i in range(n): val = arr[i] if val != iNaT: - val = func(val, relation, &finfo) + val = func(val, &af_info) if val == INT32_MIN: raise ValueError("Unable to convert to desired frequency.") result[i] = val else: for i in range(n): - val = func(arr[i], relation, &finfo) + val = func(arr[i], &af_info) if val == INT32_MIN: raise ValueError("Unable to convert to desired frequency.") result[i] = val @@ -367,19 +685,110 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): return result + +# ---------------------------------------------------------------------- # period accessors ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN +cdef int pyear(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.year + + +@cython.cdivision +cdef int pqyear(int64_t ordinal, int freq): + cdef: + int year, quarter, qtr_freq + qtr_freq = get_yq(ordinal, freq, &quarter, &year) + if (qtr_freq % 1000) > 12: + year -= 1 + return year + + +cdef int pquarter(int64_t ordinal, int freq): + cdef: + int year, quarter, qtr_freq + qtr_freq = get_yq(ordinal, freq, &quarter, &year) + if (qtr_freq % 1000) > 12: + year -= 1 + return quarter + + +cdef int pmonth(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.month + + +cdef int pday(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.day + + +cdef int pweekday(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dayofweek(dinfo.year, dinfo.month, dinfo.day) + + +cdef int pday_of_year(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return get_day_of_year(dinfo.year, dinfo.month, dinfo.day) + + +cdef int pweek(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return ccalendar.get_week_of_year(dinfo.year, dinfo.month, dinfo.day) + + +cdef int phour(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.hour + + +cdef int pminute(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.minute + + +cdef int psecond(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.second + + +cdef int pdays_in_month(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return ccalendar.get_days_in_month(dinfo.year, dinfo.month) + + def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): cdef: Py_ssize_t i, sz ndarray[int64_t] out accessor f - f = _get_accessor_func(code) - if f is NULL: + func = _get_accessor_func(code) + if func is NULL: raise ValueError('Unrecognized period code: %d' % code) sz = len(arr) @@ -389,36 +798,36 @@ def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): if arr[i] == iNaT: out[i] = -1 continue - out[i] = f(arr[i], freq) + out[i] = func(arr[i], freq) return out cdef accessor _get_accessor_func(int code): if code == 0: - return &pyear + return pyear elif code == 1: - return &pqyear + return pqyear elif code == 2: - return &pquarter + return pquarter elif code == 3: - return &pmonth + return pmonth elif code == 4: - return &pday + return pday elif code == 5: - return &phour + return phour elif code == 6: - return &pminute + return pminute elif code == 7: - return &psecond + return psecond elif code == 8: - return &pweek + return pweek elif code == 9: - return &pday_of_year + return pday_of_year elif code == 10: - return &pweekday + return pweekday elif code == 11: - return &pdays_in_month + return pdays_in_month return NULL @@ -483,6 +892,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, ndarray[int64_t] result = np.empty(n, dtype=np.int64) ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts + int64_t local_val if is_utc(tz): for i in range(n): @@ -499,11 +909,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - dt64_to_dtstruct(stamps[i], &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - dt64_to_dtstruct(stamps[i] + delta, &dts) + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index b166babe5992c..d0a9501afe566 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -23,6 +23,7 @@ from timezones cimport (is_utc, is_tzlocal, maybe_get_tz, get_dst_info, get_utcoffset) from fields import build_field_sarray from conversion import tz_convert +from conversion cimport tz_convert_utc_to_tzlocal from ccalendar import MONTH_ALIASES, int_to_weekday from pandas._libs.properties import cache_readonly @@ -78,6 +79,7 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): int reso = RESO_DAY, curr_reso ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts + int64_t local_val if is_utc(tz): for i in range(n): @@ -91,11 +93,8 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): for i in range(n): if stamps[i] == NPY_NAT: continue - dt64_to_dtstruct(stamps[i], &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - dt64_to_dtstruct(stamps[i] + delta, &dts) + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) curr_reso = _reso_stamp(&dts) if curr_reso < reso: reso = curr_reso diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index b9be9c16eb6c3..ed77916a1d887 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -29,8 +29,7 @@ from nattype import NaT from nattype cimport NPY_NAT from np_datetime import OutOfBoundsDatetime from np_datetime cimport (reverse_ops, cmp_scalar, check_dts_bounds, - pandas_datetimestruct, dt64_to_dtstruct, - is_leapyear) + pandas_datetimestruct, dt64_to_dtstruct) from timedeltas import Timedelta from timedeltas cimport delta_to_nanoseconds from timezones cimport ( @@ -59,6 +58,46 @@ cdef inline object create_timestamp_from_ts(int64_t value, return ts_base +def round_ns(values, rounder, freq): + """ + Applies rounding function at given frequency + + Parameters + ---------- + values : int, :obj:`ndarray` + rounder : function + freq : str, obj + + Returns + ------- + int or :obj:`ndarray` + """ + from pandas.tseries.frequencies import to_offset + unit = to_offset(freq).nanos + if unit < 1000: + # for nano rounding, work with the last 6 digits separately + # due to float precision + buff = 1000000 + r = (buff * (values // buff) + unit * + (rounder((values % buff) * (1 / float(unit)))).astype('i8')) + else: + if unit % 1000 != 0: + msg = 'Precision will be lost using frequency: {}' + warnings.warn(msg.format(freq)) + + # GH19206 + # to deal with round-off when unit is large + if unit >= 1e9: + divisor = 10 ** int(np.log10(unit / 1e7)) + else: + divisor = 10 + + r = (unit * rounder((values * (divisor / float(unit))) / divisor) + .astype('i8')) + + return r + + # This is PITA. Because we inherit from datetime, which has very specific # construction requirements, we need to do object instantiation in python # (see Timestamp class above). This will serve as a C extension type that @@ -291,14 +330,6 @@ cdef class _Timestamp(datetime): val = tz_convert_single(self.value, 'UTC', self.tz) return val - cpdef int _get_field(self, field): - cdef: - int64_t val - ndarray[int32_t] out - val = self._maybe_convert_value_to_local() - out = get_date_field(np.array([val], dtype=np.int64), field) - return int(out[0]) - cpdef bint _get_start_end_field(self, str field): cdef: int64_t val @@ -590,28 +621,12 @@ class Timestamp(_Timestamp): return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq) def _round(self, freq, rounder): - - cdef: - int64_t unit, r, value, buff = 1000000 - object result - - from pandas.tseries.frequencies import to_offset - unit = to_offset(freq).nanos if self.tz is not None: value = self.tz_localize(None).value else: value = self.value - if unit < 1000 and unit % 1000 != 0: - # for nano rounding, work with the last 6 digits separately - # due to float precision - r = (buff * (value // buff) + unit * - (rounder((value % buff) / float(unit))).astype('i8')) - elif unit >= 1000 and unit % 1000 != 0: - msg = 'Precision will be lost using frequency: {}' - warnings.warn(msg.format(freq)) - r = (unit * rounder(value / float(unit)).astype('i8')) - else: - r = (unit * rounder(value / float(unit)).astype('i8')) + + r = round_ns(value, rounder, freq) result = Timestamp(r, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) @@ -695,14 +710,11 @@ class Timestamp(_Timestamp): @property def dayofyear(self): - return self._get_field('doy') + return ccalendar.get_day_of_year(self.year, self.month, self.day) @property def week(self): - if self.freq is None: - # fastpath for non-business - return ccalendar.get_week_of_year(self.year, self.month, self.day) - return self._get_field('woy') + return ccalendar.get_week_of_year(self.year, self.month, self.day) weekofyear = week @@ -764,7 +776,7 @@ class Timestamp(_Timestamp): @property def is_leap_year(self): - return bool(is_leapyear(self.year)) + return bool(ccalendar.is_leapyear(self.year)) def tz_localize(self, tz, ambiguous='raise', errors='raise'): """ diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4cdec54b9a07a..c65943fbbb201 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,15 +1,20 @@ +import warnings import numpy as np from pandas import compat from pandas._libs import reduction +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.common import ( is_extension_type, is_sequence) +from pandas.util._decorators import cache_readonly from pandas.io.formats.printing import pprint_thing -def frame_apply(obj, func, axis=0, broadcast=False, - raw=False, reduce=None, args=(), **kwds): +def frame_apply(obj, func, axis=0, broadcast=None, + raw=False, reduce=None, result_type=None, + ignore_failures=False, + args=None, kwds=None): """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) @@ -19,20 +24,49 @@ def frame_apply(obj, func, axis=0, broadcast=False, klass = FrameColumnApply return klass(obj, func, broadcast=broadcast, - raw=raw, reduce=reduce, args=args, kwds=kwds) + raw=raw, reduce=reduce, result_type=result_type, + ignore_failures=ignore_failures, + args=args, kwds=kwds) class FrameApply(object): - def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): + def __init__(self, obj, func, broadcast, raw, reduce, result_type, + ignore_failures, args, kwds): self.obj = obj - self.broadcast = broadcast self.raw = raw - self.reduce = reduce - self.args = args - - self.ignore_failures = kwds.pop('ignore_failures', False) - self.kwds = kwds + self.ignore_failures = ignore_failures + self.args = args or () + self.kwds = kwds or {} + + if result_type not in [None, 'reduce', 'broadcast', 'expand']: + raise ValueError("invalid value for result_type, must be one " + "of {None, 'reduce', 'broadcast', 'expand'}") + + if broadcast is not None: + warnings.warn("The broadcast argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='broadcast' to broadcast the result " + "to the original dimensions", + FutureWarning, stacklevel=4) + if broadcast: + result_type = 'broadcast' + + if reduce is not None: + warnings.warn("The reduce argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='reduce' to try to reduce the result " + "to the original dimensions", + FutureWarning, stacklevel=4) + if reduce: + + if result_type is not None: + raise ValueError( + "cannot pass both reduce=True and result_type") + + result_type = 'reduce' + + self.result_type = result_type # curry if needed if kwds or args and not isinstance(func, np.ufunc): @@ -43,6 +77,11 @@ def f(x): self.f = f + # results + self.result = None + self.res_index = None + self.res_columns = None + @property def columns(self): return self.obj.columns @@ -51,10 +90,14 @@ def columns(self): def index(self): return self.obj.index - @property + @cache_readonly def values(self): return self.obj.values + @cache_readonly + def dtypes(self): + return self.obj.dtypes + @property def agg_axis(self): return self.obj._get_agg_axis(self.axis) @@ -68,8 +111,7 @@ def get_result(self): # string dispatch if isinstance(self.f, compat.string_types): - if self.axis: - self.kwds['axis'] = self.axis + self.kwds['axis'] = self.axis return getattr(self.obj, self.f)(*self.args, **self.kwds) # ufunc @@ -80,25 +122,37 @@ def get_result(self): columns=self.columns, copy=False) # broadcasting - if self.broadcast: + if self.result_type == 'broadcast': return self.apply_broadcast() # one axis empty - if not all(self.obj.shape): + elif not all(self.obj.shape): return self.apply_empty_result() # raw - if self.raw and not self.obj._is_mixed_type: + elif self.raw and not self.obj._is_mixed_type: return self.apply_raw() return self.apply_standard() def apply_empty_result(self): - from pandas import Series - reduce = self.reduce + """ + we have an empty result; at least 1 axis is 0 + + we will try to apply the function to an empty + series in order to see if this is a reduction function + """ + + # we are not asked to reduce or infer reduction + # so just return a copy of the existing object + if self.result_type not in ['reduce', None]: + return self.obj.copy() + + # we may need to infer + reduce = self.result_type == 'reduce' - if reduce is None: - reduce = False + from pandas import Series + if not reduce: EMPTY_SERIES = Series([]) try: @@ -113,6 +167,8 @@ def apply_empty_result(self): return self.obj.copy() def apply_raw(self): + """ apply to the values as a numpy array """ + try: result = reduction.reduce(self.values, self.f, axis=self.axis) except Exception: @@ -125,49 +181,70 @@ def apply_raw(self): else: return Series(result, index=self.agg_axis) - def apply_standard(self): - from pandas import Series + def apply_broadcast(self, target): + result_values = np.empty_like(target.values) + + # axis which we want to compare compliance + result_compare = target.shape[0] + + for i, col in enumerate(target.columns): + res = self.f(target[col]) + ares = np. asarray(res).ndim + + # must be a scalar or 1d + if ares > 1: + raise ValueError("too many dims to broadcast") + elif ares == 1: + + # must match return dim + if result_compare != len(res): + raise ValueError("cannot broadcast result") - reduce = self.reduce - if reduce is None: - reduce = True + result_values[:, i] = res + + # we *always* preserve the original index / columns + result = self.obj._constructor(result_values, + index=target.index, + columns=target.columns) + return result + + def apply_standard(self): # try to reduce first (by default) # this only matters if the reduction in values is of different dtype # e.g. if we want to apply to a SparseFrame, then can't directly reduce - if reduce: - values = self.values - # we cannot reduce using non-numpy dtypes, - # as demonstrated in gh-12244 - if not is_extension_type(values): + # we cannot reduce using non-numpy dtypes, + # as demonstrated in gh-12244 + if (self.result_type in ['reduce', None] and + not self.dtypes.apply(is_extension_type).any()): - # Create a dummy Series from an empty array - index = self.obj._get_axis(self.axis) - empty_arr = np.empty(len(index), dtype=values.dtype) - - dummy = Series(empty_arr, index=index, dtype=values.dtype) + # Create a dummy Series from an empty array + from pandas import Series + values = self.values + index = self.obj._get_axis(self.axis) + labels = self.agg_axis + empty_arr = np.empty(len(index), dtype=values.dtype) + dummy = Series(empty_arr, index=index, dtype=values.dtype) - try: - labels = self.agg_axis - result = reduction.reduce(values, self.f, - axis=self.axis, - dummy=dummy, - labels=labels) - return Series(result, index=labels) - except Exception: - pass + try: + result = reduction.reduce(values, self.f, + axis=self.axis, + dummy=dummy, + labels=labels) + return Series(result, index=labels) + except Exception: + pass # compute the result using the series generator - results, res_index, res_columns = self._apply_series_generator() + self.apply_series_generator() # wrap results - return self.wrap_results(results, res_index, res_columns) + return self.wrap_results() - def _apply_series_generator(self): + def apply_series_generator(self): series_gen = self.series_generator res_index = self.result_index - res_columns = self.result_columns i = None keys = [] @@ -201,40 +278,23 @@ def _apply_series_generator(self): pprint_thing(k), ) raise - return results, res_index, res_columns + self.results = results + self.res_index = res_index + self.res_columns = self.result_columns - def wrap_results(self, results, res_index, res_columns): - from pandas import Series + def wrap_results(self): + results = self.results + # see if we can infer the results if len(results) > 0 and is_sequence(results[0]): - if not isinstance(results[0], Series): - index = res_columns - else: - index = None - result = self.obj._constructor(data=results, index=index) - result.columns = res_index + return self.wrap_results_for_axis() - if self.axis == 1: - result = result.T - result = result._convert( - datetime=True, timedelta=True, copy=False) - - else: - - result = Series(results) - result.index = res_index - - return result - - def _apply_broadcast(self, target): - result_values = np.empty_like(target.values) - columns = target.columns - for i, col in enumerate(columns): - result_values[:, i] = self.f(target[col]) + # dict of scalars + from pandas import Series + result = Series(results) + result.index = self.res_index - result = self.obj._constructor(result_values, index=target.index, - columns=target.columns) return result @@ -251,7 +311,7 @@ def get_result(self): return super(FrameRowApply, self).get_result() def apply_broadcast(self): - return self._apply_broadcast(self.obj) + return super(FrameRowApply, self).apply_broadcast(self.obj) @property def series_generator(self): @@ -266,29 +326,37 @@ def result_index(self): def result_columns(self): return self.index + def wrap_results_for_axis(self): + """ return the results for the rows """ -class FrameColumnApply(FrameApply): - axis = 1 + results = self.results + result = self.obj._constructor(data=results) - def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): - super(FrameColumnApply, self).__init__(obj, func, broadcast, - raw, reduce, args, kwds) + if not isinstance(results[0], ABCSeries): + try: + result.index = self.res_columns + except ValueError: + pass - # skip if we are mixed datelike and trying reduce across axes - # GH6125 - if self.reduce: - if self.obj._is_mixed_type and self.obj._is_datelike_mixed_type: - self.reduce = False + try: + result.columns = self.res_index + except ValueError: + pass + + return result + + +class FrameColumnApply(FrameApply): + axis = 1 def apply_broadcast(self): - return self._apply_broadcast(self.obj.T).T + result = super(FrameColumnApply, self).apply_broadcast(self.obj.T) + return result.T @property def series_generator(self): - from pandas import Series - dtype = object if self.obj._is_mixed_type else None - return (Series._from_array(arr, index=self.columns, name=name, - dtype=dtype) + constructor = self.obj._constructor_sliced + return (constructor(arr, index=self.columns, name=name) for i, (arr, name) in enumerate(zip(self.values, self.index))) @@ -299,3 +367,39 @@ def result_index(self): @property def result_columns(self): return self.columns + + def wrap_results_for_axis(self): + """ return the results for the columns """ + results = self.results + + # we have requested to expand + if self.result_type == 'expand': + result = self.infer_to_same_shape() + + # we have a non-series and don't want inference + elif not isinstance(results[0], ABCSeries): + from pandas import Series + + result = Series(results) + result.index = self.res_index + + # we may want to infer results + else: + result = self.infer_to_same_shape() + + return result + + def infer_to_same_shape(self): + """ infer the results to the same shape as the input object """ + results = self.results + + result = self.obj._constructor(data=results) + result = result.T + + # set the index + result.index = self.res_index + + # infer dtypes + result = result.infer_objects() + + return result diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1556b653819a6..e618dc6b69b2d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1,4 +1,6 @@ """An interface for extending pandas with custom arrays.""" +import numpy as np + from pandas.errors import AbstractMethodError _not_implemented_message = "{} does not implement {}." @@ -138,6 +140,25 @@ def nbytes(self): # ------------------------------------------------------------------------ # Additional Methods # ------------------------------------------------------------------------ + def astype(self, dtype, copy=True): + """Cast to a NumPy array with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray + NumPy ndarray with 'dtype' for its dtype. + """ + return np.array(self, dtype=dtype, copy=copy) + def isna(self): # type: () -> np.ndarray """Boolean NumPy array indicating if each value is missing. @@ -245,3 +266,15 @@ def _can_hold_na(self): Setting this to false will optimize some operations like fillna. """ return True + + @property + def _ndarray_values(self): + # type: () -> np.ndarray + """Internal pandas method for lossy conversion to a NumPy ndarray. + + This method is not part of the pandas interface. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + """ + return np.array(self) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 62c6a6b16cbe9..bcf9cb7646704 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -243,7 +243,7 @@ class Categorical(ExtensionArray, PandasObject): # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 - _dtype = CategoricalDtype() + _dtype = CategoricalDtype(ordered=False) _deprecations = frozenset(['labels']) _typ = 'categorical' @@ -294,7 +294,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, if fastpath: self._codes = coerce_indexer_dtype(values, categories) - self._dtype = dtype + self._dtype = self._dtype.update_dtype(dtype) return # null_mask indicates missing values we want to exclude from inference. @@ -358,7 +358,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, full_codes[~null_mask] = codes codes = full_codes - self._dtype = dtype + self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) @property @@ -410,6 +410,10 @@ def dtype(self): """The :class:`~pandas.api.types.CategoricalDtype` for this instance""" return self._dtype + @property + def _ndarray_values(self): + return self.codes + @property def _constructor(self): return Categorical @@ -438,7 +442,7 @@ def astype(self, dtype, copy=True): """ if is_categorical_dtype(dtype): # GH 10696/18593 - dtype = self.dtype._update_dtype(dtype) + dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self if dtype == self.dtype: return self @@ -560,7 +564,7 @@ def from_codes(cls, codes, categories, ordered=False): raise ValueError( "codes need to be convertible to an arrays of integers") - categories = CategoricalDtype._validate_categories(categories) + categories = CategoricalDtype.validate_categories(categories) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " @@ -1165,7 +1169,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_categories' not in state and '_levels' in state: - state['_categories'] = self.dtype._validate_categories(state.pop( + state['_categories'] = self.dtype.validate_categories(state.pop( '_levels')) if '_codes' not in state and 'labels' in state: state['_codes'] = coerce_indexer_dtype( diff --git a/pandas/core/base.py b/pandas/core/base.py index 54d25a16a10a3..0ca029ffd4c25 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -13,7 +13,8 @@ is_list_like, is_scalar, is_datetimelike, - is_extension_type) + is_extension_type, + is_extension_array_dtype) from pandas.util._validators import validate_bool_kwarg @@ -392,6 +393,10 @@ def nested_renaming_depr(level=4): elif isinstance(obj, ABCSeries): nested_renaming_depr() + elif isinstance(obj, ABCDataFrame) and \ + k not in obj.columns: + raise KeyError( + "Column '{col}' does not exist!".format(col=k)) arg = new_arg @@ -734,7 +739,7 @@ def data(self): @property def itemsize(self): """ return the size of the dtype of the item of the underlying data """ - return self._values.itemsize + return self._ndarray_values.itemsize @property def nbytes(self): @@ -744,7 +749,7 @@ def nbytes(self): @property def strides(self): """ return the strides of the underlying data """ - return self._values.strides + return self._ndarray_values.strides @property def size(self): @@ -764,8 +769,17 @@ def base(self): return self.values.base @property - def _values(self): - """ the internal implementation """ + def _ndarray_values(self): + """The data as an ndarray, possibly losing information. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + + - categorical -> codes + """ + # type: () -> np.ndarray + if is_extension_array_dtype(self): + return self.values._ndarray_values return self.values @property @@ -975,6 +989,7 @@ def unique(self): values = self._values if hasattr(values, 'unique'): + result = values.unique() else: from pandas.core.algorithms import unique1d @@ -1048,7 +1063,7 @@ def is_monotonic_decreasing(self): def memory_usage(self, deep=False): """ - Memory usage of my values + Memory usage of the values Parameters ---------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b2816343fc8eb..55919fb2bea0d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -927,7 +927,7 @@ def try_timedelta(v): # will try first with a string & object conversion from pandas import to_timedelta try: - return to_timedelta(v)._values.reshape(shape) + return to_timedelta(v)._ndarray_values.reshape(shape) except Exception: return v.reshape(shape) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c66e7fcfc6978..c2b71bc316fe8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1709,7 +1709,7 @@ def is_extension_array_dtype(arr_or_dtype): from pandas.core.arrays import ExtensionArray # we want to unpack series, anything else? - if isinstance(arr_or_dtype, ABCSeries): + if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)): arr_or_dtype = arr_or_dtype._values return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index ddecbe85087d8..d306d0d78f1f4 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -488,12 +488,14 @@ def _concat_index_asobject(to_concat, name=None): concat all inputs as object. DatetimeIndex, TimedeltaIndex and PeriodIndex are converted to object dtype before concatenation """ + from pandas import Index + from pandas.core.arrays import ExtensionArray - klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex + klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, + ExtensionArray) to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat] - from pandas import Index self = to_concat[0] attribs = self._get_attributes_dict() attribs['name'] = name diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d8d3a96992757..99e4033f104db 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -159,11 +159,11 @@ class CategoricalDtype(PandasExtensionDtype): _metadata = ['categories', 'ordered'] _cache = {} - def __init__(self, categories=None, ordered=False): + def __init__(self, categories=None, ordered=None): self._finalize(categories, ordered, fastpath=False) @classmethod - def _from_fastpath(cls, categories=None, ordered=False): + def _from_fastpath(cls, categories=None, ordered=None): self = cls.__new__(cls) self._finalize(categories, ordered, fastpath=True) return self @@ -180,14 +180,12 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): def _finalize(self, categories, ordered, fastpath=False): - if ordered is None: - ordered = False - else: - self._validate_ordered(ordered) + if ordered is not None: + self.validate_ordered(ordered) if categories is not None: - categories = self._validate_categories(categories, - fastpath=fastpath) + categories = self.validate_categories(categories, + fastpath=fastpath) self._categories = categories self._ordered = ordered @@ -208,6 +206,17 @@ def __hash__(self): return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other): + """ + Rules for CDT equality: + 1) Any CDT is equal to the string 'category' + 2) Any CDT is equal to a CDT with categories=None regardless of ordered + 3) A CDT with ordered=True is only equal to another CDT with + ordered=True and identical categories in the same order + 4) A CDT with ordered={False, None} is only equal to another CDT with + ordered={False, None} and identical categories, but same order is + not required. There is no distinction between False/None. + 5) Any other comparison returns False + """ if isinstance(other, compat.string_types): return other == self.name @@ -220,12 +229,16 @@ def __eq__(self, other): # CDT(., .) = CDT(None, False) and *all* # CDT(., .) = CDT(None, True). return True - elif self.ordered: - return other.ordered and self.categories.equals(other.categories) - elif other.ordered: - return False + elif self.ordered or other.ordered: + # At least one has ordered=True; equal if both have ordered=True + # and the same values for categories in the same order. + return ((self.ordered == other.ordered) and + self.categories.equals(other.categories)) else: - # both unordered; this could probably be optimized / cached + # Neither has ordered=True; equal if both have the same categories, + # but same order is not necessary. There is no distinction between + # ordered=False and ordered=None: CDT(., False) and CDT(., None) + # will be equal if they have the same categories. return hash(self) == hash(other) def __repr__(self): @@ -288,7 +301,7 @@ def construct_from_string(cls, string): raise TypeError("cannot construct a CategoricalDtype") @staticmethod - def _validate_ordered(ordered): + def validate_ordered(ordered): """ Validates that we have a valid ordered parameter. If it is not a boolean, a TypeError will be raised. @@ -308,7 +321,7 @@ def _validate_ordered(ordered): raise TypeError("'ordered' must either be 'True' or 'False'") @staticmethod - def _validate_categories(categories, fastpath=False): + def validate_categories(categories, fastpath=False): """ Validates that we have good categories @@ -340,7 +353,7 @@ def _validate_categories(categories, fastpath=False): return categories - def _update_dtype(self, dtype): + def update_dtype(self, dtype): """ Returns a CategoricalDtype with categories and ordered taken from dtype if specified, otherwise falling back to self if unspecified @@ -361,11 +374,16 @@ def _update_dtype(self, dtype): 'got {dtype!r}').format(dtype=dtype) raise ValueError(msg) - # dtype is CDT: keep current categories if None (ordered can't be None) + # dtype is CDT: keep current categories/ordered if None new_categories = dtype.categories if new_categories is None: new_categories = self.categories - return CategoricalDtype(new_categories, dtype.ordered) + + new_ordered = dtype.ordered + if new_ordered is None: + new_ordered = self.ordered + + return CategoricalDtype(new_categories, new_ordered) @property def categories(self): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 201d8ba427c8a..bc045d74cee52 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -233,7 +233,7 @@ -------- merge_ordered merge_asof - +DataFrame.join """ # ----------------------------------------------------------------------- @@ -1059,7 +1059,7 @@ def to_gbq(self, destination_table, project_id, chunksize=10000, private_key : str (optional) Service account private key in JSON format. Can be file path or string contents. This is useful for remote server - authentication (eg. jupyter iPython notebook on remote host) + authentication (eg. Jupyter/IPython notebook on remote host) """ from pandas.io import gbq @@ -1727,7 +1727,7 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, sparsify=None, index_names=True, justify=None, bold_rows=True, classes=None, escape=True, max_rows=None, max_cols=None, show_dimensions=False, notebook=False, decimal='.', - border=None): + border=None, table_id=None): """ Render a DataFrame as an HTML table. @@ -1755,6 +1755,12 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, `
` tag. Default ``pd.options.html.border``. .. versionadded:: 0.19.0 + + table_id : str, optional + A css id is included in the opening `
` tag if specified. + + .. versionadded:: 0.23.0 + """ if (justify is not None and @@ -1772,7 +1778,7 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, - decimal=decimal) + decimal=decimal, table_id=table_id) # TODO: a generic formatter wld b in DataFrameFormatter formatter.to_html(classes=classes, notebook=notebook, border=border) @@ -2681,12 +2687,17 @@ def assign(self, **kwargs): Notes ----- - For python 3.6 and above, the columns are inserted in the order of - \*\*kwargs. For python 3.5 and earlier, since \*\*kwargs is unordered, - the columns are inserted in alphabetical order at the end of your - DataFrame. Assigning multiple columns within the same ``assign`` - is possible, but you cannot reference other columns created within - the same ``assign`` call. + Assigning multiple columns within the same ``assign`` is possible. + For Python 3.6 and above, later items in '\*\*kwargs' may refer to + newly created or modified columns in 'df'; items are computed and + assigned into 'df' in order. For Python 3.5 and below, the order of + keyword arguments is not specified, you cannot refer to newly created + or modified columns. All items are computed first, and then assigned + in alphabetical order. + + .. versionmodified :: 0.23.0 + + Keyword argument order is maintained for Python 3.6 and later. Examples -------- @@ -2722,22 +2733,34 @@ def assign(self, **kwargs): 7 8 -1.495604 2.079442 8 9 0.549296 2.197225 9 10 -0.758542 2.302585 + + Where the keyword arguments depend on each other + + >>> df = pd.DataFrame({'A': [1, 2, 3]}) + + >>> df.assign(B=df.A, C=lambda x:x['A']+ x['B']) + A B C + 0 1 1 2 + 1 2 2 4 + 2 3 3 6 """ data = self.copy() - # do all calculations first... - results = OrderedDict() - for k, v in kwargs.items(): - results[k] = com._apply_if_callable(v, data) - - # preserve order for 3.6 and later, but sort by key for 3.5 and earlier + # >= 3.6 preserve order of kwargs if PY36: - results = results.items() + for k, v in kwargs.items(): + data[k] = com._apply_if_callable(v, data) else: + # <= 3.5: do all calculations first... + results = OrderedDict() + for k, v in kwargs.items(): + results[k] = com._apply_if_callable(v, data) + + # <= 3.5 and earlier results = sorted(results.items()) - # ... and then assign - for k, v in results: - data[k] = v + # ... and then assign + for k, v in results: + data[k] = v return data def _sanitize_column(self, key, value, broadcast=True): @@ -3915,23 +3938,12 @@ def reorder_levels(self, order, axis=0): # ---------------------------------------------------------------------- # Arithmetic / combination related - def _combine_frame(self, other, func, fill_value=None, level=None, - try_cast=True): + def _combine_frame(self, other, func, fill_value=None, level=None): this, other = self.align(other, join='outer', level=level, copy=False) new_index, new_columns = this.index, this.columns def _arith_op(left, right): - if fill_value is not None: - left_mask = isna(left) - right_mask = isna(right) - left = left.copy() - right = right.copy() - - # one but not both - mask = left_mask ^ right_mask - left[left_mask & mask] = fill_value - right[right_mask & mask] = fill_value - + left, right = ops.fill_binop(left, right, fill_value) return func(left, right) if this._is_mixed_type or other._is_mixed_type: @@ -3968,52 +3980,40 @@ def f(i): def _combine_series(self, other, func, fill_value=None, axis=None, level=None, try_cast=True): + if fill_value is not None: + raise NotImplementedError("fill_value {fill} not supported." + .format(fill=fill_value)) + if axis is not None: axis = self._get_axis_name(axis) if axis == 'index': - return self._combine_match_index(other, func, level=level, - fill_value=fill_value, - try_cast=try_cast) + return self._combine_match_index(other, func, level=level) else: return self._combine_match_columns(other, func, level=level, - fill_value=fill_value, try_cast=try_cast) - return self._combine_series_infer(other, func, level=level, - fill_value=fill_value, - try_cast=try_cast) - - def _combine_series_infer(self, other, func, level=None, - fill_value=None, try_cast=True): - if len(other) == 0: - return self * np.nan + else: + if not len(other): + return self * np.nan - if len(self) == 0: - # Ambiguous case, use _series so works with DataFrame - return self._constructor(data=self._series, index=self.index, - columns=self.columns) + if not len(self): + # Ambiguous case, use _series so works with DataFrame + return self._constructor(data=self._series, index=self.index, + columns=self.columns) - return self._combine_match_columns(other, func, level=level, - fill_value=fill_value, - try_cast=try_cast) + # default axis is columns + return self._combine_match_columns(other, func, level=level, + try_cast=try_cast) - def _combine_match_index(self, other, func, level=None, - fill_value=None, try_cast=True): + def _combine_match_index(self, other, func, level=None): left, right = self.align(other, join='outer', axis=0, level=level, copy=False) - if fill_value is not None: - raise NotImplementedError("fill_value %r not supported." % - fill_value) return self._constructor(func(left.values.T, right.values).T, index=left.index, columns=self.columns, copy=False) - def _combine_match_columns(self, other, func, level=None, - fill_value=None, try_cast=True): + def _combine_match_columns(self, other, func, level=None, try_cast=True): left, right = self.align(other, join='outer', axis=1, level=level, copy=False) - if fill_value is not None: - raise NotImplementedError("fill_value %r not supported" % - fill_value) new_data = left._data.eval(func=func, other=right, axes=[left.columns, self.index], @@ -4833,14 +4833,14 @@ def aggregate(self, func, axis=0, *args, **kwargs): agg = aggregate - def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, - args=(), **kwds): - """Applies function along input axis of DataFrame. + def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, + result_type=None, args=(), **kwds): + """Applies function along an axis of the DataFrame. Objects passed to functions are Series objects having index either the DataFrame's index (axis=0) or the columns (axis=1). - Return type depends on whether passed function aggregates, or the - reduce argument if the DataFrame is empty. + Final return type depends on the return type of the applied function, + or on the `result_type` argument. Parameters ---------- @@ -4849,9 +4849,14 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, axis : {0 or 'index', 1 or 'columns'}, default 0 * 0 or 'index': apply function to each column * 1 or 'columns': apply function to each row - broadcast : boolean, default False + broadcast : boolean, optional For aggregation functions, return object of same size with values propagated + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='broadcast'. + raw : boolean, default False If False, convert each row or column into a Series. If raw=True the passed function will receive ndarray objects instead. If you are @@ -4865,6 +4870,27 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, while guessing, exceptions raised by func will be ignored). If reduce is True a Series will always be returned, and if False a DataFrame will always be returned. + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='reduce'. + + result_type : {'expand', 'reduce', 'broadcast, None} + These only act when axis=1 {columns}: + + * 'expand' : list-like results will be turned into columns. + * 'reduce' : return a Series if possible rather than expanding + list-like results. This is the opposite to 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the frame, the original index & columns will be retained. + + The default behaviour (None) depends on the return value of the + applied function: list-like results will be returned as a Series + of those. However if the apply function returns a Series these + are expanded to columns. + + .. versionadded:: 0.23.0 + args : tuple Positional arguments to pass to function in addition to the array/series @@ -4880,9 +4906,97 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, Examples -------- - >>> df.apply(numpy.sqrt) # returns DataFrame - >>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0) - >>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1) + + We use this DataFrame to illustrate + + >>> df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + Using a numpy universal function (in this case the same as + ``np.sqrt(df)``): + + >>> df.apply(np.sqrt) + A B C + 0 1.0 1.414214 1.732051 + 1 1.0 1.414214 1.732051 + 2 1.0 1.414214 1.732051 + 3 1.0 1.414214 1.732051 + 4 1.0 1.414214 1.732051 + 5 1.0 1.414214 1.732051 + + Using a reducing function on either axis + + >>> df.apply(np.sum, axis=0) + A 6 + B 12 + C 18 + dtype: int64 + + >>> df.apply(np.sum, axis=1) + 0 6 + 1 6 + 2 6 + 3 6 + 4 6 + 5 6 + dtype: int64 + + Retuning a list-like will result in a Series + + >>> df.apply(lambda x: [1, 2], axis=1) + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + + Passing result_type='expand' will expand list-like results + to columns of a Dataframe + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') + 0 1 + 0 1 2 + 1 1 2 + 2 1 2 + 3 1 2 + 4 1 2 + 5 1 2 + + Returning a Series inside the function is similar to passing + ``result_type='expand'``. The resulting column names + will be the Series index. + + >>> df.apply(lambda x: Series([1, 2], index=['foo', 'bar']), axis=1) + foo bar + 0 1 2 + 1 1 2 + 2 1 2 + 3 1 2 + 4 1 2 + 5 1 2 + + Passing ``result_type='broadcast'`` will ensure the same shape + result, whether list-like or scalar is returned by the function, + and broadcast it along the axis. The resulting column names will + be the originals. + + >>> df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 See also -------- @@ -4901,7 +5015,9 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, broadcast=broadcast, raw=raw, reduce=reduce, - args=args, **kwds) + result_type=result_type, + args=args, + kwds=kwds) return op.get_result() def applymap(self, func): @@ -5219,18 +5335,17 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', raise ValueError('Joining multiple DataFrames only supported' ' for joining on index') - # join indexes only using concat - if how == 'left': - how = 'outer' - join_axes = [self.index] - else: - join_axes = None - frames = [self] + list(other) can_concat = all(df.index.is_unique for df in frames) + # join indexes only using concat if can_concat: + if how == 'left': + how = 'outer' + join_axes = [self.index] + else: + join_axes = None return concat(frames, axis=1, join=how, join_axes=join_axes, verify_integrity=True) @@ -5605,12 +5720,16 @@ def f(x): # numeric_only and yet we have tried a # column-by-column reduction, where we have mixed type. # So let's just do what we can - result = self.apply(f, reduce=False, - ignore_failures=True) + from pandas.core.apply import frame_apply + opa = frame_apply(self, + func=f, + result_type='expand', + ignore_failures=True) + result = opa.get_result() if result.ndim == self.ndim: result = result.iloc[0] return result - except: + except Exception: pass if filter_type is None or filter_type == 'numeric': diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0f038cd687dfd..35f866c9e7d58 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -25,6 +25,7 @@ is_list_like, is_dict_like, is_re_compilable, + is_period_arraylike, pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.inference import is_hashable @@ -1027,10 +1028,24 @@ def _indexed_same(self, other): def __neg__(self): values = com._values_from_object(self) - if values.dtype == np.bool_: + if is_bool_dtype(values): arr = operator.inv(values) - else: + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)): arr = operator.neg(values) + else: + raise TypeError("Unary negative expects numeric dtype, not {}" + .format(values.dtype)) + return self.__array_wrap__(arr) + + def __pos__(self): + values = com._values_from_object(self) + if (is_bool_dtype(values) or is_period_arraylike(values)): + arr = values + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)): + arr = operator.pos(values) + else: + raise TypeError("Unary plus expects numeric dtype, not {}" + .format(values.dtype)) return self.__array_wrap__(arr) def __invert__(self): @@ -5691,6 +5706,10 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, reduce the dimensionality of the return type if possible, otherwise return a consistent type + Returns + ------- + GroupBy object + Examples -------- DataFrame results @@ -5702,10 +5721,15 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, >>> data.groupby(['col1', 'col2']).mean() - Returns - ------- - GroupBy object + Notes + ----- + See the `user guide + `_ for more. + See also + -------- + resample : Convenience method for frequency conversion and resampling + of time series. """ from pandas.core.groupby import groupby @@ -5904,8 +5928,16 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, .. versionadded:: 0.19.0 + Returns + ------- + Resampler object + Notes ----- + See the `user guide + `_ + for more. + To learn more about the offset strings, please see `this link `__. @@ -6071,6 +6103,10 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, a b c d 2000-01-01 00:00:00 0 6 12 18 2000-01-01 00:03:00 0 4 8 12 + + See also + -------- + groupby : Group by mapping, function, label, or list of labels. """ from pandas.core.resample import (resample, _maybe_process_deprecations) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2c1deb9db7bba..0363bcd02aa16 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -37,6 +37,7 @@ _ensure_categorical, _ensure_float) from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna, notna, _maybe_fill from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, @@ -230,7 +231,7 @@ Notes ----- See more `here -`_ +`_ Examples -------- @@ -423,6 +424,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): self.obj = None self.indexer = None self.binner = None + self._grouper = None @property def ax(self): @@ -465,12 +467,22 @@ def _set_grouper(self, obj, sort=False): raise ValueError( "The Grouper cannot specify both a key and a level!") + # Keep self.grouper value before overriding + if self._grouper is None: + self._grouper = self.grouper + # the key must be a valid info item if self.key is not None: key = self.key - if key not in obj._info_axis: - raise KeyError("The grouper name {0} is not found".format(key)) - ax = Index(obj[key], name=key) + # The 'on' is already defined + if getattr(self.grouper, 'name', None) == key and \ + isinstance(obj, ABCSeries): + ax = self._grouper.take(obj.index) + else: + if key not in obj._info_axis: + raise KeyError( + "The grouper name {0} is not found".format(key)) + ax = Index(obj[key], name=key) else: ax = obj._get_axis(self.axis) @@ -982,7 +994,7 @@ def _transform_should_cast(self, func_nm): return (self.size().fillna(0) > 0).any() and (func_nm not in _cython_cast_blacklist) - def _cython_transform(self, how, numeric_only=True): + def _cython_transform(self, how, numeric_only=True, **kwargs): output = collections.OrderedDict() for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -990,12 +1002,16 @@ def _cython_transform(self, how, numeric_only=True): continue try: - result, names = self.grouper.transform(obj.values, how) + result, names = self.grouper.transform(obj.values, how, + **kwargs) except NotImplementedError: continue except AssertionError as e: raise GroupByError(str(e)) - output[name] = self._try_cast(result, obj) + if self._transform_should_cast(how): + output[name] = self._try_cast(result, obj) + else: + output[name] = result if len(output) == 0: raise DataError('No numeric types to aggregate') @@ -1756,6 +1772,37 @@ def cumcount(self, ascending=True): cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) + @Substitution(name='groupby') + @Appender(_doc_template) + def rank(self, method='average', ascending=True, na_option='keep', + pct=False, axis=0): + """Provides the rank of values within each group + + Parameters + ---------- + method : {'average', 'min', 'max', 'first', 'dense'}, efault 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + method : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + ascending : boolean, default True + False for ranks by high (1) to low (N) + pct : boolean, default False + Compute percentage rank of data within each group + + Returns + ----- + DataFrame with ranking of values within each group + """ + return self._cython_transform('rank', numeric_only=False, + ties_method=method, ascending=ascending, + na_option=na_option, pct=pct, axis=axis) + @Substitution(name='groupby') @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): @@ -2171,6 +2218,16 @@ def get_group_levels(self): 'cumsum': 'group_cumsum', 'cummin': 'group_cummin', 'cummax': 'group_cummax', + 'rank': { + 'name': 'group_rank', + 'f': lambda func, a, b, c, d, **kwargs: func( + a, b, c, d, + kwargs.get('ties_method', 'average'), + kwargs.get('ascending', True), + kwargs.get('pct', False), + kwargs.get('na_option', 'keep') + ) + } } } @@ -2228,9 +2285,10 @@ def wrapper(*args, **kwargs): raise NotImplementedError("function is not implemented for this" "dtype: [how->%s,dtype->%s]" % (how, dtype_str)) - return func, dtype_str + return func - def _cython_operation(self, kind, values, how, axis, min_count=-1): + def _cython_operation(self, kind, values, how, axis, min_count=-1, + **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions @@ -2292,20 +2350,23 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1): values = values.astype(object) try: - func, dtype_str = self._get_cython_function( + func = self._get_cython_function( kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = _ensure_float64(values) - func, dtype_str = self._get_cython_function( + func = self._get_cython_function( kind, how, values, is_numeric) else: raise - if is_numeric: - out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + if how == 'rank': + out_dtype = 'float' else: - out_dtype = 'object' + if is_numeric: + out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + else: + out_dtype = 'object' labels, _, _ = self.group_info @@ -2322,9 +2383,10 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1): # TODO: min_count result = self._transform( - result, values, labels, func, is_numeric, is_datetimelike) + result, values, labels, func, is_numeric, is_datetimelike, + **kwargs) - if is_integer_dtype(result): + if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') @@ -2361,8 +2423,8 @@ def aggregate(self, values, how, axis=0, min_count=-1): return self._cython_operation('aggregate', values, how, axis, min_count=min_count) - def transform(self, values, how, axis=0): - return self._cython_operation('transform', values, how, axis) + def transform(self, values, how, axis=0, **kwargs): + return self._cython_operation('transform', values, how, axis, **kwargs) def _aggregate(self, result, counts, values, comp_ids, agg_func, is_numeric, is_datetimelike, min_count=-1): @@ -2382,7 +2444,7 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, return result def _transform(self, result, values, comp_ids, transform_func, - is_numeric, is_datetimelike): + is_numeric, is_datetimelike, **kwargs): comp_ids, _, ngroups = self.group_info if values.ndim > 3: @@ -2394,9 +2456,9 @@ def _transform(self, result, values, comp_ids, transform_func, chunk = chunk.squeeze() transform_func(result[:, :, i], values, - comp_ids, is_datetimelike) + comp_ids, is_datetimelike, **kwargs) else: - transform_func(result, values, comp_ids, is_datetimelike) + transform_func(result, values, comp_ids, is_datetimelike, **kwargs) return result diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 626f3dc86556a..be7c1624936bf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -31,12 +31,14 @@ is_object_dtype, is_categorical_dtype, is_interval_dtype, + is_period_dtype, is_bool, is_bool_dtype, is_signed_integer_dtype, is_unsigned_integer_dtype, is_integer_dtype, is_float_dtype, is_datetime64_any_dtype, + is_datetime64tz_dtype, is_timedelta64_dtype, needs_i8_conversion, is_iterator, is_list_like, @@ -80,6 +82,26 @@ def _try_get_item(x): return x +def _make_invalid_op(name): + """ + Return a binary method that always raises a TypeError. + + Parameters + ---------- + name : str + + Returns + ------- + invalid_op : function + """ + def invalid_op(self, other=None): + raise TypeError("cannot perform {name} with this index type: " + "{typ}".format(name=name, typ=type(self))) + + invalid_op.__name__ = name + return invalid_op + + class InvalidIndexError(Exception): pass @@ -392,7 +414,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): values = np.array(values, copy=False) if is_object_dtype(values): values = cls(values, name=name, dtype=dtype, - **kwargs)._values + **kwargs)._ndarray_values result = object.__new__(cls) result._data = values @@ -574,6 +596,40 @@ def values(self): """ return the underlying data as an ndarray """ return self._data.view(np.ndarray) + @property + def _values(self): + # type: () -> Union[ExtensionArray, Index] + # TODO(EA): remove index types as they become extension arrays + """The best array representation. + + This is an ndarray, ExtensionArray, or Index subclass. This differs + from ``_ndarray_values``, which always returns an ndarray. + + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. + + It may differ from the public '.values' method. + + index | values | _values | _ndarray_values | + ----------------- | -------------- -| ----------- | --------------- | + CategoricalIndex | Categorical | Categorical | codes | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + + For the following, the ``._values`` is currently ``ndarray[object]``, + but will soon be an ``ExtensionArray`` + + index | values | _values | _ndarray_values | + ----------------- | --------------- | ------------ | --------------- | + PeriodIndex | ndarray[object] | ndarray[obj] | ndarray[int] | + IntervalIndex | ndarray[object] | ndarray[obj] | ndarray[object] | + + See Also + -------- + values + _ndarray_values + """ + return self.values + def get_values(self): """ return the underlying data as an ndarray """ return self.values @@ -644,7 +700,7 @@ def ravel(self, order='C'): -------- numpy.ndarray.ravel """ - return self._values.ravel(order=order) + return self._ndarray_values.ravel(order=order) # construction helpers @classmethod @@ -1577,7 +1633,7 @@ def _constructor(self): @cache_readonly def _engine(self): # property, for now, slow to look up - return self._engine_type(lambda: self._values, len(self)) + return self._engine_type(lambda: self._ndarray_values, len(self)) def _validate_index_level(self, level): """ @@ -2208,27 +2264,37 @@ def union(self, other): other = other.astype('O') return this.union(other) + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self) or is_datetime64tz_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other) or is_datetime64tz_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values + if self.is_monotonic and other.is_monotonic: try: - result = self._outer_indexer(self._values, other._values)[0] + result = self._outer_indexer(lvals, rvals)[0] except TypeError: # incomparable objects - result = list(self._values) + result = list(lvals) # worth making this faster? a very unusual case - value_set = set(self._values) - result.extend([x for x in other._values if x not in value_set]) + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) else: indexer = self.get_indexer(other) indexer, = (indexer == -1).nonzero() if len(indexer) > 0: - other_diff = algos.take_nd(other._values, indexer, + other_diff = algos.take_nd(rvals, indexer, allow_fill=False) - result = _concat._concat_compat((self._values, other_diff)) + result = _concat._concat_compat((lvals, other_diff)) try: - self._values[0] < other_diff[0] + lvals[0] < other_diff[0] except TypeError as e: warnings.warn("%s, sort order is undefined for " "incomparable objects" % e, RuntimeWarning, @@ -2240,7 +2306,7 @@ def union(self, other): result.sort() else: - result = self._values + result = lvals try: result = np.sort(result) @@ -2291,20 +2357,30 @@ def intersection(self, other): other = other.astype('O') return this.intersection(other) + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values + if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self._values, other._values)[0] + result = self._inner_indexer(lvals, rvals)[0] return self._wrap_union_result(other, result) except TypeError: pass try: - indexer = Index(other._values).get_indexer(self._values) + indexer = Index(rvals).get_indexer(lvals) indexer = indexer.take((indexer != -1).nonzero()[0]) except Exception: # duplicates indexer = algos.unique1d( - Index(other._values).get_indexer_non_unique(self._values)[0]) + Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] taken = other.take(indexer) @@ -2680,7 +2756,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise ValueError('limit argument only valid if doing pad, ' 'backfill or nearest reindexing') - indexer = self._engine.get_indexer(target._values) + indexer = self._engine.get_indexer(target._ndarray_values) return _ensure_platform_int(indexer) @@ -2696,12 +2772,13 @@ def _get_fill_indexer(self, target, method, limit=None, tolerance=None): if self.is_monotonic_increasing and target.is_monotonic_increasing: method = (self._engine.get_pad_indexer if method == 'pad' else self._engine.get_backfill_indexer) - indexer = method(target._values, limit) + indexer = method(target._ndarray_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None: - indexer = self._filter_indexer_tolerance(target._values, indexer, + indexer = self._filter_indexer_tolerance(target._ndarray_values, + indexer, tolerance) return indexer @@ -2792,7 +2869,7 @@ def get_indexer_non_unique(self, target): self = Index(self.asi8) tgt_values = target.asi8 else: - tgt_values = target._values + tgt_values = target._ndarray_values indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return _ensure_platform_int(indexer), missing @@ -3227,16 +3304,17 @@ def _join_multi(self, other, how, return_indexers=True): def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers - left_idx, right_idx = _get_join_indexers([self._values], - [other._values], how=how, + left_idx, right_idx = _get_join_indexers([self._ndarray_values], + [other._ndarray_values], + how=how, sort=True) left_idx = _ensure_platform_int(left_idx) right_idx = _ensure_platform_int(right_idx) - join_index = np.asarray(self._values.take(left_idx)) + join_index = np.asarray(self._ndarray_values.take(left_idx)) mask = left_idx == -1 - np.putmask(join_index, mask, other._values.take(right_idx)) + np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) join_index = self._wrap_joined_index(join_index, other) @@ -3383,8 +3461,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False): else: return ret_index - sv = self._values - ov = other._values + sv = self._ndarray_values + ov = other._ndarray_values if self.is_unique and other.is_unique: # We can perform much better than the general case @@ -3736,7 +3814,7 @@ def insert(self, loc, item): item = self._na_value _self = np.asarray(self) - item = self._coerce_scalar_to_index(item)._values + item = self._coerce_scalar_to_index(item)._ndarray_values idx = np.concatenate((_self[:loc], item, _self[loc:])) return self._shallow_copy_with_infer(idx) @@ -3916,30 +3994,12 @@ def _evaluate_compare(self, other): @classmethod def _add_numeric_methods_add_sub_disabled(cls): """ add in the numeric add/sub methods to disable """ - - def _make_invalid_op(name): - def invalid_op(self, other=None): - raise TypeError("cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self))) - - invalid_op.__name__ = name - return invalid_op - cls.__add__ = cls.__radd__ = __iadd__ = _make_invalid_op('__add__') # noqa cls.__sub__ = __isub__ = _make_invalid_op('__sub__') # noqa @classmethod def _add_numeric_methods_disabled(cls): """ add in numeric methods to disable other than add/sub """ - - def _make_invalid_op(name): - def invalid_op(self, other=None): - raise TypeError("cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self))) - - invalid_op.__name__ = name - return invalid_op - cls.__pow__ = cls.__rpow__ = _make_invalid_op('__pow__') cls.__mul__ = cls.__rmul__ = _make_invalid_op('__mul__') cls.__floordiv__ = cls.__rfloordiv__ = _make_invalid_op('__floordiv__') @@ -4040,6 +4100,8 @@ def _evaluate_numeric_binop(self, other): attrs = self._maybe_update_attributes(attrs) with np.errstate(all='ignore'): result = op(values, other) + + result = missing.dispatch_missing(op, values, other, result) return constructor(result, **attrs) return _evaluate_numeric_binop @@ -4145,15 +4207,6 @@ def logical_func(self, *args, **kwargs): @classmethod def _add_logical_methods_disabled(cls): """ add in logical methods to disable """ - - def _make_invalid_op(name): - def invalid_op(self, other=None): - raise TypeError("cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self))) - - invalid_op.__name__ = name - return invalid_op - cls.all = _make_invalid_op('all') cls.any = _make_invalid_op('any') diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 2c7be2b21f959..a4d0f787cc6ec 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -293,6 +293,11 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data + @property + def itemsize(self): + # Size of the items in categories, not codes. + return self.values.itemsize + def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() @@ -344,7 +349,7 @@ def astype(self, dtype, copy=True): return IntervalIndex(np.array(self)) elif is_categorical_dtype(dtype): # GH 18630 - dtype = self.dtype._update_dtype(dtype) + dtype = self.dtype.update_dtype(dtype) if dtype == self.dtype: return self.copy() if copy else self @@ -386,8 +391,8 @@ def is_monotonic_decreasing(self): def unique(self, level=None): if level is not None: self._validate_index_level(level) - result = base.IndexOpsMixin.unique(self) - # CategoricalIndex._shallow_copy uses keeps original categories + result = self.values.unique() + # CategoricalIndex._shallow_copy keeps original categories # and ordered if not otherwise specified return self._shallow_copy(result, categories=result.categories, ordered=result.ordered) @@ -553,6 +558,8 @@ def _reindex_non_unique(self, target): @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): + from pandas.core.arrays.categorical import _recode_for_categories + method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) @@ -568,8 +575,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): if (isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target)): - # we have the same codes - codes = target.codes + if self.values.equals(target.values): + # we have the same codes + codes = target.codes + else: + codes = _recode_for_categories(target.codes, + target.categories, + self.values.categories) else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 8e77c7a7fa48c..c98f8ceea0ffa 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -36,6 +36,7 @@ from pandas._libs import lib, iNaT, NaT from pandas._libs.tslibs.period import Period from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds +from pandas._libs.tslibs.timestamps import round_ns from pandas.core.indexes.base import Index, _index_shared_docs from pandas.util._decorators import Appender, cache_readonly @@ -90,23 +91,9 @@ class TimelikeOps(object): """) def _round(self, freq, rounder): - - from pandas.tseries.frequencies import to_offset - unit = to_offset(freq).nanos # round the local times values = _ensure_datetimelike_to_i8(self) - if unit < 1000 and unit % 1000 != 0: - # for nano rounding, work with the last 6 digits separately - # due to float precision - buff = 1000000 - result = (buff * (values // buff) + unit * - (rounder((values % buff) / float(unit))).astype('i8')) - elif unit >= 1000 and unit % 1000 != 0: - msg = 'Precision will be lost using frequency: {}' - warnings.warn(msg.format(freq)) - result = (unit * rounder(values / float(unit)).astype('i8')) - else: - result = (unit * rounder(values / float(unit)).astype('i8')) + result = round_ns(values, rounder, freq) result = self._maybe_mask_results(result, fill_value=NaT) attribs = self._get_attributes_dict() @@ -389,7 +376,7 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - sorted_values = np.sort(self._values) + sorted_values = np.sort(self._ndarray_values) attribs = self._get_attributes_dict() freq = attribs['freq'] diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e09fa87477122..cc9ce1f3fd5eb 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -55,7 +55,7 @@ from pandas._libs import (lib, index as libindex, tslib as libts, join as libjoin, Timestamp) from pandas._libs.tslibs import (timezones, conversion, fields, parsing, - period as libperiod) + resolution as libresolution) # -------- some conversion wrapper functions @@ -678,6 +678,15 @@ def _assert_tzawareness_compat(self, other): raise TypeError('Cannot compare tz-naive and tz-aware ' 'datetime-like objects') + @property + def _values(self): + # tz-naive -> ndarray + # tz-aware -> DatetimeIndex + if self.tz is not None: + return self + else: + return self.values + @property def tzinfo(self): """ @@ -685,6 +694,27 @@ def tzinfo(self): """ return self.tz + @property + def size(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.size + + @property + def shape(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.shape + + @property + def nbytes(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.nbytes + @cache_readonly def _timezone(self): """ Comparable timezone both for pytz / dateutil""" @@ -1086,6 +1116,19 @@ def snap(self, freq='S'): # we know it conforms; skip check return DatetimeIndex(snapped, freq=freq, verify_integrity=False) + def unique(self, level=None): + # Override here since IndexOpsMixin.unique uses self._values.unique + # For DatetimeIndex with TZ, that's a DatetimeIndex -> recursion error + # So we extract the tz-naive DatetimeIndex, unique that, and wrap the + # result with out TZ. + if self.tz is not None: + naive = type(self)(self._ndarray_values, copy=False) + else: + naive = self + result = super(DatetimeIndex, naive).unique(level=level) + return self._simple_new(result, name=self.name, tz=self.tz, + freq=self.freq) + def union(self, other): """ Specialized union for DatetimeIndex objects. If combine @@ -1795,7 +1838,7 @@ def is_normalized(self): @cache_readonly def _resolution(self): - return libperiod.resolution(self.asi8, self.tz) + return libresolution.resolution(self.asi8, self.tz) def insert(self, loc, item): """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3bf783b5a2faa..d431ea1e51e31 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -680,6 +680,16 @@ def length(self): 'e.g. Intervals with string endpoints') raise TypeError(msg) + @property + def size(self): + # Avoid materializing self.values + return self.left.size + + @property + def shape(self): + # Avoid materializing self.values + return self.left.shape + def __len__(self): return len(self.left) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 510f7245cebd8..94dbd8b884e47 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -799,9 +799,11 @@ def values(self): box = hasattr(lev, '_box_values') # Try to minimize boxing. if box and len(lev) > len(lab): - taken = lev._box_values(algos.take_1d(lev._values, lab)) + taken = lev._box_values(algos.take_1d(lev._ndarray_values, + lab)) elif box: - taken = algos.take_1d(lev._box_values(lev._values), lab, + taken = algos.take_1d(lev._box_values(lev._ndarray_values), + lab, fill_value=_get_na_value(lev.dtype.type)) else: taken = algos.take_1d(np.asarray(lev._values), lab) @@ -2410,7 +2412,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): mapper = Series(indexer) indexer = labels.take(_ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) - m = result.map(mapper)._values + m = result.map(mapper)._ndarray_values else: m = np.zeros(len(labels), dtype=bool) @@ -2505,6 +2507,7 @@ def get_locs(self, seq): MultiIndex.slice_locs : Get slice location given start label(s) and end label(s). """ + from .numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] @@ -2530,7 +2533,6 @@ def _convert_to_indexer(r): "that is not the same length as the " "index") r = r.nonzero()[0] - from .numeric import Int64Index return Int64Index(r) def _update_indexer(idxr, indexer=indexer): @@ -2567,9 +2569,8 @@ def _update_indexer(idxr, indexer=indexer): if indexers is not None: indexer = _update_indexer(indexers, indexer=indexer) else: - from .numeric import Int64Index # no matches we are done - return Int64Index([])._values + return Int64Index([])._ndarray_values elif com.is_null_slice(k): # empty slice @@ -2589,8 +2590,8 @@ def _update_indexer(idxr, indexer=indexer): # empty indexer if indexer is None: - return Int64Index([])._values - return indexer._values + return Int64Index([])._ndarray_values + return indexer._ndarray_values def truncate(self, before=None, after=None): """ @@ -2639,7 +2640,7 @@ def equals(self, other): if not isinstance(other, MultiIndex): other_vals = com._values_from_object(_ensure_index(other)) - return array_equivalent(self._values, other_vals) + return array_equivalent(self._ndarray_values, other_vals) if self.nlevels != other.nlevels: return False @@ -2655,8 +2656,9 @@ def equals(self, other): olabels = other.labels[i] olabels = olabels[olabels != -1] - ovalues = algos.take_nd(np.asarray(other.levels[i]._values), - olabels, allow_fill=False) + ovalues = algos.take_nd( + np.asarray(other.levels[i]._values), + olabels, allow_fill=False) # since we use NaT both datetime64 and timedelta64 # we can have a situation where a level is typed say @@ -2704,7 +2706,8 @@ def union(self, other): if len(other) == 0 or self.equals(other): return self - uniq_tuples = lib.fast_unique_multiple([self._values, other._values]) + uniq_tuples = lib.fast_unique_multiple([self._ndarray_values, + other._ndarray_values]) return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) @@ -2726,8 +2729,8 @@ def intersection(self, other): if self.equals(other): return self - self_tuples = self._values - other_tuples = other._values + self_tuples = self._ndarray_values + other_tuples = other._ndarray_values uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) if len(uniq_tuples) == 0: return MultiIndex(levels=[[]] * self.nlevels, @@ -2756,7 +2759,8 @@ def difference(self, other): labels=[[]] * self.nlevels, names=result_names, verify_integrity=False) - difference = sorted(set(self._values) - set(other._values)) + difference = sorted(set(self._ndarray_values) - + set(other._ndarray_values)) if len(difference) == 0: return MultiIndex(levels=[[]] * self.nlevels, diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index b02aee0495d8c..a4558116bfa63 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -378,7 +378,7 @@ def equals(self, other): if (not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape): return False - left, right = self._values, other._values + left, right = self._ndarray_values, other._ndarray_values return ((left == right) | (self._isnan & other._isnan)).all() except (TypeError, ValueError): return False diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 1f8542ed5ee60..8f2d7d382a16e 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -54,7 +54,7 @@ def _field_accessor(name, alias, docstring=None): def f(self): base, mult = _gfc(self.freq) - result = get_period_field_arr(alias, self._values, base) + result = get_period_field_arr(alias, self._ndarray_values, base) return Index(result, name=self.name) f.__name__ = name f.__doc__ = docstring @@ -82,7 +82,7 @@ def _period_index_cmp(opname, cls, nat_result=False): def wrapper(self, other): if isinstance(other, Period): - func = getattr(self._values, opname) + func = getattr(self._ndarray_values, opname) other_base, _ = _gfc(other.freq) if other.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) @@ -94,7 +94,8 @@ def wrapper(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - result = getattr(self._values, opname)(other._values) + op = getattr(self._ndarray_values, opname) + result = op(other._ndarray_values) mask = self._isnan | other._isnan if mask.any(): @@ -102,11 +103,11 @@ def wrapper(self, other): return result elif other is tslib.NaT: - result = np.empty(len(self._values), dtype=bool) + result = np.empty(len(self._ndarray_values), dtype=bool) result.fill(nat_result) else: other = Period(other, freq=self.freq) - func = getattr(self._values, opname) + func = getattr(self._ndarray_values, opname) result = func(other.ordinal) if self.hasnans: @@ -275,11 +276,11 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: # no freq change freq = data.freq - data = data._values + data = data._ndarray_values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) - data = period.period_asfreq_arr(data._values, + data = period.period_asfreq_arr(data._ndarray_values, base1, base2, 1) return cls._simple_new(data, name=name, freq=freq) @@ -374,7 +375,7 @@ def _shallow_copy(self, values=None, freq=None, **kwargs): if freq is None: freq = self.freq if values is None: - values = self._values + values = self._ndarray_values return super(PeriodIndex, self)._shallow_copy(values=values, freq=freq, **kwargs) @@ -407,7 +408,7 @@ def __contains__(self, key): @property def asi8(self): - return self._values.view('i8') + return self._ndarray_values.view('i8') @cache_readonly def _int64index(self): @@ -418,7 +419,8 @@ def values(self): return self.astype(object).values @property - def _values(self): + def _ndarray_values(self): + # Ordinals return self._data def __array__(self, dtype=None): @@ -475,6 +477,16 @@ def _to_embed(self, keep_tz=False, dtype=None): return self.astype(object).values + @property + def size(self): + # Avoid materializing self._values + return self._ndarray_values.size + + @property + def shape(self): + # Avoid materializing self._values + return self._ndarray_values.shape + @property def _formatter_func(self): return lambda x: "'%s'" % x @@ -489,13 +501,15 @@ def asof_locs(self, where, mask): if isinstance(where_idx, DatetimeIndex): where_idx = PeriodIndex(where_idx.values, freq=self.freq) - locs = self._values[mask].searchsorted(where_idx._values, side='right') + locs = self._ndarray_values[mask].searchsorted( + where_idx._ndarray_values, side='right') locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) first = mask.argmax() - result[(locs == 0) & (where_idx._values < self._values[first])] = -1 + result[(locs == 0) & (where_idx._ndarray_values < + self._ndarray_values[first])] = -1 return result @@ -523,7 +537,8 @@ def searchsorted(self, value, side='left', sorter=None): elif isinstance(value, compat.string_types): value = Period(value, freq=self.freq).ordinal - return self._values.searchsorted(value, side=side, sorter=sorter) + return self._ndarray_values.searchsorted(value, side=side, + sorter=sorter) @property def is_all_dates(self): @@ -664,7 +679,7 @@ def to_timestamp(self, freq=None, how='start'): base, mult = _gfc(freq) new_data = self.asfreq(freq, how) - new_data = period.periodarr_to_dt64arr(new_data._values, base) + new_data = period.periodarr_to_dt64arr(new_data._ndarray_values, base) return DatetimeIndex(new_data, freq='infer', name=self.name) def _maybe_convert_timedelta(self, other): @@ -744,7 +759,7 @@ def shift(self, n): ------- shifted : PeriodIndex """ - values = self._values + n * self.freq.n + values = self._ndarray_values + n * self.freq.n if self.hasnans: values[self._isnan] = tslib.iNaT return self._shallow_copy(values=values) @@ -775,7 +790,7 @@ def get_value(self, series, key): grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) - vals = self._values + vals = self._ndarray_values # if our data is higher resolution than requested key, slice if grp < freqn: @@ -786,7 +801,7 @@ def get_value(self, series, key): if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) - pos = np.searchsorted(self._values, [ord1, ord2]) + pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) key = slice(pos[0], pos[1] + 1) return series[key] elif grp == freqn: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index a82ee6b2b44af..0ed92a67c7e14 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -550,7 +550,7 @@ def __getitem__(self, key): return super_getitem(key) def __floordiv__(self, other): - if is_integer(other): + if is_integer(other) and other != 0: if (len(self) == 0 or self._start % other == 0 and self._step % other == 0): @@ -592,14 +592,15 @@ def _evaluate_numeric_binop(self, other): attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) + left, right = self, other if reversed: - self, other = other, self + left, right = right, left try: # apply if we have an override if step: with np.errstate(all='ignore'): - rstep = step(self._step, other) + rstep = step(left._step, right) # we don't have a representable op # so return a base index @@ -607,11 +608,11 @@ def _evaluate_numeric_binop(self, other): raise ValueError else: - rstep = self._step + rstep = left._step with np.errstate(all='ignore'): - rstart = op(self._start, other) - rstop = op(self._stop, other) + rstart = op(left._start, right) + rstop = op(left._stop, right) result = RangeIndex(rstart, rstop, @@ -627,18 +628,12 @@ def _evaluate_numeric_binop(self, other): return result - except (ValueError, TypeError, AttributeError): - pass - - # convert to Int64Index ops - if isinstance(self, RangeIndex): - self = self.values - if isinstance(other, RangeIndex): - other = other.values - - with np.errstate(all='ignore'): - results = op(self, other) - return Index(results, **attrs) + except (ValueError, TypeError, AttributeError, + ZeroDivisionError): + # Defer to Int64Index implementation + if reversed: + return op(other, self._int64index) + return op(self._int64index, other) return _evaluate_numeric_binop diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9463512ac11de..352ce921d1d44 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1300,6 +1300,9 @@ class _IXIndexer(_NDFrameIndexer): """A primarily label-location based indexer, with integer position fallback. + Warning: Starting in 0.20.0, the .ix indexer is deprecated, in + favor of the more strict .iloc and .loc indexers. + ``.ix[]`` supports mixed integer and label based access. It is primarily label based, but will fall back to integer positional access unless the corresponding axis is of integer type. diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 2eccc5777bca6..31c489e2f8941 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1,6 +1,7 @@ """ Routines for filling missing data """ +import operator import numpy as np from distutils.version import LooseVersion @@ -650,6 +651,87 @@ def fill_zeros(result, x, y, name, fill): return result +def mask_zero_div_zero(x, y, result, copy=False): + """ + Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes + of the numerator or the denominator. + + Parameters + ---------- + x : ndarray + y : ndarray + result : ndarray + copy : bool (default False) + Whether to always create a new array or try to fill in the existing + array if possible. + + Returns + ------- + filled_result : ndarray + + Examples + -------- + >>> x = np.array([1, 0, -1], dtype=np.int64) + >>> y = 0 # int 0; numpy behavior is different with float + >>> result = x / y + >>> result # raw numpy result does not fill division by zero + array([0, 0, 0]) + >>> mask_zero_div_zero(x, y, result) + array([ inf, nan, -inf]) + """ + if is_scalar(y): + y = np.array(y) + + zmask = y == 0 + if zmask.any(): + shape = result.shape + + nan_mask = (zmask & (x == 0)).ravel() + neginf_mask = (zmask & (x < 0)).ravel() + posinf_mask = (zmask & (x > 0)).ravel() + + if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): + # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN + result = result.astype('float64', copy=copy).ravel() + + np.putmask(result, nan_mask, np.nan) + np.putmask(result, posinf_mask, np.inf) + np.putmask(result, neginf_mask, -np.inf) + + result = result.reshape(shape) + + return result + + +def dispatch_missing(op, left, right, result): + """ + Fill nulls caused by division by zero, casting to a diffferent dtype + if necessary. + + Parameters + ---------- + op : function (operator.add, operator.div, ...) + left : object (Index for non-reversed ops) + right : object (Index fof reversed ops) + result : ndarray + + Returns + ------- + result : ndarray + """ + opstr = '__{opname}__'.format(opname=op.__name__).replace('____', '__') + if op in [operator.truediv, operator.floordiv, + getattr(operator, 'div', None)]: + result = mask_zero_div_zero(left, right, result) + elif op is operator.mod: + result = fill_zeros(result, left, right, opstr, np.nan) + elif op is divmod: + res0 = mask_zero_div_zero(left, right, result[0]) + res1 = fill_zeros(result[1], left, right, opstr, np.nan) + result = (res0, res1) + return result + + def _interp_limit(invalid, fw_limit, bw_limit): """ Get indexers of values that won't be filled diff --git a/pandas/core/ops.py b/pandas/core/ops.py index b9b8cc33a148e..d60e078b54018 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -397,6 +397,79 @@ def _make_flex_doc(op_name, typ): return doc +# ----------------------------------------------------------------------------- +# Masking NA values and fallbacks for operations numpy does not support + +def fill_binop(left, right, fill_value): + """ + If a non-None fill_value is given, replace null entries in left and right + with this value, but only in positions where _one_ of left/right is null, + not both. + + Parameters + ---------- + left : array-like + right : array-like + fill_value : object + + Returns + ------- + left : array-like + right : array-like + + Notes + ----- + Makes copies if fill_value is not None + """ + # TODO: can we make a no-copy implementation? + if fill_value is not None: + left_mask = isna(left) + right_mask = isna(right) + left = left.copy() + right = right.copy() + + # one but not both + mask = left_mask ^ right_mask + left[left_mask & mask] = fill_value + right[right_mask & mask] = fill_value + return left, right + + +def mask_cmp_op(x, y, op, allowed_types): + """ + Apply the function `op` to only non-null points in x and y. + + Parameters + ---------- + x : array-like + y : array-like + op : binary operation + allowed_types : class or tuple of classes + + Returns + ------- + result : ndarray[bool] + """ + # TODO: Can we make the allowed_types arg unnecessary? + xrav = x.ravel() + result = np.empty(x.size, dtype=bool) + if isinstance(y, allowed_types): + yrav = y.ravel() + mask = notna(xrav) & notna(yrav) + result[mask] = op(np.array(list(xrav[mask])), + np.array(list(yrav[mask]))) + else: + mask = notna(xrav) + result[mask] = op(np.array(list(xrav[mask])), y) + + if op == operator.ne: # pragma: no cover + np.putmask(result, ~mask, True) + else: + np.putmask(result, ~mask, False) + result = result.reshape(x.shape) + return result + + # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory # methods @@ -1118,12 +1191,13 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): if isinstance(other, ABCDataFrame): # Another DataFrame return self._combine_frame(other, na_op, fill_value, level) elif isinstance(other, ABCSeries): - return self._combine_series(other, na_op, fill_value, axis, level) + return self._combine_series(other, na_op, fill_value, axis, level, + try_cast=True) else: if fill_value is not None: self = self.fillna(fill_value) - return self._combine_const(other, na_op) + return self._combine_const(other, na_op, try_cast=True) f.__name__ = name @@ -1138,23 +1212,7 @@ def na_op(x, y): with np.errstate(invalid='ignore'): result = op(x, y) except TypeError: - xrav = x.ravel() - result = np.empty(x.size, dtype=bool) - if isinstance(y, (np.ndarray, ABCSeries)): - yrav = y.ravel() - mask = notna(xrav) & notna(yrav) - result[mask] = op(np.array(list(xrav[mask])), - np.array(list(yrav[mask]))) - else: - mask = notna(xrav) - result[mask] = op(np.array(list(xrav[mask])), y) - - if op == operator.ne: # pragma: no cover - np.putmask(result, ~mask, True) - else: - np.putmask(result, ~mask, False) - result = result.reshape(x.shape) - + result = mask_cmp_op(x, y, op, (np.ndarray, ABCSeries)) return result @Appender('Wrapper for flexible comparison methods {name}' @@ -1184,7 +1242,8 @@ def f(self, other): if isinstance(other, ABCDataFrame): # Another DataFrame return self._compare_frame(other, func, str_rep) elif isinstance(other, ABCSeries): - return self._combine_series_infer(other, func, try_cast=False) + return self._combine_series(other, func, + axis=None, try_cast=False) else: # straight boolean comparisons we want to allow all columns @@ -1231,23 +1290,7 @@ def na_op(x, y): try: result = expressions.evaluate(op, str_rep, x, y) except TypeError: - xrav = x.ravel() - result = np.empty(x.size, dtype=bool) - if isinstance(y, np.ndarray): - yrav = y.ravel() - mask = notna(xrav) & notna(yrav) - result[mask] = op(np.array(list(xrav[mask])), - np.array(list(yrav[mask]))) - else: - mask = notna(xrav) - result[mask] = op(np.array(list(xrav[mask])), y) - - if op == operator.ne: # pragma: no cover - np.putmask(result, ~mask, True) - else: - np.putmask(result, ~mask, False) - result = result.reshape(x.shape) - + result = mask_cmp_op(x, y, op, np.ndarray) return result @Appender('Wrapper for comparison method {name}'.format(name=name)) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3ec78ce52c6e5..4b99b0407cfcc 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -12,6 +12,7 @@ from pandas import (Categorical, DataFrame, Index, MultiIndex, Timedelta) +from pandas.core.arrays.categorical import _recode_for_categories from pandas.core.frame import _merge_doc from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -1021,7 +1022,12 @@ def _validate_specification(self): common_cols = self.left.columns.intersection( self.right.columns) if len(common_cols) == 0: - raise MergeError('No common columns to perform merge on') + raise MergeError( + 'No common columns to perform merge on. ' + 'Merge options: left_on={lon}, right_on={ron}, ' + 'left_index={lidx}, right_index={ridx}' + .format(lon=self.left_on, ron=self.right_on, + lidx=self.left_index, ridx=self.right_index)) if not common_cols.is_unique: raise MergeError("Data columns not unique: {common!r}" .format(common=common_cols)) @@ -1535,8 +1541,15 @@ def _factorize_keys(lk, rk, sort=True): is_categorical_dtype(rk) and lk.is_dtype_equal(rk)): klass = libhashtable.Int64Factorizer + + if lk.categories.equals(rk.categories): + rk = rk.codes + else: + # Same categories in different orders -> recode + rk = _recode_for_categories(rk.codes, rk.categories, lk.categories) + lk = _ensure_int64(lk.codes) - rk = _ensure_int64(rk.codes) + rk = _ensure_int64(rk) elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): klass = libhashtable.Int64Factorizer lk = _ensure_int64(com._values_from_object(lk)) diff --git a/pandas/core/series.py b/pandas/core/series.py index e4b8979d6393a..655eaa5373f5a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1725,19 +1725,8 @@ def _binop(self, other, func, level=None, fill_value=None): copy=False) new_index = this.index - this_vals = this.values - other_vals = other.values - - if fill_value is not None: - this_mask = isna(this_vals) - other_mask = isna(other_vals) - this_vals = this_vals.copy() - other_vals = other_vals.copy() - - # one but not both - mask = this_mask ^ other_mask - this_vals[this_mask & mask] = fill_value - other_vals[other_mask & mask] = fill_value + this_vals, other_vals = ops.fill_binop(this.values, other.values, + fill_value) with np.errstate(all='ignore'): result = func(this_vals, other_vals) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index fa07400a0706e..3cbae717d0e07 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -8,10 +8,10 @@ import warnings import pandas as pd -from pandas.core.base import PandasObject +from pandas.core.base import PandasObject, IndexOpsMixin from pandas import compat -from pandas.compat import range +from pandas.compat import range, PYPY from pandas.compat.numpy import function as nv from pandas.core.dtypes.generic import ABCSparseSeries @@ -26,10 +26,12 @@ is_scalar, is_dtype_equal) from pandas.core.dtypes.cast import ( maybe_convert_platform, maybe_promote, - astype_nansafe, find_common_type) + astype_nansafe, find_common_type, infer_dtype_from_scalar, + construct_1d_arraylike_from_scalar) from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype import pandas._libs.sparse as splib +import pandas._libs.lib as lib from pandas._libs.sparse import SparseIndex, BlockIndex, IntIndex from pandas._libs import index as libindex import pandas.core.algorithms as algos @@ -161,9 +163,9 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', data = np.nan if not is_scalar(data): raise Exception("must only pass scalars with an index ") - values = np.empty(len(index), dtype='float64') - values.fill(data) - data = values + dtype = infer_dtype_from_scalar(data)[0] + data = construct_1d_arraylike_from_scalar( + data, len(index), dtype) if isinstance(data, ABCSparseSeries): data = data.values @@ -238,6 +240,17 @@ def kind(self): elif isinstance(self.sp_index, IntIndex): return 'integer' + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + values = self.sp_values + + v = values.nbytes + + if deep and is_object_dtype(self) and not PYPY: + v += lib.memory_usage_of_objects(values) + + return v + def __array_wrap__(self, out_arr, context=None): """ NumPy calls this method when ufunc is applied diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 91dc44e3f185e..19b126216db81 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -540,8 +540,7 @@ def xs(self, key, axis=0, copy=False): # ---------------------------------------------------------------------- # Arithmetic-related methods - def _combine_frame(self, other, func, fill_value=None, level=None, - try_cast=True): + def _combine_frame(self, other, func, fill_value=None, level=None): this, other = self.align(other, join='outer', level=level, copy=False) new_index, new_columns = this.index, this.columns @@ -584,12 +583,9 @@ def _combine_frame(self, other, func, fill_value=None, level=None, default_fill_value=new_fill_value ).__finalize__(self) - def _combine_match_index(self, other, func, level=None, fill_value=None, - try_cast=True): + def _combine_match_index(self, other, func, level=None): new_data = {} - if fill_value is not None: - raise NotImplementedError("'fill_value' argument is not supported") if level is not None: raise NotImplementedError("'level' argument is not supported") @@ -605,6 +601,7 @@ def _combine_match_index(self, other, func, level=None, fill_value=None, new_data[col] = func(series.values, other.values) # fill_value is a function of our operator + fill_value = None if isna(other.fill_value) or isna(self.default_fill_value): fill_value = np.nan else: @@ -615,15 +612,12 @@ def _combine_match_index(self, other, func, level=None, fill_value=None, new_data, index=new_index, columns=self.columns, default_fill_value=fill_value).__finalize__(self) - def _combine_match_columns(self, other, func, level=None, fill_value=None, - try_cast=True): + def _combine_match_columns(self, other, func, level=None, try_cast=True): # patched version of DataFrame._combine_match_columns to account for # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series, # where 3.0 is numpy.float64 and series is a SparseSeries. Still # possible for this to happen, which is bothersome - if fill_value is not None: - raise NotImplementedError("'fill_value' argument is not supported") if level is not None: raise NotImplementedError("'level' argument is not supported") @@ -835,7 +829,8 @@ def notna(self): return self._apply_columns(lambda x: x.notna()) notnull = notna - def apply(self, func, axis=0, broadcast=False, reduce=False): + def apply(self, func, axis=0, broadcast=None, reduce=None, + result_type=None): """ Analogous to DataFrame.apply, for SparseDataFrame @@ -848,6 +843,39 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): For aggregation functions, return object of same size with values propagated + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='broadcast'. + + reduce : boolean or None, default None + Try to apply reduction procedures. If the DataFrame is empty, + apply will use reduce to determine whether the result should be a + Series or a DataFrame. If reduce is None (the default), apply's + return value will be guessed by calling func an empty Series (note: + while guessing, exceptions raised by func will be ignored). If + reduce is True a Series will always be returned, and if False a + DataFrame will always be returned. + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='reduce'. + + result_type : {'expand', 'reduce', 'broadcast, None} + These only act when axis=1 {columns}: + + * 'expand' : list-like results will be turned into columns. + * 'reduce' : return a Series if possible rather than expanding + list-like results. This is the opposite to 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the frame, the original index & columns will be retained. + + The default behaviour (None) depends on the return value of the + applied function: list-like results will be returned as a Series + of those. However if the apply function returns a Series these + are expanded to columns. + + .. versionadded:: 0.23.0 + Returns ------- applied : Series or SparseDataFrame @@ -871,12 +899,10 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): op = frame_apply(self, func=func, axis=axis, - reduce=reduce) - - if broadcast: - return op.apply_broadcast() - - return op.apply_standard() + reduce=reduce, + broadcast=broadcast, + result_type=result_type) + return op.get_result() def applymap(self, func): """ diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 12c7feb5f2b15..b1c1ede66236c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -598,7 +598,7 @@ def _str_extract_frame(arr, pat, flags=0): dtype=object) -def str_extract(arr, pat, flags=0, expand=None): +def str_extract(arr, pat, flags=0, expand=True): r""" For each subject string in the Series, extract groups from the first match of regular expression pat. @@ -610,7 +610,7 @@ def str_extract(arr, pat, flags=0, expand=None): flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE - expand : bool, default False + expand : bool, default True * If True, return DataFrame. * If False, return Series/Index/DataFrame. @@ -676,15 +676,6 @@ def str_extract(arr, pat, flags=0, expand=None): dtype: object """ - if expand is None: - warnings.warn( - "currently extract(expand=None) " + - "means expand=False (return Index/Series/DataFrame) " + - "but in a future version of pandas this will be changed " + - "to expand=True (return DataFrame)", - FutureWarning, - stacklevel=3) - expand = False if not isinstance(expand, bool): raise ValueError("expand must be True or False") if expand: @@ -1739,7 +1730,7 @@ def translate(self, table, deletechars=None): findall = _pat_wrapper(str_findall, flags=True) @copy(str_extract) - def extract(self, pat, flags=0, expand=None): + def extract(self, pat, flags=0, expand=True): return str_extract(self, pat, flags=flags, expand=expand) @copy(str_extractall) diff --git a/pandas/io/common.py b/pandas/io/common.py index 4ba969f0abac4..e312181f08512 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -183,7 +183,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, Returns ------- - a filepath_ or buffer or S3File instance, the encoding, the compression + tuple of ({a filepath_ or buffer or S3File instance}, + encoding, str, + compression, str, + should_close, bool) """ filepath_or_buffer = _stringify_path(filepath_or_buffer) @@ -194,7 +197,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) - return reader, encoding, compression + req.close() + return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): from pandas.io import s3 @@ -206,13 +210,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): - return _expand_user(filepath_or_buffer), None, compression + return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = "Invalid file path or buffer object type: {_type}" raise ValueError(msg.format(_type=type(filepath_or_buffer))) - return filepath_or_buffer, None, compression + return filepath_or_buffer, None, compression, False def file_path_to_url(path): @@ -309,6 +313,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) + Returns ------- f : file-like diff --git a/pandas/io/excel.py b/pandas/io/excel.py index b03987e933bff..0d3d4286f5a3c 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -381,7 +381,7 @@ def __init__(self, io, **kwds): if _is_url(self._io): io = _urlopen(self._io) elif not isinstance(self.io, (ExcelFile, xlrd.Book)): - io, _, _ = get_filepath_or_buffer(self._io) + io, _, _, _ = get_filepath_or_buffer(self._io) if engine == 'xlrd' and isinstance(io, xlrd.Book): self.book = io diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 269c81b380b5e..621641747f376 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -77,7 +77,11 @@ index_names : bool, optional Prints the names of the indexes, default True line_width : int, optional - Width to wrap a line in characters, default no wrap""" + Width to wrap a line in characters, default no wrap + table_id : str, optional + id for the
element create by to_html + + .. versionadded:: 0.23.0""" _VALID_JUSTIFY_PARAMETERS = ("left", "right", "center", "justify", "justify-all", "start", "end", "inherit", @@ -387,7 +391,8 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, justify=None, float_format=None, sparsify=None, index_names=True, line_width=None, max_rows=None, - max_cols=None, show_dimensions=False, decimal='.', **kwds): + max_cols=None, show_dimensions=False, decimal='.', + table_id=None, **kwds): self.frame = frame if buf is not None: self.buf = _expand_user(_stringify_path(buf)) @@ -413,6 +418,7 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, self.max_rows_displayed = min(max_rows or len(self.frame), len(self.frame)) self.show_dimensions = show_dimensions + self.table_id = table_id if justify is None: self.justify = get_option("display.colheader_justify") @@ -740,7 +746,8 @@ def to_html(self, classes=None, notebook=False, border=None): max_rows=self.max_rows, max_cols=self.max_cols, notebook=notebook, - border=border) + border=border, + table_id=self.table_id) if hasattr(self.buf, 'write'): html_renderer.write_result(self.buf) elif isinstance(self.buf, compat.string_types): @@ -1082,7 +1089,7 @@ class HTMLFormatter(TableFormatter): indent_delta = 2 def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, - notebook=False, border=None): + notebook=False, border=None, table_id=None): self.fmt = formatter self.classes = classes @@ -1101,6 +1108,7 @@ def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, if border is None: border = get_option('display.html.border') self.border = border + self.table_id = table_id def write(self, s, indent=0): rs = pprint_thing(s) @@ -1197,6 +1205,7 @@ def write_style(self): def write_result(self, buf): indent = 0 + id_section = "" frame = self.frame _classes = ['dataframe'] # Default class. @@ -1220,8 +1229,12 @@ def write_result(self, buf): self.write(''.format(style=div_style)) self.write_style() - self.write('
' - .format(border=self.border, cls=' '.join(_classes)), indent) + + if self.table_id is not None: + id_section = ' id="{table_id}"'.format(table_id=self.table_id) + self.write('
' + .format(border=self.border, cls=' '.join(_classes), + id_section=id_section), indent) indent += self.indent_delta indent = self._write_header(indent) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 58796aa30f0bf..525f487d8aa39 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -364,7 +364,7 @@ def format(self, formatter, subset=None): >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) >>> df.style.format("{:.2%}") >>> df['c'] = ['a', 'b', 'c', 'd'] - >>> df.style.format({'C': str.upper}) + >>> df.style.format({'c': str.upper}) """ if subset is None: row_locs = range(len(self.data)) @@ -509,7 +509,9 @@ def _apply(self, func, axis=0, subset=None, **kwargs): subset = _non_reducing_slice(subset) data = self.data.loc[subset] if axis is not None: - result = data.apply(func, axis=axis, **kwargs) + result = data.apply(func, axis=axis, + result_type='expand', **kwargs) + result.columns = data.columns else: result = func(data, **kwargs) if not isinstance(result, pd.DataFrame): diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index b452b0cf5ddd4..f9bc6ae1a5451 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -65,7 +65,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, private_key : str (optional) Service account private key in JSON format. Can be file path or string contents. This is useful for remote server - authentication (eg. jupyter iPython notebook on remote host) + authentication (eg. Jupyter/IPython notebook on remote host) dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index e3a1321336fb3..24364fe07405e 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -404,7 +404,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, """ compression = _infer_compression(path_or_buf, compression) - filepath_or_buffer, _, compression = get_filepath_or_buffer( + filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, ) @@ -419,7 +419,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, if chunksize: return json_reader - return json_reader.read() + result = json_reader.read() + if should_close: + try: + filepath_or_buffer.close() + except: # noqa: flake8 + pass + return result class JsonReader(BaseIterator): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 9289853a1bbfd..d3e6f0cf4a1bc 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -180,7 +180,7 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): obj : type of object stored in file """ - path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) + path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) @@ -188,6 +188,12 @@ def read(fh): l = list(unpack(fh, encoding=encoding, **kwargs)) if len(l) == 1: return l[0] + + if should_close: + try: + path_or_buf.close() + except: # noqa: flake8 + pass return l # see if we have an actual file diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6e1b6e14861c3..1c22a305c089d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -107,7 +107,7 @@ def write(self, df, path, compression='snappy', self.validate_dataframe(df) if self._pyarrow_lt_070: self._validate_write_lt_070(df) - path, _, _ = get_filepath_or_buffer(path, mode='wb') + path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if self._pyarrow_lt_060: table = self.api.Table.from_pandas(df, timestamps_to_ms=True) @@ -121,13 +121,21 @@ def write(self, df, path, compression='snappy', coerce_timestamps=coerce_timestamps, **kwargs) def read(self, path, columns=None, **kwargs): - path, _, _ = get_filepath_or_buffer(path) + path, _, _, should_close = get_filepath_or_buffer(path) if self._pyarrow_lt_070: - return self.api.parquet.read_pandas(path, columns=columns, - **kwargs).to_pandas() - kwargs['use_pandas_metadata'] = True - return self.api.parquet.read_table(path, columns=columns, - **kwargs).to_pandas() + result = self.api.parquet.read_pandas(path, columns=columns, + **kwargs).to_pandas() + else: + kwargs['use_pandas_metadata'] = True + result = self.api.parquet.read_table(path, columns=columns, + **kwargs).to_pandas() + if should_close: + try: + path.close() + except: # noqa: flake8 + pass + + return result def _validate_write_lt_070(self, df): # Compatibility shim for pyarrow < 0.7.0 @@ -199,11 +207,11 @@ def write(self, df, path, compression='snappy', **kwargs): # path is s3:// so we need to open the s3file in 'wb' mode. # TODO: Support 'ab' - path, _, _ = get_filepath_or_buffer(path, mode='wb') + path, _, _, _ = get_filepath_or_buffer(path, mode='wb') # And pass the opened s3file to the fastparquet internal impl. kwargs['open_with'] = lambda path, _: path else: - path, _, _ = get_filepath_or_buffer(path) + path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write(path, df, @@ -214,13 +222,13 @@ def read(self, path, columns=None, **kwargs): # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. - s3, _, _ = get_filepath_or_buffer(path) + s3, _, _, should_close = get_filepath_or_buffer(path) try: parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open) finally: s3.close() else: - path, _, _ = get_filepath_or_buffer(path) + path, _, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index af1441f4a0fc9..7ea6d321e0fdd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -413,7 +413,7 @@ def _read(filepath_or_buffer, kwds): compression = kwds.get('compression') compression = _infer_compression(filepath_or_buffer, compression) - filepath_or_buffer, _, compression = get_filepath_or_buffer( + filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( filepath_or_buffer, encoding, compression) kwds['compression'] = compression @@ -439,6 +439,13 @@ def _read(filepath_or_buffer, kwds): data = parser.read(nrows) finally: parser.close() + + if should_close: + try: + filepath_or_buffer.close() + except: # noqa: flake8 + pass + return data diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index fa953f7d876cc..756096dd0c9ce 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,4 +1,5 @@ """ pickle compat """ +import warnings import numpy as np from numpy.lib.format import read_array, write_array @@ -96,7 +97,9 @@ def try_read(path, encoding=None): # cpickle # GH 6899 try: - return read_wrapper(lambda f: pkl.load(f)) + with warnings.catch_warnings(record=True): + # We want to silencce any warnings about, e.g. moved modules. + return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0d833807602e1..2437b7d396e84 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4430,7 +4430,7 @@ def _convert_index(index, encoding=None, format_type=None): elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() # avoid to store ndarray of Period objects - return IndexCol(index._values, 'integer', atom, + return IndexCol(index._ndarray_values, 'integer', atom, freq=getattr(index, 'freq', None), index_name=index_name) diff --git a/pandas/io/s3.py b/pandas/io/s3.py index e2650e29c0db3..bd2286c5c8569 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -27,7 +27,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, fs = s3fs.S3FileSystem(anon=False) try: filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) - except (OSError, NoCredentialsError): + except (compat.FileNotFoundError, NoCredentialsError): # boto3 has troubles when trying to access a public file # when credentialed... # An OSError is raised if you have credentials, but they @@ -36,4 +36,4 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # for that bucket. fs = s3fs.S3FileSystem(anon=True) filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) - return filepath_or_buffer, None, compression + return filepath_or_buffer, None, compression, True diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 26e39f0df8b29..806cbddaa2ee2 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -90,7 +90,7 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) + self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, compat.string_types): self._path_or_buf = open(self._path_or_buf, 'rb') self.handle = self._path_or_buf diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index c14524f7d7cd6..7994517b9f303 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -236,7 +236,8 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', self._chunksize = chunksize if isinstance(filepath_or_buffer, str): - filepath_or_buffer, encoding, compression = get_filepath_or_buffer( + (filepath_or_buffer, encoding, + compression, should_close) = get_filepath_or_buffer( filepath_or_buffer, encoding=encoding) if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index adbff06364dbe..9646831cb612c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -988,7 +988,7 @@ def __init__(self, path_or_buf, convert_dates=True, self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _ = get_filepath_or_buffer( + path_or_buf, encoding, _, should_close = get_filepath_or_buffer( path_or_buf, encoding=self._default_encoding ) @@ -1341,12 +1341,14 @@ def _null_terminate(self, s): return s def _read_value_labels(self): - if self.format_version <= 108: - # Value labels are not supported in version 108 and earlier. - return if self._value_labels_read: # Don't read twice return + if self.format_version <= 108: + # Value labels are not supported in version 108 and earlier. + self._value_labels_read = True + self.value_label_dict = dict() + return if self.format_version >= 117: self.path_or_buf.seek(self.seek_value_labels) diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 07163615c6ba4..9ca06475290e4 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -249,11 +249,11 @@ def _convert_1d(values, units, axis): is_float(values)): return get_datevalue(values, axis.freq) if isinstance(values, PeriodIndex): - return values.asfreq(axis.freq)._values + return values.asfreq(axis.freq)._ndarray_values if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) if is_period_arraylike(values): - return PeriodIndex(values, freq=axis.freq)._values + return PeriodIndex(values, freq=axis.freq)._ndarray_values if isinstance(values, (list, tuple, np.ndarray, Index)): return [get_datevalue(x, axis.freq) for x in values] return values @@ -642,7 +642,7 @@ def _daily_finder(vmin, vmax, freq): info = np.zeros(span, dtype=[('val', np.int64), ('maj', bool), ('min', bool), ('fmt', '|S20')]) - info['val'][:] = dates_._values + info['val'][:] = dates_._ndarray_values info['fmt'][:] = '' info['maj'][[0, -1]] = True # .. and set some shortcuts diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 9c3572f9ffe72..07ba0b681418e 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -542,66 +542,42 @@ def test_frame_pos(self): # float lhs = DataFrame(randn(5, 2)) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_frame_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) # int lhs = DataFrame(randint(5, size=(5, 2))) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_frame_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) # bool doesn't work with numexpr but works elsewhere lhs = DataFrame(rand(5, 2) > 0.5) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_frame_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) def test_series_pos(self): expr = self.ex('+') # float lhs = Series(randn(5)) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_series_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) # int lhs = Series(randint(5, size=5)) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_series_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) # bool doesn't work with numexpr but works elsewhere lhs = Series(rand(5) > 0.5) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_series_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) def test_scalar_unary(self): with pytest.raises(TypeError): diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index eca4dd4cf2106..cc833af03ae66 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -10,14 +10,12 @@ Series, Categorical, CategoricalIndex, IntervalIndex, date_range) from pandas.compat import string_types -from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, - IntervalDtype, CategoricalDtype, ExtensionDtype) + IntervalDtype, CategoricalDtype) from pandas.core.dtypes.common import ( is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, - is_extension_array_dtype, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, is_datetime64_dtype, is_interval_dtype, @@ -26,6 +24,11 @@ import pandas.util.testing as tm +@pytest.fixture(params=[True, False, None]) +def ordered(request): + return request.param + + class Base(object): def setup_method(self, method): @@ -126,41 +129,6 @@ def test_tuple_categories(self): result = CategoricalDtype(categories) assert all(result.categories == categories) - @pytest.mark.parametrize('dtype', [ - CategoricalDtype(list('abc'), False), - CategoricalDtype(list('abc'), True)]) - @pytest.mark.parametrize('new_dtype', [ - 'category', - CategoricalDtype(None, False), - CategoricalDtype(None, True), - CategoricalDtype(list('abc'), False), - CategoricalDtype(list('abc'), True), - CategoricalDtype(list('cba'), False), - CategoricalDtype(list('cba'), True), - CategoricalDtype(list('wxyz'), False), - CategoricalDtype(list('wxyz'), True)]) - def test_update_dtype(self, dtype, new_dtype): - if isinstance(new_dtype, string_types) and new_dtype == 'category': - expected_categories = dtype.categories - expected_ordered = dtype.ordered - else: - expected_categories = new_dtype.categories - if expected_categories is None: - expected_categories = dtype.categories - expected_ordered = new_dtype.ordered - - result = dtype._update_dtype(new_dtype) - tm.assert_index_equal(result.categories, expected_categories) - assert result.ordered is expected_ordered - - @pytest.mark.parametrize('bad_dtype', [ - 'foo', object, np.int64, PeriodDtype('Q')]) - def test_update_dtype_errors(self, bad_dtype): - dtype = CategoricalDtype(list('abc'), False) - msg = 'a CategoricalDtype must be passed to perform an update, ' - with tm.assert_raises_regex(ValueError, msg): - dtype._update_dtype(bad_dtype) - class TestDatetimeTZDtype(Base): @@ -611,17 +579,12 @@ def test_caching(self): class TestCategoricalDtypeParametrized(object): - @pytest.mark.parametrize('categories, ordered', [ - (['a', 'b', 'c', 'd'], False), - (['a', 'b', 'c', 'd'], True), - (np.arange(1000), False), - (np.arange(1000), True), - (['a', 'b', 10, 2, 1.3, True], False), - ([True, False], True), - ([True, False], False), - (pd.date_range('2017', periods=4), True), - (pd.date_range('2017', periods=4), False), - ]) + @pytest.mark.parametrize('categories', [ + list('abcd'), + np.arange(1000), + ['a', 'b', 10, 2, 1.3, True], + [True, False], + pd.date_range('2017', periods=4)]) def test_basic(self, categories, ordered): c1 = CategoricalDtype(categories, ordered=ordered) tm.assert_index_equal(c1.categories, pd.Index(categories)) @@ -629,21 +592,24 @@ def test_basic(self, categories, ordered): def test_order_matters(self): categories = ['a', 'b'] - c1 = CategoricalDtype(categories, ordered=False) - c2 = CategoricalDtype(categories, ordered=True) + c1 = CategoricalDtype(categories, ordered=True) + c2 = CategoricalDtype(categories, ordered=False) + c3 = CategoricalDtype(categories, ordered=None) assert c1 is not c2 + assert c1 is not c3 - def test_unordered_same(self): - c1 = CategoricalDtype(['a', 'b']) - c2 = CategoricalDtype(['b', 'a']) + @pytest.mark.parametrize('ordered', [False, None]) + def test_unordered_same(self, ordered): + c1 = CategoricalDtype(['a', 'b'], ordered=ordered) + c2 = CategoricalDtype(['b', 'a'], ordered=ordered) assert hash(c1) == hash(c2) def test_categories(self): result = CategoricalDtype(['a', 'b', 'c']) tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c'])) - assert result.ordered is False + assert result.ordered is None - def test_equal_but_different(self): + def test_equal_but_different(self, ordered): c1 = CategoricalDtype([1, 2, 3]) c2 = CategoricalDtype([1., 2., 3.]) assert c1 is not c2 @@ -654,9 +620,11 @@ def test_equal_but_different(self): ([1, 2, 3], [3, 2, 1]), ]) def test_order_hashes_different(self, v1, v2): - c1 = CategoricalDtype(v1) + c1 = CategoricalDtype(v1, ordered=False) c2 = CategoricalDtype(v2, ordered=True) + c3 = CategoricalDtype(v1, ordered=None) assert c1 is not c2 + assert c1 is not c3 def test_nan_invalid(self): with pytest.raises(ValueError): @@ -671,26 +639,46 @@ def test_same_categories_different_order(self): c2 = CategoricalDtype(['b', 'a'], ordered=True) assert c1 is not c2 - @pytest.mark.parametrize('ordered, other, expected', [ - (True, CategoricalDtype(['a', 'b'], True), True), - (False, CategoricalDtype(['a', 'b'], False), True), - (True, CategoricalDtype(['a', 'b'], False), False), - (False, CategoricalDtype(['a', 'b'], True), False), - (True, CategoricalDtype([1, 2], False), False), - (False, CategoricalDtype([1, 2], True), False), - (False, CategoricalDtype(None, True), True), - (True, CategoricalDtype(None, True), True), - (False, CategoricalDtype(None, False), True), - (True, CategoricalDtype(None, False), True), - (True, 'category', True), - (False, 'category', True), - (True, 'not a category', False), - (False, 'not a category', False), - ]) - def test_categorical_equality(self, ordered, other, expected): - c1 = CategoricalDtype(['a', 'b'], ordered) + @pytest.mark.parametrize('ordered1', [True, False, None]) + @pytest.mark.parametrize('ordered2', [True, False, None]) + def test_categorical_equality(self, ordered1, ordered2): + # same categories, same order + # any combination of None/False are equal + # True/True is the only combination with True that are equal + c1 = CategoricalDtype(list('abc'), ordered1) + c2 = CategoricalDtype(list('abc'), ordered2) + result = c1 == c2 + expected = bool(ordered1) is bool(ordered2) + assert result is expected + + # same categories, different order + # any combination of None/False are equal (order doesn't matter) + # any combination with True are not equal (different order of cats) + c1 = CategoricalDtype(list('abc'), ordered1) + c2 = CategoricalDtype(list('cab'), ordered2) + result = c1 == c2 + expected = (bool(ordered1) is False) and (bool(ordered2) is False) + assert result is expected + + # different categories + c2 = CategoricalDtype([1, 2, 3], ordered2) + assert c1 != c2 + + # none categories + c1 = CategoricalDtype(list('abc'), ordered1) + c2 = CategoricalDtype(None, ordered2) + c3 = CategoricalDtype(None, ordered1) + assert c1 == c2 + assert c2 == c1 + assert c2 == c3 + + @pytest.mark.parametrize('categories', [list('abc'), None]) + @pytest.mark.parametrize('other', ['category', 'not a category']) + def test_categorical_equality_strings(self, categories, ordered, other): + c1 = CategoricalDtype(categories, ordered) result = c1 == other - assert result == expected + expected = other == 'category' + assert result is expected def test_invalid_raises(self): with tm.assert_raises_regex(TypeError, 'ordered'): @@ -731,12 +719,12 @@ def test_from_categorical_dtype_both(self): c1, categories=[1, 2], ordered=False) assert result == CategoricalDtype([1, 2], ordered=False) - def test_str_vs_repr(self): - c1 = CategoricalDtype(['a', 'b']) + def test_str_vs_repr(self, ordered): + c1 = CategoricalDtype(['a', 'b'], ordered=ordered) assert str(c1) == 'category' # Py2 will have unicode prefixes - pat = r"CategoricalDtype\(categories=\[.*\], ordered=False\)" - assert re.match(pat, repr(c1)) + pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)" + assert re.match(pat.format(ordered=ordered), repr(c1)) def test_categorical_categories(self): # GH17884 @@ -745,30 +733,37 @@ def test_categorical_categories(self): c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) + @pytest.mark.parametrize('new_categories', [ + list('abc'), list('cba'), list('wxyz'), None]) + @pytest.mark.parametrize('new_ordered', [True, False, None]) + def test_update_dtype(self, ordered, new_categories, new_ordered): + dtype = CategoricalDtype(list('abc'), ordered) + new_dtype = CategoricalDtype(new_categories, new_ordered) -class DummyArray(ExtensionArray): - pass - - -class DummyDtype(ExtensionDtype): - pass + expected_categories = new_dtype.categories + if expected_categories is None: + expected_categories = dtype.categories + expected_ordered = new_dtype.ordered + if expected_ordered is None: + expected_ordered = dtype.ordered -class TestExtensionArrayDtype(object): + result = dtype.update_dtype(new_dtype) + tm.assert_index_equal(result.categories, expected_categories) + assert result.ordered is expected_ordered - @pytest.mark.parametrize('values', [ - pd.Categorical([]), - pd.Categorical([]).dtype, - pd.Series(pd.Categorical([])), - DummyDtype(), - DummyArray(), - ]) - def test_is_extension_array_dtype(self, values): - assert is_extension_array_dtype(values) + def test_update_dtype_string(self, ordered): + dtype = CategoricalDtype(list('abc'), ordered) + expected_categories = dtype.categories + expected_ordered = dtype.ordered + result = dtype.update_dtype('category') + tm.assert_index_equal(result.categories, expected_categories) + assert result.ordered is expected_ordered - @pytest.mark.parametrize('values', [ - np.array([]), - pd.Series(np.array([])), - ]) - def test_is_not_extension_array_dtype(self, values): - assert not is_extension_array_dtype(values) + @pytest.mark.parametrize('bad_dtype', [ + 'foo', object, np.int64, PeriodDtype('Q')]) + def test_update_dtype_errors(self, bad_dtype): + dtype = CategoricalDtype(list('abc'), False) + msg = 'a CategoricalDtype must be passed to perform an update, ' + with tm.assert_raises_regex(ValueError, msg): + dtype.update_dtype(bad_dtype) diff --git a/pandas/tests/extension/__init__.py b/pandas/tests/extension/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py new file mode 100644 index 0000000000000..1f4582f687415 --- /dev/null +++ b/pandas/tests/extension/test_common.py @@ -0,0 +1,67 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.core.arrays import ExtensionArray +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype + + +class DummyDtype(ExtensionDtype): + pass + + +class DummyArray(ExtensionArray): + + def __init__(self, data): + self.data = data + + def __array__(self, dtype): + return self.data + + @property + def dtype(self): + return self.data.dtype + + +class TestExtensionArrayDtype(object): + + @pytest.mark.parametrize('values', [ + pd.Categorical([]), + pd.Categorical([]).dtype, + pd.Series(pd.Categorical([])), + DummyDtype(), + DummyArray(np.array([1, 2])), + ]) + def test_is_extension_array_dtype(self, values): + assert is_extension_array_dtype(values) + + @pytest.mark.parametrize('values', [ + np.array([]), + pd.Series(np.array([])), + ]) + def test_is_not_extension_array_dtype(self, values): + assert not is_extension_array_dtype(values) + + +def test_astype(): + + arr = DummyArray(np.array([1, 2, 3])) + expected = np.array([1, 2, 3], dtype=object) + + result = arr.astype(object) + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype('object') + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_no_copy(): + arr = DummyArray(np.array([1, 2, 3], dtype=np.int64)) + result = arr.astype(arr.dtype, copy=False) + + assert arr.data is result + + result = arr.astype(arr.dtype) + assert arr.data is not result diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index e0fc6c470fe57..d1ad9f71e6350 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -82,24 +82,30 @@ def test_apply_empty(self): rs = xp.apply(lambda x: x['a'], axis=1) assert_frame_equal(xp, rs) + def test_apply_with_reduce_empty(self): # reduce with an empty DataFrame x = [] - result = self.empty.apply(x.append, axis=1, reduce=False) + result = self.empty.apply(x.append, axis=1, result_type='expand') assert_frame_equal(result, self.empty) - result = self.empty.apply(x.append, axis=1, reduce=True) + result = self.empty.apply(x.append, axis=1, result_type='reduce') assert_series_equal(result, Series( [], index=pd.Index([], dtype=object))) empty_with_cols = DataFrame(columns=['a', 'b', 'c']) - result = empty_with_cols.apply(x.append, axis=1, reduce=False) + result = empty_with_cols.apply(x.append, axis=1, result_type='expand') assert_frame_equal(result, empty_with_cols) - result = empty_with_cols.apply(x.append, axis=1, reduce=True) + result = empty_with_cols.apply(x.append, axis=1, result_type='reduce') assert_series_equal(result, Series( [], index=pd.Index([], dtype=object))) # Ensure that x.append hasn't been called assert x == [] + def test_apply_deprecate_reduce(self): + with warnings.catch_warnings(record=True): + x = [] + self.empty.apply(x.append, axis=1, result_type='reduce') + def test_apply_standard_nonunique(self): df = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) @@ -121,17 +127,79 @@ def test_with_string_args(self): expected = getattr(self.frame, arg)(axis=1) tm.assert_series_equal(result, expected) + def test_apply_broadcast_deprecated(self): + with tm.assert_produces_warning(FutureWarning): + self.frame.apply(np.mean, broadcast=True) + def test_apply_broadcast(self): - broadcasted = self.frame.apply(np.mean, broadcast=True) - agged = self.frame.apply(np.mean) - for col, ts in compat.iteritems(broadcasted): - assert (ts == agged[col]).all() + # scalars + result = self.frame.apply(np.mean, result_type='broadcast') + expected = DataFrame([self.frame.mean()], index=self.frame.index) + tm.assert_frame_equal(result, expected) + + result = self.frame.apply(np.mean, axis=1, result_type='broadcast') + m = self.frame.mean(axis=1) + expected = DataFrame({c: m for c in self.frame.columns}) + tm.assert_frame_equal(result, expected) + + # lists + result = self.frame.apply( + lambda x: list(range(len(self.frame.columns))), + axis=1, + result_type='broadcast') + m = list(range(len(self.frame.columns))) + expected = DataFrame([m] * len(self.frame.index), + dtype='float64', + index=self.frame.index, + columns=self.frame.columns) + tm.assert_frame_equal(result, expected) + + result = self.frame.apply(lambda x: list(range(len(self.frame.index))), + result_type='broadcast') + m = list(range(len(self.frame.index))) + expected = DataFrame({c: m for c in self.frame.columns}, + dtype='float64', + index=self.frame.index) + tm.assert_frame_equal(result, expected) + + # preserve columns + df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + columns=list('ABC')) + result = df.apply(lambda x: [1, 2, 3], + axis=1, + result_type='broadcast') + tm.assert_frame_equal(result, df) + + df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + columns=list('ABC')) + result = df.apply(lambda x: Series([1, 2, 3], index=list('abc')), + axis=1, + result_type='broadcast') + expected = df.copy() + tm.assert_frame_equal(result, expected) - broadcasted = self.frame.apply(np.mean, axis=1, broadcast=True) - agged = self.frame.apply(np.mean, axis=1) - for idx in broadcasted.index: - assert (broadcasted.xs(idx) == agged[idx]).all() + def test_apply_broadcast_error(self): + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + # > 1 ndim + with pytest.raises(ValueError): + df.apply(lambda x: np.array([1, 2]).reshape(-1, 2), + axis=1, + result_type='broadcast') + + # cannot broadcast + with pytest.raises(ValueError): + df.apply(lambda x: [1, 2], + axis=1, + result_type='broadcast') + + with pytest.raises(ValueError): + df.apply(lambda x: Series([1, 2]), + axis=1, + result_type='broadcast') def test_apply_raw(self): result0 = self.frame.apply(np.mean, raw=True) @@ -208,7 +276,7 @@ def _checkit(axis=0, raw=False): _check(no_index, lambda x: x) _check(no_index, lambda x: x.mean()) - result = no_cols.apply(lambda x: x.mean(), broadcast=True) + result = no_cols.apply(lambda x: x.mean(), result_type='broadcast') assert isinstance(result, DataFrame) def test_apply_with_args_kwds(self): @@ -350,33 +418,37 @@ def test_apply_attach_name(self): result = self.frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) - expected = DataFrame(np.tile(self.frame.index, - (len(self.frame.columns), 1)).T, - index=self.frame.index, - columns=self.frame.columns) - assert_frame_equal(result, expected) + expected = Series(np.repeat(t[0], len(self.frame.columns)) + for t in self.frame.itertuples()) + expected.index = self.frame.index + assert_series_equal(result, expected) def test_apply_multi_index(self): - s = DataFrame([[1, 2], [3, 4], [5, 6]]) - s.index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) - s.columns = ['col1', 'col2'] - res = s.apply(lambda x: Series({'min': min(x), 'max': max(x)}), 1) - assert isinstance(res.index, MultiIndex) + index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) + s = DataFrame([[1, 2], [3, 4], [5, 6]], + index=index, + columns=['col1', 'col2']) + result = s.apply( + lambda x: Series({'min': min(x), 'max': max(x)}), 1) + expected = DataFrame([[1, 2], [3, 4], [5, 6]], + index=index, + columns=['min', 'max']) + assert_frame_equal(result, expected, check_like=True) def test_apply_dict(self): # GH 8735 A = DataFrame([['foo', 'bar'], ['spam', 'eggs']]) - A_dicts = pd.Series([dict([(0, 'foo'), (1, 'spam')]), - dict([(0, 'bar'), (1, 'eggs')])]) + A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]), + dict([(0, 'bar'), (1, 'eggs')])]) B = DataFrame([[0, 1], [2, 3]]) - B_dicts = pd.Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) + B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) fn = lambda x: x.to_dict() for df, dicts in [(A, A_dicts), (B, B_dicts)]: - reduce_true = df.apply(fn, reduce=True) - reduce_false = df.apply(fn, reduce=False) - reduce_none = df.apply(fn, reduce=None) + reduce_true = df.apply(fn, result_type='reduce') + reduce_false = df.apply(fn, result_type='expand') + reduce_none = df.apply(fn) assert_series_equal(reduce_true, dicts) assert_frame_equal(reduce_false, df) @@ -465,8 +537,8 @@ def test_frame_apply_dont_convert_datetime64(self): assert df.x1.dtype == 'M8[ns]' - # See gh-12244 def test_apply_non_numpy_dtype(self): + # See gh-12244 df = DataFrame({'dt': pd.date_range( "2015-01-01", periods=3, tz='Europe/Brussels')}) result = df.apply(lambda x: x) @@ -482,6 +554,256 @@ def test_apply_non_numpy_dtype(self): assert_frame_equal(result, df) +class TestInferOutputShape(object): + # the user has supplied an opaque UDF where + # they are transforming the input that requires + # us to infer the output + + def test_infer_row_shape(self): + # gh-17437 + # if row shape is changing, infer it + df = pd.DataFrame(np.random.rand(10, 2)) + result = df.apply(np.fft.fft, axis=0) + assert result.shape == (10, 2) + + result = df.apply(np.fft.rfft, axis=0) + assert result.shape == (6, 2) + + def test_with_dictlike_columns(self): + # gh 17602 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1) + expected = Series([{'s': 3} for t in df.itertuples()]) + assert_series_equal(result, expected) + + df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), + pd.Timestamp('2017-05-02 00:00:00')] + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1) + assert_series_equal(result, expected) + + # compose a series + result = (df['a'] + df['b']).apply(lambda x: {'s': x}) + expected = Series([{'s': 3}, {'s': 3}]) + assert_series_equal(result, expected) + + # gh-18775 + df = DataFrame() + df["author"] = ["X", "Y", "Z"] + df["publisher"] = ["BBC", "NBC", "N24"] + df["date"] = pd.to_datetime(['17-10-2010 07:15:30', + '13-05-2011 08:20:35', + '15-01-2013 09:09:09']) + result = df.apply(lambda x: {}, axis=1) + expected = Series([{}, {}, {}]) + assert_series_equal(result, expected) + + def test_with_dictlike_columns_with_infer(self): + # gh 17602 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1, result_type='expand') + expected = DataFrame({'s': [3, 3]}) + assert_frame_equal(result, expected) + + df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), + pd.Timestamp('2017-05-02 00:00:00')] + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1, result_type='expand') + assert_frame_equal(result, expected) + + def test_with_listlike_columns(self): + # gh-17348 + df = DataFrame({'a': Series(np.random.randn(4)), + 'b': ['a', 'list', 'of', 'words'], + 'ts': date_range('2016-10-01', periods=4, freq='H')}) + + result = df[['a', 'b']].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[['a', 'b']].itertuples()]) + assert_series_equal(result, expected) + + result = df[['a', 'ts']].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()]) + assert_series_equal(result, expected) + + # gh-18919 + df = DataFrame({'x': Series([['a', 'b'], ['q']]), + 'y': Series([['z'], ['q', 't']])}) + df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')]) + + result = df.apply( + lambda row: [el for el in row['x'] if el in row['y']], + axis=1) + expected = Series([[], ['q']], index=df.index) + assert_series_equal(result, expected) + + def test_infer_output_shape_columns(self): + # gh-18573 + + df = DataFrame({'number': [1., 2.], + 'string': ['foo', 'bar'], + 'datetime': [pd.Timestamp('2017-11-29 03:30:00'), + pd.Timestamp('2017-11-29 03:45:00')]}) + result = df.apply(lambda row: (row.number, row.string), axis=1) + expected = Series([t[2:] for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_infer_output_shape_listlike_columns(self): + # gh-16353 + + df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + assert_series_equal(result, expected) + + # gh-17970 + df = DataFrame({"a": [1, 2, 3]}, index=list('abc')) + + result = df.apply(lambda row: np.ones(1), axis=1) + expected = Series([np.ones(1) for t in df.itertuples()], + index=df.index) + assert_series_equal(result, expected) + + result = df.apply(lambda row: np.ones(2), axis=1) + expected = Series([np.ones(2) for t in df.itertuples()], + index=df.index) + assert_series_equal(result, expected) + + # gh-17892 + df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'), + pd.Timestamp('2010-02-04'), + pd.Timestamp('2010-02-05'), + pd.Timestamp('2010-02-06')], + 'b': [9, 5, 4, 3], + 'c': [5, 3, 4, 2], + 'd': [1, 2, 3, 4]}) + + def fun(x): + return (1, 2) + + result = df.apply(fun, axis=1) + expected = Series([(1, 2) for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_consistent_coerce_for_shapes(self): + # we want column names to NOT be propagated + # just because the shape matches the input shape + df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_consistent_names(self): + # if a Series is returned, we should use the resulting index names + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: Series([1, 2, 3], + index=['test', 'other', 'cols']), + axis=1) + expected = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['test', 'other', 'cols']) + assert_frame_equal(result, expected) + + result = df.apply( + lambda x: pd.Series([1, 2], index=['test', 'other']), axis=1) + expected = DataFrame( + np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['test', 'other']) + assert_frame_equal(result, expected) + + def test_result_type(self): + # result_type should be consistent no matter which + # path we take in the code + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') + expected = df.copy() + expected.columns = [0, 1, 2] + assert_frame_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1, result_type='expand') + expected = df[['A', 'B']].copy() + expected.columns = [0, 1] + assert_frame_equal(result, expected) + + # broadcast result + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + expected = df.copy() + assert_frame_equal(result, expected) + + columns = ['other', 'col', 'names'] + result = df.apply( + lambda x: pd.Series([1, 2, 3], + index=columns), + axis=1, + result_type='broadcast') + expected = df.copy() + assert_frame_equal(result, expected) + + # series result + result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + expected = df.copy() + assert_frame_equal(result, expected) + + # series result with other index + columns = ['other', 'col', 'names'] + result = df.apply( + lambda x: pd.Series([1, 2, 3], index=columns), + axis=1) + expected = df.copy() + expected.columns = columns + assert_frame_equal(result, expected) + + @pytest.mark.parametrize("result_type", ['foo', 1]) + def test_result_type_error(self, result_type): + # allowed result_type + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + with pytest.raises(ValueError): + df.apply(lambda x: [1, 2, 3], + axis=1, + result_type=result_type) + + @pytest.mark.parametrize( + "box", + [lambda x: list(x), + lambda x: tuple(x), + lambda x: np.array(x, dtype='int64')], + ids=['list', 'tuple', 'array']) + def test_consistency_for_boxed(self, box): + # passing an array or list should not affect the output shape + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: box([1, 2]), axis=1) + expected = Series([box([1, 2]) for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand') + expected = DataFrame( + np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1) + assert_frame_equal(result, expected) + + def zip_frames(*frames): """ take a list of frames, zip the columns together for each @@ -496,8 +818,6 @@ def zip_frames(*frames): class TestDataFrameAggregate(TestData): - _multiprocess_can_split_ = True - def test_agg_transform(self): with np.errstate(all='ignore'): @@ -659,13 +979,13 @@ def test_non_callable_aggregates(self): # Function aggregate result = df.agg({'A': 'count'}) - expected = pd.Series({'A': 2}) + expected = Series({'A': 2}) assert_series_equal(result, expected) # Non-function aggregate result = df.agg({'A': 'size'}) - expected = pd.Series({'A': 3}) + expected = Series({'A': 3}) assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 9b99a7b73b82b..a3a799aed1c55 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- - import pytest import numpy as np +from pandas.compat import range + import pandas as pd import pandas.util.testing as tm @@ -58,10 +59,129 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): result = getattr(empty, opname)(const).get_dtype_counts() tm.assert_series_equal(result, pd.Series([2], ['bool'])) + @pytest.mark.parametrize('timestamps', [ + [pd.Timestamp('2012-01-01 13:00:00+00:00')] * 2, + [pd.Timestamp('2012-01-01 13:00:00')] * 2]) + def test_tz_aware_scalar_comparison(self, timestamps): + # Test for issue #15966 + df = pd.DataFrame({'test': timestamps}) + expected = pd.DataFrame({'test': [False, False]}) + tm.assert_frame_equal(df == -1, expected) + # ------------------------------------------------------------------- # Arithmetic +class TestFrameMulDiv(object): + """Tests for DataFrame multiplication and division""" + # ------------------------------------------------------------------ + # Mod By Zero + + def test_df_mod_zero_df(self): + # GH#3590, modulo as ints + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + # this is technically wrong, as the integer portion is coerced to float + # ### + first = pd.Series([0, 0, 0, 0], dtype='float64') + second = pd.Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({'first': first, 'second': second}) + result = df % df + tm.assert_frame_equal(result, expected) + + def test_df_mod_zero_array(self): + # GH#3590, modulo as ints + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + # this is technically wrong, as the integer portion is coerced to float + # ### + first = pd.Series([0, 0, 0, 0], dtype='float64') + second = pd.Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({'first': first, 'second': second}) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all='ignore'): + arr = df.values % df.values + result2 = pd.DataFrame(arr, index=df.index, + columns=df.columns, dtype='float64') + result2.iloc[0:3, 1] = np.nan + tm.assert_frame_equal(result2, expected) + + def test_df_mod_zero_int(self): + # GH#3590, modulo as ints + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + result = df % 0 + expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all='ignore'): + arr = df.values.astype('float64') % 0 + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) + tm.assert_frame_equal(result2, expected) + + def test_df_mod_zero_series_does_not_commute(self): + # GH#3590, modulo as ints + # not commutative with series + df = pd.DataFrame(np.random.randn(10, 5)) + ser = df[0] + res = ser % df + res2 = df % ser + assert not res.fillna(0).equals(res2.fillna(0)) + + # ------------------------------------------------------------------ + # Division By Zero + + def test_df_div_zero_df(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + result = df / df + + first = pd.Series([1.0, 1.0, 1.0, 1.0]) + second = pd.Series([np.nan, np.nan, np.nan, 1]) + expected = pd.DataFrame({'first': first, 'second': second}) + tm.assert_frame_equal(result, expected) + + def test_df_div_zero_array(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + first = pd.Series([1.0, 1.0, 1.0, 1.0]) + second = pd.Series([np.nan, np.nan, np.nan, 1]) + expected = pd.DataFrame({'first': first, 'second': second}) + + with np.errstate(all='ignore'): + arr = df.values.astype('float') / df.values + result = pd.DataFrame(arr, index=df.index, + columns=df.columns) + tm.assert_frame_equal(result, expected) + + def test_df_div_zero_int(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + result = df / 0 + expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns) + expected.iloc[0:3, 1] = np.nan + tm.assert_frame_equal(result, expected) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all='ignore'): + arr = df.values.astype('float64') / 0 + result2 = pd.DataFrame(arr, index=df.index, + columns=df.columns) + tm.assert_frame_equal(result2, expected) + + def test_df_div_zero_series_does_not_commute(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame(np.random.randn(10, 5)) + ser = df[0] + res = ser / df + res2 = df / ser + assert not res.fillna(0).equals(res2.fillna(0)) + + class TestFrameArithmetic(object): @pytest.mark.xfail(reason='GH#7996 datetime64 units not converted to nano') @@ -125,7 +245,7 @@ def test_ops_frame_period(self): exp = pd.DataFrame({'A': np.array([2, 1], dtype=object), 'B': np.array([14, 13], dtype=object)}) tm.assert_frame_equal(p - df, exp) - tm.assert_frame_equal(df - p, -exp) + tm.assert_frame_equal(df - p, -1 * exp) df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'), pd.Period('2015-06', freq='M')], @@ -137,4 +257,4 @@ def test_ops_frame_period(self): exp = pd.DataFrame({'A': np.array([4, 4], dtype=object), 'B': np.array([16, 16], dtype=object)}) tm.assert_frame_equal(df2 - df, exp) - tm.assert_frame_equal(df - df2, -exp) + tm.assert_frame_equal(df - df2, -1 * exp) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index afecba2026dd7..ccdba6df2521a 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -165,3 +165,20 @@ def test_join_period_index(frame_with_period_index): index=frame_with_period_index.index) tm.assert_frame_equal(joined, expected) + + +def test_join_left_sequence_non_unique_index(): + # https://github.com/pandas-dev/pandas/issues/19607 + df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3]) + df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2]) + df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4]) + + joined = df1.join([df2, df3], how='left') + + expected = DataFrame({ + 'a': [0, 10, 10, 20], + 'b': [np.nan, 300, 300, 200], + 'c': [np.nan, 400, 500, np.nan] + }, index=[1, 2, 2, 3]) + + tm.assert_frame_equal(joined, expected) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 9acdf2f17d86a..8236a41d00243 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -89,11 +89,35 @@ def test_assign_bad(self): df.assign(lambda x: x.A) with pytest.raises(AttributeError): df.assign(C=df.A, D=df.A + df.C) + + @pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python + 3.6 and above""") + def test_assign_dependent_old_python(self): + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + + # Key C does not exist at defition time of df with pytest.raises(KeyError): - df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C']) + df.assign(C=lambda df: df.A, + D=lambda df: df['A'] + df['C']) with pytest.raises(KeyError): df.assign(C=df.A, D=lambda x: x['A'] + x['C']) + @pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for + python 3.5 and below""") + def test_assign_dependent(self): + df = DataFrame({'A': [1, 2], 'B': [3, 4]}) + + result = df.assign(C=df.A, D=lambda x: x['A'] + x['C']) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], + columns=list('ABCD')) + assert_frame_equal(result, expected) + + result = df.assign(C=lambda df: df.A, + D=lambda df: df['A'] + df['C']) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], + columns=list('ABCD')) + assert_frame_equal(result, expected) + def test_insert_error_msmgs(self): # GH 7432 diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index bdccbec6111d3..5df50f3d7835b 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -203,76 +203,6 @@ def test_timestamp_compare(self): result = right_f(Timestamp('nat'), df) assert_frame_equal(result, expected) - def test_modulo(self): - # GH3590, modulo as ints - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - - # this is technically wrong as the integer portion is coerced to float - # ### - expected = DataFrame({'first': Series([0, 0, 0, 0], dtype='float64'), - 'second': Series([np.nan, np.nan, np.nan, 0])}) - result = p % p - assert_frame_equal(result, expected) - - # numpy has a slightly different (wrong) treatement - with np.errstate(all='ignore'): - arr = p.values % p.values - result2 = DataFrame(arr, index=p.index, - columns=p.columns, dtype='float64') - result2.iloc[0:3, 1] = np.nan - assert_frame_equal(result2, expected) - - result = p % 0 - expected = DataFrame(np.nan, index=p.index, columns=p.columns) - assert_frame_equal(result, expected) - - # numpy has a slightly different (wrong) treatement - with np.errstate(all='ignore'): - arr = p.values.astype('float64') % 0 - result2 = DataFrame(arr, index=p.index, columns=p.columns) - assert_frame_equal(result2, expected) - - # not commutative with series - p = DataFrame(np.random.randn(10, 5)) - s = p[0] - res = s % p - res2 = p % s - assert not res.fillna(0).equals(res2.fillna(0)) - - def test_div(self): - - # integer div, but deal with the 0's (GH 9144) - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p / p - - expected = DataFrame({'first': Series([1.0, 1.0, 1.0, 1.0]), - 'second': Series([nan, nan, nan, 1])}) - assert_frame_equal(result, expected) - - with np.errstate(all='ignore'): - arr = p.values.astype('float') / p.values - result2 = DataFrame(arr, index=p.index, - columns=p.columns) - assert_frame_equal(result2, expected) - - result = p / 0 - expected = DataFrame(np.inf, index=p.index, columns=p.columns) - expected.iloc[0:3, 1] = nan - assert_frame_equal(result, expected) - - # numpy has a slightly different (wrong) treatement - with np.errstate(all='ignore'): - arr = p.values.astype('float64') / 0 - result2 = DataFrame(arr, index=p.index, - columns=p.columns) - assert_frame_equal(result2, expected) - - p = DataFrame(np.random.randn(10, 5)) - s = p[0] - res = s / p - res2 = p / s - assert not res.fillna(0).equals(res2.fillna(0)) - def test_logical_operators(self): def _check_bin_op(op): @@ -341,13 +271,50 @@ def test_logical_with_nas(self): expected = Series([True, True]) assert_series_equal(result, expected) - def test_neg(self): - # what to do? - assert_frame_equal(-self.frame, -1 * self.frame) + @pytest.mark.parametrize('df,expected', [ + (pd.DataFrame({'a': [-1, 1]}), pd.DataFrame({'a': [1, -1]})), + (pd.DataFrame({'a': [False, True]}), + pd.DataFrame({'a': [True, False]})), + (pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}), + pd.DataFrame({'a': pd.Series(pd.to_timedelta([1, -1]))})) + ]) + def test_neg_numeric(self, df, expected): + assert_frame_equal(-df, expected) + assert_series_equal(-df['a'], expected['a']) + + @pytest.mark.parametrize('df', [ + pd.DataFrame({'a': ['a', 'b']}), + pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), + ]) + def test_neg_raises(self, df): + with pytest.raises(TypeError): + (- df) + with pytest.raises(TypeError): + (- df['a']) def test_invert(self): assert_frame_equal(-(self.frame < 0), ~(self.frame < 0)) + @pytest.mark.parametrize('df', [ + pd.DataFrame({'a': [-1, 1]}), + pd.DataFrame({'a': [False, True]}), + pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}), + ]) + def test_pos_numeric(self, df): + # GH 16073 + assert_frame_equal(+df, df) + assert_series_equal(+df['a'], df['a']) + + @pytest.mark.parametrize('df', [ + pd.DataFrame({'a': ['a', 'b']}), + pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), + ]) + def test_pos_raises(self, df): + with pytest.raises(TypeError): + (+ df) + with pytest.raises(TypeError): + (+ df['a']) + def test_arith_flex_frame(self): ops = ['add', 'sub', 'mul', 'div', 'truediv', 'pow', 'floordiv', 'mod'] if not compat.PY3: @@ -451,6 +418,19 @@ def test_arith_flex_frame(self): with tm.assert_raises_regex(NotImplementedError, 'fill_value'): self.frame.add(self.frame.iloc[0], axis='index', fill_value=3) + def test_arith_flex_zero_len_raises(self): + # GH#19522 passing fill_value to frame flex arith methods should + # raise even in the zero-length special cases + ser_len0 = pd.Series([]) + df_len0 = pd.DataFrame([], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + + with tm.assert_raises_regex(NotImplementedError, 'fill_value'): + df.add(ser_len0, fill_value='E') + + with tm.assert_raises_regex(NotImplementedError, 'fill_value'): + df_len0.sub(df['A'], axis=None, fill_value=3) + def test_binary_ops_align(self): # test aligning binary ops diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index e6b47fd69cb05..25dd285e883a0 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -738,12 +738,3 @@ def test_tz_convert_and_localize(self, fn): with assert_raises_regex(ValueError, 'not valid'): df = DataFrame(index=l0) df = getattr(df, fn)('US/Pacific', level=1) - - @pytest.mark.parametrize('timestamps', [ - [Timestamp('2012-01-01 13:00:00+00:00')] * 2, - [Timestamp('2012-01-01 13:00:00')] * 2]) - def test_tz_aware_scalar_comparison(self, timestamps): - # Test for issue #15966 - df = DataFrame({'test': timestamps}) - expected = DataFrame({'test': [False, False]}) - assert_frame_equal(df == -1, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index c8ee05ddbb74f..cef3a699ed24b 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -12,7 +12,8 @@ from numpy import nan import pandas as pd -from pandas import bdate_range, DataFrame, Index, Series +from pandas import (bdate_range, DataFrame, Index, Series, Timestamp, + Timedelta, NaT) from pandas.core.groupby import DataError import pandas.util.testing as tm @@ -187,3 +188,19 @@ def test_cython_agg_empty_buckets_nanops(): {"a": [1, 1, 1716, 1]}, index=pd.CategoricalIndex(intervals, name='a', ordered=True)) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize('op', ['first', 'last', 'max', 'min']) +@pytest.mark.parametrize('data', [ + Timestamp('2016-10-14 21:00:44.557'), + Timedelta('17088 days 21:00:44.557'), ]) +def test_cython_with_timestamp_and_nat(op, data): + # https://github.com/pandas-dev/pandas/issues/19526 + df = DataFrame({'a': [0, 1], 'b': [data, NaT]}) + index = Index([0, 1], name='a') + + # We will group by a and test the cython aggregations + expected = DataFrame({'b': [data, NaT]}, index=index) + + result = df.groupby('a').aggregate(op) + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 575eae1916f4c..4c407ad8a0d93 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -7,6 +7,7 @@ from __future__ import print_function import pytest +from collections import OrderedDict import datetime as dt from functools import partial @@ -81,7 +82,7 @@ def test_agg_period_index(): s1 = Series(np.random.rand(len(index)), index=index) s2 = Series(np.random.rand(len(index)), index=index) series = [('s1', s1), ('s2', s2)] - df = DataFrame.from_items(series) + df = DataFrame.from_dict(OrderedDict(series)) grouped = df.groupby(df.index.month) list(grouped) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5172efe25d697..6eacd45deb7bc 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1895,6 +1895,172 @@ def test_rank_apply(self): expected = expected.reindex(result.index) assert_series_equal(result, expected) + @pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) + @pytest.mark.parametrize("vals", [ + [2, 2, 8, 2, 6], + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06')]]) + @pytest.mark.parametrize("ties_method,ascending,pct,exp", [ + ('average', True, False, [2., 2., 5., 2., 4.]), + ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ('average', False, False, [4., 4., 1., 4., 2.]), + ('average', False, True, [.8, .8, .2, .8, .4]), + ('min', True, False, [1., 1., 5., 1., 4.]), + ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ('min', False, False, [3., 3., 1., 3., 2.]), + ('min', False, True, [.6, .6, .2, .6, .4]), + ('max', True, False, [3., 3., 5., 3., 4.]), + ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ('max', False, False, [5., 5., 1., 5., 2.]), + ('max', False, True, [1., 1., .2, 1., .4]), + ('first', True, False, [1., 2., 5., 3., 4.]), + ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ('first', False, False, [3., 4., 1., 5., 2.]), + ('first', False, True, [.6, .8, .2, 1., .4]), + ('dense', True, False, [1., 1., 3., 1., 2.]), + ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), + ('dense', False, False, [3., 3., 1., 3., 2.]), + ('dense', False, True, [.6, .6, .2, .6, .4]), + ]) + def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=['val']) + assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) + @pytest.mark.parametrize("vals", [ + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06'), np.nan, np.nan] + ]) + @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ + ('average', True, 'keep', False, + [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]), + ('average', True, 'keep', True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]), + ('average', False, 'keep', False, + [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]), + ('average', False, 'keep', True, + [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]), + ('min', True, 'keep', False, + [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]), + ('min', True, 'keep', True, + [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ('min', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('min', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('max', True, 'keep', False, + [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]), + ('max', True, 'keep', True, + [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('max', False, 'keep', False, + [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]), + ('max', False, 'keep', True, + [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('first', True, 'keep', False, + [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]), + ('first', True, 'keep', True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('first', False, 'keep', False, + [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]), + ('first', False, 'keep', True, + [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('dense', True, 'keep', False, + [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), + ('dense', True, 'keep', True, + [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), + ('dense', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('dense', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), + ('average', True, 'no_na', True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), + ('average', False, 'no_na', False, [4., 4., 7., 1., 4., 2., 7., 7.]), + ('average', False, 'no_na', True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]), + ('min', True, 'no_na', False, [1., 1., 6., 5., 1., 4., 6., 6.]), + ('min', True, 'no_na', True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]), + ('min', False, 'no_na', False, [3., 3., 6., 1., 3., 2., 6., 6.]), + ('min', False, 'no_na', True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]), + ('max', True, 'no_na', False, [3., 3., 8., 5., 3., 4., 8., 8.]), + ('max', True, 'no_na', True, + [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]), + ('max', False, 'no_na', False, [5., 5., 8., 1., 5., 2., 8., 8.]), + ('max', False, 'no_na', True, + [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]), + ('first', True, 'no_na', False, [1., 2., 6., 5., 3., 4., 7., 8.]), + ('first', True, 'no_na', True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]), + ('first', False, 'no_na', False, [3., 4., 6., 1., 5., 2., 7., 8.]), + ('first', False, 'no_na', True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), + ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), + ('dense', True, 'no_na', True, + [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), + ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), + ('dense', False, 'no_na', True, + [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) + ]) + def test_rank_args_missing(self, grps, vals, ties_method, ascending, + na_option, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=['val']) + assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("pct,exp", [ + (False, [3., 3., 3., 3., 3.]), + (True, [.6, .6, .6, .6, .6])]) + def test_rank_resets_each_group(self, pct, exp): + df = DataFrame( + {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], + 'val': [1] * 10} + ) + result = df.groupby('key').rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=['val']) + assert_frame_equal(result, exp_df) + + def test_rank_avg_even_vals(self): + df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4}) + result = df.groupby('key').rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val']) + assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("ties_method", [ + 'average', 'min', 'max', 'first', 'dense']) + @pytest.mark.parametrize("ascending", [True, False]) + @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) + @pytest.mark.parametrize("pct", [True, False]) + @pytest.mark.parametrize("vals", [ + ['bar', 'bar', 'foo', 'bar', 'baz'], + ['bar', np.nan, 'foo', np.nan, 'baz'] + ]) + def test_rank_object_raises(self, ties_method, ascending, na_option, + pct, vals): + df = DataFrame({'key': ['foo'] * 5, 'val': vals}) + with tm.assert_raises_regex(TypeError, "not callable"): + df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) + def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2}) @@ -2086,7 +2252,19 @@ def test_median_empty_bins(self): expected = df.groupby(bins).agg(lambda x: x.median()) assert_frame_equal(result, expected) - def test_groupby_non_arithmetic_agg_types(self): + @pytest.mark.parametrize("dtype", [ + 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) + @pytest.mark.parametrize("method,data", [ + ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], + 'args': [1]}), + ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], + 'out_type': 'int64'}) + ]) + def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): # GH9311, GH6620 df = pd.DataFrame( [{'a': 1, 'b': 1}, @@ -2094,39 +2272,25 @@ def test_groupby_non_arithmetic_agg_types(self): {'a': 2, 'b': 3}, {'a': 2, 'b': 4}]) - dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64'] - - grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, - 'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, - 'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, - 'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, - 'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], - 'args': [1]}, - 'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], - 'out_type': 'int64'}} - - for dtype in dtypes: - df_in = df.copy() - df_in['b'] = df_in.b.astype(dtype) + df['b'] = df.b.astype(dtype) - for method, data in compat.iteritems(grp_exp): - if 'args' not in data: - data['args'] = [] + if 'args' not in data: + data['args'] = [] - if 'out_type' in data: - out_type = data['out_type'] - else: - out_type = dtype + if 'out_type' in data: + out_type = data['out_type'] + else: + out_type = dtype - exp = data['df'] - df_out = pd.DataFrame(exp) + exp = data['df'] + df_out = pd.DataFrame(exp) - df_out['b'] = df_out.b.astype(out_type) - df_out.set_index('a', inplace=True) + df_out['b'] = df_out.b.astype(out_type) + df_out.set_index('a', inplace=True) - grpd = df_in.groupby('a') - t = getattr(grpd, method)(*data['args']) - assert_frame_equal(t, df_out) + grpd = df.groupby('a') + t = getattr(grpd, method)(*data['args']) + assert_frame_equal(t, df_out) def test_groupby_non_arithmetic_agg_intlike_precision(self): # GH9311, GH6620 diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 8948c5f79900d..2d8d70aa2ac84 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -314,7 +314,8 @@ def test_ensure_copied_data(self): # .values an object array of Period, thus copied result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(index._values, result._values, + tm.assert_numpy_array_equal(index._ndarray_values, + result._ndarray_values, check_same='same') elif isinstance(index, IntervalIndex): # checked in test_interval.py @@ -323,7 +324,8 @@ def test_ensure_copied_data(self): result = index_type(index.values, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.values, result.values, check_same='same') - tm.assert_numpy_array_equal(index._values, result._values, + tm.assert_numpy_array_equal(index._ndarray_values, + result._ndarray_values, check_same='same') def test_copy_and_deepcopy(self, indices): diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 217ee07affa84..6d88ef0cfa6c5 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -1,9 +1,10 @@ import pytest import numpy as np +import pandas as pd import pandas.util.testing as tm from pandas.core.indexes.api import Index, MultiIndex -from pandas.compat import lzip +from pandas.compat import lzip, long @pytest.fixture(params=[tm.makeUnicodeIndex(100), @@ -29,3 +30,18 @@ def indices(request): def one(request): # zero-dim integer array behaves like an integer return request.param + + +zeros = [box([0] * 5, dtype=dtype) + for box in [pd.Index, np.array] + for dtype in [np.int64, np.uint64, np.float64]] +zeros.extend([np.array(0, dtype=dtype) + for dtype in [np.int64, np.uint64, np.float64]]) +zeros.extend([0, 0.0, long(0)]) + + +@pytest.fixture(params=zeros) +def zero(request): + # For testing division by (or of) zero for Index with length 5, this + # gives several scalar-zeros and length-5 vector-zeros + return request.param diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index 09a6b35a0ff0e..ddc97636ae0a8 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -15,6 +15,7 @@ DatetimeIndex, TimedeltaIndex, date_range) from pandas._libs import tslib +from pandas._libs.tslibs.offsets import shift_months @pytest.fixture(params=[None, 'UTC', 'Asia/Tokyo', @@ -412,6 +413,14 @@ def test_dti_shift_no_freq(self): with pytest.raises(NullFrequencyError): dti.shift(2) + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_shift_localized(self, tzstr): + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + dr_tz = dr.tz_localize(tzstr) + + result = dr_tz.shift(1, '10T') + assert result.tz == dr_tz.tz + # ------------------------------------------------------------- # Binary operations DatetimeIndex and timedelta-like @@ -767,6 +776,24 @@ def test_dti_with_offset_series(self, tz, names): res3 = dti - other tm.assert_series_equal(res3, expected_sub) + def test_dti_add_offset_tzaware(self): + dates = date_range('2012-11-01', periods=3, tz='US/Pacific') + offset = dates + pd.offsets.Hour(5) + assert dates[0] + pd.offsets.Hour(5) == offset[0] + + # GH#6818 + for tz in ['UTC', 'US/Pacific', 'Asia/Tokyo']: + dates = date_range('2010-11-01 00:00', periods=3, tz=tz, freq='H') + expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00', + '2010-11-01 07:00'], freq='H', tz=tz) + + offset = dates + pd.offsets.Hour(5) + tm.assert_index_equal(offset, expected) + offset = dates + np.timedelta64(5, 'h') + tm.assert_index_equal(offset, expected) + offset = dates + timedelta(hours=5) + tm.assert_index_equal(offset, expected) + @pytest.mark.parametrize('klass,assert_func', [ (Series, tm.assert_series_equal), @@ -907,3 +934,19 @@ def test_datetime64_with_DateOffset(klass, assert_func): Timestamp('2000-02-29', tz='US/Central')], name='a') assert_func(result, exp) assert_func(result2, exp) + + +@pytest.mark.parametrize('years', [-1, 0, 1]) +@pytest.mark.parametrize('months', [-2, 0, 2]) +def test_shift_months(years, months): + s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), + Timestamp('2000-01-31 00:23:00'), + Timestamp('2000-01-01'), + Timestamp('2000-02-29'), + Timestamp('2000-12-31')]) + actual = DatetimeIndex(shift_months(s.asi8, years * 12 + months)) + + raw = [x + pd.offsets.DateOffset(years=years, months=months) + for x in s] + expected = DatetimeIndex(raw) + tm.assert_index_equal(actual, expected) diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 4b989eb35e900..8acdd301f241a 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -138,6 +138,30 @@ def test_astype_object(self): tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) assert casted.tolist() == exp_values + @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo']) + def test_astype_object_tz(self, tz): + idx = pd.date_range(start='2013-01-01', periods=4, freq='M', + name='idx', tz=tz) + expected_list = [Timestamp('2013-01-31', tz=tz), + Timestamp('2013-02-28', tz=tz), + Timestamp('2013-03-31', tz=tz), + Timestamp('2013-04-30', tz=tz)] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.astype(object) + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype_object_with_nat(self): + idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2), + pd.NaT, datetime(2013, 1, 4)], name='idx') + expected_list = [Timestamp('2013-01-01'), + Timestamp('2013-01-02'), pd.NaT, + Timestamp('2013-01-04')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.astype(object) + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + @pytest.mark.parametrize('dtype', [ float, 'timedelta64', 'timedelta64[ns]', 'datetime64', 'datetime64[D]']) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index a75ace2933b71..05678b0c8dd45 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -469,3 +469,12 @@ def test_factorize_dst(self): arr, res = obj.factorize() tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) tm.assert_index_equal(res, idx) + + @pytest.mark.parametrize('arr, expected', [ + (pd.DatetimeIndex(['2017', '2017']), pd.DatetimeIndex(['2017'])), + (pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern'), + pd.DatetimeIndex(['2017'], tz='US/Eastern')), + ]) + def test_unique(self, arr, expected): + result = arr.unique() + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 4f386eb28cc0f..bc43b427fe0aa 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -5,10 +5,8 @@ import numpy as np from datetime import datetime -from itertools import product import pandas as pd import pandas._libs.tslib as tslib -from pandas._libs.tslibs.offsets import shift_months import pandas.util.testing as tm from pandas import (DatetimeIndex, PeriodIndex, Series, Timestamp, date_range, _np_version_under1p10, Index, @@ -51,49 +49,6 @@ def test_ops_properties_basic(self): assert s.day == 10 pytest.raises(AttributeError, lambda: s.weekday) - def test_astype_object(self): - idx = pd.date_range(start='2013-01-01', periods=4, freq='M', - name='idx') - expected_list = [Timestamp('2013-01-31'), - Timestamp('2013-02-28'), - Timestamp('2013-03-31'), - Timestamp('2013-04-30')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - - idx = pd.date_range(start='2013-01-01', periods=4, freq='M', - name='idx', tz='Asia/Tokyo') - expected_list = [Timestamp('2013-01-31', tz='Asia/Tokyo'), - Timestamp('2013-02-28', tz='Asia/Tokyo'), - Timestamp('2013-03-31', tz='Asia/Tokyo'), - Timestamp('2013-04-30', tz='Asia/Tokyo')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - - idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2), - pd.NaT, datetime(2013, 1, 4)], name='idx') - expected_list = [Timestamp('2013-01-01'), - Timestamp('2013-01-02'), pd.NaT, - Timestamp('2013-01-04')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - def test_minmax(self): for tz in self.tz: # monotonic @@ -568,19 +523,6 @@ def test_equals(self): assert not idx.equals(pd.Series(idx3)) -@pytest.mark.parametrize('years,months', product([-1, 0, 1], [-2, 0, 2])) -def test_shift_months(years, months): - s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), - Timestamp('2000-01-31 00:23:00'), - Timestamp('2000-01-01'), - Timestamp('2000-02-29'), - Timestamp('2000-12-31')]) - actual = DatetimeIndex(shift_months(s.asi8, years * 12 + months)) - expected = DatetimeIndex([x + pd.offsets.DateOffset( - years=years, months=months) for x in s]) - tm.assert_index_equal(actual, expected) - - class TestBusinessDatetimeIndex(object): def setup_method(self, method): diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 111f68ba14775..83e7a0cd68d63 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -126,6 +126,27 @@ def test_round(self, tz): ts = '2016-10-17 12:00:00.001501031' DatetimeIndex([ts]).round('1010ns') + @pytest.mark.parametrize('test_input, rounder, freq, expected', [ + (['2117-01-01 00:00:45'], 'floor', '15s', ['2117-01-01 00:00:45']), + (['2117-01-01 00:00:45'], 'ceil', '15s', ['2117-01-01 00:00:45']), + (['2117-01-01 00:00:45.000000012'], 'floor', '10ns', + ['2117-01-01 00:00:45.000000010']), + (['1823-01-01 00:00:01.000000012'], 'ceil', '10ns', + ['1823-01-01 00:00:01.000000020']), + (['1823-01-01 00:00:01'], 'floor', '1s', ['1823-01-01 00:00:01']), + (['1823-01-01 00:00:01'], 'ceil', '1s', ['1823-01-01 00:00:01']), + (('NaT', '1823-01-01 00:00:01'), 'floor', '1s', + ('NaT', '1823-01-01 00:00:01')), + (('NaT', '1823-01-01 00:00:01'), 'ceil', '1s', + ('NaT', '1823-01-01 00:00:01')) + ]) + def test_ceil_floor_edge(self, tz, test_input, rounder, freq, expected): + dt = DatetimeIndex(list(test_input)) + func = getattr(dt, rounder) + result = func(freq) + expected = DatetimeIndex(list(expected)) + assert expected.equals(result) + # ---------------------------------------------------------------- # DatetimeIndex.normalize diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py new file mode 100644 index 0000000000000..075d239df5f7a --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -0,0 +1,1018 @@ +# -*- coding: utf-8 -*- +""" +Tests for DatetimeIndex timezone-related methods +""" +from datetime import datetime, timedelta, tzinfo +from distutils.version import LooseVersion + +import pytest +import pytz +import dateutil +from dateutil.tz import gettz, tzlocal +import numpy as np + +import pandas.util.testing as tm +import pandas.util._test_decorators as td + +import pandas as pd +from pandas._libs import tslib +from pandas._libs.tslibs import timezones +from pandas.compat import lrange, zip +from pandas import (DatetimeIndex, date_range, bdate_range, + Timestamp, isna, to_datetime, Index) + + +class FixedOffset(tzinfo): + """Fixed offset in minutes east from UTC.""" + + def __init__(self, offset, name): + self.__offset = timedelta(minutes=offset) + self.__name = name + + def utcoffset(self, dt): + return self.__offset + + def tzname(self, dt): + return self.__name + + def dst(self, dt): + return timedelta(0) + + +fixed_off = FixedOffset(-420, '-07:00') +fixed_off_no_name = FixedOffset(-330, None) + + +class TestDatetimeIndexTimezones(object): + # ------------------------------------------------------------- + # DatetimeIndex.tz_convert + def test_tz_convert_nat(self): + # GH#5546 + dates = [pd.NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize('US/Pacific') + tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) + idx = idx.tz_convert('US/Eastern') + tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Eastern')) + idx = idx.tz_convert('UTC') + tm.assert_index_equal(idx, DatetimeIndex(dates, tz='UTC')) + + dates = ['2010-12-01 00:00', '2010-12-02 00:00', pd.NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize('US/Pacific') + tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) + idx = idx.tz_convert('US/Eastern') + expected = ['2010-12-01 03:00', '2010-12-02 03:00', pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + + idx = idx + pd.offsets.Hour(5) + expected = ['2010-12-01 08:00', '2010-12-02 08:00', pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + idx = idx.tz_convert('US/Pacific') + expected = ['2010-12-01 05:00', '2010-12-02 05:00', pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) + + idx = idx + np.timedelta64(3, 'h') + expected = ['2010-12-01 08:00', '2010-12-02 08:00', pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) + + idx = idx.tz_convert('US/Eastern') + expected = ['2010-12-01 11:00', '2010-12-02 11:00', pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + + @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + def test_dti_tz_convert_compat_timestamp(self, prefix): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + idx = DatetimeIndex(strdates, tz=prefix + 'US/Eastern') + + conv = idx[0].tz_convert(prefix + 'US/Pacific') + expected = idx.tz_convert(prefix + 'US/Pacific')[0] + + assert conv == expected + + def test_dti_tz_convert_hour_overflow_dst(self): + # Regression test for: + # https://github.com/pandas-dev/pandas/issues/13306 + + # sorted case US/Eastern -> UTC + ts = ['2008-05-12 09:50:00', + '2008-12-12 09:50:35', + '2009-05-12 09:50:32'] + tt = DatetimeIndex(ts).tz_localize('US/Eastern') + ut = tt.tz_convert('UTC') + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = ['2008-05-12 13:50:00', + '2008-12-12 14:50:35', + '2009-05-12 13:50:32'] + tt = DatetimeIndex(ts).tz_localize('UTC') + ut = tt.tz_convert('US/Eastern') + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = ['2008-05-12 09:50:00', + '2008-12-12 09:50:35', + '2008-05-12 09:50:32'] + tt = DatetimeIndex(ts).tz_localize('US/Eastern') + ut = tt.tz_convert('UTC') + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = ['2008-05-12 13:50:00', + '2008-12-12 14:50:35', + '2008-05-12 13:50:32'] + tt = DatetimeIndex(ts).tz_localize('UTC') + ut = tt.tz_convert('US/Eastern') + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): + # Regression test for GH#13306 + + # sorted case US/Eastern -> UTC + ts = [Timestamp('2008-05-12 09:50:00', tz=tz), + Timestamp('2008-12-12 09:50:35', tz=tz), + Timestamp('2009-05-12 09:50:32', tz=tz)] + tt = DatetimeIndex(ts) + ut = tt.tz_convert('UTC') + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), + Timestamp('2008-12-12 14:50:35', tz='UTC'), + Timestamp('2009-05-12 13:50:32', tz='UTC')] + tt = DatetimeIndex(ts) + ut = tt.tz_convert('US/Eastern') + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = [Timestamp('2008-05-12 09:50:00', tz=tz), + Timestamp('2008-12-12 09:50:35', tz=tz), + Timestamp('2008-05-12 09:50:32', tz=tz)] + tt = DatetimeIndex(ts) + ut = tt.tz_convert('UTC') + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), + Timestamp('2008-12-12 14:50:35', tz='UTC'), + Timestamp('2008-05-12 13:50:32', tz='UTC')] + tt = DatetimeIndex(ts) + ut = tt.tz_convert('US/Eastern') + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + def test_dti_tz_convert_trans_pos_plus_1__bug(self): + # Regression test for tslib.tz_convert(vals, tz1, tz2). + # See https://github.com/pandas-dev/pandas/issues/4496 for details. + for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: + idx = date_range(datetime(2011, 3, 26, 23), + datetime(2011, 3, 27, 1), freq=freq) + idx = idx.tz_localize('UTC') + idx = idx.tz_convert('Europe/Moscow') + + expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + def test_dti_tz_convert_dst(self): + for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: + # Start DST + idx = date_range('2014-03-08 23:00', '2014-03-09 09:00', freq=freq, + tz='UTC') + idx = idx.tz_convert('US/Eastern') + expected = np.repeat(np.array([18, 19, 20, 21, 22, 23, + 0, 1, 3, 4, 5]), + np.array([n, n, n, n, n, n, n, n, n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + idx = date_range('2014-03-08 18:00', '2014-03-09 05:00', freq=freq, + tz='US/Eastern') + idx = idx.tz_convert('UTC') + expected = np.repeat(np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + np.array([n, n, n, n, n, n, n, n, n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + # End DST + idx = date_range('2014-11-01 23:00', '2014-11-02 09:00', freq=freq, + tz='UTC') + idx = idx.tz_convert('US/Eastern') + expected = np.repeat(np.array([19, 20, 21, 22, 23, + 0, 1, 1, 2, 3, 4]), + np.array([n, n, n, n, n, n, n, n, n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + idx = date_range('2014-11-01 18:00', '2014-11-02 05:00', freq=freq, + tz='US/Eastern') + idx = idx.tz_convert('UTC') + expected = np.repeat(np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10]), + np.array([n, n, n, n, n, n, n, n, n, + n, n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + # daily + # Start DST + idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', + tz='UTC') + idx = idx.tz_convert('US/Eastern') + tm.assert_index_equal(idx.hour, Index([19, 19])) + + idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', + tz='US/Eastern') + idx = idx.tz_convert('UTC') + tm.assert_index_equal(idx.hour, Index([5, 5])) + + # End DST + idx = date_range('2014-11-01 00:00', '2014-11-02 00:00', freq='D', + tz='UTC') + idx = idx.tz_convert('US/Eastern') + tm.assert_index_equal(idx.hour, Index([20, 20])) + + idx = date_range('2014-11-01 00:00', '2014-11-02 000:00', freq='D', + tz='US/Eastern') + idx = idx.tz_convert('UTC') + tm.assert_index_equal(idx.hour, Index([4, 4])) + + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Pacific']) + def test_tz_convert_roundtrip(self, tz): + idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M', + tz='UTC') + exp1 = date_range(start='2014-01-01', end='2014-12-31', freq='M') + + idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D', + tz='UTC') + exp2 = date_range(start='2014-01-01', end='2014-12-31', freq='D') + + idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H', + tz='UTC') + exp3 = date_range(start='2014-01-01', end='2014-03-01', freq='H') + + idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T', + tz='UTC') + exp4 = date_range(start='2014-08-01', end='2014-10-31', freq='T') + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), + (idx4, exp4)]: + converted = idx.tz_convert(tz) + reset = converted.tz_convert(None) + tm.assert_index_equal(reset, expected) + assert reset.tzinfo is None + expected = converted.tz_convert('UTC').tz_localize(None) + tm.assert_index_equal(reset, expected) + + def test_dti_tz_convert_tzlocal(self): + # GH#13583 + # tz_convert doesn't affect to internal + dti = date_range(start='2001-01-01', end='2001-03-01', tz='UTC') + dti2 = dti.tz_convert(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + dti = date_range(start='2001-01-01', end='2001-03-01', + tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_convert(None) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern', + pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_convert_utc_to_local_no_modify(self, tz): + rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert(tz) + + # Values are unmodified + tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) + + assert timezones.tz_compare(rng_eastern.tz, timezones.maybe_get_tz(tz)) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_tz_convert_unsorted(self, tzstr): + dr = date_range('2012-03-09', freq='H', periods=100, tz='utc') + dr = dr.tz_convert(tzstr) + + result = dr[::-1].hour + exp = dr.hour[::-1] + tm.assert_almost_equal(result, exp) + + # ------------------------------------------------------------- + # DatetimeIndex.tz_localize + + def test_dti_tz_localize_nonexistent_raise_coerce(self): + # GH#13057 + times = ['2015-03-08 01:00', '2015-03-08 02:00', '2015-03-08 03:00'] + index = DatetimeIndex(times) + tz = 'US/Eastern' + with pytest.raises(pytz.NonExistentTimeError): + index.tz_localize(tz=tz) + + with pytest.raises(pytz.NonExistentTimeError): + index.tz_localize(tz=tz, errors='raise') + + result = index.tz_localize(tz=tz, errors='coerce') + test_times = ['2015-03-08 01:00-05:00', 'NaT', + '2015-03-08 03:00-04:00'] + dti = DatetimeIndex(test_times) + expected = dti.tz_localize('UTC').tz_convert('US/Eastern') + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_localize_ambiguous_infer(self, tz): + # November 6, 2011, fall back, repeat 2 AM hour + # With no repeated hours, we cannot infer the transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=pd.offsets.Hour()) + with pytest.raises(pytz.AmbiguousTimeError): + dr.tz_localize(tz) + + # With repeated hours, we can infer the transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=pd.offsets.Hour(), tz=tz) + times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', + '11/06/2011 02:00', '11/06/2011 03:00'] + di = DatetimeIndex(times) + localized = di.tz_localize(tz, ambiguous='infer') + tm.assert_index_equal(dr, localized) + with tm.assert_produces_warning(FutureWarning): + localized_old = di.tz_localize(tz, infer_dst=True) + tm.assert_index_equal(dr, localized_old) + tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, + ambiguous='infer')) + + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, + freq=pd.offsets.Hour()) + localized = dr.tz_localize(tz) + localized_infer = dr.tz_localize(tz, ambiguous='infer') + tm.assert_index_equal(localized, localized_infer) + with tm.assert_produces_warning(FutureWarning): + localized_infer_old = dr.tz_localize(tz, infer_dst=True) + tm.assert_index_equal(localized, localized_infer_old) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_localize_ambiguous_times(self, tz): + # March 13, 2011, spring forward, skip from 2 AM to 3 AM + dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, + freq=pd.offsets.Hour()) + with pytest.raises(pytz.NonExistentTimeError): + dr.tz_localize(tz) + + # after dst transition, it works + dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, + freq=pd.offsets.Hour(), tz=tz) + + # November 6, 2011, fall back, repeat 2 AM hour + dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, + freq=pd.offsets.Hour()) + with pytest.raises(pytz.AmbiguousTimeError): + dr.tz_localize(tz) + + # UTC is OK + dr = date_range(datetime(2011, 3, 13), periods=48, + freq=pd.offsets.Minute(30), tz=pytz.utc) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + + idx = DatetimeIndex(strdates) + conv = idx.tz_localize(tzstr) + + fromdates = DatetimeIndex(strdates, tz=tzstr) + + assert conv.tz == fromdates.tz + tm.assert_numpy_array_equal(conv.values, fromdates.values) + + @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + def test_dti_tz_localize(self, prefix): + tzstr = prefix + 'US/Eastern' + dti = DatetimeIndex(start='1/1/2005', end='1/1/2005 0:00:30.256', + freq='L') + dti2 = dti.tz_localize(tzstr) + + dti_utc = DatetimeIndex(start='1/1/2005 05:00', + end='1/1/2005 5:00:30.256', freq='L', tz='utc') + + tm.assert_numpy_array_equal(dti2.values, dti_utc.values) + + dti3 = dti2.tz_convert(prefix + 'US/Pacific') + tm.assert_numpy_array_equal(dti3.values, dti_utc.values) + + dti = DatetimeIndex(start='11/6/2011 1:59', end='11/6/2011 2:00', + freq='L') + with pytest.raises(pytz.AmbiguousTimeError): + dti.tz_localize(tzstr) + + dti = DatetimeIndex(start='3/13/2011 1:59', end='3/13/2011 2:00', + freq='L') + with pytest.raises(pytz.NonExistentTimeError): + dti.tz_localize(tzstr) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern', + pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_localize_utc_conversion(self, tz): + # Localizing to time zone should: + # 1) check for DST ambiguities + # 2) convert to UTC + + rng = date_range('3/10/2012', '3/11/2012', freq='30T') + + converted = rng.tz_localize(tz) + expected_naive = rng + pd.offsets.Hour(5) + tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) + + # DST ambiguity, this should fail + rng = date_range('3/11/2012', '3/12/2012', freq='30T') + # Is this really how it should fail?? + with pytest.raises(pytz.NonExistentTimeError): + rng.tz_localize(tz) + + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Pacific']) + def test_dti_tz_localize_roundtrip(self, tz): + idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M') + idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D') + idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H') + idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T') + for idx in [idx1, idx2, idx3, idx4]: + localized = idx.tz_localize(tz) + expected = date_range(start=idx[0], end=idx[-1], freq=idx.freq, + tz=tz) + tm.assert_index_equal(localized, expected) + + with pytest.raises(TypeError): + localized.tz_localize(tz) + + reset = localized.tz_localize(None) + tm.assert_index_equal(reset, idx) + assert reset.tzinfo is None + + def test_dti_tz_localize_naive(self): + rng = date_range('1/1/2011', periods=100, freq='H') + + conv = rng.tz_localize('US/Pacific') + exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') + + tm.assert_index_equal(conv, exp) + + def test_dti_tz_localize_tzlocal(self): + # GH#13583 + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = int(offset.total_seconds() * 1000000000) + + dti = date_range(start='2001-01-01', end='2001-03-01') + dti2 = dti.tz_localize(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) + + dti = date_range(start='2001-01-01', end='2001-03-01', + tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_localize(None) + tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_localize_ambiguous_nat(self, tz): + times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', + '11/06/2011 02:00', '11/06/2011 03:00'] + di = DatetimeIndex(times) + localized = di.tz_localize(tz, ambiguous='NaT') + + times = ['11/06/2011 00:00', np.NaN, np.NaN, '11/06/2011 02:00', + '11/06/2011 03:00'] + di_test = DatetimeIndex(times, tz='US/Eastern') + + # left dtype is datetime64[ns, US/Eastern] + # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] + tm.assert_numpy_array_equal(di_test.values, localized.values) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_localize_ambiguous_flags(self, tz): + # November 6, 2011, fall back, repeat 2 AM hour + + # Pass in flags to determine right dst transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=pd.offsets.Hour(), tz=tz) + times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', + '11/06/2011 02:00', '11/06/2011 03:00'] + + # Test tz_localize + di = DatetimeIndex(times) + is_dst = [1, 1, 0, 0, 0] + localized = di.tz_localize(tz, ambiguous=is_dst) + tm.assert_index_equal(dr, localized) + tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, + ambiguous=is_dst)) + + localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) + tm.assert_index_equal(dr, localized) + + localized = di.tz_localize(tz, + ambiguous=np.array(is_dst).astype('bool')) + tm.assert_index_equal(dr, localized) + + # Test constructor + localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst) + tm.assert_index_equal(dr, localized) + + # Test duplicate times where infer_dst fails + times += times + di = DatetimeIndex(times) + + # When the sizes are incompatible, make sure error is raised + with pytest.raises(Exception): + di.tz_localize(tz, ambiguous=is_dst) + + # When sizes are compatible and there are repeats ('infer' won't work) + is_dst = np.hstack((is_dst, is_dst)) + localized = di.tz_localize(tz, ambiguous=is_dst) + dr = dr.append(dr) + tm.assert_index_equal(dr, localized) + + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, + freq=pd.offsets.Hour()) + is_dst = np.array([1] * 10) + localized = dr.tz_localize(tz) + localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) + tm.assert_index_equal(localized, localized_is_dst) + + # TODO: belongs outside tz_localize tests? + @pytest.mark.parametrize('tz', ['Europe/London', 'dateutil/Europe/London']) + def test_dti_construction_ambiguous_endpoint(self, tz): + # construction with an ambiguous end-point + # GH#11626 + + # FIXME: This next block fails to raise; it was taken from an older + # version of this test that had an indention mistake that caused it + # to not get executed. + # with pytest.raises(pytz.AmbiguousTimeError): + # date_range("2013-10-26 23:00", "2013-10-27 01:00", + # tz="Europe/London", freq="H") + + times = date_range("2013-10-26 23:00", "2013-10-27 01:00", freq="H", + tz=tz, ambiguous='infer') + assert times[0] == Timestamp('2013-10-26 23:00', tz=tz, freq="H") + + if str(tz).startswith('dateutil'): + if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): + # see GH#14621 + assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', + tz=tz, freq="H") + elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): + # fixed ambiguous behavior + assert times[-1] == Timestamp('2013-10-27 01:00:00+0100', + tz=tz, freq="H") + else: + assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', + tz=tz, freq="H") + + def test_dti_tz_localize_bdate_range(self): + dr = pd.bdate_range('1/1/2009', '1/1/2010') + dr_utc = pd.bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) + localized = dr.tz_localize(pytz.utc) + tm.assert_index_equal(dr_utc, localized) + + # ------------------------------------------------------------- + # DatetimeIndex.normalize + + def test_normalize_tz(self): + rng = date_range('1/1/2000 9:30', periods=10, freq='D', + tz='US/Eastern') + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', + tz='US/Eastern') + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz='UTC') + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', tz='UTC') + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz=tzlocal()) + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', tz=tzlocal()) + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + @td.skip_if_windows + @pytest.mark.parametrize('timezone', ['US/Pacific', 'US/Eastern', 'UTC', + 'Asia/Kolkata', 'Asia/Shanghai', + 'Australia/Canberra']) + def test_normalize_tz_local(self, timezone): + # GH#13459 + with tm.set_timezone(timezone): + rng = date_range('1/1/2000 9:30', periods=10, freq='D', + tz=tzlocal()) + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', + tz=tzlocal()) + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + # ------------------------------------------------------------ + # DatetimeIndex.__new__ + + @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + def test_dti_constructor_static_tzinfo(self, prefix): + # it works! + index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + 'EST') + index.hour + index[0] + + def test_dti_constructor_with_fixed_tz(self): + off = FixedOffset(420, '+07:00') + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + assert off == rng.tz + + rng2 = date_range(start, periods=len(rng), tz=off) + tm.assert_index_equal(rng, rng2) + + rng3 = date_range('3/11/2012 05:00:00+07:00', + '6/11/2012 05:00:00+07:00') + assert (rng.values == rng3.values).all() + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_convert_datetime_list(self, tzstr): + dr = date_range('2012-06-02', periods=10, + tz=tzstr, name='foo') + dr2 = DatetimeIndex(list(dr), name='foo') + tm.assert_index_equal(dr, dr2) + assert dr.tz == dr2.tz + assert dr2.name == 'foo' + + def test_dti_construction_univalent(self): + rng = date_range('03/12/2012 00:00', periods=10, freq='W-FRI', + tz='US/Eastern') + rng2 = DatetimeIndex(data=rng, tz='US/Eastern') + tm.assert_index_equal(rng, rng2) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_from_tzaware_datetime(self, tz): + d = [datetime(2012, 8, 19, tzinfo=tz)] + + index = DatetimeIndex(d) + assert timezones.tz_compare(index.tz, tz) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_tz_constructors(self, tzstr): + """ Test different DatetimeIndex constructions with timezone + Follow-up of GH#4229 + """ + + arr = ['11/10/2005 08:00:00', '11/10/2005 09:00:00'] + + idx1 = to_datetime(arr).tz_localize(tzstr) + idx2 = DatetimeIndex(start="2005-11-10 08:00:00", freq='H', periods=2, + tz=tzstr) + idx3 = DatetimeIndex(arr, tz=tzstr) + idx4 = DatetimeIndex(np.array(arr), tz=tzstr) + + for other in [idx2, idx3, idx4]: + tm.assert_index_equal(idx1, other) + + # ------------------------------------------------------------- + # Unsorted + + def test_join_utc_convert(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + left = rng.tz_convert('US/Eastern') + right = rng.tz_convert('Europe/Berlin') + + for how in ['inner', 'outer', 'left', 'right']: + result = left.join(left[:-5], how=how) + assert isinstance(result, DatetimeIndex) + assert result.tz == left.tz + + result = left.join(right[:-5], how=how) + assert isinstance(result, DatetimeIndex) + assert result.tz.zone == 'UTC' + + def test_dti_drop_dont_lose_tz(self): + # GH#2621 + ind = date_range("2012-12-01", periods=10, tz="utc") + ind = ind.drop(ind[-1]) + + assert ind.tz is not None + + def test_date_range_localize(self): + rng = date_range('3/11/2012 03:00', periods=15, freq='H', + tz='US/Eastern') + rng2 = DatetimeIndex(['3/11/2012 03:00', '3/11/2012 04:00'], + tz='US/Eastern') + rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') + rng3 = rng3.tz_localize('US/Eastern') + + tm.assert_index_equal(rng, rng3) + + # DST transition time + val = rng[0] + exp = Timestamp('3/11/2012 03:00', tz='US/Eastern') + + assert val.hour == 3 + assert exp.hour == 3 + assert val == exp # same UTC value + tm.assert_index_equal(rng[:2], rng2) + + # Right before the DST transition + rng = date_range('3/11/2012 00:00', periods=2, freq='H', + tz='US/Eastern') + rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], + tz='US/Eastern') + tm.assert_index_equal(rng, rng2) + exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') + assert exp.hour == 0 + assert rng[0] == exp + exp = Timestamp('3/11/2012 01:00', tz='US/Eastern') + assert exp.hour == 1 + assert rng[1] == exp + + rng = date_range('3/11/2012 00:00', periods=10, freq='H', + tz='US/Eastern') + assert rng[2].hour == 3 + + def test_timestamp_equality_different_timezones(self): + utc_range = date_range('1/1/2000', periods=20, tz='UTC') + eastern_range = utc_range.tz_convert('US/Eastern') + berlin_range = utc_range.tz_convert('Europe/Berlin') + + for a, b, c in zip(utc_range, eastern_range, berlin_range): + assert a == b + assert b == c + assert a == c + + assert (utc_range == eastern_range).all() + assert (utc_range == berlin_range).all() + assert (berlin_range == eastern_range).all() + + def test_dti_intersection(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + left = rng[10:90][::-1] + right = rng[20:80][::-1] + + assert left.tz == rng.tz + result = left.intersection(right) + assert result.tz == left.tz + + def test_dti_equals_with_tz(self): + left = date_range('1/1/2011', periods=100, freq='H', tz='utc') + right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern') + + assert not left.equals(right) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_tz_nat(self, tzstr): + idx = DatetimeIndex([Timestamp("2013-1-1", tz=tzstr), pd.NaT]) + + assert isna(idx[1]) + assert idx[0].tzinfo is not None + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_astype_asobject_tzinfos(self, tzstr): + # GH#1345 + + # dates around a dst transition + rng = date_range('2/13/2010', '5/6/2010', tz=tzstr) + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + assert x == exval + assert x.tzinfo == exval.tzinfo + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + assert x == exval + assert x.tzinfo == exval.tzinfo + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_with_timezone_repr(self, tzstr): + rng = date_range('4/13/2010', '5/6/2010') + + rng_eastern = rng.tz_localize(tzstr) + + rng_repr = repr(rng_eastern) + assert '2010-04-13 00:00:00' in rng_repr + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_take_dont_lose_meta(self, tzstr): + rng = date_range('1/1/2000', periods=20, tz=tzstr) + + result = rng.take(lrange(5)) + assert result.tz == rng.tz + assert result.freq == rng.freq + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_utc_box_timestamp_and_localize(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert(tzstr) + + expected = rng[-1].astimezone(tz) + + stamp = rng_eastern[-1] + assert stamp == expected + assert stamp.tzinfo == expected.tzinfo + + # right tzinfo + rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert(tzstr) + # test not valid for dateutil timezones. + # assert 'EDT' in repr(rng_eastern[0].tzinfo) + assert ('EDT' in repr(rng_eastern[0].tzinfo) or + 'tzfile' in repr(rng_eastern[0].tzinfo)) + + def test_dti_to_pydatetime(self): + dt = dateutil.parser.parse('2012-06-13T01:39:00Z') + dt = dt.replace(tzinfo=tzlocal()) + + arr = np.array([dt], dtype=object) + + result = to_datetime(arr, utc=True) + assert result.tz is pytz.utc + + rng = date_range('2012-11-03 03:00', '2012-11-05 03:00', tz=tzlocal()) + arr = rng.to_pydatetime() + result = to_datetime(arr, utc=True) + assert result.tz is pytz.utc + + def test_dti_to_pydatetime_fizedtz(self): + dates = np.array([datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off)]) + dti = DatetimeIndex(dates) + + result = dti.to_pydatetime() + tm.assert_numpy_array_equal(dates, result) + + result = dti._mpl_repr() + tm.assert_numpy_array_equal(dates, result) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Central'), + gettz('US/Central')]) + def test_with_tz(self, tz): + # just want it to work + start = datetime(2011, 3, 12, tzinfo=pytz.utc) + dr = bdate_range(start, periods=50, freq=pd.offsets.Hour()) + assert dr.tz is pytz.utc + + # DateRange with naive datetimes + dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc) + dr = bdate_range('1/1/2005', '1/1/2009', tz=tz) + + # normalized + central = dr.tz_convert(tz) + assert central.tz is tz + naive = central[0].to_pydatetime().replace(tzinfo=None) + comp = tslib._localize_pydatetime(naive, tz).tzinfo + assert central[0].tz is comp + + # compare vs a localized tz + naive = dr[0].to_pydatetime().replace(tzinfo=None) + comp = tslib._localize_pydatetime(naive, tz).tzinfo + assert central[0].tz is comp + + # datetimes with tzinfo set + dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), + datetime(2009, 1, 1, tzinfo=pytz.utc)) + with pytest.raises(Exception): + bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009', + tz=tz) + + @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + def test_field_access_localize(self, prefix): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + rng = DatetimeIndex(strdates, tz=prefix + 'US/Eastern') + assert (rng.hour == 0).all() + + # a more unusual time zone, #1946 + dr = date_range('2011-10-02 00:00', freq='h', periods=10, + tz=prefix + 'America/Atikokan') + + expected = Index(np.arange(10, dtype=np.int64)) + tm.assert_index_equal(dr.hour, expected) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_convert_tz_aware_datetime_datetime(self, tz): + # GH#1581 + dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)] + + dates_aware = [tslib._localize_pydatetime(x, tz) for x in dates] + result = DatetimeIndex(dates_aware) + assert timezones.tz_compare(result.tz, tz) + + converted = to_datetime(dates_aware, utc=True) + ex_vals = np.array([Timestamp(x).value for x in dates_aware]) + tm.assert_numpy_array_equal(converted.asi8, ex_vals) + assert converted.tz is pytz.utc + + def test_dti_union_aware(self): + # non-overlapping + rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", + tz="US/Central") + + rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", + tz="US/Eastern") + + result = rng.union(rng2) + assert result.tz.zone == 'UTC' + + +class TestDateRange(object): + """Tests for date_range with timezones""" + def test_hongkong_tz_convert(self): + # GH#1673 smoke test + dr = date_range('2012-01-01', '2012-01-10', freq='D', tz='Hongkong') + + # it works! + dr.hour + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_date_range_span_dst_transition(self, tzstr): + # GH#1778 + + # Standard -> Daylight Savings Time + dr = date_range('03/06/2012 00:00', periods=200, freq='W-FRI', + tz='US/Eastern') + + assert (dr.hour == 0).all() + + dr = date_range('2012-11-02', periods=10, tz=tzstr) + assert (dr.hour == 0).all() + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_date_range_timezone_str_argument(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + result = date_range('1/1/2000', periods=10, tz=tzstr) + expected = date_range('1/1/2000', periods=10, tz=tz) + + tm.assert_index_equal(result, expected) + + def test_date_range_with_fixedoffset_noname(self): + off = fixed_off_no_name + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + assert off == rng.tz + + idx = Index([start, end]) + assert off == idx.tz + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_date_range_with_tz(self, tzstr): + stamp = Timestamp('3/11/2012 05:00', tz=tzstr) + assert stamp.hour == 5 + + rng = date_range('3/11/2012 04:00', periods=10, freq='H', + tz=tzstr) + + assert stamp == rng[1] + + +class TestToDatetime(object): + """Tests for the to_datetime constructor with timezones""" + def test_to_datetime_utc(self): + arr = np.array([dateutil.parser.parse('2012-06-13T01:39:00Z')], + dtype=object) + + result = to_datetime(arr, utc=True) + assert result.tz is pytz.utc + + def test_to_datetime_fixed_offset(self): + dates = [datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off)] + result = to_datetime(dates) + assert result.tz == fixed_off diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 44f3c21d23e62..b5926933544e8 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -17,8 +17,8 @@ from pandas._libs.tslibs import parsing from pandas.core.tools import datetimes as tools -from pandas.compat import lmap -from pandas.compat.numpy import np_array_datetime64_compat +from pandas.errors import OutOfBoundsDatetime +from pandas.compat import lmap, PY3 from pandas.core.dtypes.common import is_datetime64_ns_dtype from pandas.util import testing as tm import pandas.util._test_decorators as td @@ -186,6 +186,18 @@ def test_to_datetime_format_weeks(self, cache): class TestToDatetime(object): + def test_to_datetime_pydatetime(self): + actual = pd.to_datetime(datetime(2008, 1, 15)) + assert actual == datetime(2008, 1, 15) + + def test_to_datetime_YYYYMMDD(self): + actual = pd.to_datetime('20080115') + assert actual == datetime(2008, 1, 15) + + def test_to_datetime_unparseable_ignore(self): + # unparseable + s = 'Month 1, 1999' + assert pd.to_datetime(s, errors='ignore') == s @td.skip_if_windows # `tm.set_timezone` does not work in windows def test_to_datetime_now(self): @@ -237,6 +249,13 @@ def test_to_datetime_today(self): assert pdtoday.tzinfo is None assert pdtoday2.tzinfo is None + def test_to_datetime_today_now_unicode_bytes(self): + to_datetime([u'now']) + to_datetime([u'today']) + if not PY3: + to_datetime(['now']) + to_datetime(['today']) + @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_dt64s(self, cache): in_bound_dts = [ @@ -783,6 +802,14 @@ def test_dataframe_dtypes(self, cache): class TestToDatetimeMisc(object): + def test_to_datetime_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + arr = np.array(['2262-04-11 23:47:16.854775808'], dtype=object) + + with pytest.raises(OutOfBoundsDatetime): + to_datetime(arr) @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_iso8601(self, cache): @@ -1445,157 +1472,6 @@ def test_parsers_timezone_minute_offsets_roundtrip(self, cache): converted_time = dt_time.tz_localize('UTC').tz_convert(tz) assert dt_string_repr == repr(converted_time) - def test_parsers_iso8601(self): - # GH 12060 - # test only the iso parser - flexibility to different - # separators and leadings 0s - # Timestamp construction falls back to dateutil - cases = {'2011-01-02': datetime(2011, 1, 2), - '2011-1-2': datetime(2011, 1, 2), - '2011-01': datetime(2011, 1, 1), - '2011-1': datetime(2011, 1, 1), - '2011 01 02': datetime(2011, 1, 2), - '2011.01.02': datetime(2011, 1, 2), - '2011/01/02': datetime(2011, 1, 2), - '2011\\01\\02': datetime(2011, 1, 2), - '2013-01-01 05:30:00': datetime(2013, 1, 1, 5, 30), - '2013-1-1 5:30:00': datetime(2013, 1, 1, 5, 30)} - for date_str, exp in compat.iteritems(cases): - actual = tslib._test_parse_iso8601(date_str) - assert actual == exp - - # separators must all match - YYYYMM not valid - invalid_cases = ['2011-01/02', '2011^11^11', - '201401', '201111', '200101', - # mixed separated and unseparated - '2005-0101', '200501-01', - '20010101 12:3456', '20010101 1234:56', - # HHMMSS must have two digits in each component - # if unseparated - '20010101 1', '20010101 123', '20010101 12345', - '20010101 12345Z', - # wrong separator for HHMMSS - '2001-01-01 12-34-56'] - for date_str in invalid_cases: - with pytest.raises(ValueError): - tslib._test_parse_iso8601(date_str) - # If no ValueError raised, let me know which case failed. - raise Exception(date_str) - - -class TestArrayToDatetime(object): - def test_parsing_valid_dates(self): - arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr), - np_array_datetime64_compat( - [ - '2013-01-01T00:00:00.000000000-0000', - '2013-01-02T00:00:00.000000000-0000' - ], - dtype='M8[ns]' - ) - ) - - arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr), - np_array_datetime64_compat( - [ - '2013-09-16T00:00:00.000000000-0000', - '2013-09-17T00:00:00.000000000-0000' - ], - dtype='M8[ns]' - ) - ) - - def test_parsing_timezone_offsets(self): - # All of these datetime strings with offsets are equivalent - # to the same datetime after the timezone offset is added - dt_strings = [ - '01-01-2013 08:00:00+08:00', - '2013-01-01T08:00:00.000000000+0800', - '2012-12-31T16:00:00.000000000-0800', - '12-31-2012 23:00:00-01:00' - ] - - expected_output = tslib.array_to_datetime(np.array( - ['01-01-2013 00:00:00'], dtype=object)) - - for dt_string in dt_strings: - tm.assert_numpy_array_equal( - tslib.array_to_datetime( - np.array([dt_string], dtype=object) - ), - expected_output - ) - - def test_number_looking_strings_not_into_datetime(self): - # #4601 - # These strings don't look like datetimes so they shouldn't be - # attempted to be converted - arr = np.array(['-352.737091', '183.575577'], dtype=object) - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='ignore'), arr) - - arr = np.array(['1', '2', '3', '4', '5'], dtype=object) - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='ignore'), arr) - - def test_coercing_dates_outside_of_datetime64_ns_bounds(self): - invalid_dates = [ - date(1000, 1, 1), - datetime(1000, 1, 1), - '1000-01-01', - 'Jan 1, 1000', - np.datetime64('1000-01-01'), - ] - - for invalid_date in invalid_dates: - pytest.raises(ValueError, - tslib.array_to_datetime, - np.array([invalid_date], dtype='object'), - errors='raise', ) - tm.assert_numpy_array_equal( - tslib.array_to_datetime( - np.array([invalid_date], dtype='object'), - errors='coerce'), - np.array([tslib.iNaT], dtype='M8[ns]') - ) - - arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='coerce'), - np_array_datetime64_compat( - [ - tslib.iNaT, - '2000-01-01T00:00:00.000000000-0000' - ], - dtype='M8[ns]' - ) - ) - - def test_coerce_of_invalid_datetimes(self): - arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) - - # Without coercing, the presence of any invalid dates prevents - # any values from being converted - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='ignore'), arr) - - # With coercing, the invalid dates becomes iNaT - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='coerce'), - np_array_datetime64_compat( - [ - '2013-01-01T00:00:00.000000000-0000', - tslib.iNaT, - tslib.iNaT - ], - dtype='M8[ns]' - ) - ) - def test_normalize_date(): value = date(2012, 9, 7) diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index 356ea5fc656de..81171920f635f 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -11,6 +11,171 @@ import pandas.core.indexes.period as period +class TestPeriodIndexComparisons(object): + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_pi_cmp_pi(self, freq): + base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], + freq=freq) + per = Period('2011-02', freq=freq) + + exp = np.array([False, True, False, False]) + tm.assert_numpy_array_equal(base == per, exp) + tm.assert_numpy_array_equal(per == base, exp) + + exp = np.array([True, False, True, True]) + tm.assert_numpy_array_equal(base != per, exp) + tm.assert_numpy_array_equal(per != base, exp) + + exp = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(base > per, exp) + tm.assert_numpy_array_equal(per < base, exp) + + exp = np.array([True, False, False, False]) + tm.assert_numpy_array_equal(base < per, exp) + tm.assert_numpy_array_equal(per > base, exp) + + exp = np.array([False, True, True, True]) + tm.assert_numpy_array_equal(base >= per, exp) + tm.assert_numpy_array_equal(per <= base, exp) + + exp = np.array([True, True, False, False]) + tm.assert_numpy_array_equal(base <= per, exp) + tm.assert_numpy_array_equal(per >= base, exp) + + idx = PeriodIndex(['2011-02', '2011-01', '2011-03', '2011-05'], + freq=freq) + + exp = np.array([False, False, True, False]) + tm.assert_numpy_array_equal(base == idx, exp) + + exp = np.array([True, True, False, True]) + tm.assert_numpy_array_equal(base != idx, exp) + + exp = np.array([False, True, False, False]) + tm.assert_numpy_array_equal(base > idx, exp) + + exp = np.array([True, False, False, True]) + tm.assert_numpy_array_equal(base < idx, exp) + + exp = np.array([False, True, True, False]) + tm.assert_numpy_array_equal(base >= idx, exp) + + exp = np.array([True, False, True, True]) + tm.assert_numpy_array_equal(base <= idx, exp) + + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_pi_cmp_pi_mismatched_freq_raises(self, freq): + # different base freq + base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], + freq=freq) + + msg = "Input has different freq=A-DEC from PeriodIndex" + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + base <= Period('2011', freq='A') + + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + Period('2011', freq='A') >= base + + idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + base <= idx + + # Different frequency + msg = "Input has different freq=4M from PeriodIndex" + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + base <= Period('2011', freq='4M') + + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + Period('2011', freq='4M') >= base + + idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M') + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + base <= idx + + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_pi_cmp_nat(self, freq): + idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) + + result = idx1 > Period('2011-02', freq=freq) + exp = np.array([False, False, False, True]) + tm.assert_numpy_array_equal(result, exp) + result = Period('2011-02', freq=freq) < idx1 + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == Period('NaT', freq=freq) + exp = np.array([False, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + result = Period('NaT', freq=freq) == idx1 + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != Period('NaT', freq=freq) + exp = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, exp) + result = Period('NaT', freq=freq) != idx1 + tm.assert_numpy_array_equal(result, exp) + + idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq=freq) + result = idx1 < idx2 + exp = np.array([True, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == idx2 + exp = np.array([False, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != idx2 + exp = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == idx1 + exp = np.array([True, True, False, True]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != idx1 + exp = np.array([False, False, True, False]) + tm.assert_numpy_array_equal(result, exp) + + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_pi_cmp_nat_mismatched_freq_raises(self, freq): + idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) + + diff = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq='4M') + msg = "Input has different freq=4M from PeriodIndex" + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + idx1 > diff + + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + idx1 == diff + + # TODO: De-duplicate with test_pi_cmp_nat + def test_comp_nat(self): + left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, + pd.Period('2011-01-03')]) + right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) + + for lhs, rhs in [(left, right), + (left.astype(object), right.astype(object))]: + result = lhs == rhs + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = lhs != rhs + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(lhs == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == rhs, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(lhs != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != lhs, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(lhs < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > lhs, expected) + + class TestPeriodIndexArithmetic(object): def test_pi_add_offset_array(self): # GH#18849 @@ -250,6 +415,97 @@ def test_sub_isub(self): rng -= 1 tm.assert_index_equal(rng, expected) + # --------------------------------------------------------------- + # PeriodIndex.shift is used by __add__ and __sub__ + + def test_pi_shift_ndarray(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + result = idx.shift(np.array([1, 2, 3, 4])) + expected = PeriodIndex(['2011-02', '2011-04', 'NaT', + '2011-08'], freq='M', name='idx') + tm.assert_index_equal(result, expected) + + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + result = idx.shift(np.array([1, -2, 3, -4])) + expected = PeriodIndex(['2011-02', '2010-12', 'NaT', + '2010-12'], freq='M', name='idx') + tm.assert_index_equal(result, expected) + + def test_shift(self): + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') + + tm.assert_index_equal(pi1.shift(0), pi1) + + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + def test_shift_corner_cases(self): + # GH#9903 + idx = pd.PeriodIndex([], name='xxx', freq='H') + + with pytest.raises(TypeError): + # period shift doesn't accept freq + idx.shift(1, freq='H') + + tm.assert_index_equal(idx.shift(0), idx) + tm.assert_index_equal(idx.shift(3), idx) + + idx = pd.PeriodIndex(['2011-01-01 10:00', '2011-01-01 11:00' + '2011-01-01 12:00'], name='xxx', freq='H') + tm.assert_index_equal(idx.shift(0), idx) + exp = pd.PeriodIndex(['2011-01-01 13:00', '2011-01-01 14:00' + '2011-01-01 15:00'], name='xxx', freq='H') + tm.assert_index_equal(idx.shift(3), exp) + exp = pd.PeriodIndex(['2011-01-01 07:00', '2011-01-01 08:00' + '2011-01-01 09:00'], name='xxx', freq='H') + tm.assert_index_equal(idx.shift(-3), exp) + + def test_shift_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + result = idx.shift(1) + expected = PeriodIndex(['2011-02', '2011-03', 'NaT', + '2011-05'], freq='M', name='idx') + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + def test_shift_gh8083(self): + # test shift for PeriodIndex + # GH#8083 + drange = pd.period_range('20130101', periods=5, freq='D') + result = drange.shift(1) + expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', + '2013-01-05', '2013-01-06'], freq='D') + tm.assert_index_equal(result, expected) + class TestPeriodIndexSeriesMethods(object): """ Test PeriodIndex and Period Series Ops consistency """ diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 639a9272c3808..eca80d17b1dc3 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -119,8 +119,8 @@ def test_constructor_fromarraylike(self): tm.assert_index_equal(PeriodIndex(idx.values), idx) tm.assert_index_equal(PeriodIndex(list(idx.values)), idx) - pytest.raises(ValueError, PeriodIndex, idx._values) - pytest.raises(ValueError, PeriodIndex, list(idx._values)) + pytest.raises(ValueError, PeriodIndex, idx._ndarray_values) + pytest.raises(ValueError, PeriodIndex, list(idx._ndarray_values)) pytest.raises(TypeError, PeriodIndex, data=Period('2007', freq='A')) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 21a9ffdde3444..8745de0c2a7aa 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -1,11 +1,9 @@ -import pytest import numpy as np import pandas as pd import pandas._libs.tslib as tslib import pandas.util.testing as tm -import pandas.core.indexes.period as period from pandas import (DatetimeIndex, PeriodIndex, Series, Period, _np_version_under1p10, Index) @@ -521,25 +519,8 @@ def test_nat_new(self): tm.assert_numpy_array_equal(result, exp) def test_shift(self): - # GH 9903 - idx = pd.PeriodIndex([], name='xxx', freq='H') - - with pytest.raises(TypeError): - # period shift doesn't accept freq - idx.shift(1, freq='H') - - tm.assert_index_equal(idx.shift(0), idx) - tm.assert_index_equal(idx.shift(3), idx) - - idx = pd.PeriodIndex(['2011-01-01 10:00', '2011-01-01 11:00' - '2011-01-01 12:00'], name='xxx', freq='H') - tm.assert_index_equal(idx.shift(0), idx) - exp = pd.PeriodIndex(['2011-01-01 13:00', '2011-01-01 14:00' - '2011-01-01 15:00'], name='xxx', freq='H') - tm.assert_index_equal(idx.shift(3), exp) - exp = pd.PeriodIndex(['2011-01-01 07:00', '2011-01-01 08:00' - '2011-01-01 09:00'], name='xxx', freq='H') - tm.assert_index_equal(idx.shift(-3), exp) + # This is tested in test_arithmetic + pass def test_repeat(self): index = pd.period_range('2001-01-01', periods=2, freq='D') @@ -703,172 +684,3 @@ def test_pi_comp_period_nat(self): f = lambda x: tslib.NaT >= x exp = np.array([False, False, False, False], dtype=np.bool) self._check(idx, f, exp) - - -class TestPeriodIndexComparisons(object): - - def test_pi_pi_comp(self): - - for freq in ['M', '2M', '3M']: - base = PeriodIndex(['2011-01', '2011-02', - '2011-03', '2011-04'], freq=freq) - p = Period('2011-02', freq=freq) - - exp = np.array([False, True, False, False]) - tm.assert_numpy_array_equal(base == p, exp) - tm.assert_numpy_array_equal(p == base, exp) - - exp = np.array([True, False, True, True]) - tm.assert_numpy_array_equal(base != p, exp) - tm.assert_numpy_array_equal(p != base, exp) - - exp = np.array([False, False, True, True]) - tm.assert_numpy_array_equal(base > p, exp) - tm.assert_numpy_array_equal(p < base, exp) - - exp = np.array([True, False, False, False]) - tm.assert_numpy_array_equal(base < p, exp) - tm.assert_numpy_array_equal(p > base, exp) - - exp = np.array([False, True, True, True]) - tm.assert_numpy_array_equal(base >= p, exp) - tm.assert_numpy_array_equal(p <= base, exp) - - exp = np.array([True, True, False, False]) - tm.assert_numpy_array_equal(base <= p, exp) - tm.assert_numpy_array_equal(p >= base, exp) - - idx = PeriodIndex(['2011-02', '2011-01', '2011-03', - '2011-05'], freq=freq) - - exp = np.array([False, False, True, False]) - tm.assert_numpy_array_equal(base == idx, exp) - - exp = np.array([True, True, False, True]) - tm.assert_numpy_array_equal(base != idx, exp) - - exp = np.array([False, True, False, False]) - tm.assert_numpy_array_equal(base > idx, exp) - - exp = np.array([True, False, False, True]) - tm.assert_numpy_array_equal(base < idx, exp) - - exp = np.array([False, True, True, False]) - tm.assert_numpy_array_equal(base >= idx, exp) - - exp = np.array([True, False, True, True]) - tm.assert_numpy_array_equal(base <= idx, exp) - - # different base freq - msg = "Input has different freq=A-DEC from PeriodIndex" - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - base <= Period('2011', freq='A') - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - Period('2011', freq='A') >= base - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') - base <= idx - - # Different frequency - msg = "Input has different freq=4M from PeriodIndex" - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - base <= Period('2011', freq='4M') - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - Period('2011', freq='4M') >= base - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M') - base <= idx - - def test_pi_nat_comp(self): - for freq in ['M', '2M', '3M']: - idx1 = PeriodIndex( - ['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) - - result = idx1 > Period('2011-02', freq=freq) - exp = np.array([False, False, False, True]) - tm.assert_numpy_array_equal(result, exp) - result = Period('2011-02', freq=freq) < idx1 - tm.assert_numpy_array_equal(result, exp) - - result = idx1 == Period('NaT', freq=freq) - exp = np.array([False, False, False, False]) - tm.assert_numpy_array_equal(result, exp) - result = Period('NaT', freq=freq) == idx1 - tm.assert_numpy_array_equal(result, exp) - - result = idx1 != Period('NaT', freq=freq) - exp = np.array([True, True, True, True]) - tm.assert_numpy_array_equal(result, exp) - result = Period('NaT', freq=freq) != idx1 - tm.assert_numpy_array_equal(result, exp) - - idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', - 'NaT'], freq=freq) - result = idx1 < idx2 - exp = np.array([True, False, False, False]) - tm.assert_numpy_array_equal(result, exp) - - result = idx1 == idx2 - exp = np.array([False, False, False, False]) - tm.assert_numpy_array_equal(result, exp) - - result = idx1 != idx2 - exp = np.array([True, True, True, True]) - tm.assert_numpy_array_equal(result, exp) - - result = idx1 == idx1 - exp = np.array([True, True, False, True]) - tm.assert_numpy_array_equal(result, exp) - - result = idx1 != idx1 - exp = np.array([False, False, True, False]) - tm.assert_numpy_array_equal(result, exp) - - diff = PeriodIndex(['2011-02', '2011-01', '2011-04', - 'NaT'], freq='4M') - msg = "Input has different freq=4M from PeriodIndex" - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - idx1 > diff - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - idx1 == diff - - # TODO: De-duplicate with test_pi_nat_comp - def test_comp_nat(self): - left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, - pd.Period('2011-01-03')]) - right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) - - for lhs, rhs in [(left, right), - (left.astype(object), right.astype(object))]: - result = lhs == rhs - expected = np.array([False, False, True]) - tm.assert_numpy_array_equal(result, expected) - - result = lhs != rhs - expected = np.array([True, True, False]) - tm.assert_numpy_array_equal(result, expected) - - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(lhs == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == rhs, expected) - - expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(lhs != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != lhs, expected) - - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(lhs < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > lhs, expected) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index ab341b70dfe91..b3f059018493c 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -14,7 +14,6 @@ class TestPeriodIndex(DatetimeLike): _holder = PeriodIndex - _multiprocess_can_split_ = True def setup_method(self, method): self.indices = dict(index=tm.makePeriodIndex(10), @@ -206,7 +205,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') @@ -214,7 +213,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') @@ -223,7 +222,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) def test_period_index_length(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') @@ -454,16 +453,6 @@ def test_periods_number_check(self): with pytest.raises(ValueError): period_range('2011-1-1', '2012-1-1', 'B') - def test_start_time(self): - index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') - tm.assert_index_equal(index.start_time, expected_index) - - def test_end_time(self): - index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') - tm.assert_index_equal(index.end_time, expected_index) - def test_index_duplicate_periods(self): # monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') @@ -496,78 +485,14 @@ def test_index_unique(self): tm.assert_index_equal(idx.unique(), expected) assert idx.nunique() == 3 - def test_shift_gh8083(self): - - # test shift for PeriodIndex - # GH8083 - drange = self.create_index() - result = drange.shift(1) - expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', - '2013-01-05', '2013-01-06'], freq='D') - tm.assert_index_equal(result, expected) - def test_shift(self): - pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') - - tm.assert_index_equal(pi1.shift(0), pi1) - - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(1), pi2) - - pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(-1), pi2) - - pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(1), pi2) - - pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(-1), pi2) - - pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(1), pi2) - - pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(-1), pi2) - - def test_shift_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - result = idx.shift(1) - expected = PeriodIndex(['2011-02', '2011-03', 'NaT', - '2011-05'], freq='M', name='idx') - tm.assert_index_equal(result, expected) - assert result.name == expected.name + # This is tested in test_arithmetic + pass @td.skip_if_32bit def test_ndarray_compat_properties(self): super(TestPeriodIndex, self).test_ndarray_compat_properties() - def test_shift_ndarray(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - result = idx.shift(np.array([1, 2, 3, 4])) - expected = PeriodIndex(['2011-02', '2011-04', 'NaT', - '2011-08'], freq='M', name='idx') - tm.assert_index_equal(result, expected) - - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - result = idx.shift(np.array([1, -2, 3, -4])) - expected = PeriodIndex(['2011-02', '2010-12', 'NaT', - '2010-12'], freq='M', name='idx') - tm.assert_index_equal(result, expected) - def test_negative_ordinals(self): Period(ordinal=-1000, freq='A') Period(ordinal=0, freq='A') diff --git a/pandas/tests/indexes/period/test_scalar_compat.py b/pandas/tests/indexes/period/test_scalar_compat.py new file mode 100644 index 0000000000000..56bd2adf58719 --- /dev/null +++ b/pandas/tests/indexes/period/test_scalar_compat.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +"""Tests for PeriodIndex behaving like a vectorized Period scalar""" + +from pandas import PeriodIndex, date_range +import pandas.util.testing as tm + + +class TestPeriodIndexOps(object): + def test_start_time(self): + index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') + expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') + tm.assert_index_equal(index.start_time, expected_index) + + def test_end_time(self): + index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') + expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') + tm.assert_index_equal(index.end_time, expected_index) diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 0e72cadb5d494..97500f2f5ed95 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -6,8 +6,6 @@ import pandas.core.indexes.period as period from pandas.compat import lrange -from pandas._libs.tslibs.frequencies import get_freq -from pandas._libs.tslibs.period import period_ordinal, period_asfreq from pandas._libs.tslibs.ccalendar import MONTHS from pandas import (PeriodIndex, Period, DatetimeIndex, Timestamp, Series, @@ -22,7 +20,7 @@ class TestPeriodRepresentation(object): def _check_freq(self, freq, base_date): rng = PeriodIndex(start=base_date, periods=10, freq=freq) exp = np.arange(10, dtype=np.int64) - tm.assert_numpy_array_equal(rng._values, exp) + tm.assert_numpy_array_equal(rng.asi8, exp) def test_annual(self): @@ -76,83 +74,6 @@ def test_negone_ordinals(self): repr(period) -class TestTslib(object): - def test_intraday_conversion_factors(self): - assert period_asfreq(1, get_freq('D'), get_freq('H'), False) == 24 - assert period_asfreq(1, get_freq('D'), get_freq('T'), False) == 1440 - assert period_asfreq(1, get_freq('D'), get_freq('S'), False) == 86400 - assert period_asfreq(1, get_freq('D'), - get_freq('L'), False) == 86400000 - assert period_asfreq(1, get_freq('D'), - get_freq('U'), False) == 86400000000 - assert period_asfreq(1, get_freq('D'), - get_freq('N'), False) == 86400000000000 - - assert period_asfreq(1, get_freq('H'), get_freq('T'), False) == 60 - assert period_asfreq(1, get_freq('H'), get_freq('S'), False) == 3600 - assert period_asfreq(1, get_freq('H'), - get_freq('L'), False) == 3600000 - assert period_asfreq(1, get_freq('H'), - get_freq('U'), False) == 3600000000 - assert period_asfreq(1, get_freq('H'), - get_freq('N'), False) == 3600000000000 - - assert period_asfreq(1, get_freq('T'), get_freq('S'), False) == 60 - assert period_asfreq(1, get_freq('T'), get_freq('L'), False) == 60000 - assert period_asfreq(1, get_freq('T'), - get_freq('U'), False) == 60000000 - assert period_asfreq(1, get_freq('T'), - get_freq('N'), False) == 60000000000 - - assert period_asfreq(1, get_freq('S'), get_freq('L'), False) == 1000 - assert period_asfreq(1, get_freq('S'), - get_freq('U'), False) == 1000000 - assert period_asfreq(1, get_freq('S'), - get_freq('N'), False) == 1000000000 - - assert period_asfreq(1, get_freq('L'), get_freq('U'), False) == 1000 - assert period_asfreq(1, get_freq('L'), - get_freq('N'), False) == 1000000 - - assert period_asfreq(1, get_freq('U'), get_freq('N'), False) == 1000 - - def test_period_ordinal_start_values(self): - # information for 1.1.1970 - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('A')) == 0 - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('M')) == 0 - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('W')) == 1 - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('D')) == 0 - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('B')) == 0 - - def test_period_ordinal_week(self): - assert period_ordinal(1970, 1, 4, 0, 0, 0, 0, 0, get_freq('W')) == 1 - assert period_ordinal(1970, 1, 5, 0, 0, 0, 0, 0, get_freq('W')) == 2 - assert period_ordinal(2013, 10, 6, 0, - 0, 0, 0, 0, get_freq('W')) == 2284 - assert period_ordinal(2013, 10, 7, 0, - 0, 0, 0, 0, get_freq('W')) == 2285 - - def test_period_ordinal_business_day(self): - # Thursday - assert period_ordinal(2013, 10, 3, 0, - 0, 0, 0, 0, get_freq('B')) == 11415 - # Friday - assert period_ordinal(2013, 10, 4, 0, - 0, 0, 0, 0, get_freq('B')) == 11416 - # Saturday - assert period_ordinal(2013, 10, 5, 0, - 0, 0, 0, 0, get_freq('B')) == 11417 - # Sunday - assert period_ordinal(2013, 10, 6, 0, - 0, 0, 0, 0, get_freq('B')) == 11417 - # Monday - assert period_ordinal(2013, 10, 7, 0, - 0, 0, 0, 0, get_freq('B')) == 11417 - # Tuesday - assert period_ordinal(2013, 10, 8, 0, - 0, 0, 0, 0, get_freq('B')) == 11418 - - class TestPeriodIndex(object): def setup_method(self, method): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 974099f1fbbe9..90edcb526bb2e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -830,15 +830,16 @@ def test_map_with_tuples(self): # Test that returning a single tuple from an Index # returns an Index. - boolean_index = tm.makeIntIndex(3).map(lambda x: (x,)) - expected = Index([(0,), (1,), (2,)]) - tm.assert_index_equal(boolean_index, expected) + idx = tm.makeIntIndex(3) + result = tm.makeIntIndex(3).map(lambda x: (x,)) + expected = Index([(i,) for i in idx]) + tm.assert_index_equal(result, expected) # Test that returning a tuple from a map of a single index # returns a MultiIndex object. - boolean_index = tm.makeIntIndex(3).map(lambda x: (x, x == 1)) - expected = MultiIndex.from_tuples([(0, False), (1, True), (2, False)]) - tm.assert_index_equal(boolean_index, expected) + result = idx.map(lambda x: (x, x == 1)) + expected = MultiIndex.from_tuples([(i, i == 1) for i in idx]) + tm.assert_index_equal(result, expected) # Test that returning a single object from a MultiIndex # returns an Index. @@ -870,7 +871,8 @@ def test_map_tseries_indices_return_index(self): def test_map_dictlike(self, mapper): # GH 12756 expected = Index(['foo', 'bar', 'baz']) - result = tm.makeIntIndex(3).map(mapper(expected.values, [0, 1, 2])) + idx = tm.makeIntIndex(3) + result = idx.map(mapper(expected.values, idx)) tm.assert_index_equal(result, expected) for name in self.indices.keys(): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index c2e40c79f8914..e9fddfde90348 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -353,6 +353,14 @@ def test_append(self): expected = Index(list('caaabbca')) tm.assert_index_equal(result, expected, exact=True) + def test_append_to_another(self): + # hits _concat_index_asobject + fst = Index(['a', 'b']) + snd = CategoricalIndex(['d', 'e']) + result = fst.append(snd) + expected = Index(['a', 'b', 'd', 'e']) + tm.assert_index_equal(result, expected) + def test_insert(self): ci = self.create_index() diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e59456b8a2d5e..cd6a5c761d0c2 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -962,6 +962,53 @@ def test_values_boxed(self): # Check that code branches for boxed values produce identical results tm.assert_numpy_array_equal(result.values[:4], result[:4].values) + def test_values_multiindex_datetimeindex(self): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(10**18, 10**18 + 5) + naive = pd.DatetimeIndex(ints) + aware = pd.DatetimeIndex(ints, tz='US/Central') + + idx = pd.MultiIndex.from_arrays([naive, aware]) + result = idx.values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive[:2]) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware[:2]) + + def test_values_multiindex_periodindex(self): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(2007, 2012) + pidx = pd.PeriodIndex(ints, freq='D') + + idx = pd.MultiIndex.from_arrays([ints, pidx]) + result = idx.values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints)) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints[:2])) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx[:2]) + def test_append(self): result = self.index[:3].append(self.index[3:]) assert result.equals(self.index) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 0c1bec7a6f1a9..c6883df7ee91a 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -157,6 +157,48 @@ def test_divmod_series(self): for r, e in zip(result, expected): tm.assert_series_equal(r, e) + def test_div_zero(self, zero): + idx = self.create_index() + + expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], + dtype=np.float64) + result = idx / zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype('i8') / np.array(zero).astype('i8') + tm.assert_series_equal(ser_compat, Series(result)) + + def test_floordiv_zero(self, zero): + idx = self.create_index() + expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], + dtype=np.float64) + + result = idx // zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype('i8') // np.array(zero).astype('i8') + tm.assert_series_equal(ser_compat, Series(result)) + + def test_mod_zero(self, zero): + idx = self.create_index() + + expected = Index([np.nan, np.nan, np.nan, np.nan, np.nan], + dtype=np.float64) + result = idx % zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype('i8') % np.array(zero).astype('i8') + tm.assert_series_equal(ser_compat, Series(result)) + + def test_divmod_zero(self, zero): + idx = self.create_index() + + exleft = Index([np.nan, np.inf, np.inf, np.inf, np.inf], + dtype=np.float64) + exright = Index([np.nan, np.nan, np.nan, np.nan, np.nan], + dtype=np.float64) + + result = divmod(idx, zero) + tm.assert_index_equal(result[0], exleft) + tm.assert_index_equal(result[1], exright) + def test_explicit_conversions(self): # GH 8608 diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index c3bd857036efc..329f0c2467e8b 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -1,3 +1,5 @@ +from datetime import timedelta + import pytest import numpy as np @@ -8,7 +10,24 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True + def test_astype_object(self): + idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx') + expected_list = [Timedelta('1 days'), Timedelta('2 days'), + Timedelta('3 days'), Timedelta('4 days')] + result = idx.astype(object) + expected = Index(expected_list, dtype=object, name='idx') + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype_object_with_nat(self): + idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), NaT, + timedelta(days=4)], name='idx') + expected_list = [Timedelta('1 days'), Timedelta('2 days'), NaT, + Timedelta('4 days')] + result = idx.astype(object) + expected = Index(expected_list, dtype=object, name='idx') + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list def test_astype(self): # GH 13149, GH 13209 diff --git a/pandas/tests/indexes/timedeltas/test_construction.py b/pandas/tests/indexes/timedeltas/test_construction.py index 70aadd9f57174..68dc0003e2312 100644 --- a/pandas/tests/indexes/timedeltas/test_construction.py +++ b/pandas/tests/indexes/timedeltas/test_construction.py @@ -9,7 +9,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_construction_base_constructor(self): arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')] diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index e64c4e6ac54a5..59e38c2e738b0 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -9,7 +9,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_insert(self): diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index e944aad13f8d5..d154aa2172ef7 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -8,7 +8,7 @@ from pandas import to_timedelta from pandas import (Series, Timedelta, Timestamp, TimedeltaIndex, timedelta_range, - _np_version_under1p10, Index) + _np_version_under1p10) from pandas._libs.tslib import iNaT from pandas.tests.test_base import Ops @@ -25,31 +25,6 @@ def test_ops_properties(self): self.check_ops_properties(TimedeltaIndex._field_ops, f) self.check_ops_properties(TimedeltaIndex._object_ops, f) - def test_astype_object(self): - idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx') - expected_list = [Timedelta('1 days'), Timedelta('2 days'), - Timedelta('3 days'), Timedelta('4 days')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - - idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), pd.NaT, - timedelta(days=4)], name='idx') - expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT, - Timedelta('4 days')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - def test_minmax(self): # monotonic @@ -420,7 +395,6 @@ def test_equals(self): class TestTimedeltas(object): - _multiprocess_can_split_ = True def test_timedelta_ops(self): # GH4984 diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 22546d25273a7..020e9079b3436 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -6,7 +6,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_union(self): diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 32157a9a44e04..ce0f3b89b753e 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -18,7 +18,6 @@ class TestTimedeltaIndex(DatetimeLike): _holder = TimedeltaIndex - _multiprocess_can_split_ = True def setup_method(self, method): self.indices = dict(index=tm.makeTimedeltaIndex(10)) @@ -300,7 +299,6 @@ def test_freq_conversion(self): class TestTimeSeries(object): - _multiprocess_can_split_ = True def test_series_box_timedelta(self): rng = timedelta_range('1 day 1 s', periods=5, freq='h') diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 7624e1f79af15..784ef845fea10 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -7,7 +7,6 @@ class TestTimedeltas(object): - _multiprocess_can_split_ = True def test_timedelta_range(self): diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index b4ad28eeacb69..daa9739132d9e 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -11,7 +11,6 @@ class TestTimedeltas(object): - _multiprocess_can_split_ = True def test_to_timedelta(self): def conv(v): diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index f2182687d047f..634ad0d8160ed 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -432,6 +432,23 @@ def test_get_indexer_array(self): expected = np.array([0, 1], dtype='intp') tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_same_categories_same_order(self): + ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) + + result = ci.get_indexer(CategoricalIndex(['b', 'b'], + categories=['a', 'b'])) + expected = np.array([1, 1], dtype='intp') + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19551 + ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) + + result = ci.get_indexer(CategoricalIndex(['b', 'b'], + categories=['b', 'a'])) + expected = np.array([1, 1], dtype='intp') + tm.assert_numpy_array_equal(result, expected) + def test_getitem_with_listlike(self): # GH 16115 cats = Categorical([Timestamp('12-31-1999'), diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index d2692c7dc302e..e3f93924aca0d 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -4,7 +4,8 @@ from warnings import catch_warnings import numpy as np -from pandas import Series, DataFrame, Index, Float64Index +from pandas import (Series, DataFrame, Index, Float64Index, Int64Index, + RangeIndex) from pandas.util.testing import assert_series_equal, assert_almost_equal import pandas.util.testing as tm @@ -206,9 +207,8 @@ def test_scalar_integer(self): # test how scalar float indexers work on int indexes # integer index - for index in [tm.makeIntIndex, tm.makeRangeIndex]: + for i in [Int64Index(range(5)), RangeIndex(5)]: - i = index(5) for s in [Series(np.arange(len(i))), DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i)]: @@ -362,9 +362,9 @@ def test_slice_integer(self): # these coerce to a like integer # oob indicates if we are out of bounds # of positional indexing - for index, oob in [(tm.makeIntIndex(5), False), - (tm.makeRangeIndex(5), False), - (tm.makeIntIndex(5) + 10, True)]: + for index, oob in [(Int64Index(range(5)), False), + (RangeIndex(5), False), + (Int64Index(range(5)) + 10, True)]: # s is an in-range index s = Series(range(5), index=index) @@ -486,9 +486,8 @@ def f(): def test_slice_integer_frame_getitem(self): # similar to above, but on the getitem dim (of a DataFrame) - for index in [tm.makeIntIndex, tm.makeRangeIndex]: + for index in [Int64Index(range(5)), RangeIndex(5)]: - index = index(5) s = DataFrame(np.random.randn(5, 2), index=index) def f(idxr): diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 57e72da2fd3f4..8deb51e190bab 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -2,30 +2,34 @@ import pytest from pandas.io.parsers import read_table +from pandas.util import testing as tm -HERE = os.path.dirname(__file__) +@pytest.fixture +def parser_data(request): + return os.path.join(tm.get_data_path(), '..', 'parser', 'data') -@pytest.fixture(scope='module') -def tips_file(): + +@pytest.fixture +def tips_file(parser_data): """Path to the tips dataset""" - return os.path.join(HERE, 'parser', 'data', 'tips.csv') + return os.path.join(parser_data, 'tips.csv') -@pytest.fixture(scope='module') -def jsonl_file(): +@pytest.fixture +def jsonl_file(parser_data): """Path a JSONL dataset""" - return os.path.join(HERE, 'parser', 'data', 'items.jsonl') + return os.path.join(parser_data, 'items.jsonl') -@pytest.fixture(scope='module') -def salaries_table(): +@pytest.fixture +def salaries_table(parser_data): """DataFrame with the salaries dataset""" - path = os.path.join(HERE, 'parser', 'data', 'salaries.csv') + path = os.path.join(parser_data, 'salaries.csv') return read_table(path) -@pytest.fixture(scope='module') +@pytest.fixture def s3_resource(tips_file, jsonl_file): """Fixture for mocking S3 interaction. @@ -41,8 +45,8 @@ def s3_resource(tips_file, jsonl_file): is yielded by the fixture. """ pytest.importorskip('s3fs') + boto3 = pytest.importorskip('boto3') moto = pytest.importorskip('moto') - moto.mock_s3().start() test_s3_files = [ ('tips.csv', tips_file), @@ -58,17 +62,22 @@ def add_tips_files(bucket_name): Key=s3_key, Body=f) - boto3 = pytest.importorskip('boto3') - # see gh-16135 - bucket = 'pandas-test' + try: - conn = boto3.resource("s3", region_name="us-east-1") - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) + s3 = moto.mock_s3() + s3.start() - conn.create_bucket(Bucket='cant_get_it', ACL='private') - add_tips_files('cant_get_it') + # see gh-16135 + bucket = 'pandas-test' + conn = boto3.resource("s3", region_name="us-east-1") - yield conn + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) - moto.mock_s3().stop() + conn.create_bucket(Bucket='cant_get_it', ACL='private') + add_tips_files('cant_get_it') + yield conn + except: # noqa: flake8 + pytest.skip("failure to use s3 resource") + finally: + s3.stop() diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index e0ce27de5c31f..dddba5b425c3b 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1492,7 +1492,7 @@ def test_repr_html_float(self): 'B': np.arange(41, 41 + h)}).set_index('idx') reg_repr = df._repr_html_() assert '..' not in reg_repr - assert str(40 + h) in reg_repr + assert ''.format(val=str(40 + h)) in reg_repr h = max_rows + 1 df = DataFrame({'idx': np.linspace(-10, 10, h), @@ -1500,7 +1500,7 @@ def test_repr_html_float(self): 'B': np.arange(41, 41 + h)}).set_index('idx') long_repr = df._repr_html_() assert '..' in long_repr - assert '31' not in long_repr + assert ''.format(val='31') not in long_repr assert u('{h} rows ').format(h=h) in long_repr assert u('2 columns') in long_repr diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index e12a7196dce6b..dfa3751bff57a 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import sys import numpy as np import pandas as pd import pytest @@ -9,6 +10,37 @@ class TestToCSV(object): + @pytest.mark.xfail((3, 6, 5) > sys.version_info >= (3, 5), + reason=("Python csv library bug " + "(see https://bugs.python.org/issue32255)")) + def test_to_csv_with_single_column(self): + # see gh-18676, https://bugs.python.org/issue32255 + # + # Python's CSV library adds an extraneous '""' + # before the newline when the NaN-value is in + # the first row. Otherwise, only the newline + # character is added. This behavior is inconsistent + # and was patched in https://bugs.python.org/pull_request4672. + df1 = DataFrame([None, 1]) + expected1 = """\ +"" +1.0 +""" + with tm.ensure_clean('test.csv') as path: + df1.to_csv(path, header=None, index=None) + with open(path, 'r') as f: + assert f.read() == expected1 + + df2 = DataFrame([1, None]) + expected2 = """\ +1.0 +"" +""" + with tm.ensure_clean('test.csv') as path: + df2.to_csv(path, header=None, index=None) + with open(path, 'r') as f: + assert f.read() == expected2 + def test_to_csv_defualt_encoding(self): # GH17097 df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]}) diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 9e063c2d176e1..f69cac62513d4 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -1864,3 +1864,10 @@ def test_to_html_with_index_names_false(self): name='myindexname')) result = df.to_html(index_names=False) assert 'myindexname' not in result + + def test_to_html_with_id(self): + # gh-8496 + df = pd.DataFrame({"A": [1, 2]}, index=pd.Index(['a', 'b'], + name='myindexname')) + result = df.to_html(index_names=False, table_id="TEST_ID") + assert ' id="TEST_ID"' in result diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 10139eb07a925..a72744e08fa7c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1039,7 +1039,6 @@ def test_read_inline_jsonl(self): assert_frame_equal(result, expected) def test_read_s3_jsonl(self, s3_resource): - pytest.importorskip('s3fs') # GH17200 result = read_json('s3n://pandas-test/items.jsonl', lines=True) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 10f6cef04b593..f16338fda6245 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -46,6 +46,7 @@ def check_compressed_urls(salaries_table, compression, extension, mode, class TestS3(object): + @tm.network def test_parse_public_s3_bucket(self): pytest.importorskip('s3fs') @@ -65,7 +66,8 @@ def test_parse_public_s3_bucket(self): assert not df.empty tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) - def test_parse_public_s3n_bucket(self, s3_resource): + @tm.network + def test_parse_public_s3n_bucket(self): # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) @@ -74,7 +76,8 @@ def test_parse_public_s3n_bucket(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_parse_public_s3a_bucket(self, s3_resource): + @tm.network + def test_parse_public_s3a_bucket(self): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) @@ -82,7 +85,8 @@ def test_parse_public_s3a_bucket(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_nrows(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) @@ -91,7 +95,8 @@ def test_parse_public_s3_bucket_nrows(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_chunked(self): # Read with a chunksize chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) @@ -109,7 +114,8 @@ def test_parse_public_s3_bucket_chunked(self, s3_resource): chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_chunked_python(self): # Read with a chunksize using the Python parser chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) @@ -127,7 +133,8 @@ def test_parse_public_s3_bucket_chunked_python(self, s3_resource): chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_python(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression=comp) @@ -136,7 +143,8 @@ def test_parse_public_s3_bucket_python(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df) - def test_infer_s3_compression(self, s3_resource): + @tm.network + def test_infer_s3_compression(self): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') @@ -145,7 +153,8 @@ def test_infer_s3_compression(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df) - def test_parse_public_s3_bucket_nrows_python(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_nrows_python(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) @@ -154,7 +163,8 @@ def test_parse_public_s3_bucket_nrows_python(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_s3_fails(self, s3_resource): + @tm.network + def test_s3_fails(self): with pytest.raises(IOError): read_csv('s3://nyqpug/asdf.csv') diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a0070dce6a7f1..a89156db38ae3 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -102,15 +102,19 @@ def test_infer_compression_from_path(self, extension, expected, path_type): def test_get_filepath_or_buffer_with_path(self): filename = '~/sometest' - filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename) + filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( + filename) assert filepath_or_buffer != filename assert isabs(filepath_or_buffer) assert os.path.expanduser(filename) == filepath_or_buffer + assert not should_close def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() - filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer) + filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( + input_buffer) assert filepath_or_buffer == input_buffer + assert not should_close def test_iterator(self): reader = read_csv(StringIO(self.data1), chunksize=1) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 0cc4101cd6304..f3ab74d37a2bc 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1731,13 +1731,16 @@ class _TestMySQLAlchemy(object): @classmethod def connect(cls): url = 'mysql+{driver}://root@localhost/pandas_nosetest' - return sqlalchemy.create_engine(url.format(driver=cls.driver)) + return sqlalchemy.create_engine(url.format(driver=cls.driver), + connect_args=cls.connect_args) @classmethod def setup_driver(cls): try: import pymysql # noqa cls.driver = 'pymysql' + from pymysql.constants import CLIENT + cls.connect_args = {'client_flag': CLIENT.MULTI_STATEMENTS} except ImportError: pytest.skip('pymysql not installed') diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 89d76061329a3..4e259d0994bdb 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -589,6 +589,16 @@ def test_105(self): df0['psch_dis'] = df0["psch_dis"].astype(np.float32) tm.assert_frame_equal(df.head(3), df0) + def test_value_labels_old_format(self): + # GH 19417 + # + # Test that value_labels() returns an empty dict if the file format + # predates supporting value labels. + dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta') + reader = StataReader(dpath) + assert reader.value_labels() == {} + reader.close() + def test_date_export_formats(self): columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty'] conversions = {c: c for c in columns} diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f63c206c0c407..101d34ebdb89f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -270,6 +270,14 @@ def test_no_overlap_more_informative_error(self): df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) pytest.raises(MergeError, merge, df1, df2) + msg = ('No common columns to perform merge on. ' + 'Merge options: left_on={lon}, right_on={ron}, ' + 'left_index={lidx}, right_index={ridx}' + .format(lon=None, ron=None, lidx=False, ridx=False)) + + with tm.assert_raises_regex(MergeError, msg): + merge(df1, df2) + def test_merge_non_unique_indexes(self): dt = datetime(2012, 5, 1) @@ -1635,6 +1643,25 @@ def test_merge_categorical(self): result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c') tm.assert_frame_equal(result, expected) + def tests_merge_categorical_unordered_equal(self): + # GH-19551 + df1 = DataFrame({ + 'Foo': Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C']), + 'Left': ['A0', 'B0', 'C0'], + }) + + df2 = DataFrame({ + 'Foo': Categorical(['C', 'B', 'A'], categories=['C', 'B', 'A']), + 'Right': ['C1', 'B1', 'A1'], + }) + result = pd.merge(df1, df2, on=['Foo']) + expected = DataFrame({ + 'Foo': pd.Categorical(['A', 'B', 'C']), + 'Left': ['A0', 'B0', 'C0'], + 'Right': ['A1', 'B1', 'C1'], + }) + assert_frame_equal(result, expected) + def test_other_columns(self, left, right): # non-merge columns should preserve if possible right = right.assign(Z=right.Z.astype('category')) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index c9d079421532f..a57c3c41b3637 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -3,6 +3,7 @@ from warnings import catch_warnings import pytest +from collections import OrderedDict from pandas import DataFrame, Series import pandas as pd @@ -457,7 +458,8 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): @pytest.mark.parametrize('sparse', [True, False]) def test_get_dummies_dont_sparsify_all_columns(self, sparse): # GH18914 - df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])]) + df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]), + ('Nation', ['AB', 'CD'])])) df = get_dummies(df, columns=['Nation'], sparse=sparse) df2 = df.reindex(columns=['GDP']) diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index 64d4940082978..667266be2a89b 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -13,7 +13,6 @@ class TestTimedeltaArithmetic(object): - _multiprocess_can_split_ = True def test_arithmetic_overflow(self): with pytest.raises(OverflowError): @@ -286,7 +285,6 @@ def test_compare_timedelta_ndarray(self): class TestTimedeltas(object): - _multiprocess_can_split_ = True def setup_method(self, method): pass diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 301f6da140866..7695c94409232 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -18,6 +18,7 @@ from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import get_timezone, dateutil_gettz as gettz +from pandas.errors import OutOfBoundsDatetime from pandas.compat import long, PY3 from pandas.compat.numpy import np_datetime64_compat from pandas import Timestamp, Period, Timedelta @@ -410,6 +411,13 @@ def test_out_of_bounds_string(self): with pytest.raises(ValueError): Timestamp('2263-01-01') + def test_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + with pytest.raises(OutOfBoundsDatetime): + Timestamp('2262-04-11 23:47:16.854775808') + def test_bounds_with_different_units(self): out_of_bounds_dates = ('1677-09-21', '2262-04-12') diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 7a5c6feb8b651..f43651dc6f0db 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -15,12 +15,29 @@ import pandas.util._test_decorators as td from pandas import Timestamp, NaT +from pandas.errors import OutOfBoundsDatetime class TestTimestampTZOperations(object): # -------------------------------------------------------------- # Timestamp.tz_localize + def test_tz_localize_pushes_out_of_bounds(self): + # GH#12677 + # tz_localize that pushes away from the boundary is OK + pac = Timestamp.min.tz_localize('US/Pacific') + assert pac.value > Timestamp.min.value + pac.tz_convert('Asia/Tokyo') # tz_convert doesn't change value + with pytest.raises(OutOfBoundsDatetime): + Timestamp.min.tz_localize('Asia/Tokyo') + + # tz_localize that pushes away from the boundary is OK + tokyo = Timestamp.max.tz_localize('Asia/Tokyo') + assert tokyo.value < Timestamp.max.value + tokyo.tz_convert('US/Pacific') # tz_convert doesn't change value + with pytest.raises(OutOfBoundsDatetime): + Timestamp.max.tz_localize('US/Pacific') + def test_tz_localize_ambiguous_bool(self): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 70c7308dd3991..8a6989c909cb2 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -10,7 +10,7 @@ from pandas.compat import PY3 from pandas._libs.tslibs.frequencies import _INVALID_FREQ_ERROR -from pandas import Timestamp +from pandas import Timestamp, NaT class TestTimestampUnaryOps(object): @@ -93,6 +93,29 @@ def test_round_frequencies(self, freq, expected): result = stamp.round(freq=freq) assert result == expected + @pytest.mark.parametrize('test_input, rounder, freq, expected', [ + ('2117-01-01 00:00:45', 'floor', '15s', '2117-01-01 00:00:45'), + ('2117-01-01 00:00:45', 'ceil', '15s', '2117-01-01 00:00:45'), + ('2117-01-01 00:00:45.000000012', 'floor', '10ns', + '2117-01-01 00:00:45.000000010'), + ('1823-01-01 00:00:01.000000012', 'ceil', '10ns', + '1823-01-01 00:00:01.000000020'), + ('1823-01-01 00:00:01', 'floor', '1s', '1823-01-01 00:00:01'), + ('1823-01-01 00:00:01', 'ceil', '1s', '1823-01-01 00:00:01'), + ('NaT', 'floor', '1s', 'NaT'), + ('NaT', 'ceil', '1s', 'NaT') + ]) + def test_ceil_floor_edge(self, test_input, rounder, freq, expected): + dt = Timestamp(test_input) + func = getattr(dt, rounder) + result = func(freq) + + if dt is NaT: + assert result is NaT + else: + expected = Timestamp(expected) + assert result == expected + def test_ceil(self): dt = Timestamp('20130101 09:10:11') result = dt.ceil('D') diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 3822ecd0a1b0e..0780c846a6c19 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -164,8 +164,6 @@ def test_apply_dict_depr(self): class TestSeriesAggregate(TestData): - _multiprocess_can_split_ = True - def test_transform(self): # transforming functions diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index ed12aa8c44a82..7dcd0ffc72eb4 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -344,7 +344,7 @@ def test_ops_series_period(self): # dtype will be object because of original dtype expected = pd.Series([9, 8], name='xxx', dtype=object) tm.assert_series_equal(per - ser, expected) - tm.assert_series_equal(ser - per, -expected) + tm.assert_series_equal(ser - per, -1 * expected) s2 = pd.Series([pd.Period('2015-01-05', freq='D'), pd.Period('2015-01-04', freq='D')], name='xxx') @@ -352,7 +352,7 @@ def test_ops_series_period(self): expected = pd.Series([4, 2], name='xxx', dtype=object) tm.assert_series_equal(s2 - ser, expected) - tm.assert_series_equal(ser - s2, -expected) + tm.assert_series_equal(ser - s2, -1 * expected) class TestTimestampSeriesArithmetic(object): diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 05ccb25960b1f..554b3e15d8f10 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -596,77 +596,81 @@ def test_divide_decimal(self): assert_series_equal(expected, s) - def test_div(self): + @pytest.mark.parametrize( + 'dtype2', + [ + np.int64, np.int32, np.int16, np.int8, + np.float64, np.float32, np.float16, + np.uint64, np.uint32, + np.uint16, np.uint8 + ]) + @pytest.mark.parametrize('dtype1', [np.int64, np.float64, np.uint64]) + def test_ser_div_ser(self, dtype1, dtype2): + # no longer do integer div for any ops, but deal with the 0's + first = Series([3, 4, 5, 8], name='first').astype(dtype1) + second = Series([0, 0, 0, 3], name='second').astype(dtype2) + with np.errstate(all='ignore'): - # no longer do integer div for any ops, but deal with the 0's - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p['first'] / p['second'] - expected = Series( - p['first'].values.astype(float) / p['second'].values, - dtype='float64') - expected.iloc[0:3] = np.inf - assert_series_equal(result, expected) + expected = Series(first.values.astype(np.float64) / second.values, + dtype='float64', name=None) + expected.iloc[0:3] = np.inf - result = p['first'] / 0 - expected = Series(np.inf, index=p.index, name='first') - assert_series_equal(result, expected) + result = first / second + assert_series_equal(result, expected) + assert not result.equals(second / first) - p = p.astype('float64') - result = p['first'] / p['second'] - expected = Series(p['first'].values / p['second'].values) - assert_series_equal(result, expected) + def test_div_equiv_binop(self): + # Test Series.div as well as Series.__div__ + # float/integer issue + # GH#7785 + first = pd.Series([1, 0], name='first') + second = pd.Series([-0.01, -0.02], name='second') + expected = Series([-0.01, -np.inf]) - p = DataFrame({'first': [3, 4, 5, 8], 'second': [1, 1, 1, 1]}) - result = p['first'] / p['second'] - assert_series_equal(result, p['first'].astype('float64'), - check_names=False) - assert result.name is None - assert not result.equals(p['second'] / p['first']) - - # inf signing - s = Series([np.nan, 1., -1.]) - result = s / 0 - expected = Series([np.nan, np.inf, -np.inf]) - assert_series_equal(result, expected) + result = second.div(first) + assert_series_equal(result, expected, check_names=False) - # float/integer issue - # GH 7785 - p = DataFrame({'first': (1, 0), 'second': (-0.01, -0.02)}) - expected = Series([-0.01, -np.inf]) + result = second / first + assert_series_equal(result, expected) - result = p['second'].div(p['first']) - assert_series_equal(result, expected, check_names=False) + def test_rdiv_zero_compat(self): + # GH#8674 + zero_array = np.array([0] * 5) + data = np.random.randn(5) + expected = pd.Series([0.] * 5) - result = p['second'] / p['first'] - assert_series_equal(result, expected) + result = zero_array / pd.Series(data) + assert_series_equal(result, expected) - # GH 9144 - s = Series([-1, 0, 1]) + result = pd.Series(zero_array) / data + assert_series_equal(result, expected) - result = 0 / s - expected = Series([0.0, nan, 0.0]) - assert_series_equal(result, expected) + result = pd.Series(zero_array) / pd.Series(data) + assert_series_equal(result, expected) - result = s / 0 - expected = Series([-inf, nan, inf]) - assert_series_equal(result, expected) + def test_div_zero_inf_signs(self): + # GH#9144, inf signing + ser = Series([-1, 0, 1], name='first') + expected = Series([-np.inf, np.nan, np.inf], name='first') - result = s // 0 - expected = Series([-inf, nan, inf]) - assert_series_equal(result, expected) + result = ser / 0 + assert_series_equal(result, expected) - # GH 8674 - zero_array = np.array([0] * 5) - data = np.random.randn(5) - expected = pd.Series([0.] * 5) - result = zero_array / pd.Series(data) - assert_series_equal(result, expected) + def test_rdiv_zero(self): + # GH#9144 + ser = Series([-1, 0, 1], name='first') + expected = Series([0.0, np.nan, 0.0], name='first') - result = pd.Series(zero_array) / data - assert_series_equal(result, expected) + result = 0 / ser + assert_series_equal(result, expected) - result = pd.Series(zero_array) / pd.Series(data) - assert_series_equal(result, expected) + def test_floordiv_div(self): + # GH#9144 + ser = Series([-1, 0, 1], name='first') + + result = ser // 0 + expected = Series([-inf, nan, inf], name='first') + assert_series_equal(result, expected) class TestTimedeltaSeriesArithmeticWithIntegers(object): @@ -1576,33 +1580,42 @@ def test_dt64_series_add_intlike(self, tz): class TestSeriesOperators(TestData): - def test_op_method(self): - def check(series, other, check_reverse=False): - simple_ops = ['add', 'sub', 'mul', 'floordiv', 'truediv', 'pow'] - if not compat.PY3: - simple_ops.append('div') - - for opname in simple_ops: - op = getattr(Series, opname) - - if op == 'div': - alt = operator.truediv - else: - alt = getattr(operator, opname) - - result = op(series, other) - expected = alt(series, other) - assert_almost_equal(result, expected) - if check_reverse: - rop = getattr(Series, "r" + opname) - result = rop(series, other) - expected = alt(other, series) - assert_almost_equal(result, expected) + @pytest.mark.parametrize( + 'ts', + [ + (lambda x: x, lambda x: x * 2, False), + (lambda x: x, lambda x: x[::2], False), + (lambda x: x, lambda x: 5, True), + (lambda x: tm.makeFloatSeries(), + lambda x: tm.makeFloatSeries(), + True) + ]) + @pytest.mark.parametrize('opname', ['add', 'sub', 'mul', 'floordiv', + 'truediv', 'div', 'pow']) + def test_op_method(self, opname, ts): + # check that Series.{opname} behaves like Series.__{opname}__, + series = ts[0](self.ts) + other = ts[1](self.ts) + check_reverse = ts[2] + + if opname == 'div' and compat.PY3: + pytest.skip('div test only for Py3') + + op = getattr(Series, opname) + + if op == 'div': + alt = operator.truediv + else: + alt = getattr(operator, opname) - check(self.ts, self.ts * 2) - check(self.ts, self.ts[::2]) - check(self.ts, 5, check_reverse=True) - check(tm.makeFloatSeries(), tm.makeFloatSeries(), check_reverse=True) + result = op(series, other) + expected = alt(series, other) + assert_almost_equal(result, expected) + if check_reverse: + rop = getattr(Series, "r" + opname) + result = rop(series, other) + expected = alt(other, series) + assert_almost_equal(result, expected) def test_neg(self): assert_series_equal(-self.series, -1 * self.series) @@ -1971,20 +1984,15 @@ def test_operators_corner(self): index=self.ts.index[:-5], name='ts') tm.assert_series_equal(added[:-5], expected) - def test_operators_reverse_object(self): + @pytest.mark.parametrize('op', [operator.add, operator.sub, operator.mul, + operator.truediv, operator.floordiv]) + def test_operators_reverse_object(self, op): # GH 56 arr = Series(np.random.randn(10), index=np.arange(10), dtype=object) - def _check_op(arr, op): - result = op(1., arr) - expected = op(1., arr.astype(float)) - assert_series_equal(result.astype(float), expected) - - _check_op(arr, operator.add) - _check_op(arr, operator.sub) - _check_op(arr, operator.mul) - _check_op(arr, operator.truediv) - _check_op(arr, operator.floordiv) + result = op(1., arr) + expected = op(1., arr.astype(float)) + assert_series_equal(result.astype(float), expected) def test_arith_ops_df_compat(self): # GH 1134 diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py new file mode 100644 index 0000000000000..2e15c964e4e93 --- /dev/null +++ b/pandas/tests/series/test_timezones.py @@ -0,0 +1,293 @@ +# -*- coding: utf-8 -*- +""" +Tests for Series timezone-related methods +""" +from datetime import datetime + +import pytest +import pytz +import numpy as np +from dateutil.tz import tzoffset + +import pandas.util.testing as tm +from pandas._libs import tslib +from pandas._libs.tslibs import timezones +from pandas.compat import lrange +from pandas.core.indexes.datetimes import date_range +from pandas import Series, Timestamp, DatetimeIndex, Index + + +class TestSeriesTimezones(object): + # ----------------------------------------------------------------- + # Series.tz_localize + def test_series_tz_localize(self): + + rng = date_range('1/1/2011', periods=100, freq='H') + ts = Series(1, index=rng) + + result = ts.tz_localize('utc') + assert result.index.tz.zone == 'UTC' + + # Can't localize if already tz-aware + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + ts = Series(1, index=rng) + tm.assert_raises_regex(TypeError, 'Already tz-aware', + ts.tz_localize, 'US/Eastern') + + def test_series_tz_localize_ambiguous_bool(self): + # make sure that we are correctly accepting bool values as ambiguous + + # GH#14402 + ts = Timestamp('2015-11-01 01:00:03') + expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') + expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') + + ser = Series([ts]) + expected0 = Series([expected0]) + expected1 = Series([expected1]) + + with pytest.raises(pytz.AmbiguousTimeError): + ser.dt.tz_localize('US/Central') + + result = ser.dt.tz_localize('US/Central', ambiguous=True) + tm.assert_series_equal(result, expected0) + + result = ser.dt.tz_localize('US/Central', ambiguous=[True]) + tm.assert_series_equal(result, expected0) + + result = ser.dt.tz_localize('US/Central', ambiguous=False) + tm.assert_series_equal(result, expected1) + + result = ser.dt.tz_localize('US/Central', ambiguous=[False]) + tm.assert_series_equal(result, expected1) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_series_tz_localize_empty(self, tzstr): + # GH#2248 + ser = Series() + + ser2 = ser.tz_localize('utc') + assert ser2.index.tz == pytz.utc + + ser2 = ser.tz_localize(tzstr) + timezones.tz_compare(ser2.index.tz, timezones.maybe_get_tz(tzstr)) + + # ----------------------------------------------------------------- + # Series.tz_convert + + def test_series_tz_convert(self): + rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') + ts = Series(1, index=rng) + + result = ts.tz_convert('Europe/Berlin') + assert result.index.tz.zone == 'Europe/Berlin' + + # can't convert tz-naive + rng = date_range('1/1/2011', periods=200, freq='D') + ts = Series(1, index=rng) + tm.assert_raises_regex(TypeError, "Cannot convert tz-naive", + ts.tz_convert, 'US/Eastern') + + # ----------------------------------------------------------------- + # Series.append + + def test_series_append_aware(self): + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', + tz='US/Eastern') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Eastern') + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], + tz='US/Eastern') + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + assert ts_result.index.tz == rng1.tz + + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], + tz='UTC') + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + utc = rng1.tz + assert utc == ts_result.index.tz + + # GH#7795 + # different tz coerces to object dtype, not UTC + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', + tz='US/Eastern') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Central') + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), + Timestamp('1/1/2011 02:00', tz='US/Central')]) + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + + def test_series_append_aware_naive(self): + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Eastern') + ser1 = Series(np.random.randn(len(rng1)), index=rng1) + ser2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ser1.append(ser2) + + expected = ser1.index.astype(object).append(ser2.index.astype(object)) + assert ts_result.index.equals(expected) + + # mixed + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') + rng2 = lrange(100) + ser1 = Series(np.random.randn(len(rng1)), index=rng1) + ser2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ser1.append(ser2) + + expected = ser1.index.astype(object).append(ser2.index) + assert ts_result.index.equals(expected) + + def test_series_append_dst(self): + rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', + tz='US/Eastern') + rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', + tz='US/Eastern') + ser1 = Series([1, 2, 3], index=rng1) + ser2 = Series([10, 11, 12], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', + '2016-01-01 03:00', '2016-08-01 01:00', + '2016-08-01 02:00', '2016-08-01 03:00'], + tz='US/Eastern') + exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) + tm.assert_series_equal(ts_result, exp) + assert ts_result.index.tz == rng1.tz + + # ----------------------------------------------------------------- + + def test_dateutil_tzoffset_support(self): + values = [188.5, 328.25] + tzinfo = tzoffset(None, 7200) + index = [datetime(2012, 5, 11, 11, tzinfo=tzinfo), + datetime(2012, 5, 11, 12, tzinfo=tzinfo)] + series = Series(data=values, index=index) + + assert series.index.tz == tzinfo + + # it works! #2443 + repr(series.index[0]) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_tz_aware_asfreq(self, tz): + dr = date_range('2011-12-01', '2012-07-20', freq='D', tz=tz) + + ser = Series(np.random.randn(len(dr)), index=dr) + + # it works! + ser.asfreq('T') + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_string_index_alias_tz_aware(self, tz): + rng = date_range('1/1/2000', periods=10, tz=tz) + ser = Series(np.random.randn(len(rng)), index=rng) + + result = ser['1/3/2000'] + tm.assert_almost_equal(result, ser[2]) + + # TODO: De-duplicate with test below + def test_series_add_tz_mismatch_converts_to_utc_duplicate(self): + rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') + ser = Series(np.random.randn(len(rng)), index=rng) + + ts_moscow = ser.tz_convert('Europe/Moscow') + + result = ser + ts_moscow + assert result.index.tz is pytz.utc + + result = ts_moscow + ser + assert result.index.tz is pytz.utc + + def test_series_add_tz_mismatch_converts_to_utc(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + perm = np.random.permutation(100)[:90] + ser1 = Series(np.random.randn(90), + index=rng.take(perm).tz_convert('US/Eastern')) + + perm = np.random.permutation(100)[:90] + ser2 = Series(np.random.randn(90), + index=rng.take(perm).tz_convert('Europe/Berlin')) + + result = ser1 + ser2 + + uts1 = ser1.tz_convert('utc') + uts2 = ser2.tz_convert('utc') + expected = uts1 + uts2 + + assert result.index.tz == pytz.UTC + tm.assert_series_equal(result, expected) + + def test_series_add_aware_naive_raises(self): + rng = date_range('1/1/2011', periods=10, freq='H') + ser = Series(np.random.randn(len(rng)), index=rng) + + ser_utc = ser.tz_localize('utc') + + with pytest.raises(Exception): + ser + ser_utc + + with pytest.raises(Exception): + ser_utc + ser + + def test_series_align_aware(self): + idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') + ser = Series(np.random.randn(len(idx1)), index=idx1) + ser_central = ser.tz_convert('US/Central') + # # different timezones convert to UTC + + new1, new2 = ser.align(ser_central) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_localized_at_time_between_time(self, tzstr): + from datetime import time + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range('4/16/2012', '5/1/2012', freq='H') + ts = Series(np.random.randn(len(rng)), index=rng) + + ts_local = ts.tz_localize(tzstr) + + result = ts_local.at_time(time(10, 0)) + expected = ts.at_time(time(10, 0)).tz_localize(tzstr) + tm.assert_series_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + t1, t2 = time(10, 0), time(11, 0) + result = ts_local.between_time(t1, t2) + expected = ts.between_time(t1, t2).tz_localize(tzstr) + tm.assert_series_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + @pytest.mark.parametrize('tzstr', ['Europe/Berlin', + 'dateutil/Europe/Berlin']) + def test_getitem_pydatetime_tz(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + + index = date_range(start='2012-12-24 16:00', end='2012-12-24 18:00', + freq='H', tz=tzstr) + ts = Series(index=index, data=index.hour) + time_pandas = Timestamp('2012-12-24 17:00', tz=tzstr) + + dt = datetime(2012, 12, 24, 17, 0) + time_datetime = tslib._localize_pydatetime(dt, tz) + assert ts[time_pandas] == ts[time_datetime] diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py new file mode 100644 index 0000000000000..07e4b1bf7c913 --- /dev/null +++ b/pandas/tests/sparse/frame/test_apply.py @@ -0,0 +1,92 @@ +import pytest +import numpy as np +from pandas import SparseDataFrame, DataFrame, Series, bdate_range +from pandas.core import nanops +from pandas.util import testing as tm + + +@pytest.fixture +def dates(): + return bdate_range('1/1/2011', periods=10) + + +@pytest.fixture +def empty(): + return SparseDataFrame() + + +@pytest.fixture +def frame(dates): + data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], + 'C': np.arange(10, dtype=np.float64), + 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]} + + return SparseDataFrame(data, index=dates) + + +@pytest.fixture +def fill_frame(frame): + values = frame.values.copy() + values[np.isnan(values)] = 2 + + return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], + default_fill_value=2, + index=frame.index) + + +def test_apply(frame): + applied = frame.apply(np.sqrt) + assert isinstance(applied, SparseDataFrame) + tm.assert_almost_equal(applied.values, np.sqrt(frame.values)) + + # agg / broadcast + with tm.assert_produces_warning(FutureWarning): + broadcasted = frame.apply(np.sum, broadcast=True) + assert isinstance(broadcasted, SparseDataFrame) + + with tm.assert_produces_warning(FutureWarning): + exp = frame.to_dense().apply(np.sum, broadcast=True) + tm.assert_frame_equal(broadcasted.to_dense(), exp) + + applied = frame.apply(np.sum) + tm.assert_series_equal(applied, + frame.to_dense().apply(nanops.nansum)) + + +def test_apply_fill(fill_frame): + applied = fill_frame.apply(np.sqrt) + assert applied['A'].fill_value == np.sqrt(2) + + +def test_apply_empty(empty): + assert empty.apply(np.sqrt) is empty + + +def test_apply_nonuq(): + orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'a', 'c']) + sparse = orig.to_sparse() + res = sparse.apply(lambda s: s[0], axis=1) + exp = orig.apply(lambda s: s[0], axis=1) + + # dtype must be kept + assert res.dtype == np.int64 + + # ToDo: apply must return subclassed dtype + assert isinstance(res, Series) + tm.assert_series_equal(res.to_dense(), exp) + + # df.T breaks + sparse = orig.T.to_sparse() + res = sparse.apply(lambda s: s[0], axis=0) # noqa + exp = orig.T.apply(lambda s: s[0], axis=0) + + # TODO: no non-unique columns supported in sparse yet + # tm.assert_series_equal(res.to_dense(), exp) + + +def test_applymap(frame): + # just test that it works + result = frame.applymap(lambda x: x * 2) + assert isinstance(result, SparseDataFrame) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 54f567bcd2a8c..0e8b2161cafc4 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -621,52 +621,6 @@ def test_append(self): tm.assert_sp_frame_equal(appended.iloc[:, :3], self.frame.iloc[:, :3], exact_indices=False) - def test_apply(self): - applied = self.frame.apply(np.sqrt) - assert isinstance(applied, SparseDataFrame) - tm.assert_almost_equal(applied.values, np.sqrt(self.frame.values)) - - applied = self.fill_frame.apply(np.sqrt) - assert applied['A'].fill_value == np.sqrt(2) - - # agg / broadcast - broadcasted = self.frame.apply(np.sum, broadcast=True) - assert isinstance(broadcasted, SparseDataFrame) - - exp = self.frame.to_dense().apply(np.sum, broadcast=True) - tm.assert_frame_equal(broadcasted.to_dense(), exp) - - assert self.empty.apply(np.sqrt) is self.empty - - from pandas.core import nanops - applied = self.frame.apply(np.sum) - tm.assert_series_equal(applied, - self.frame.to_dense().apply(nanops.nansum)) - - def test_apply_nonuq(self): - orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=['a', 'a', 'c']) - sparse = orig.to_sparse() - res = sparse.apply(lambda s: s[0], axis=1) - exp = orig.apply(lambda s: s[0], axis=1) - # dtype must be kept - assert res.dtype == np.int64 - # ToDo: apply must return subclassed dtype - assert isinstance(res, pd.Series) - tm.assert_series_equal(res.to_dense(), exp) - - # df.T breaks - sparse = orig.T.to_sparse() - res = sparse.apply(lambda s: s[0], axis=0) # noqa - exp = orig.T.apply(lambda s: s[0], axis=0) - # TODO: no non-unique columns supported in sparse yet - # tm.assert_series_equal(res.to_dense(), exp) - - def test_applymap(self): - # just test that it works - result = self.frame.applymap(lambda x: x * 2) - assert isinstance(result, SparseDataFrame) - def test_astype(self): sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], dtype=np.int64), @@ -1303,3 +1257,14 @@ def test_quantile_multi(self): tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) + + def test_assign_with_sparse_frame(self): + # GH 19163 + df = pd.DataFrame({"a": [1, 2, 3]}) + res = df.to_sparse(fill_value=False).assign(newcol=False) + exp = df.assign(newcol=False).to_sparse(fill_value=False) + + tm.assert_sp_frame_equal(res, exp) + + for column in res.columns: + assert type(res[column]) is SparseSeries diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 2ea1e63433520..3f5d5a59cc540 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -23,6 +23,8 @@ from pandas.core.sparse.api import SparseSeries from pandas.tests.series.test_api import SharedWithSparse +from itertools import product + def _test_data1(): # nan-based @@ -971,6 +973,17 @@ def test_combine_first(self): tm.assert_sp_series_equal(result, result2) tm.assert_sp_series_equal(result, expected) + @pytest.mark.parametrize('deep,fill_values', [([True, False], + [0, 1, np.nan, None])]) + def test_memory_usage_deep(self, deep, fill_values): + for deep, fill_value in product(deep, fill_values): + sparse_series = SparseSeries(fill_values, fill_value=fill_value) + dense_series = Series(fill_values) + sparse_usage = sparse_series.memory_usage(deep=deep) + dense_usage = dense_series.memory_usage(deep=deep) + + assert sparse_usage < dense_usage + class TestSparseHandlingMultiIndexes(object): diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 8de93ff320961..6c0c83cf65ff7 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -113,6 +113,21 @@ def test_constructor_spindex_dtype(self): assert arr.dtype == np.int64 assert arr.fill_value == 0 + @pytest.mark.parametrize('scalar,dtype', [ + (False, bool), + (0.0, 'float64'), + (1, 'int64'), + ('z', 'object')]) + def test_scalar_with_index_infer_dtype(self, scalar, dtype): + # GH 19163 + arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) + exp = SparseArray([scalar, scalar, scalar], fill_value=scalar) + + tm.assert_sp_array_equal(arr, exp) + + assert arr.dtype == dtype + assert exp.dtype == dtype + def test_sparseseries_roundtrip(self): # GH 13999 for kind in ['integer', 'block']: diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 118e07e37fb47..61f9619752e3d 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -346,8 +346,9 @@ def test_ops(self): if not isinstance(o, PeriodIndex): expected = getattr(o.values, op)() else: - expected = pd.Period(ordinal=getattr(o._values, op)(), - freq=o.freq) + expected = pd.Period( + ordinal=getattr(o._ndarray_values, op)(), + freq=o.freq) try: assert result == expected except TypeError: @@ -458,7 +459,7 @@ def test_value_counts_unique_nunique_null(self): for orig in self.objs: o = orig.copy() klass = type(o) - values = o._values + values = o._ndarray_values if not self._allow_na_ops(o): continue @@ -1183,3 +1184,54 @@ def test_iter_box(self): assert isinstance(res, pd.Period) assert res.freq == 'M' assert res == exp + + +@pytest.mark.parametrize('array, expected_type, dtype', [ + (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'), + (np.array(['a', 'b']), np.ndarray, 'object'), + (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), + (pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]'), + (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), pd.DatetimeIndex, + 'datetime64[ns, US/Central]'), + (pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]'), + (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), + (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray, 'object'), +]) +def test_values_consistent(array, expected_type, dtype): + l_values = pd.Series(array)._values + r_values = pd.Index(array)._values + assert type(l_values) is expected_type + assert type(l_values) is type(r_values) + + if isinstance(l_values, np.ndarray): + tm.assert_numpy_array_equal(l_values, r_values) + elif isinstance(l_values, pd.Index): + tm.assert_index_equal(l_values, r_values) + elif pd.api.types.is_categorical(l_values): + tm.assert_categorical_equal(l_values, r_values) + else: + raise TypeError("Unexpected type {}".format(type(l_values))) + + assert l_values.dtype == dtype + assert r_values.dtype == dtype + + +@pytest.mark.parametrize('array, expected', [ + (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), + (np.array(['0', '1']), np.array(['0', '1'], dtype=object)), + (pd.Categorical(['a', 'a']), np.array([0, 0], dtype='int8')), + (pd.DatetimeIndex(['2017-01-01T00:00:00']), + np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')), + (pd.DatetimeIndex(['2017-01-01T00:00:00'], tz="US/Eastern"), + np.array(['2017-01-01T05:00:00'], dtype='M8[ns]')), + (pd.TimedeltaIndex([10**10]), np.array([10**10], dtype='m8[ns]')), + pytest.mark.xfail(reason='PeriodArray not implemented')(( + pd.PeriodIndex(['2017', '2018'], freq='D'), + np.array([17167, 17532]), + )), +]) +def test_ndarray_values(array, expected): + l_values = pd.Series(array)._ndarray_values + r_values = pd.Index(array)._ndarray_values + tm.assert_numpy_array_equal(l_values, r_values) + tm.assert_numpy_array_equal(l_values, expected) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index a5aaa328a8e06..23cc18de34778 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -20,7 +20,6 @@ from pandas.core.dtypes.generic import ABCSeries, ABCDataFrame from pandas.compat import range, lrange, zip, product, OrderedDict -from pandas.core.base import SpecificationError from pandas.errors import UnsupportedFunctionCall from pandas.core.groupby import DataError import pandas.core.common as com @@ -614,7 +613,7 @@ def f(): t[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) - pytest.raises(SpecificationError, f) + pytest.raises(KeyError, f) def test_agg_nested_dicts(self): @@ -659,6 +658,21 @@ def f(): 'B': {'rb': ['mean', 'std']}}) assert_frame_equal(result, expected, check_like=True) + def test_try_aggregate_non_existing_column(self): + # GH 16766 + data = [ + {'dt': datetime(2017, 6, 1, 0), 'x': 1.0, 'y': 2.0}, + {'dt': datetime(2017, 6, 1, 1), 'x': 2.0, 'y': 2.0}, + {'dt': datetime(2017, 6, 1, 2), 'x': 3.0, 'y': 1.5} + ] + df = DataFrame(data).set_index('dt') + + # Error as we don't have 'z' column + with pytest.raises(KeyError): + df.resample('30T').agg({'x': ['mean'], + 'y': ['median'], + 'z': ['sum']}) + def test_selection_api_validation(self): # GH 13500 index = date_range(datetime(2005, 1, 1), @@ -3077,6 +3091,15 @@ def test_getitem_multiple(self): result = r['buyer'].count() assert_series_equal(result, expected) + def test_groupby_resample_on_api_with_getitem(self): + # GH 17813 + df = pd.DataFrame({'id': list('aabbb'), + 'date': pd.date_range('1-1-2016', periods=5), + 'data': 1}) + exp = df.set_index('date').groupby('id').resample('2D')['data'].sum() + result = df.groupby('id').resample('2D', on='date')['data'].sum() + assert_series_equal(result, exp) + def test_nearest(self): # GH 17496 diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 973fe74429551..178c5ff655b04 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -612,13 +612,16 @@ def test_match(self): def test_extract_expand_None(self): values = Series(['fooBAD__barBAD', NA, 'foo']) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_raises_regex(ValueError, + 'expand must be True or False'): values.str.extract('.*(BAD[_]+).*(BAD)', expand=None) def test_extract_expand_unspecified(self): values = Series(['fooBAD__barBAD', NA, 'foo']) - with tm.assert_produces_warning(FutureWarning): - values.str.extract('.*(BAD[_]+).*(BAD)') + result_unspecified = values.str.extract('.*(BAD[_]+).*') + assert isinstance(result_unspecified, DataFrame) + result_true = values.str.extract('.*(BAD[_]+).*', expand=True) + tm.assert_frame_equal(result_unspecified, result_true) def test_extract_expand_False(self): # Contains tests like those in test_match and some others. diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index 2630984a70807..565e735c14c80 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -5,43 +5,13 @@ import dateutil import numpy as np -from dateutil.parser import parse -from pytz import NonExistentTimeError -from distutils.version import LooseVersion -from dateutil.tz import tzlocal, tzoffset -from datetime import datetime, timedelta, tzinfo +from datetime import datetime import pandas.util.testing as tm -import pandas.util._test_decorators as td -import pandas.tseries.offsets as offsets -from pandas.compat import lrange, zip -from pandas.core.indexes.datetimes import bdate_range, date_range +from pandas.core.indexes.datetimes import date_range from pandas._libs import tslib from pandas._libs.tslibs import timezones, conversion -from pandas import (Index, Series, isna, Timestamp, NaT, - DatetimeIndex, to_datetime) -from pandas.util.testing import assert_series_equal, set_timezone - - -class FixedOffset(tzinfo): - """Fixed offset in minutes east from UTC.""" - - def __init__(self, offset, name): - self.__offset = timedelta(minutes=offset) - self.__name = name - - def utcoffset(self, dt): - return self.__offset - - def tzname(self, dt): - return self.__name - - def dst(self, dt): - return timedelta(0) - - -fixed_off = FixedOffset(-420, '-07:00') -fixed_off_no_name = FixedOffset(-330, None) +from pandas import Timestamp class TestTimeZoneSupportPytz(object): @@ -68,438 +38,6 @@ def cmptz(self, tz1, tz2): # tests. return tz1.zone == tz2.zone - def test_utc_to_local_no_modify(self): - rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') - rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) - - # Values are unmodified - tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) - - assert self.cmptz(rng_eastern.tz, self.tz('US/Eastern')) - - def test_utc_to_local_no_modify_explicit(self): - rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') - rng_eastern = rng.tz_convert(self.tz('US/Eastern')) - - # Values are unmodified - tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) - - assert rng_eastern.tz == self.tz('US/Eastern') - - def test_localize_utc_conversion(self): - # Localizing to time zone should: - # 1) check for DST ambiguities - # 2) convert to UTC - - rng = date_range('3/10/2012', '3/11/2012', freq='30T') - - converted = rng.tz_localize(self.tzstr('US/Eastern')) - expected_naive = rng + offsets.Hour(5) - tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) - - # DST ambiguity, this should fail - rng = date_range('3/11/2012', '3/12/2012', freq='30T') - # Is this really how it should fail?? - pytest.raises(NonExistentTimeError, rng.tz_localize, - self.tzstr('US/Eastern')) - - def test_localize_utc_conversion_explicit(self): - # Localizing to time zone should: - # 1) check for DST ambiguities - # 2) convert to UTC - - rng = date_range('3/10/2012', '3/11/2012', freq='30T') - converted = rng.tz_localize(self.tz('US/Eastern')) - expected_naive = rng + offsets.Hour(5) - tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) - - # DST ambiguity, this should fail - rng = date_range('3/11/2012', '3/12/2012', freq='30T') - # Is this really how it should fail?? - pytest.raises(NonExistentTimeError, rng.tz_localize, - self.tz('US/Eastern')) - - def test_tz_localize_dti(self): - dti = DatetimeIndex(start='1/1/2005', end='1/1/2005 0:00:30.256', - freq='L') - dti2 = dti.tz_localize(self.tzstr('US/Eastern')) - - dti_utc = DatetimeIndex(start='1/1/2005 05:00', - end='1/1/2005 5:00:30.256', freq='L', tz='utc') - - tm.assert_numpy_array_equal(dti2.values, dti_utc.values) - - dti3 = dti2.tz_convert(self.tzstr('US/Pacific')) - tm.assert_numpy_array_equal(dti3.values, dti_utc.values) - - dti = DatetimeIndex(start='11/6/2011 1:59', end='11/6/2011 2:00', - freq='L') - pytest.raises(pytz.AmbiguousTimeError, dti.tz_localize, - self.tzstr('US/Eastern')) - - dti = DatetimeIndex(start='3/13/2011 1:59', end='3/13/2011 2:00', - freq='L') - pytest.raises(pytz.NonExistentTimeError, dti.tz_localize, - self.tzstr('US/Eastern')) - - def test_tz_localize_empty_series(self): - # #2248 - - ts = Series() - - ts2 = ts.tz_localize('utc') - assert ts2.index.tz == pytz.utc - - ts2 = ts.tz_localize(self.tzstr('US/Eastern')) - assert self.cmptz(ts2.index.tz, self.tz('US/Eastern')) - - def test_create_with_tz(self): - stamp = Timestamp('3/11/2012 05:00', tz=self.tzstr('US/Eastern')) - assert stamp.hour == 5 - - rng = date_range('3/11/2012 04:00', periods=10, freq='H', - tz=self.tzstr('US/Eastern')) - - assert stamp == rng[1] - - def test_create_with_fixed_tz(self): - off = FixedOffset(420, '+07:00') - start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) - end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) - rng = date_range(start=start, end=end) - assert off == rng.tz - - rng2 = date_range(start, periods=len(rng), tz=off) - tm.assert_index_equal(rng, rng2) - - rng3 = date_range('3/11/2012 05:00:00+07:00', - '6/11/2012 05:00:00+07:00') - assert (rng.values == rng3.values).all() - - def test_create_with_fixedoffset_noname(self): - off = fixed_off_no_name - start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) - end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) - rng = date_range(start=start, end=end) - assert off == rng.tz - - idx = Index([start, end]) - assert off == idx.tz - - def test_date_range_localize(self): - rng = date_range('3/11/2012 03:00', periods=15, freq='H', - tz='US/Eastern') - rng2 = DatetimeIndex(['3/11/2012 03:00', '3/11/2012 04:00'], - tz='US/Eastern') - rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') - rng3 = rng3.tz_localize('US/Eastern') - - tm.assert_index_equal(rng, rng3) - - # DST transition time - val = rng[0] - exp = Timestamp('3/11/2012 03:00', tz='US/Eastern') - - assert val.hour == 3 - assert exp.hour == 3 - assert val == exp # same UTC value - tm.assert_index_equal(rng[:2], rng2) - - # Right before the DST transition - rng = date_range('3/11/2012 00:00', periods=2, freq='H', - tz='US/Eastern') - rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], - tz='US/Eastern') - tm.assert_index_equal(rng, rng2) - exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') - assert exp.hour == 0 - assert rng[0] == exp - exp = Timestamp('3/11/2012 01:00', tz='US/Eastern') - assert exp.hour == 1 - assert rng[1] == exp - - rng = date_range('3/11/2012 00:00', periods=10, freq='H', - tz='US/Eastern') - assert rng[2].hour == 3 - - def test_utc_box_timestamp_and_localize(self): - rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') - rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) - - tz = self.tz('US/Eastern') - expected = rng[-1].astimezone(tz) - - stamp = rng_eastern[-1] - assert stamp == expected - assert stamp.tzinfo == expected.tzinfo - - # right tzinfo - rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc') - rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) - # test not valid for dateutil timezones. - # assert 'EDT' in repr(rng_eastern[0].tzinfo) - assert ('EDT' in repr(rng_eastern[0].tzinfo) or - 'tzfile' in repr(rng_eastern[0].tzinfo)) - - def test_timestamp_tz_convert(self): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] - idx = DatetimeIndex(strdates, tz=self.tzstr('US/Eastern')) - - conv = idx[0].tz_convert(self.tzstr('US/Pacific')) - expected = idx.tz_convert(self.tzstr('US/Pacific'))[0] - - assert conv == expected - - def test_pass_dates_localize_to_utc(self): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] - - idx = DatetimeIndex(strdates) - conv = idx.tz_localize(self.tzstr('US/Eastern')) - - fromdates = DatetimeIndex(strdates, tz=self.tzstr('US/Eastern')) - - assert conv.tz == fromdates.tz - tm.assert_numpy_array_equal(conv.values, fromdates.values) - - def test_field_access_localize(self): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] - rng = DatetimeIndex(strdates, tz=self.tzstr('US/Eastern')) - assert (rng.hour == 0).all() - - # a more unusual time zone, #1946 - dr = date_range('2011-10-02 00:00', freq='h', periods=10, - tz=self.tzstr('America/Atikokan')) - - expected = Index(np.arange(10, dtype=np.int64)) - tm.assert_index_equal(dr.hour, expected) - - def test_with_tz(self): - tz = self.tz('US/Central') - - # just want it to work - start = datetime(2011, 3, 12, tzinfo=pytz.utc) - dr = bdate_range(start, periods=50, freq=offsets.Hour()) - assert dr.tz is pytz.utc - - # DateRange with naive datetimes - dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc) - dr = bdate_range('1/1/2005', '1/1/2009', tz=tz) - - # normalized - central = dr.tz_convert(tz) - assert central.tz is tz - comp = self.localize(tz, central[0].to_pydatetime().replace( - tzinfo=None)).tzinfo - assert central[0].tz is comp - - # compare vs a localized tz - comp = self.localize(tz, - dr[0].to_pydatetime().replace(tzinfo=None)).tzinfo - assert central[0].tz is comp - - # datetimes with tzinfo set - dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), - datetime(2009, 1, 1, tzinfo=pytz.utc)) - - pytest.raises(Exception, bdate_range, - datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009', - tz=tz) - - def test_tz_localize(self): - dr = bdate_range('1/1/2009', '1/1/2010') - dr_utc = bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) - localized = dr.tz_localize(pytz.utc) - tm.assert_index_equal(dr_utc, localized) - - def test_with_tz_ambiguous_times(self): - tz = self.tz('US/Eastern') - - # March 13, 2011, spring forward, skip from 2 AM to 3 AM - dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, - freq=offsets.Hour()) - pytest.raises(pytz.NonExistentTimeError, dr.tz_localize, tz) - - # after dst transition, it works - dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, - freq=offsets.Hour(), tz=tz) - - # November 6, 2011, fall back, repeat 2 AM hour - dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, - freq=offsets.Hour()) - pytest.raises(pytz.AmbiguousTimeError, dr.tz_localize, tz) - - # UTC is OK - dr = date_range(datetime(2011, 3, 13), periods=48, - freq=offsets.Minute(30), tz=pytz.utc) - - def test_ambiguous_infer(self): - # November 6, 2011, fall back, repeat 2 AM hour - # With no repeated hours, we cannot infer the transition - tz = self.tz('US/Eastern') - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=offsets.Hour()) - pytest.raises(pytz.AmbiguousTimeError, dr.tz_localize, tz) - - # With repeated hours, we can infer the transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=offsets.Hour(), tz=tz) - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] - di = DatetimeIndex(times) - localized = di.tz_localize(tz, ambiguous='infer') - tm.assert_index_equal(dr, localized) - with tm.assert_produces_warning(FutureWarning): - localized_old = di.tz_localize(tz, infer_dst=True) - tm.assert_index_equal(dr, localized_old) - tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous='infer')) - - # When there is no dst transition, nothing special happens - dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=offsets.Hour()) - localized = dr.tz_localize(tz) - localized_infer = dr.tz_localize(tz, ambiguous='infer') - tm.assert_index_equal(localized, localized_infer) - with tm.assert_produces_warning(FutureWarning): - localized_infer_old = dr.tz_localize(tz, infer_dst=True) - tm.assert_index_equal(localized, localized_infer_old) - - def test_ambiguous_flags(self): - # November 6, 2011, fall back, repeat 2 AM hour - tz = self.tz('US/Eastern') - - # Pass in flags to determine right dst transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=offsets.Hour(), tz=tz) - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] - - # Test tz_localize - di = DatetimeIndex(times) - is_dst = [1, 1, 0, 0, 0] - localized = di.tz_localize(tz, ambiguous=is_dst) - tm.assert_index_equal(dr, localized) - tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous=is_dst)) - - localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) - tm.assert_index_equal(dr, localized) - - localized = di.tz_localize(tz, - ambiguous=np.array(is_dst).astype('bool')) - tm.assert_index_equal(dr, localized) - - # Test constructor - localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst) - tm.assert_index_equal(dr, localized) - - # Test duplicate times where infer_dst fails - times += times - di = DatetimeIndex(times) - - # When the sizes are incompatible, make sure error is raised - pytest.raises(Exception, di.tz_localize, tz, ambiguous=is_dst) - - # When sizes are compatible and there are repeats ('infer' won't work) - is_dst = np.hstack((is_dst, is_dst)) - localized = di.tz_localize(tz, ambiguous=is_dst) - dr = dr.append(dr) - tm.assert_index_equal(dr, localized) - - # When there is no dst transition, nothing special happens - dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=offsets.Hour()) - is_dst = np.array([1] * 10) - localized = dr.tz_localize(tz) - localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) - tm.assert_index_equal(localized, localized_is_dst) - - # construction with an ambiguous end-point - # GH 11626 - tz = self.tzstr("Europe/London") - - def f(): - date_range("2013-10-26 23:00", "2013-10-27 01:00", - tz="Europe/London", freq="H") - pytest.raises(pytz.AmbiguousTimeError, f) - - times = date_range("2013-10-26 23:00", "2013-10-27 01:00", freq="H", - tz=tz, ambiguous='infer') - assert times[0] == Timestamp('2013-10-26 23:00', tz=tz, freq="H") - - if str(tz).startswith('dateutil'): - if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): - # see gh-14621 - assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', - tz=tz, freq="H") - elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): - # fixed ambiguous behavior - assert times[-1] == Timestamp('2013-10-27 01:00:00+0100', - tz=tz, freq="H") - else: - assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', - tz=tz, freq="H") - - def test_ambiguous_nat(self): - tz = self.tz('US/Eastern') - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] - di = DatetimeIndex(times) - localized = di.tz_localize(tz, ambiguous='NaT') - - times = ['11/06/2011 00:00', np.NaN, np.NaN, '11/06/2011 02:00', - '11/06/2011 03:00'] - di_test = DatetimeIndex(times, tz='US/Eastern') - - # left dtype is datetime64[ns, US/Eastern] - # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] - tm.assert_numpy_array_equal(di_test.values, localized.values) - - def test_ambiguous_bool(self): - # make sure that we are correctly accepting bool values as ambiguous - - # gh-14402 - t = Timestamp('2015-11-01 01:00:03') - expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') - expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') - - s = Series([t]) - expected0 = Series([expected0]) - expected1 = Series([expected1]) - - def f(): - s.dt.tz_localize('US/Central') - pytest.raises(pytz.AmbiguousTimeError, f) - - result = s.dt.tz_localize('US/Central', ambiguous=True) - assert_series_equal(result, expected0) - - result = s.dt.tz_localize('US/Central', ambiguous=[True]) - assert_series_equal(result, expected0) - - result = s.dt.tz_localize('US/Central', ambiguous=False) - assert_series_equal(result, expected1) - - result = s.dt.tz_localize('US/Central', ambiguous=[False]) - assert_series_equal(result, expected1) - - def test_nonexistent_raise_coerce(self): - # See issue 13057 - from pytz.exceptions import NonExistentTimeError - times = ['2015-03-08 01:00', '2015-03-08 02:00', '2015-03-08 03:00'] - index = DatetimeIndex(times) - tz = 'US/Eastern' - pytest.raises(NonExistentTimeError, - index.tz_localize, tz=tz) - pytest.raises(NonExistentTimeError, - index.tz_localize, tz=tz, errors='raise') - result = index.tz_localize(tz=tz, errors='coerce') - test_times = ['2015-03-08 01:00-05:00', 'NaT', - '2015-03-08 03:00-04:00'] - expected = DatetimeIndex(test_times)\ - .tz_localize('UTC').tz_convert('US/Eastern') - tm.assert_index_equal(result, expected) - # test utility methods def test_infer_tz(self): eastern = self.tz('US/Eastern') @@ -525,242 +63,6 @@ def test_infer_tz(self): pytest.raises(Exception, timezones.infer_tzinfo, start, end) pytest.raises(Exception, timezones.infer_tzinfo, end, start) - def test_tz_string(self): - result = date_range('1/1/2000', periods=10, - tz=self.tzstr('US/Eastern')) - expected = date_range('1/1/2000', periods=10, tz=self.tz('US/Eastern')) - - tm.assert_index_equal(result, expected) - - def test_take_dont_lose_meta(self): - rng = date_range('1/1/2000', periods=20, tz=self.tzstr('US/Eastern')) - - result = rng.take(lrange(5)) - assert result.tz == rng.tz - assert result.freq == rng.freq - - def test_index_with_timezone_repr(self): - rng = date_range('4/13/2010', '5/6/2010') - - rng_eastern = rng.tz_localize(self.tzstr('US/Eastern')) - - rng_repr = repr(rng_eastern) - assert '2010-04-13 00:00:00' in rng_repr - - def test_index_astype_asobject_tzinfos(self): - # #1345 - - # dates around a dst transition - rng = date_range('2/13/2010', '5/6/2010', tz=self.tzstr('US/Eastern')) - - objs = rng.astype(object) - for i, x in enumerate(objs): - exval = rng[i] - assert x == exval - assert x.tzinfo == exval.tzinfo - - objs = rng.astype(object) - for i, x in enumerate(objs): - exval = rng[i] - assert x == exval - assert x.tzinfo == exval.tzinfo - - def test_localized_at_time_between_time(self): - from datetime import time - - rng = date_range('4/16/2012', '5/1/2012', freq='H') - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_local = ts.tz_localize(self.tzstr('US/Eastern')) - - result = ts_local.at_time(time(10, 0)) - expected = ts.at_time(time(10, 0)).tz_localize(self.tzstr( - 'US/Eastern')) - assert_series_equal(result, expected) - assert self.cmptz(result.index.tz, self.tz('US/Eastern')) - - t1, t2 = time(10, 0), time(11, 0) - result = ts_local.between_time(t1, t2) - expected = ts.between_time(t1, - t2).tz_localize(self.tzstr('US/Eastern')) - assert_series_equal(result, expected) - assert self.cmptz(result.index.tz, self.tz('US/Eastern')) - - def test_string_index_alias_tz_aware(self): - rng = date_range('1/1/2000', periods=10, tz=self.tzstr('US/Eastern')) - ts = Series(np.random.randn(len(rng)), index=rng) - - result = ts['1/3/2000'] - tm.assert_almost_equal(result, ts[2]) - - def test_fixed_offset(self): - dates = [datetime(2000, 1, 1, tzinfo=fixed_off), - datetime(2000, 1, 2, tzinfo=fixed_off), - datetime(2000, 1, 3, tzinfo=fixed_off)] - result = to_datetime(dates) - assert result.tz == fixed_off - - def test_fixedtz_topydatetime(self): - dates = np.array([datetime(2000, 1, 1, tzinfo=fixed_off), - datetime(2000, 1, 2, tzinfo=fixed_off), - datetime(2000, 1, 3, tzinfo=fixed_off)]) - result = to_datetime(dates).to_pydatetime() - tm.assert_numpy_array_equal(dates, result) - result = to_datetime(dates)._mpl_repr() - tm.assert_numpy_array_equal(dates, result) - - def test_convert_tz_aware_datetime_datetime(self): - # #1581 - - tz = self.tz('US/Eastern') - - dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)] - - dates_aware = [self.localize(tz, x) for x in dates] - result = to_datetime(dates_aware) - assert self.cmptz(result.tz, self.tz('US/Eastern')) - - converted = to_datetime(dates_aware, utc=True) - ex_vals = np.array([Timestamp(x).value for x in dates_aware]) - tm.assert_numpy_array_equal(converted.asi8, ex_vals) - assert converted.tz is pytz.utc - - def test_to_datetime_utc(self): - arr = np.array([parse('2012-06-13T01:39:00Z')], dtype=object) - - result = to_datetime(arr, utc=True) - assert result.tz is pytz.utc - - def test_to_datetime_tzlocal(self): - dt = parse('2012-06-13T01:39:00Z') - dt = dt.replace(tzinfo=tzlocal()) - - arr = np.array([dt], dtype=object) - - result = to_datetime(arr, utc=True) - assert result.tz is pytz.utc - - rng = date_range('2012-11-03 03:00', '2012-11-05 03:00', tz=tzlocal()) - arr = rng.to_pydatetime() - result = to_datetime(arr, utc=True) - assert result.tz is pytz.utc - - def test_hongkong_tz_convert(self): - # #1673 - dr = date_range('2012-01-01', '2012-01-10', freq='D', tz='Hongkong') - - # it works! - dr.hour - - def test_tz_convert_unsorted(self): - dr = date_range('2012-03-09', freq='H', periods=100, tz='utc') - dr = dr.tz_convert(self.tzstr('US/Eastern')) - - result = dr[::-1].hour - exp = dr.hour[::-1] - tm.assert_almost_equal(result, exp) - - def test_shift_localized(self): - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') - dr_tz = dr.tz_localize(self.tzstr('US/Eastern')) - - result = dr_tz.shift(1, '10T') - assert result.tz == dr_tz.tz - - def test_tz_aware_asfreq(self): - dr = date_range('2011-12-01', '2012-07-20', freq='D', - tz=self.tzstr('US/Eastern')) - - s = Series(np.random.randn(len(dr)), index=dr) - - # it works! - s.asfreq('T') - - def test_static_tzinfo(self): - # it works! - index = DatetimeIndex([datetime(2012, 1, 1)], tz=self.tzstr('EST')) - index.hour - index[0] - - def test_tzaware_datetime_to_index(self): - d = [datetime(2012, 8, 19, tzinfo=self.tz('US/Eastern'))] - - index = DatetimeIndex(d) - assert self.cmptz(index.tz, self.tz('US/Eastern')) - - def test_date_range_span_dst_transition(self): - # #1778 - - # Standard -> Daylight Savings Time - dr = date_range('03/06/2012 00:00', periods=200, freq='W-FRI', - tz='US/Eastern') - - assert (dr.hour == 0).all() - - dr = date_range('2012-11-02', periods=10, tz=self.tzstr('US/Eastern')) - assert (dr.hour == 0).all() - - def test_convert_datetime_list(self): - dr = date_range('2012-06-02', periods=10, - tz=self.tzstr('US/Eastern'), name='foo') - dr2 = DatetimeIndex(list(dr), name='foo') - tm.assert_index_equal(dr, dr2) - assert dr.tz == dr2.tz - assert dr2.name == 'foo' - - def test_dateutil_tzoffset_support(self): - values = [188.5, 328.25] - tzinfo = tzoffset(None, 7200) - index = [datetime(2012, 5, 11, 11, tzinfo=tzinfo), - datetime(2012, 5, 11, 12, tzinfo=tzinfo)] - series = Series(data=values, index=index) - - assert series.index.tz == tzinfo - - # it works! #2443 - repr(series.index[0]) - - def test_getitem_pydatetime_tz(self): - index = date_range(start='2012-12-24 16:00', end='2012-12-24 18:00', - freq='H', tz=self.tzstr('Europe/Berlin')) - ts = Series(index=index, data=index.hour) - time_pandas = Timestamp('2012-12-24 17:00', - tz=self.tzstr('Europe/Berlin')) - time_datetime = self.localize( - self.tz('Europe/Berlin'), datetime(2012, 12, 24, 17, 0)) - assert ts[time_pandas] == ts[time_datetime] - - def test_index_drop_dont_lose_tz(self): - # #2621 - ind = date_range("2012-12-01", periods=10, tz="utc") - ind = ind.drop(ind[-1]) - - assert ind.tz is not None - - def test_datetimeindex_tz(self): - """ Test different DatetimeIndex constructions with timezone - Follow-up of #4229 - """ - - arr = ['11/10/2005 08:00:00', '11/10/2005 09:00:00'] - - idx1 = to_datetime(arr).tz_localize(self.tzstr('US/Eastern')) - idx2 = DatetimeIndex(start="2005-11-10 08:00:00", freq='H', periods=2, - tz=self.tzstr('US/Eastern')) - idx3 = DatetimeIndex(arr, tz=self.tzstr('US/Eastern')) - idx4 = DatetimeIndex(np.array(arr), tz=self.tzstr('US/Eastern')) - - for other in [idx2, idx3, idx4]: - tm.assert_index_equal(idx1, other) - - def test_datetimeindex_tz_nat(self): - idx = to_datetime([Timestamp("2013-1-1", tz=self.tzstr('US/Eastern')), - NaT]) - - assert isna(idx[1]) - assert idx[0].tzinfo is not None - def test_replace_across_dst(self): # GH#18319 check that 1) timezone is correctly normalized and # 2) that hour is not incorrectly changed by this normalization @@ -810,159 +112,6 @@ def normalize(self, ts): # no-op for dateutil return ts - def test_tz_convert_hour_overflow_dst(self): - # Regression test for: - # https://github.com/pandas-dev/pandas/issues/13306 - - # sorted case US/Eastern -> UTC - ts = ['2008-05-12 09:50:00', - '2008-12-12 09:50:35', - '2009-05-12 09:50:32'] - tt = to_datetime(ts).tz_localize('US/Eastern') - ut = tt.tz_convert('UTC') - expected = Index([13, 14, 13]) - tm.assert_index_equal(ut.hour, expected) - - # sorted case UTC -> US/Eastern - ts = ['2008-05-12 13:50:00', - '2008-12-12 14:50:35', - '2009-05-12 13:50:32'] - tt = to_datetime(ts).tz_localize('UTC') - ut = tt.tz_convert('US/Eastern') - expected = Index([9, 9, 9]) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case US/Eastern -> UTC - ts = ['2008-05-12 09:50:00', - '2008-12-12 09:50:35', - '2008-05-12 09:50:32'] - tt = to_datetime(ts).tz_localize('US/Eastern') - ut = tt.tz_convert('UTC') - expected = Index([13, 14, 13]) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case UTC -> US/Eastern - ts = ['2008-05-12 13:50:00', - '2008-12-12 14:50:35', - '2008-05-12 13:50:32'] - tt = to_datetime(ts).tz_localize('UTC') - ut = tt.tz_convert('US/Eastern') - expected = Index([9, 9, 9]) - tm.assert_index_equal(ut.hour, expected) - - def test_tz_convert_hour_overflow_dst_timestamps(self): - # Regression test for: - # https://github.com/pandas-dev/pandas/issues/13306 - - tz = self.tzstr('US/Eastern') - - # sorted case US/Eastern -> UTC - ts = [Timestamp('2008-05-12 09:50:00', tz=tz), - Timestamp('2008-12-12 09:50:35', tz=tz), - Timestamp('2009-05-12 09:50:32', tz=tz)] - tt = to_datetime(ts) - ut = tt.tz_convert('UTC') - expected = Index([13, 14, 13]) - tm.assert_index_equal(ut.hour, expected) - - # sorted case UTC -> US/Eastern - ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), - Timestamp('2008-12-12 14:50:35', tz='UTC'), - Timestamp('2009-05-12 13:50:32', tz='UTC')] - tt = to_datetime(ts) - ut = tt.tz_convert('US/Eastern') - expected = Index([9, 9, 9]) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case US/Eastern -> UTC - ts = [Timestamp('2008-05-12 09:50:00', tz=tz), - Timestamp('2008-12-12 09:50:35', tz=tz), - Timestamp('2008-05-12 09:50:32', tz=tz)] - tt = to_datetime(ts) - ut = tt.tz_convert('UTC') - expected = Index([13, 14, 13]) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case UTC -> US/Eastern - ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), - Timestamp('2008-12-12 14:50:35', tz='UTC'), - Timestamp('2008-05-12 13:50:32', tz='UTC')] - tt = to_datetime(ts) - ut = tt.tz_convert('US/Eastern') - expected = Index([9, 9, 9]) - tm.assert_index_equal(ut.hour, expected) - - def test_tslib_tz_convert_trans_pos_plus_1__bug(self): - # Regression test for tslib.tz_convert(vals, tz1, tz2). - # See https://github.com/pandas-dev/pandas/issues/4496 for details. - for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: - idx = date_range(datetime(2011, 3, 26, 23), - datetime(2011, 3, 27, 1), freq=freq) - idx = idx.tz_localize('UTC') - idx = idx.tz_convert('Europe/Moscow') - - expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) - - def test_tslib_tz_convert_dst(self): - for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: - # Start DST - idx = date_range('2014-03-08 23:00', '2014-03-09 09:00', freq=freq, - tz='UTC') - idx = idx.tz_convert('US/Eastern') - expected = np.repeat(np.array([18, 19, 20, 21, 22, 23, - 0, 1, 3, 4, 5]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) - - idx = date_range('2014-03-08 18:00', '2014-03-09 05:00', freq=freq, - tz='US/Eastern') - idx = idx.tz_convert('UTC') - expected = np.repeat(np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) - - # End DST - idx = date_range('2014-11-01 23:00', '2014-11-02 09:00', freq=freq, - tz='UTC') - idx = idx.tz_convert('US/Eastern') - expected = np.repeat(np.array([19, 20, 21, 22, 23, - 0, 1, 1, 2, 3, 4]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) - - idx = date_range('2014-11-01 18:00', '2014-11-02 05:00', freq=freq, - tz='US/Eastern') - idx = idx.tz_convert('UTC') - expected = np.repeat(np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, - 7, 8, 9, 10]), - np.array([n, n, n, n, n, n, n, n, n, - n, n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) - - # daily - # Start DST - idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', - tz='UTC') - idx = idx.tz_convert('US/Eastern') - tm.assert_index_equal(idx.hour, Index([19, 19])) - - idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', - tz='US/Eastern') - idx = idx.tz_convert('UTC') - tm.assert_index_equal(idx.hour, Index([5, 5])) - - # End DST - idx = date_range('2014-11-01 00:00', '2014-11-02 00:00', freq='D', - tz='UTC') - idx = idx.tz_convert('US/Eastern') - tm.assert_index_equal(idx.hour, Index([20, 20])) - - idx = date_range('2014-11-01 00:00', '2014-11-02 000:00', freq='D', - tz='US/Eastern') - idx = idx.tz_convert('UTC') - tm.assert_index_equal(idx.hour, Index([4, 4])) - def test_tzlocal(self): # GH 13583 ts = Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()) @@ -977,32 +126,6 @@ def test_tzlocal(self): offset = offset.total_seconds() * 1000000000 assert ts.value + offset == Timestamp('2011-01-01').value - def test_tz_localize_tzlocal(self): - # GH 13583 - offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) - offset = int(offset.total_seconds() * 1000000000) - - dti = date_range(start='2001-01-01', end='2001-03-01') - dti2 = dti.tz_localize(dateutil.tz.tzlocal()) - tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) - - dti = date_range(start='2001-01-01', end='2001-03-01', - tz=dateutil.tz.tzlocal()) - dti2 = dti.tz_localize(None) - tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) - - def test_tz_convert_tzlocal(self): - # GH 13583 - # tz_convert doesn't affect to internal - dti = date_range(start='2001-01-01', end='2001-03-01', tz='UTC') - dti2 = dti.tz_convert(dateutil.tz.tzlocal()) - tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) - - dti = date_range(start='2001-01-01', end='2001-03-01', - tz=dateutil.tz.tzlocal()) - dti2 = dti.tz_convert(None) - tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) - class TestTimeZoneCacheKey(object): @@ -1020,382 +143,6 @@ def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self, tz_name): timezones._p_tz_cache_key(tz_d)) -class TestTimeZones(object): - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] - - def test_index_equals_with_tz(self): - left = date_range('1/1/2011', periods=100, freq='H', tz='utc') - right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern') - - assert not left.equals(right) - - def test_tz_localize_naive(self): - rng = date_range('1/1/2011', periods=100, freq='H') - - conv = rng.tz_localize('US/Pacific') - exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') - - tm.assert_index_equal(conv, exp) - - def test_tz_localize_roundtrip(self): - for tz in self.timezones: - idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M') - idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D') - idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H') - idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T') - for idx in [idx1, idx2, idx3, idx4]: - localized = idx.tz_localize(tz) - expected = date_range(start=idx[0], end=idx[-1], freq=idx.freq, - tz=tz) - tm.assert_index_equal(localized, expected) - - with pytest.raises(TypeError): - localized.tz_localize(tz) - - reset = localized.tz_localize(None) - tm.assert_index_equal(reset, idx) - assert reset.tzinfo is None - - def test_series_tz_localize(self): - - rng = date_range('1/1/2011', periods=100, freq='H') - ts = Series(1, index=rng) - - result = ts.tz_localize('utc') - assert result.index.tz.zone == 'UTC' - - # Can't localize if already tz-aware - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') - ts = Series(1, index=rng) - tm.assert_raises_regex(TypeError, 'Already tz-aware', - ts.tz_localize, 'US/Eastern') - - def test_series_tz_convert(self): - rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') - ts = Series(1, index=rng) - - result = ts.tz_convert('Europe/Berlin') - assert result.index.tz.zone == 'Europe/Berlin' - - # can't convert tz-naive - rng = date_range('1/1/2011', periods=200, freq='D') - ts = Series(1, index=rng) - tm.assert_raises_regex(TypeError, "Cannot convert tz-naive", - ts.tz_convert, 'US/Eastern') - - def test_tz_convert_roundtrip(self): - for tz in self.timezones: - idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M', - tz='UTC') - exp1 = date_range(start='2014-01-01', end='2014-12-31', freq='M') - - idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D', - tz='UTC') - exp2 = date_range(start='2014-01-01', end='2014-12-31', freq='D') - - idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H', - tz='UTC') - exp3 = date_range(start='2014-01-01', end='2014-03-01', freq='H') - - idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T', - tz='UTC') - exp4 = date_range(start='2014-08-01', end='2014-10-31', freq='T') - - for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), - (idx4, exp4)]: - converted = idx.tz_convert(tz) - reset = converted.tz_convert(None) - tm.assert_index_equal(reset, expected) - assert reset.tzinfo is None - tm.assert_index_equal(reset, converted.tz_convert( - 'UTC').tz_localize(None)) - - def test_join_utc_convert(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') - - left = rng.tz_convert('US/Eastern') - right = rng.tz_convert('Europe/Berlin') - - for how in ['inner', 'outer', 'left', 'right']: - result = left.join(left[:-5], how=how) - assert isinstance(result, DatetimeIndex) - assert result.tz == left.tz - - result = left.join(right[:-5], how=how) - assert isinstance(result, DatetimeIndex) - assert result.tz.zone == 'UTC' - - def test_join_aware(self): - rng = date_range('1/1/2011', periods=10, freq='H') - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_utc = ts.tz_localize('utc') - - pytest.raises(Exception, ts.__add__, ts_utc) - pytest.raises(Exception, ts_utc.__add__, ts) - - # non-overlapping - rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", - tz="US/Central") - - rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", - tz="US/Eastern") - - result = rng.union(rng2) - assert result.tz.zone == 'UTC' - - def test_series_align_aware(self): - idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') - ser = Series(np.random.randn(len(idx1)), index=idx1) - ser_central = ser.tz_convert('US/Central') - # # different timezones convert to UTC - - new1, new2 = ser.align(ser_central) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - def test_append_aware(self): - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', - tz='US/Eastern') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Eastern') - ts1 = Series([1], index=rng1) - ts2 = Series([2], index=rng2) - ts_result = ts1.append(ts2) - - exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], - tz='US/Eastern') - exp = Series([1, 2], index=exp_index) - assert_series_equal(ts_result, exp) - assert ts_result.index.tz == rng1.tz - - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') - ts1 = Series([1], index=rng1) - ts2 = Series([2], index=rng2) - ts_result = ts1.append(ts2) - - exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], - tz='UTC') - exp = Series([1, 2], index=exp_index) - assert_series_equal(ts_result, exp) - utc = rng1.tz - assert utc == ts_result.index.tz - - # GH 7795 - # different tz coerces to object dtype, not UTC - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', - tz='US/Eastern') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Central') - ts1 = Series([1], index=rng1) - ts2 = Series([2], index=rng2) - ts_result = ts1.append(ts2) - exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), - Timestamp('1/1/2011 02:00', tz='US/Central')]) - exp = Series([1, 2], index=exp_index) - assert_series_equal(ts_result, exp) - - def test_append_dst(self): - rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', - tz='US/Eastern') - rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', - tz='US/Eastern') - ts1 = Series([1, 2, 3], index=rng1) - ts2 = Series([10, 11, 12], index=rng2) - ts_result = ts1.append(ts2) - - exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', - '2016-01-01 03:00', '2016-08-01 01:00', - '2016-08-01 02:00', '2016-08-01 03:00'], - tz='US/Eastern') - exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) - assert_series_equal(ts_result, exp) - assert ts_result.index.tz == rng1.tz - - def test_append_aware_naive(self): - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Eastern') - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) - ts_result = ts1.append(ts2) - - assert ts_result.index.equals(ts1.index.astype(object).append( - ts2.index.astype(object))) - - # mixed - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') - rng2 = lrange(100) - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) - ts_result = ts1.append(ts2) - assert ts_result.index.equals(ts1.index.astype(object).append( - ts2.index)) - - def test_series_add_tz_mismatch_converts_to_utc(self): - rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_moscow = ts.tz_convert('Europe/Moscow') - - result = ts + ts_moscow - assert result.index.tz is pytz.utc - - result = ts_moscow + ts - assert result.index.tz is pytz.utc - - def test_arith_utc_convert(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') - - perm = np.random.permutation(100)[:90] - ts1 = Series(np.random.randn(90), - index=rng.take(perm).tz_convert('US/Eastern')) - - perm = np.random.permutation(100)[:90] - ts2 = Series(np.random.randn(90), - index=rng.take(perm).tz_convert('Europe/Berlin')) - - result = ts1 + ts2 - - uts1 = ts1.tz_convert('utc') - uts2 = ts2.tz_convert('utc') - expected = uts1 + uts2 - - assert result.index.tz == pytz.UTC - assert_series_equal(result, expected) - - def test_intersection(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') - - left = rng[10:90][::-1] - right = rng[20:80][::-1] - - assert left.tz == rng.tz - result = left.intersection(right) - assert result.tz == left.tz - - def test_timestamp_equality_different_timezones(self): - utc_range = date_range('1/1/2000', periods=20, tz='UTC') - eastern_range = utc_range.tz_convert('US/Eastern') - berlin_range = utc_range.tz_convert('Europe/Berlin') - - for a, b, c in zip(utc_range, eastern_range, berlin_range): - assert a == b - assert b == c - assert a == c - - assert (utc_range == eastern_range).all() - assert (utc_range == berlin_range).all() - assert (berlin_range == eastern_range).all() - - def test_datetimeindex_tz(self): - rng = date_range('03/12/2012 00:00', periods=10, freq='W-FRI', - tz='US/Eastern') - rng2 = DatetimeIndex(data=rng, tz='US/Eastern') - tm.assert_index_equal(rng, rng2) - - def test_normalize_tz(self): - rng = date_range('1/1/2000 9:30', periods=10, freq='D', - tz='US/Eastern') - - result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', - tz='US/Eastern') - tm.assert_index_equal(result, expected) - - assert result.is_normalized - assert not rng.is_normalized - - rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz='UTC') - - result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', tz='UTC') - tm.assert_index_equal(result, expected) - - assert result.is_normalized - assert not rng.is_normalized - - rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz=tzlocal()) - result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', tz=tzlocal()) - tm.assert_index_equal(result, expected) - - assert result.is_normalized - assert not rng.is_normalized - - @td.skip_if_windows - def test_normalize_tz_local(self): - # see gh-13459 - timezones = ['US/Pacific', 'US/Eastern', 'UTC', 'Asia/Kolkata', - 'Asia/Shanghai', 'Australia/Canberra'] - - for timezone in timezones: - with set_timezone(timezone): - rng = date_range('1/1/2000 9:30', periods=10, freq='D', - tz=tzlocal()) - - result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', - tz=tzlocal()) - tm.assert_index_equal(result, expected) - - assert result.is_normalized - assert not rng.is_normalized - - def test_tzaware_offset(self): - dates = date_range('2012-11-01', periods=3, tz='US/Pacific') - offset = dates + offsets.Hour(5) - assert dates[0] + offsets.Hour(5) == offset[0] - - # GH 6818 - for tz in ['UTC', 'US/Pacific', 'Asia/Tokyo']: - dates = date_range('2010-11-01 00:00', periods=3, tz=tz, freq='H') - expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00', - '2010-11-01 07:00'], freq='H', tz=tz) - - offset = dates + offsets.Hour(5) - tm.assert_index_equal(offset, expected) - offset = dates + np.timedelta64(5, 'h') - tm.assert_index_equal(offset, expected) - offset = dates + timedelta(hours=5) - tm.assert_index_equal(offset, expected) - - def test_nat(self): - # GH 5546 - dates = [NaT] - idx = DatetimeIndex(dates) - idx = idx.tz_localize('US/Pacific') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) - idx = idx.tz_convert('US/Eastern') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Eastern')) - idx = idx.tz_convert('UTC') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='UTC')) - - dates = ['2010-12-01 00:00', '2010-12-02 00:00', NaT] - idx = DatetimeIndex(dates) - idx = idx.tz_localize('US/Pacific') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) - idx = idx.tz_convert('US/Eastern') - expected = ['2010-12-01 03:00', '2010-12-02 03:00', NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) - - idx = idx + offsets.Hour(5) - expected = ['2010-12-01 08:00', '2010-12-02 08:00', NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) - idx = idx.tz_convert('US/Pacific') - expected = ['2010-12-01 05:00', '2010-12-02 05:00', NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) - - idx = idx + np.timedelta64(3, 'h') - expected = ['2010-12-01 08:00', '2010-12-02 08:00', NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) - - idx = idx.tz_convert('US/Eastern') - expected = ['2010-12-01 11:00', '2010-12-02 11:00', NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) - - class TestTslib(object): def test_tslib_tz_convert(self): diff --git a/pandas/tests/tslibs/__init__.py b/pandas/tests/tslibs/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py new file mode 100644 index 0000000000000..eb77e52e7c91d --- /dev/null +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +from datetime import datetime, date + +import numpy as np +import pytest + +from pandas._libs import tslib +from pandas.compat.numpy import np_array_datetime64_compat +import pandas.util.testing as tm + + +class TestParseISO8601(object): + @pytest.mark.parametrize('date_str, exp', [ + ('2011-01-02', datetime(2011, 1, 2)), + ('2011-1-2', datetime(2011, 1, 2)), + ('2011-01', datetime(2011, 1, 1)), + ('2011-1', datetime(2011, 1, 1)), + ('2011 01 02', datetime(2011, 1, 2)), + ('2011.01.02', datetime(2011, 1, 2)), + ('2011/01/02', datetime(2011, 1, 2)), + ('2011\\01\\02', datetime(2011, 1, 2)), + ('2013-01-01 05:30:00', datetime(2013, 1, 1, 5, 30)), + ('2013-1-1 5:30:00', datetime(2013, 1, 1, 5, 30))]) + def test_parsers_iso8601(self, date_str, exp): + # GH#12060 + # test only the iso parser - flexibility to different + # separators and leadings 0s + # Timestamp construction falls back to dateutil + actual = tslib._test_parse_iso8601(date_str) + assert actual == exp + + @pytest.mark.parametrize( + 'date_str', + ['2011-01/02', '2011^11^11', + '201401', '201111', '200101', + # mixed separated and unseparated + '2005-0101', '200501-01', + '20010101 12:3456', + '20010101 1234:56', + # HHMMSS must have two digits in + # each component if unseparated + '20010101 1', '20010101 123', + '20010101 12345', '20010101 12345Z', + # wrong separator for HHMMSS + '2001-01-01 12-34-56']) + def test_parsers_iso8601_invalid(self, date_str): + # separators must all match - YYYYMM not valid + with pytest.raises(ValueError): + tslib._test_parse_iso8601(date_str) + + +class TestArrayToDatetime(object): + def test_parsing_valid_dates(self): + arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) + result = tslib.array_to_datetime(arr) + expected = ['2013-01-01T00:00:00.000000000-0000', + '2013-01-02T00:00:00.000000000-0000'] + tm.assert_numpy_array_equal( + result, + np_array_datetime64_compat(expected, dtype='M8[ns]')) + + arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) + result = tslib.array_to_datetime(arr) + expected = ['2013-09-16T00:00:00.000000000-0000', + '2013-09-17T00:00:00.000000000-0000'] + tm.assert_numpy_array_equal( + result, + np_array_datetime64_compat(expected, dtype='M8[ns]')) + + @pytest.mark.parametrize('dt_string', [ + '01-01-2013 08:00:00+08:00', + '2013-01-01T08:00:00.000000000+0800', + '2012-12-31T16:00:00.000000000-0800', + '12-31-2012 23:00:00-01:00']) + def test_parsing_timezone_offsets(self, dt_string): + # All of these datetime strings with offsets are equivalent + # to the same datetime after the timezone offset is added + arr = np.array(['01-01-2013 00:00:00'], dtype=object) + expected = tslib.array_to_datetime(arr) + + arr = np.array([dt_string], dtype=object) + result = tslib.array_to_datetime(arr) + tm.assert_numpy_array_equal(result, expected) + + def test_number_looking_strings_not_into_datetime(self): + # GH#4601 + # These strings don't look like datetimes so they shouldn't be + # attempted to be converted + arr = np.array(['-352.737091', '183.575577'], dtype=object) + result = tslib.array_to_datetime(arr, errors='ignore') + tm.assert_numpy_array_equal(result, arr) + + arr = np.array(['1', '2', '3', '4', '5'], dtype=object) + result = tslib.array_to_datetime(arr, errors='ignore') + tm.assert_numpy_array_equal(result, arr) + + @pytest.mark.parametrize('invalid_date', [ + date(1000, 1, 1), + datetime(1000, 1, 1), + '1000-01-01', + 'Jan 1, 1000', + np.datetime64('1000-01-01')]) + def test_coerce_outside_ns_bounds(self, invalid_date): + arr = np.array([invalid_date], dtype='object') + with pytest.raises(ValueError): + tslib.array_to_datetime(arr, errors='raise') + + result = tslib.array_to_datetime(arr, errors='coerce') + expected = np.array([tslib.iNaT], dtype='M8[ns]') + tm.assert_numpy_array_equal(result, expected) + + def test_coerce_outside_ns_bounds_one_valid(self): + arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) + result = tslib.array_to_datetime(arr, errors='coerce') + expected = [tslib.iNaT, + '2000-01-01T00:00:00.000000000-0000'] + tm.assert_numpy_array_equal( + result, + np_array_datetime64_compat(expected, dtype='M8[ns]')) + + def test_coerce_of_invalid_datetimes(self): + arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) + + # Without coercing, the presence of any invalid dates prevents + # any values from being converted + result = tslib.array_to_datetime(arr, errors='ignore') + tm.assert_numpy_array_equal(result, arr) + + # With coercing, the invalid dates becomes iNaT + result = tslib.array_to_datetime(arr, errors='coerce') + expected = ['2013-01-01T00:00:00.000000000-0000', + tslib.iNaT, + tslib.iNaT] + + tm.assert_numpy_array_equal( + result, + np_array_datetime64_compat(expected, dtype='M8[ns]')) + + def test_to_datetime_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + arr = np.array(['2262-04-11 23:47:16.854775808'], dtype=object) + with pytest.raises(tslib.OutOfBoundsDatetime): + tslib.array_to_datetime(arr) diff --git a/pandas/tests/tslibs/test_ccalendar.py b/pandas/tests/tslibs/test_ccalendar.py new file mode 100644 index 0000000000000..b5d562a7b5a9c --- /dev/null +++ b/pandas/tests/tslibs/test_ccalendar.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +from datetime import datetime + +import numpy as np + +from pandas._libs.tslibs import ccalendar + + +def test_get_day_of_year(): + assert ccalendar.get_day_of_year(2001, 3, 1) == 60 + assert ccalendar.get_day_of_year(2004, 3, 1) == 61 + assert ccalendar.get_day_of_year(1907, 12, 31) == 365 + assert ccalendar.get_day_of_year(2004, 12, 31) == 366 + + dt = datetime.fromordinal(1 + np.random.randint(365 * 4000)) + result = ccalendar.get_day_of_year(dt.year, dt.month, dt.day) + expected = (dt - dt.replace(month=1, day=1)).days + 1 + assert result == expected diff --git a/pandas/tests/tseries/test_libfrequencies.py b/pandas/tests/tslibs/test_libfrequencies.py similarity index 100% rename from pandas/tests/tseries/test_libfrequencies.py rename to pandas/tests/tslibs/test_libfrequencies.py diff --git a/pandas/tests/tseries/offsets/test_liboffsets.py b/pandas/tests/tslibs/test_liboffsets.py similarity index 100% rename from pandas/tests/tseries/offsets/test_liboffsets.py rename to pandas/tests/tslibs/test_liboffsets.py diff --git a/pandas/tests/scalar/test_parsing.py b/pandas/tests/tslibs/test_parsing.py similarity index 96% rename from pandas/tests/scalar/test_parsing.py rename to pandas/tests/tslibs/test_parsing.py index bff0de649ac5e..34cce088a8b42 100644 --- a/pandas/tests/scalar/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -7,7 +7,6 @@ import pytest from dateutil.parser import parse -import pandas as pd import pandas.util._test_decorators as td from pandas.conftest import is_dateutil_le_261, is_dateutil_gt_261 from pandas import compat @@ -16,18 +15,6 @@ from pandas._libs.tslibs.parsing import parse_time_string -def test_to_datetime1(): - actual = pd.to_datetime(datetime(2008, 1, 15)) - assert actual == datetime(2008, 1, 15) - - actual = pd.to_datetime('20080115') - assert actual == datetime(2008, 1, 15) - - # unparseable - s = 'Month 1, 1999' - assert pd.to_datetime(s, errors='ignore') == s - - class TestParseQuarters(object): def test_parse_time_string(self): diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py new file mode 100644 index 0000000000000..98959adf6fda4 --- /dev/null +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- + +from pandas._libs.tslibs.frequencies import get_freq +from pandas._libs.tslibs.period import period_ordinal, period_asfreq + + +class TestPeriodFreqConversion(object): + def test_intraday_conversion_factors(self): + assert period_asfreq(1, get_freq('D'), get_freq('H'), False) == 24 + assert period_asfreq(1, get_freq('D'), get_freq('T'), False) == 1440 + assert period_asfreq(1, get_freq('D'), get_freq('S'), False) == 86400 + assert period_asfreq(1, get_freq('D'), + get_freq('L'), False) == 86400000 + assert period_asfreq(1, get_freq('D'), + get_freq('U'), False) == 86400000000 + assert period_asfreq(1, get_freq('D'), + get_freq('N'), False) == 86400000000000 + + assert period_asfreq(1, get_freq('H'), get_freq('T'), False) == 60 + assert period_asfreq(1, get_freq('H'), get_freq('S'), False) == 3600 + assert period_asfreq(1, get_freq('H'), + get_freq('L'), False) == 3600000 + assert period_asfreq(1, get_freq('H'), + get_freq('U'), False) == 3600000000 + assert period_asfreq(1, get_freq('H'), + get_freq('N'), False) == 3600000000000 + + assert period_asfreq(1, get_freq('T'), get_freq('S'), False) == 60 + assert period_asfreq(1, get_freq('T'), get_freq('L'), False) == 60000 + assert period_asfreq(1, get_freq('T'), + get_freq('U'), False) == 60000000 + assert period_asfreq(1, get_freq('T'), + get_freq('N'), False) == 60000000000 + + assert period_asfreq(1, get_freq('S'), get_freq('L'), False) == 1000 + assert period_asfreq(1, get_freq('S'), + get_freq('U'), False) == 1000000 + assert period_asfreq(1, get_freq('S'), + get_freq('N'), False) == 1000000000 + + assert period_asfreq(1, get_freq('L'), get_freq('U'), False) == 1000 + assert period_asfreq(1, get_freq('L'), + get_freq('N'), False) == 1000000 + + assert period_asfreq(1, get_freq('U'), get_freq('N'), False) == 1000 + + def test_period_ordinal_start_values(self): + # information for 1.1.1970 + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('A')) == 0 + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('M')) == 0 + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('W')) == 1 + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('D')) == 0 + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('B')) == 0 + + def test_period_ordinal_week(self): + assert period_ordinal(1970, 1, 4, 0, 0, 0, 0, 0, get_freq('W')) == 1 + assert period_ordinal(1970, 1, 5, 0, 0, 0, 0, 0, get_freq('W')) == 2 + assert period_ordinal(2013, 10, 6, 0, + 0, 0, 0, 0, get_freq('W')) == 2284 + assert period_ordinal(2013, 10, 7, 0, + 0, 0, 0, 0, get_freq('W')) == 2285 + + def test_period_ordinal_business_day(self): + # Thursday + assert period_ordinal(2013, 10, 3, 0, + 0, 0, 0, 0, get_freq('B')) == 11415 + # Friday + assert period_ordinal(2013, 10, 4, 0, + 0, 0, 0, 0, get_freq('B')) == 11416 + # Saturday + assert period_ordinal(2013, 10, 5, 0, + 0, 0, 0, 0, get_freq('B')) == 11417 + # Sunday + assert period_ordinal(2013, 10, 6, 0, + 0, 0, 0, 0, get_freq('B')) == 11417 + # Monday + assert period_ordinal(2013, 10, 7, 0, + 0, 0, 0, 0, get_freq('B')) == 11417 + # Tuesday + assert period_ordinal(2013, 10, 8, 0, + 0, 0, 0, 0, get_freq('B')) == 11418 diff --git a/setup.py b/setup.py index 5397a1b84dc4d..2332503e558ed 100755 --- a/setup.py +++ b/setup.py @@ -515,6 +515,7 @@ def pxd(name): 'pyxfile': '_libs/tslibs/period', 'pxdfiles': ['_libs/src/util', '_libs/missing', + '_libs/tslibs/ccalendar', '_libs/tslibs/timedeltas', '_libs/tslibs/timezones', '_libs/tslibs/nattype'],
{val}{val}