diff --git a/.circleci/config.yml b/.circleci/config.yml index e947f30d285cd..5b10036818901 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,39 +1,6 @@ version: 2 jobs: - # -------------------------------------------------------------------------- - # 0. py27_compat - # -------------------------------------------------------------------------- - py27_compat: - docker: - - image: continuumio/miniconda:latest - # databases configuration - - image: circleci/postgres:9.6.5-alpine-ram - environment: - POSTGRES_USER: postgres - POSTGRES_DB: pandas_nosetest - - image: circleci/mysql:8-ram - environment: - MYSQL_USER: "root" - MYSQL_HOST: "localhost" - MYSQL_ALLOW_EMPTY_PASSWORD: "true" - MYSQL_DATABASE: "pandas_nosetest" - environment: - JOB: "2.7_COMPAT" - ENV_FILE: "ci/circle-27-compat.yaml" - LOCALE_OVERRIDE: "it_IT.UTF-8" - MINICONDA_DIR: /home/ubuntu/miniconda3 - steps: - - checkout - - run: - name: build - command: | - ./ci/install_circle.sh - ./ci/show_circle.sh - - run: - name: test - command: ./ci/run_circle.sh --skip-slow --skip-network - # -------------------------------------------------------------------------- # 1. py36_locale # -------------------------------------------------------------------------- @@ -62,86 +29,14 @@ jobs: - run: name: build command: | - ./ci/install_circle.sh - ./ci/show_circle.sh + ./ci/circle/install_circle.sh + ./ci/circle/show_circle.sh - run: name: test - command: ./ci/run_circle.sh --skip-slow --skip-network - - # -------------------------------------------------------------------------- - # 2. py36_locale_slow - # -------------------------------------------------------------------------- - py36_locale_slow: - docker: - - image: continuumio/miniconda:latest - # databases configuration - - image: circleci/postgres:9.6.5-alpine-ram - environment: - POSTGRES_USER: postgres - POSTGRES_DB: pandas_nosetest - - image: circleci/mysql:8-ram - environment: - MYSQL_USER: "root" - MYSQL_HOST: "localhost" - MYSQL_ALLOW_EMPTY_PASSWORD: "true" - MYSQL_DATABASE: "pandas_nosetest" - - environment: - JOB: "3.6_LOCALE_SLOW" - ENV_FILE: "ci/circle-36-locale_slow.yaml" - LOCALE_OVERRIDE: "zh_CN.UTF-8" - MINICONDA_DIR: /home/ubuntu/miniconda3 - steps: - - checkout - - run: - name: build - command: | - ./ci/install_circle.sh - ./ci/show_circle.sh - - run: - name: test - command: ./ci/run_circle.sh --only-slow --skip-network - - # -------------------------------------------------------------------------- - # 3. py35_ascii - # -------------------------------------------------------------------------- - py35_ascii: - docker: - - image: continuumio/miniconda:latest - # databases configuration - - image: circleci/postgres:9.6.5-alpine-ram - environment: - POSTGRES_USER: postgres - POSTGRES_DB: pandas_nosetest - - image: circleci/mysql:8-ram - environment: - MYSQL_USER: "root" - MYSQL_HOST: "localhost" - MYSQL_ALLOW_EMPTY_PASSWORD: "true" - MYSQL_DATABASE: "pandas_nosetest" - - environment: - JOB: "3.5_ASCII" - ENV_FILE: "ci/circle-35-ascii.yaml" - LOCALE_OVERRIDE: "C" - MINICONDA_DIR: /home/ubuntu/miniconda3 - steps: - - checkout - - run: - name: build - command: | - ./ci/install_circle.sh - ./ci/show_circle.sh - - run: - name: test - command: ./ci/run_circle.sh --skip-slow --skip-network - + command: ./ci/circle/run_circle.sh --skip-slow --skip-network workflows: version: 2 build_and_test: jobs: - - py27_compat - py36_locale - - py36_locale_slow - - py35_ascii diff --git a/.travis.yml b/.travis.yml index e8f7f3465bfd5..8ac4d827b0820 100644 --- a/.travis.yml +++ b/.travis.yml @@ -116,10 +116,10 @@ after_success: after_script: - echo "after_script start" - source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - if [ -e /tmp/single.xml ]; then - ci/print_skipped.py /tmp/single.xml; + - if [ -e test-data-single.xml ]; then + ci/print_skipped.py test-data-single.xml; fi - - if [ -e /tmp/multiple.xml ]; then - ci/print_skipped.py /tmp/multiple.xml; + - if [ -e test-data-multiple.xml ]; then + ci/print_skipped.py test-data-multiple.xml; fi - echo "after_script done" diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 1e9283c7fb92b..f3d063ee31bc8 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,18 +1,30 @@ import numpy as np -from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine, - ObjectEngine) +from pandas._libs import index as libindex + + +def _get_numeric_engines(): + engine_names = [ + ('Int64Engine', np.int64), ('Int32Engine', np.int32), + ('Int16Engine', np.int16), ('Int8Engine', np.int8), + ('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32), + ('UInt16engine', np.uint16), ('UInt8Engine', np.uint8), + ('Float64Engine', np.float64), ('Float32Engine', np.float32), + ] + return [(getattr(libindex, engine_name), dtype) + for engine_name, dtype in engine_names + if hasattr(libindex, engine_name)] class NumericEngineIndexing(object): - params = [[Int64Engine, UInt64Engine, Float64Engine], - [np.int64, np.uint64, np.float64], + params = [_get_numeric_engines(), ['monotonic_incr', 'monotonic_decr', 'non_monotonic'], ] - param_names = ['engine', 'dtype', 'index_type'] + param_names = ['engine_and_dtype', 'index_type'] - def setup(self, engine, dtype, index_type): + def setup(self, engine_and_dtype, index_type): + engine, dtype = engine_and_dtype N = 10**5 values = list([1] * N + [2] * N + [3] * N) arr = { @@ -26,7 +38,7 @@ def setup(self, engine, dtype, index_type): # code belows avoids populating the mapping etc. while timing. self.data.get_loc(2) - def time_get_loc(self, engine, dtype, index_type): + def time_get_loc(self, engine_and_dtype, index_type): self.data.get_loc(2) @@ -44,7 +56,7 @@ def setup(self, index_type): 'non_monotonic': np.array(list('abc') * N, dtype=object), }[index_type] - self.data = ObjectEngine(lambda: arr, len(arr)) + self.data = libindex.ObjectEngine(lambda: arr, len(arr)) # code belows avoids populating the mapping etc. while timing. self.data.get_loc('b') diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5d473bfc5a38c..373c22fdf8e62 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -7,10 +7,10 @@ jobs: parameters: name: macOS vmImage: xcode9-macos10.13 -# - template: ci/azure/linux.yml -# parameters: -# name: Linux -# vmImage: ubuntu-16.04 +- template: ci/azure/linux.yml + parameters: + name: Linux + vmImage: ubuntu-16.04 # Windows Python 2.7 needs VC 9.0 installed, and not sure # how to make that a conditional task, so for now these are diff --git a/ci/circle-27-compat.yaml b/ci/azure-27-compat.yaml similarity index 100% rename from ci/circle-27-compat.yaml rename to ci/azure-27-compat.yaml diff --git a/ci/circle-36-locale_slow.yaml b/ci/azure-36-locale_slow.yaml similarity index 100% rename from ci/circle-36-locale_slow.yaml rename to ci/azure-36-locale_slow.yaml diff --git a/ci/azure-37-locale.yaml b/ci/azure-37-locale.yaml new file mode 100644 index 0000000000000..ef97b85406709 --- /dev/null +++ b/ci/azure-37-locale.yaml @@ -0,0 +1,35 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - cython>=0.28.2 + - html5lib + - ipython + - jinja2 + - lxml + - matplotlib + - nomkl + - numexpr + - numpy + - openpyxl=2.5.5 + - psycopg2 + - pymysql + - pytables + - python-dateutil + - python=3.6* + - pytz + - s3fs + - scipy + - sqlalchemy + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - moto + - pip: + - hypothesis>=3.58.0 diff --git a/ci/azure/linux.yml b/ci/azure/linux.yml new file mode 100644 index 0000000000000..f34cba69a6195 --- /dev/null +++ b/ci/azure/linux.yml @@ -0,0 +1,56 @@ +parameters: + name: '' + vmImage: '' + +jobs: +- job: ${{ parameters.name }} + pool: + vmImage: ${{ parameters.vmImage }} + strategy: + maxParallel: 11 + matrix: + py27_np_19: + ENV_FILE: ci/azure-27-compat.yaml + CONDA_PY: "27" + CONDA_ENV: pandas + TEST_ARGS: "--skip-slow --skip-network" + + py36_locale: + ENV_FILE: ci/azure-37-locale.yaml + CONDA_PY: "37" + CONDA_ENV: pandas + TEST_ARGS: "--skip-slow --skip-network" + LOCALE_OVERRIDE: "zh_CN.UTF-8" + + py36_locale_slow: + ENV_FILE: ci/azure-36-locale_slow.yaml + CONDA_PY: "36" + CONDA_ENV: pandas + TEST_ARGS: "--only-slow --skip-network" + + steps: + - script: | + if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386; fi + echo "Installing Miniconda"{ + ci/incremental/install_miniconda.sh + export PATH=$HOME/miniconda3/bin:$PATH + echo "Setting up Conda environment" + ci/incremental/setup_conda_environment.sh + displayName: 'Before Install' + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + ci/incremental/build.sh + displayName: 'Build' + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + ci/script_single.sh + ci/script_multi.sh + echo "[Test done]" + displayName: 'Test' + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + - task: PublishTestResults@2 + inputs: + testResultsFiles: 'test-data-*.xml' + testRunTitle: 'Linux' \ No newline at end of file diff --git a/ci/azure/macos.yml b/ci/azure/macos.yml index fb10d89731f26..53ce51c76683c 100644 --- a/ci/azure/macos.yml +++ b/ci/azure/macos.yml @@ -39,5 +39,5 @@ jobs: source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - task: PublishTestResults@2 inputs: - testResultsFiles: '/tmp/*.xml' + testResultsFiles: 'test-data-*.xml' testRunTitle: 'MacOS-35' diff --git a/ci/circle-35-ascii.yaml b/ci/circle-35-ascii.yaml deleted file mode 100644 index 281ed59e2deff..0000000000000 --- a/ci/circle-35-ascii.yaml +++ /dev/null @@ -1,15 +0,0 @@ -name: pandas -channels: - - defaults -dependencies: - - cython>=0.28.2 - - nomkl - - numpy - - python-dateutil - - python=3.5* - - pytz - # universal - - pytest - - pytest-xdist - - pip: - - hypothesis>=3.58.0 diff --git a/ci/install_circle.sh b/ci/circle/install_circle.sh similarity index 100% rename from ci/install_circle.sh rename to ci/circle/install_circle.sh diff --git a/ci/run_circle.sh b/ci/circle/run_circle.sh similarity index 100% rename from ci/run_circle.sh rename to ci/circle/run_circle.sh diff --git a/ci/show_circle.sh b/ci/circle/show_circle.sh similarity index 100% rename from ci/show_circle.sh rename to ci/circle/show_circle.sh diff --git a/ci/incremental/setup_conda_environment.sh b/ci/incremental/setup_conda_environment.sh index c716a39138644..f3ac99d5e7c5a 100755 --- a/ci/incremental/setup_conda_environment.sh +++ b/ci/incremental/setup_conda_environment.sh @@ -27,13 +27,17 @@ set -v # w/o removing anything else echo echo "[removing installed pandas]" -conda remove pandas -y --force -pip uninstall -y pandas +conda remove pandas -y --force || true +pip uninstall -y pandas || true echo echo "[no installed pandas]" conda list pandas +if [ -n "$LOCALE_OVERRIDE" ]; then + sudo locale-gen "$LOCALE_OVERRIDE" +fi + # # Install the compiler toolchain # if [[ $(uname) == Linux ]]; then # if [[ "$CONDA_SUBDIR" == "linux-32" || "$BITS32" == "yes" ]] ; then diff --git a/ci/script_multi.sh b/ci/script_multi.sh index dcc5a14d7b3b4..e076558e8fff3 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -27,17 +27,17 @@ if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas - pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas + pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas elif [ "$SLOW" ]; then TEST_ARGS="--only-slow --skip-network" - echo pytest -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas - pytest -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + echo pytest -m "not single and slow" -v --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas + pytest -m "not single and slow" -v --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas else - echo pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas - pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas # TODO: doctest + echo pytest -n 2 -m "not single" --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas + pytest -n 2 -m "not single" --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/ci/script_single.sh b/ci/script_single.sh index 09e7446a2d876..42d326e0965ee 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -5,8 +5,9 @@ echo "[script_single]" source activate pandas if [ -n "$LOCALE_OVERRIDE" ]; then + echo "Setting LC_ALL and LANG to $LOCALE_OVERRIDE" export LC_ALL="$LOCALE_OVERRIDE"; - echo "Setting LC_ALL to $LOCALE_OVERRIDE" + export LANG="$LOCALE_OVERRIDE"; pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' python -c "$pycmd" @@ -25,14 +26,13 @@ if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - + echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=test-data-single.xml $TEST_ARGS pandas + pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=test-data-single.xml $TEST_ARGS pandas echo pytest -s --strict scripts pytest -s --strict scripts else - echo pytest -m "single" --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas - pytest -m "single" --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" --junitxml=test-data-single.xml --strict $TEST_ARGS pandas + pytest -m "single" --junitxml=test-data-single.xml --strict $TEST_ARGS pandas fi diff --git a/doc/source/api.rst b/doc/source/api.rst index 1ec2a56dcd094..6e8eb83577c46 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -851,6 +851,22 @@ Sparse SparseSeries.to_coo SparseSeries.from_coo +.. autosummary:: + :toctree: generated/ + :template: autosummary/accessor_attribute.rst + + Series.sparse.npoints + Series.sparse.density + Series.sparse.fill_value + Series.sparse.sp_values + + +.. autosummary:: + :toctree: generated/ + + Series.sparse.from_coo + Series.sparse.to_coo + .. _api.dataframe: DataFrame diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 67b8d287d5d1a..66d545a0de6e9 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -684,7 +684,7 @@ Test-driven development/code writing ------------------------------------ *pandas* is serious about testing and strongly encourages contributors to embrace -`test-driven development (TDD) `_. +`test-driven development (TDD) `_. This development process "relies on the repetition of a very short development cycle: first the developer writes an (initially failing) automated test case that defines a desired improvement or new function, then produces the minimum amount of code to pass that test." diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 2bb99dd1822b6..884512981e1c9 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -62,6 +62,26 @@ Any sparse object can be converted back to the standard dense form by calling sts.to_dense() +.. _sparse.accessor: + +Sparse Accessor +--------------- + +.. versionadded:: 0.24.0 + +Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` +for categorical data, and ``.dt`` for datetime-like data. This namespace provides +attributes and methods that are specific to sparse data. + +.. ipython:: python + + s = pd.Series([0, 0, 1, 2], dtype="Sparse[int]") + s.sparse.density + s.sparse.fill_value + +This accessor is available only on data with ``SparseDtype``, and on the :class:`Series` +class itself for creating a Series with sparse data from a scipy COO matrix with. + .. _sparse.array: SparseArray diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5fa391c3433ea..768868d585721 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -532,7 +532,6 @@ changes were made: - ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. - Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. - Some new warnings are issued for operations that require or are likely to materialize a large dense array: - A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. @@ -540,6 +539,13 @@ Some new warnings are issued for operations that require or are likely to materi In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. +Finally, a ``Series.sparse`` accessor was added to provide sparse-specific methods like :meth:`Series.sparse.from_coo`. + +.. ipython:: python + + s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]') + s.sparse.density + .. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: Raise ValueError in ``DataFrame.to_dict(orient='index')`` @@ -948,9 +954,11 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, - both when indexing by label (using .loc) and position(.iloc). - Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) +- Slicing Series and Dataframes with an monotonically increasing :class:`CategoricalIndex` + is now very fast and has speed comparable to slicing with an ``Int64Index``. + The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`) + Slicing a monotonically increasing :class:`CategoricalIndex` itself (i.e. ``ci[1000:2000]``) + shows similar speed improvements as above (:issue:`21659`) - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d2914dc8ac751..3ba4c2375b4e8 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -10,7 +10,8 @@ from libc.math cimport fabs, sqrt import numpy as np cimport numpy as cnp from numpy cimport (ndarray, - NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8, + NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8, + NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8, NPY_FLOAT32, NPY_FLOAT64, NPY_OBJECT, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, @@ -359,9 +360,13 @@ ctypedef fused algos_t: float64_t float32_t object - int32_t int64_t + int32_t + int16_t + int8_t uint64_t + uint32_t + uint16_t uint8_t @@ -459,7 +464,12 @@ pad_float32 = pad["float32_t"] pad_object = pad["object"] pad_int64 = pad["int64_t"] pad_int32 = pad["int32_t"] +pad_int16 = pad["int16_t"] +pad_int8 = pad["int8_t"] pad_uint64 = pad["uint64_t"] +pad_uint32 = pad["uint32_t"] +pad_uint16 = pad["uint16_t"] +pad_uint8 = pad["uint8_t"] pad_bool = pad["uint8_t"] @@ -653,7 +663,12 @@ backfill_float32 = backfill["float32_t"] backfill_object = backfill["object"] backfill_int64 = backfill["int64_t"] backfill_int32 = backfill["int32_t"] +backfill_int16 = backfill["int16_t"] +backfill_int8 = backfill["int8_t"] backfill_uint64 = backfill["uint64_t"] +backfill_uint32 = backfill["uint32_t"] +backfill_uint16 = backfill["uint16_t"] +backfill_uint8 = backfill["uint8_t"] backfill_bool = backfill["uint8_t"] @@ -866,7 +881,12 @@ is_monotonic_float32 = is_monotonic["float32_t"] is_monotonic_object = is_monotonic["object"] is_monotonic_int64 = is_monotonic["int64_t"] is_monotonic_int32 = is_monotonic["int32_t"] +is_monotonic_int16 = is_monotonic["int16_t"] +is_monotonic_int8 = is_monotonic["int8_t"] is_monotonic_uint64 = is_monotonic["uint64_t"] +is_monotonic_uint32 = is_monotonic["uint32_t"] +is_monotonic_uint16 = is_monotonic["uint16_t"] +is_monotonic_uint8 = is_monotonic["uint8_t"] is_monotonic_bool = is_monotonic["uint8_t"] diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index b39b5eaced8fd..518664d70cf06 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -133,6 +133,9 @@ dtypes = [('float64', 'FLOAT64', 'float64'), ('int16', 'INT16', 'int16'), ('int32', 'INT32', 'int32'), ('int64', 'INT64', 'int64'), + ('uint8', 'UINT8', 'uint8'), + ('uint16', 'UINT16', 'uint16'), + ('uint32', 'UINT32', 'uint32'), ('uint64', 'UINT64', 'uint64'), # ('platform_int', 'INT', 'int_'), # ('object', 'OBJECT', 'object_'), diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 3f76915655f58..d418ac63a4ac8 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -5,8 +5,10 @@ import cython import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, float64_t, int32_t, - int64_t, uint8_t, uint64_t, intp_t, +from numpy cimport (ndarray, intp_t, + float64_t, float32_t, + int64_t, int32_t, int16_t, int8_t, + uint64_t, uint32_t, uint16_t, uint8_t, # Note: NPY_DATETIME, NPY_TIMEDELTA are only available # for cimport in cython>=0.27.3 NPY_DATETIME, NPY_TIMEDELTA) diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 4ea35da0626f3..c19812efaaa35 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -10,14 +10,22 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dtype, ctype -dtypes = [('Float64', 'float64', 'float64_t'), - ('UInt64', 'uint64', 'uint64_t'), - ('Int64', 'int64', 'int64_t'), - ('Object', 'object', 'object')] +# name, dtype, ctype, hashtable_name, hashtable_dtype +dtypes = [('Float64', 'float64', 'float64_t', 'Float64', 'float64'), + ('Float32', 'float32', 'float32_t', 'Float64', 'float64'), + ('Int64', 'int64', 'int64_t', 'Int64', 'int64'), + ('Int32', 'int32', 'int32_t', 'Int64', 'int64'), + ('Int16', 'int16', 'int16_t', 'Int64', 'int64'), + ('Int8', 'int8', 'int8_t', 'Int64', 'int64'), + ('UInt64', 'uint64', 'uint64_t', 'UInt64', 'uint64'), + ('UInt32', 'uint32', 'uint32_t', 'UInt64', 'uint64'), + ('UInt16', 'uint16', 'uint16_t', 'UInt64', 'uint64'), + ('UInt8', 'uint8', 'uint8_t', 'UInt64', 'uint64'), + ('Object', 'object', 'object', 'PyObject', 'object'), + ] }} -{{for name, dtype, ctype in dtypes}} +{{for name, dtype, ctype, hashtable_name, hashtable_dtype in dtypes}} cdef class {{name}}Engine(IndexEngine): @@ -34,13 +42,9 @@ cdef class {{name}}Engine(IndexEngine): other, limit=limit) cdef _make_hash_table(self, n): - {{if name == 'Object'}} - return _hash.PyObjectHashTable(n) - {{else}} - return _hash.{{name}}HashTable(n) - {{endif}} + return _hash.{{hashtable_name}}HashTable(n) - {{if name != 'Float64' and name != 'Object'}} + {{if name not in {'Float64', 'Float32', 'Object'} }} cdef _check_type(self, object val): hash(val) if util.is_bool_object(val): @@ -50,6 +54,11 @@ cdef class {{name}}Engine(IndexEngine): {{endif}} {{if name != 'Object'}} + cpdef _call_map_locations(self, values): + # self.mapping is of type {{hashtable_name}}HashTable, + # so convert dtype of values + self.mapping.map_locations(algos.ensure_{{hashtable_dtype}}(values)) + cdef _get_index_values(self): return algos.ensure_{{dtype}}(self.vgetter()) @@ -60,7 +69,7 @@ cdef class {{name}}Engine(IndexEngine): ndarray[{{ctype}}] values int count = 0 - {{if name != 'Float64'}} + {{if name not in {'Float64', 'Float32'} }} if not util.is_integer_object(val): raise KeyError(val) {{endif}} diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index eab529584d1fb..bc91372e3ac7d 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -113,15 +113,18 @@ def delegate_names(delegate, accessors, typ, overwrite=False): Parameters ---------- - delegate : the class to get methods/properties & doc-strings - acccessors : string list of accessors to add - typ : 'property' or 'method' + delegate : object + the class to get methods/properties & doc-strings + acccessors : Sequence[str] + List of accessor to add + typ : {'property', 'method'} overwrite : boolean, default False overwrite the method/property in the target class if it exists Returns ------- - decorator + callable + A class decorator. Examples -------- diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e269f2e02ddfd..cd20bcbed2211 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1046,48 +1046,60 @@ def date(self): 'dim', "The number of days in the month") daysinmonth = days_in_month - is_month_start = _field_accessor( - 'is_month_start', - 'is_month_start', - "Logical indicating if first day of month (defined by frequency)") - is_month_end = _field_accessor( - 'is_month_end', - 'is_month_end', - """ - Indicator for whether the date is the last day of the month. + _is_month_doc = """ + Indicates whether the date is the {first_or_last} day of the month. Returns ------- Series or array - For Series, returns a Series with boolean values. For - DatetimeIndex, returns a boolean array. + For Series, returns a Series with boolean values. + For DatetimeIndex, returns a boolean array. See Also -------- - is_month_start : Indicator for whether the date is the first day - of the month. + is_month_start : Return a boolean indicating whether the date + is the first day of the month. + is_month_end : Return a boolean indicating whether the date + is the last day of the month. Examples -------- This method is available on Series with datetime values under the ``.dt`` accessor, and directly on DatetimeIndex. - >>> dates = pd.Series(pd.date_range("2018-02-27", periods=3)) - >>> dates + >>> s = pd.Series(pd.date_range("2018-02-27", periods=3)) + >>> s 0 2018-02-27 1 2018-02-28 2 2018-03-01 dtype: datetime64[ns] - >>> dates.dt.is_month_end + >>> s.dt.is_month_start + 0 False + 1 False + 2 True + dtype: bool + >>> s.dt.is_month_end 0 False 1 True 2 False dtype: bool >>> idx = pd.date_range("2018-02-27", periods=3) + >>> idx.is_month_start + array([False, False, True]) >>> idx.is_month_end - array([False, True, False], dtype=bool) - """) + array([False, True, False]) + """ + is_month_start = _field_accessor( + 'is_month_start', + 'is_month_start', + _is_month_doc.format(first_or_last='first')) + + is_month_end = _field_accessor( + 'is_month_end', + 'is_month_end', + _is_month_doc.format(first_or_last='last')) + is_quarter_start = _field_accessor( 'is_quarter_start', 'is_quarter_start', diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 920a9f8286f0d..08c961935a990 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -2,56 +2,47 @@ SparseArray data structure """ from __future__ import division -# pylint: disable=E1101,E1103,W0231 -import re -import operator import numbers -import numpy as np +import operator +import re import warnings -import pandas as pd -from pandas.core.base import PandasObject +import numpy as np +import pandas._libs.sparse as splib +import pandas.core.algorithms as algos +import pandas.core.common as com +import pandas.io.formats.printing as printing from pandas import compat -from pandas.errors import PerformanceWarning +from pandas._libs import index as libindex, lib +from pandas._libs.sparse import BlockIndex, IntIndex +from pandas._libs.tslibs import NaT from pandas.compat.numpy import function as nv - +from pandas.core.accessor import PandasDelegate, delegate_names from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin -import pandas.core.common as com +from pandas.core.base import PandasObject from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import ( + astype_nansafe, construct_1d_arraylike_from_scalar, find_common_type, + infer_dtype_from_scalar, maybe_convert_platform +) +from pandas.core.dtypes.common import ( + is_array_like, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal, + is_integer, is_list_like, is_object_dtype, is_scalar, is_string_dtype, + pandas_dtype +) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ( - ABCSparseSeries, ABCSeries, ABCIndexClass + ABCIndexClass, ABCSeries, ABCSparseSeries ) -from pandas.core.dtypes.common import ( - is_datetime64_any_dtype, - is_integer, - is_object_dtype, - is_array_like, - pandas_dtype, - is_bool_dtype, - is_list_like, - is_string_dtype, - is_scalar, is_dtype_equal) -from pandas.core.dtypes.cast import ( - maybe_convert_platform, - astype_nansafe, find_common_type, infer_dtype_from_scalar, - construct_1d_arraylike_from_scalar) -from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype +from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna from pandas.core.missing import interpolate_2d - -import pandas._libs.sparse as splib -from pandas._libs.sparse import BlockIndex, IntIndex -from pandas._libs import index as libindex -from pandas._libs import lib -import pandas.core.algorithms as algos -import pandas.io.formats.printing as printing +from pandas.errors import PerformanceWarning # ---------------------------------------------------------------------------- # Dtype - @register_extension_dtype class SparseDtype(ExtensionDtype): """ @@ -178,6 +169,7 @@ def _is_boolean(self): @property def kind(self): + """The sparse kind. Either 'integer', or 'block'.""" return self.subtype.kind @property @@ -618,7 +610,7 @@ def __array__(self, dtype=None, copy=True): if is_datetime64_any_dtype(self.sp_values.dtype): # However, we *do* special-case the common case of # a datetime64 with pandas NaT. - if fill_value is pd.NaT: + if fill_value is NaT: # Can't put pd.NaT in a datetime64[ns] fill_value = np.datetime64('NaT') try: @@ -648,10 +640,22 @@ def _from_factorized(cls, values, original): # ------------------------------------------------------------------------ @property def sp_index(self): + """ + The SparseIndex containing the location of non- ``fill_value`` points. + """ return self._sparse_index @property def sp_values(self): + """ + An ndarray containing the non- ``fill_value`` values. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0) + >>> s.sp_values + array([1, 2]) + """ return self._sparse_values @property @@ -696,7 +700,7 @@ def _null_fill_value(self): def _fill_value_matches(self, fill_value): if self._null_fill_value: - return pd.isna(fill_value) + return isna(fill_value) else: return self.fill_value == fill_value @@ -704,6 +708,31 @@ def _fill_value_matches(self, fill_value): def nbytes(self): return self.sp_values.nbytes + self.sp_index.nbytes + @property + def density(self): + """The percent of non- ``fill_value`` points, as decimal. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) + >>> s.density + 0.6 + """ + r = float(self.sp_index.npoints) / float(self.sp_index.length) + return r + + @property + def npoints(self): + """The number of non- ``fill_value`` points. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) + >>> s.npoints + 3 + """ + return self.sp_index.npoints + @property def values(self): """ @@ -816,7 +845,7 @@ def _first_fill_value_loc(self): return np.searchsorted(diff, 2) + 1 def unique(self): - uniques = list(pd.unique(self.sp_values)) + uniques = list(algos.unique(self.sp_values)) fill_loc = self._first_fill_value_loc() if fill_loc >= 0: uniques.insert(fill_loc, self.fill_value) @@ -832,8 +861,8 @@ def factorize(self, na_sentinel=-1): # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of labels, why bother # implementing an efficient factorize? - labels, uniques = pd.factorize(np.asarray(self), - na_sentinel=na_sentinel) + labels, uniques = algos.factorize(np.asarray(self), + na_sentinel=na_sentinel) uniques = SparseArray(uniques, dtype=self.dtype) return labels, uniques @@ -850,6 +879,8 @@ def value_counts(self, dropna=True): ------- counts : Series """ + from pandas import Index, Series + keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps @@ -858,7 +889,7 @@ def value_counts(self, dropna=True): pass else: if self._null_fill_value: - mask = pd.isna(keys) + mask = isna(keys) else: mask = keys == self.fill_value @@ -868,9 +899,9 @@ def value_counts(self, dropna=True): keys = np.insert(keys, 0, self.fill_value) counts = np.insert(counts, 0, fcounts) - if not isinstance(keys, pd.Index): - keys = pd.Index(keys) - result = pd.Series(counts, index=keys) + if not isinstance(keys, ABCIndexClass): + keys = Index(keys) + result = Series(counts, index=keys) return result # -------- @@ -1744,3 +1775,138 @@ def _make_index(length, indices, kind): else: # pragma: no cover raise ValueError('must be block or integer type') return index + + +# ---------------------------------------------------------------------------- +# Accessor + +@delegate_names(SparseArray, ['npoints', 'density', 'fill_value', + 'sp_values'], + typ='property') +class SparseAccessor(PandasDelegate): + def __init__(self, data=None): + self._validate(data) + # Store the Series since we need that for to_coo + self._parent = data + + @staticmethod + def _validate(data): + if not isinstance(data.dtype, SparseDtype): + msg = "Can only use the '.sparse' accessor with Sparse data." + raise AttributeError(msg) + + def _delegate_property_get(self, name, *args, **kwargs): + return getattr(self._parent.values, name) + + def _delegate_method(self, name, *args, **kwargs): + if name == 'from_coo': + return self.from_coo(*args, **kwargs) + elif name == 'to_coo': + return self.to_coo(*args, **kwargs) + else: + raise ValueError + + @classmethod + def from_coo(cls, A, dense_index=False): + """ + Create a SparseSeries from a scipy.sparse.coo_matrix. + + Parameters + ---------- + A : scipy.sparse.coo_matrix + dense_index : bool, default False + If False (default), the SparseSeries index consists of only the + coords of the non-null entries of the original coo_matrix. + If True, the SparseSeries index consists of the full sorted + (row, col) coordinates of the coo_matrix. + + Returns + ------- + s : SparseSeries + + Examples + --------- + >>> from scipy import sparse + >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), + shape=(3, 4)) + >>> A + <3x4 sparse matrix of type '' + with 3 stored elements in COOrdinate format> + >>> A.todense() + matrix([[ 0., 0., 1., 2.], + [ 3., 0., 0., 0.], + [ 0., 0., 0., 0.]]) + >>> ss = pd.SparseSeries.from_coo(A) + >>> ss + 0 2 1 + 3 2 + 1 0 3 + dtype: float64 + BlockIndex + Block locations: array([0], dtype=int32) + Block lengths: array([3], dtype=int32) + """ + from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series + from pandas import Series + + result = _coo_to_sparse_series(A, dense_index=dense_index) + # SparseSeries -> Series[sparse] + result = Series(result.values, index=result.index, copy=False) + + return result + + def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): + """ + Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. + + Use row_levels and column_levels to determine the row and column + coordinates respectively. row_levels and column_levels are the names + (labels) or numbers of the levels. {row_levels, column_levels} must be + a partition of the MultiIndex level names (or numbers). + + Parameters + ---------- + row_levels : tuple/list + column_levels : tuple/list + sort_labels : bool, default False + Sort the row and column labels before forming the sparse matrix. + + Returns + ------- + y : scipy.sparse.coo_matrix + rows : list (row labels) + columns : list (column labels) + + Examples + -------- + >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) + >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), + (1, 2, 'a', 1), + (1, 1, 'b', 0), + (1, 1, 'b', 1), + (2, 1, 'b', 0), + (2, 1, 'b', 1)], + names=['A', 'B', 'C', 'D']) + >>> ss = s.to_sparse() + >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'], + column_levels=['C', 'D'], + sort_labels=True) + >>> A + <3x4 sparse matrix of type '' + with 3 stored elements in COOrdinate format> + >>> A.todense() + matrix([[ 0., 0., 1., 3.], + [ 3., 0., 0., 0.], + [ 0., 0., 0., 0.]]) + >>> rows + [(1, 1), (1, 2), (2, 1)] + >>> columns + [('a', 0), ('a', 1), ('b', 0), ('b', 1)] + """ + from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo + + A, rows, columns = _sparse_series_to_coo(self._parent, + row_levels, + column_levels, + sort_labels=sort_labels) + return A, rows, columns diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 56bf394729773..46c8126f65fec 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,7 +6,7 @@ import warnings from pandas._libs import tslib, lib, tslibs -from pandas._libs.tslibs import iNaT, OutOfBoundsDatetime +from pandas._libs.tslibs import iNaT, OutOfBoundsDatetime, Period from pandas.compat import string_types, text_type, PY3 from .common import (ensure_object, is_bool, is_integer, is_float, is_complex, is_datetimetz, is_categorical_dtype, @@ -164,6 +164,12 @@ def trans(x): # noqa result = to_datetime(result).tz_localize('utc') result = result.tz_convert(dtype.tz) + elif dtype.type == Period: + # TODO(DatetimeArray): merge with previous elif + from pandas.core.arrays import PeriodArray + + return PeriodArray(result, freq=dtype.freq) + except Exception: pass diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index ac84971de08d8..9ef30b8fd021f 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -5,8 +5,9 @@ """ import types + +from pandas.core.dtypes.common import is_list_like, is_scalar from pandas.util._decorators import make_signature -from pandas.core.dtypes.common import is_scalar, is_list_like class GroupByMixin(object): diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index e54045884ea93..3e653704bbace 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,7 +1,9 @@ import numpy as np + from pandas.core.algorithms import unique1d from pandas.core.arrays.categorical import ( - _recode_for_categories, CategoricalDtype, Categorical) + Categorical, CategoricalDtype, _recode_for_categories +) def recode_for_groupby(c, sort, observed): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 63bf67854e5cd..a832eecf87721 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -7,48 +7,40 @@ """ import collections -import warnings import copy -from textwrap import dedent +import warnings from functools import partial +from textwrap import dedent import numpy as np -from pandas._libs import lib, Timestamp -from pandas.util._decorators import Substitution, Appender -from pandas import compat - -import pandas.core.indexes.base as ibase +import pandas.core.algorithms as algorithms import pandas.core.common as com -from pandas.core.panel import Panel +import pandas.core.indexes.base as ibase +from pandas import compat +from pandas._libs import Timestamp, lib from pandas.compat import lzip, map - -from pandas.core.series import Series -from pandas.core.generic import _shared_docs -from pandas.core.groupby.groupby import ( - GroupBy, _apply_docs, _transform_template) -from pandas.core.generic import NDFrame -from pandas.core.groupby import base +from pandas.compat.numpy import _np_version_under1p13 +from pandas.core.arrays import Categorical +from pandas.core.base import DataError, SpecificationError +from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - is_scalar, - is_bool, - is_datetimelike, - is_numeric_dtype, - is_integer_dtype, - is_interval_dtype, - ensure_platform_int, - ensure_int64) + ensure_int64, ensure_platform_int, is_bool, is_datetimelike, + is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_scalar +) from pandas.core.dtypes.missing import isna, notna -import pandas.core.algorithms as algorithms from pandas.core.frame import DataFrame -from pandas.core.dtypes.cast import maybe_downcast_to_dtype -from pandas.core.base import SpecificationError, DataError -from pandas.core.index import Index, MultiIndex, CategoricalIndex -from pandas.core.arrays import Categorical +from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.groupby import base +from pandas.core.groupby.groupby import ( + GroupBy, _apply_docs, _transform_template +) +from pandas.core.index import CategoricalIndex, Index, MultiIndex from pandas.core.internals import BlockManager, make_block -from pandas.compat.numpy import _np_version_under1p13 - +from pandas.core.panel import Panel +from pandas.core.series import Series from pandas.plotting._core import boxplot_frame_groupby +from pandas.util._decorators import Appender, Substitution class NDFrameGroupBy(GroupBy): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 025be781d9ee8..5acccbf688e30 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -7,42 +7,36 @@ class providing the base-class of operations. expose these user-facing objects to provide specific functionailty. """ -import types -from functools import wraps, partial -import datetime import collections +import datetime +import types import warnings from contextlib import contextmanager +from functools import partial, wraps import numpy as np -from pandas._libs import groupby as libgroupby, Timestamp -from pandas.util._validators import validate_kwargs -from pandas.util._decorators import ( - cache_readonly, Substitution, Appender) - +import pandas.core.algorithms as algorithms +import pandas.core.common as com from pandas import compat -from pandas.compat import zip, range, callable, set_function_name +from pandas._libs import Timestamp, groupby as libgroupby +from pandas.compat import callable, range, set_function_name, zip from pandas.compat.numpy import function as nv - -from pandas.core.dtypes.common import ( - is_numeric_dtype, - is_scalar, - ensure_float) +from pandas.core.base import ( + DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError +) +from pandas.core.config import option_context from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.common import ensure_float, is_numeric_dtype, is_scalar from pandas.core.dtypes.missing import isna, notna - +from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame from pandas.core.groupby import base -from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, - DataError, SpecificationError) from pandas.core.index import Index, MultiIndex -from pandas.core.generic import NDFrame -from pandas.core.frame import DataFrame from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter -import pandas.core.common as com -import pandas.core.algorithms as algorithms -from pandas.core.config import option_context +from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._validators import validate_kwargs _doc_template = """ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 1c8fe0e6cadad..cbe87040b8117 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -4,31 +4,25 @@ """ import warnings -import numpy as np -from pandas.util._decorators import cache_readonly +import numpy as np +import pandas.core.algorithms as algorithms +import pandas.core.common as com from pandas import compat -from pandas.compat import zip, callable - -from pandas.core.dtypes.generic import ABCSeries -from pandas.core.arrays import ExtensionArray, Categorical -from pandas.core.index import ( - Index, MultiIndex, CategoricalIndex) +from pandas.compat import callable, zip +from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.dtypes.common import ( - ensure_categorical, - is_hashable, - is_list_like, - is_timedelta64_dtype, - is_datetime64_dtype, - is_categorical_dtype, - is_scalar) -from pandas.core.series import Series + ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable, + is_list_like, is_scalar, is_timedelta64_dtype +) +from pandas.core.dtypes.generic import ABCSeries from pandas.core.frame import DataFrame -import pandas.core.common as com from pandas.core.groupby.ops import BaseGrouper -import pandas.core.algorithms as algorithms +from pandas.core.index import CategoricalIndex, Index, MultiIndex +from pandas.core.series import Series from pandas.io.formats.printing import pprint_thing +from pandas.util._decorators import cache_readonly class Grouper(object): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b199127ac867b..af22744c4feec 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -6,42 +6,33 @@ are contained *in* the SeriesGroupBy and DataFrameGroupBy objects. """ -import copy import collections -import numpy as np - -from pandas._libs import lib, reduction, NaT, iNaT, groupby as libgroupby -from pandas.util._decorators import cache_readonly +import copy -from pandas.compat import zip, range, lzip +import numpy as np +import pandas.core.algorithms as algorithms +import pandas.core.common as com +from pandas._libs import NaT, groupby as libgroupby, iNaT, lib, reduction +from pandas.compat import lzip, range, zip from pandas.core.base import SelectionMixin -from pandas.core.dtypes.missing import isna, _maybe_fill -from pandas.core.index import ( - Index, MultiIndex, ensure_index) from pandas.core.dtypes.common import ( - ensure_float64, - ensure_platform_int, - ensure_int64, - ensure_int64_or_float64, - ensure_object, - needs_i8_conversion, - is_integer_dtype, - is_complex_dtype, - is_bool_dtype, - is_numeric_dtype, - is_timedelta64_dtype, - is_datetime64_any_dtype, - is_categorical_dtype) -from pandas.core.series import Series + ensure_float64, ensure_int64, ensure_int64_or_float64, ensure_object, + ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype, + is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, + is_timedelta64_dtype, needs_i8_conversion +) +from pandas.core.dtypes.missing import _maybe_fill, isna from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -import pandas.core.common as com from pandas.core.groupby import base -from pandas.core.sorting import (get_group_index_sorter, get_group_index, - compress_group_index, get_flattened_iterator, - decons_obs_group_ids, get_indexer_dict) -import pandas.core.algorithms as algorithms +from pandas.core.index import Index, MultiIndex, ensure_index +from pandas.core.series import Series +from pandas.core.sorting import ( + compress_group_index, decons_obs_group_ids, get_flattened_iterator, + get_group_index, get_group_index_sorter, get_indexer_dict +) +from pandas.util._decorators import cache_readonly def generate_bins_generic(values, binner, closed): diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 35b9799579628..c3b94c297652a 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -1,7 +1,6 @@ """ datetimelike delegation """ - import numpy as np from pandas.core.dtypes.generic import ABCSeries diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e4250ae790553..278e395d65014 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -84,7 +84,17 @@ class CategoricalIndex(Index, accessor.PandasDelegate): """ _typ = 'categoricalindex' - _engine_type = libindex.Int64Engine + + @property + def _engine_type(self): + # self.codes can have dtype int8, int16, int32 or int64, so we need + # to return the corresponding engine type (libindex.Int8Engine, etc.). + return {np.int8: libindex.Int8Engine, + np.int16: libindex.Int16Engine, + np.int32: libindex.Int32Engine, + np.int64: libindex.Int64Engine, + }[self.codes.dtype.type] + _attributes = ['name'] def __new__(cls, data=None, categories=None, ordered=None, dtype=None, @@ -382,7 +392,7 @@ def argsort(self, *args, **kwargs): def _engine(self): # we are going to look things up with the codes themselves - return self._engine_type(lambda: self.codes.astype('i8'), len(self)) + return self._engine_type(lambda: self.codes, len(self)) # introspection @cache_readonly @@ -450,6 +460,7 @@ def get_loc(self, key, method=None): array([False, True, False, True], dtype=bool) """ code = self.categories.get_loc(key) + code = self.codes.dtype.type(code) try: return self._engine.get_loc(code) except KeyError: diff --git a/pandas/core/series.py b/pandas/core/series.py index d3ea005d3aae7..d03a88ea78f6f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -24,8 +24,9 @@ from pandas.compat.numpy import function as nv from pandas.core import base, generic from pandas.core.accessor import CachedAccessor -from pandas.core.arrays import ExtensionArray, period_array +from pandas.core.arrays import ExtensionArray, SparseArray, period_array from pandas.core.arrays.categorical import Categorical, CategoricalAccessor +from pandas.core.arrays.sparse import SparseAccessor from pandas.core.config import get_option from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, @@ -142,7 +143,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Copy input data """ _metadata = ['name'] - _accessors = {'dt', 'cat', 'str'} + _accessors = {'dt', 'cat', 'str', 'sparse'} _deprecations = generic.NDFrame._deprecations | frozenset( ['asobject', 'sortlevel', 'reshape', 'get_value', 'set_value', 'from_csv', 'valid']) @@ -1366,7 +1367,6 @@ def to_sparse(self, kind='block', fill_value=None): """ # TODO: deprecate from pandas.core.sparse.series import SparseSeries - from pandas.core.arrays import SparseArray values = SparseArray(self, kind=kind, fill_value=fill_value) return SparseSeries( @@ -4151,6 +4151,7 @@ def to_period(self, freq=None, copy=True): dt = CachedAccessor("dt", CombinedDatetimelikeProperties) cat = CachedAccessor("cat", CategoricalAccessor) plot = CachedAccessor("plot", gfx.SeriesPlotMethods) + sparse = CachedAccessor("sparse", SparseAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 5a747c6e4b1d1..ff32712f9056a 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -27,6 +27,7 @@ from pandas.core.arrays import ( SparseArray, ) +from pandas.core.arrays.sparse import SparseAccessor from pandas._libs.sparse import BlockIndex, IntIndex import pandas._libs.sparse as splib @@ -183,7 +184,7 @@ def sp_values(self): @property def npoints(self): - return self.sp_index.npoints + return self.values.npoints @classmethod def from_array(cls, arr, index=None, name=None, copy=False, @@ -452,8 +453,7 @@ def to_dense(self): @property def density(self): - r = float(self.sp_index.npoints) / float(self.sp_index.length) - return r + return self.values.density def copy(self, deep=True): """ @@ -580,99 +580,16 @@ def combine_first(self, other): dense_combined = self.to_dense().combine_first(other) return dense_combined.to_sparse(fill_value=self.fill_value) + @Appender(SparseAccessor.to_coo.__doc__) def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): - """ - Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. - - Use row_levels and column_levels to determine the row and column - coordinates respectively. row_levels and column_levels are the names - (labels) or numbers of the levels. {row_levels, column_levels} must be - a partition of the MultiIndex level names (or numbers). - - Parameters - ---------- - row_levels : tuple/list - column_levels : tuple/list - sort_labels : bool, default False - Sort the row and column labels before forming the sparse matrix. - - Returns - ------- - y : scipy.sparse.coo_matrix - rows : list (row labels) - columns : list (column labels) - - Examples - -------- - >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) - >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), - (1, 2, 'a', 1), - (1, 1, 'b', 0), - (1, 1, 'b', 1), - (2, 1, 'b', 0), - (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) - >>> ss = s.to_sparse() - >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'], - column_levels=['C', 'D'], - sort_labels=True) - >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> - >>> A.todense() - matrix([[ 0., 0., 1., 3.], - [ 3., 0., 0., 0.], - [ 0., 0., 0., 0.]]) - >>> rows - [(1, 1), (1, 2), (2, 1)] - >>> columns - [('a', 0), ('a', 1), ('b', 0), ('b', 1)] - """ A, rows, columns = _sparse_series_to_coo(self, row_levels, column_levels, sort_labels=sort_labels) return A, rows, columns @classmethod + @Appender(SparseAccessor.from_coo.__doc__) def from_coo(cls, A, dense_index=False): - """ - Create a SparseSeries from a scipy.sparse.coo_matrix. - - Parameters - ---------- - A : scipy.sparse.coo_matrix - dense_index : bool, default False - If False (default), the SparseSeries index consists of only the - coords of the non-null entries of the original coo_matrix. - If True, the SparseSeries index consists of the full sorted - (row, col) coordinates of the coo_matrix. - - Returns - ------- - s : SparseSeries - - Examples - --------- - >>> from scipy import sparse - >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), - shape=(3, 4)) - >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> - >>> A.todense() - matrix([[ 0., 0., 1., 2.], - [ 3., 0., 0., 0.], - [ 0., 0., 0., 0.]]) - >>> ss = pd.SparseSeries.from_coo(A) - >>> ss - 0 2 1 - 3 2 - 1 0 3 - dtype: float64 - BlockIndex - Block locations: array([0], dtype=int32) - Block lengths: array([3], dtype=int32) - """ return _coo_to_sparse_series(A, dense_index=dense_index) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index e211b8626b53c..cc9512c0759fc 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -996,6 +996,55 @@ def test_asarray_datetime64(self): ) np.asarray(s) + def test_density(self): + arr = SparseArray([0, 1]) + assert arr.density == 0.5 + + def test_npoints(self): + arr = SparseArray([0, 1]) + assert arr.npoints == 1 + + +class TestAccessor(object): + + @pytest.mark.parametrize('attr', [ + 'npoints', 'density', 'fill_value', 'sp_values', + ]) + def test_get_attributes(self, attr): + arr = SparseArray([0, 1]) + ser = pd.Series(arr) + + result = getattr(ser.sparse, attr) + expected = getattr(arr, attr) + assert result == expected + + def test_from_coo(self): + sparse = pytest.importorskip("scipy.sparse") + + row = [0, 3, 1, 0] + col = [0, 3, 1, 2] + data = [4, 5, 7, 9] + sp_array = sparse.coo_matrix(data, (row, col)) + result = pd.Series.sparse.from_coo(sp_array) + + index = pd.MultiIndex.from_product([[0], [0, 1, 2, 3]]) + expected = pd.Series(data, index=index, dtype='Sparse[int]') + tm.assert_series_equal(result, expected) + + def test_to_coo(self): + sparse = pytest.importorskip("scipy.sparse") + ser = pd.Series([1, 2, 3], + index=pd.MultiIndex.from_product([[0], [1, 2, 3]], + names=['a', 'b']), + dtype='Sparse[int]') + A, _, _ = ser.sparse.to_coo() + assert isinstance(A, sparse.coo.coo_matrix) + + def test_non_sparse_raises(self): + ser = pd.Series([1, 2, 3]) + with tm.assert_raises_regex(AttributeError, '.sparse'): + ser.sparse.density + def test_setting_fill_value_fillna_still_works(): # This is why letting users update fill_value / dtype is bad diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 307543eca2b3e..029a77acb121f 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -9,22 +9,25 @@ class BaseSetitemTests(BaseExtensionTests): - def test_setitem_scalar_series(self, data): - arr = pd.Series(data) - arr[0] = data[1] - assert arr[0] == data[1] - - def test_setitem_sequence(self, data): - arr = pd.Series(data) + def test_setitem_scalar_series(self, data, box_in_series): + if box_in_series: + data = pd.Series(data) + data[0] = data[1] + assert data[0] == data[1] + + def test_setitem_sequence(self, data, box_in_series): + if box_in_series: + data = pd.Series(data) original = data.copy() - arr[[0, 1]] = [data[1], data[0]] - assert arr[0] == original[1] - assert arr[1] == original[0] + data[[0, 1]] = [data[1], data[0]] + assert data[0] == original[1] + assert data[1] == original[0] @pytest.mark.parametrize('as_array', [True, False]) def test_setitem_sequence_mismatched_length_raises(self, data, as_array): ser = pd.Series(data) + original = ser.copy() value = [data[0]] if as_array: value = data._from_sequence(value) @@ -32,22 +35,26 @@ def test_setitem_sequence_mismatched_length_raises(self, data, as_array): xpr = 'cannot set using a {} indexer with a different length' with tm.assert_raises_regex(ValueError, xpr.format('list-like')): ser[[0, 1]] = value + # Ensure no modifications made before the exception + self.assert_series_equal(ser, original) with tm.assert_raises_regex(ValueError, xpr.format('slice')): ser[slice(3)] = value - - def test_setitem_empty_indxer(self, data): - ser = pd.Series(data) - original = ser.copy() - ser[[]] = [] self.assert_series_equal(ser, original) - def test_setitem_sequence_broadcasts(self, data): - arr = pd.Series(data) - - arr[[0, 1]] = data[2] - assert arr[0] == data[2] - assert arr[1] == data[2] + def test_setitem_empty_indxer(self, data, box_in_series): + if box_in_series: + data = pd.Series(data) + original = data.copy() + data[np.array([], dtype=int)] = [] + self.assert_equal(data, original) + + def test_setitem_sequence_broadcasts(self, data, box_in_series): + if box_in_series: + data = pd.Series(data) + data[[0, 1]] = data[2] + assert data[0] == data[2] + assert data[1] == data[2] @pytest.mark.parametrize('setter', ['loc', 'iloc']) def test_setitem_scalar(self, data, setter): diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 8e397d228a5b6..7758bd01840ae 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -98,3 +98,9 @@ def data_for_grouping(): Where A < B < C and NA is missing """ raise NotImplementedError + + +@pytest.fixture(params=[True, False]) +def box_in_series(request): + """Whether to box the data in a Series""" + return request.param diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 2803db4f496a5..3b8d6e6c55ed1 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -759,7 +759,6 @@ def test_combine_first_timedelta(self): tm.assert_frame_equal(res, exp) assert res['TD'].dtype == 'timedelta64[ns]' - @pytest.mark.xfail(reason="GH-23079", strict=True) def test_combine_first_period(self): data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03', '2011-04'], freq='M') diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 99058f883a392..d89baa41d33fe 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -1,16 +1,16 @@ # -*- coding: utf-8 -*- import pytest +import numpy as np import pandas.util.testing as tm from pandas.core.indexes.api import Index, CategoricalIndex from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas._libs import index as libindex from .common import Base from pandas.compat import range, PY3 -import numpy as np - from pandas import Categorical, IntervalIndex, compat from pandas.util.testing import assert_almost_equal import pandas.core.config as cf @@ -1117,3 +1117,23 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" tm.assert_raises_regex(ValueError, msg, idx.take, indices, mode='clip') + + @pytest.mark.parametrize('dtype, engine_type', [ + (np.int8, libindex.Int8Engine), + (np.int16, libindex.Int16Engine), + (np.int32, libindex.Int32Engine), + (np.int64, libindex.Int64Engine), + ]) + def test_engine_type(self, dtype, engine_type): + if dtype != np.int64: + # num. of uniques required to push CategoricalIndex.codes to a + # dtype (128 categories required for .codes dtype to be int16 etc.) + num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype] + ci = pd.CategoricalIndex(range(num_uniques)) + else: + # having 2**32 - 2**31 categories would be very memory-intensive, + # so we cheat a bit with the dtype + ci = pd.CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1) + ci.values._codes = ci.values._codes.astype('int64') + assert np.issubdtype(ci.codes.dtype, dtype) + assert isinstance(ci._engine, engine_type) diff --git a/pandas/tests/indexing/conftest.py b/pandas/tests/indexing/conftest.py new file mode 100644 index 0000000000000..be1cf4800a2ef --- /dev/null +++ b/pandas/tests/indexing/conftest.py @@ -0,0 +1,20 @@ +import numpy as np +import pytest + +from pandas._libs import index as libindex + + +@pytest.fixture(params=[ + (libindex.Int64Engine, np.int64), + (libindex.Int32Engine, np.int32), + (libindex.Int16Engine, np.int16), + (libindex.Int8Engine, np.int8), + (libindex.UInt64Engine, np.uint64), + (libindex.UInt32Engine, np.uint32), + (libindex.UInt16Engine, np.uint16), + (libindex.UInt8Engine, np.uint8), + (libindex.Float64Engine, np.float64), + (libindex.Float32Engine, np.float32), +], ids=lambda x: x[0].__name__) +def numeric_indexing_engine_type_and_dtype(request): + return request.param diff --git a/pandas/tests/indexing/test_indexing_engines.py b/pandas/tests/indexing/test_indexing_engines.py new file mode 100644 index 0000000000000..410eba99948ce --- /dev/null +++ b/pandas/tests/indexing/test_indexing_engines.py @@ -0,0 +1,168 @@ +import numpy as np + +import pandas.util.testing as tm +from pandas import compat +from pandas._libs import algos as libalgos, index as libindex + + +class TestNumericEngine(object): + def test_is_monotonic(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + num = 1000 + arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype) + + # monotonic increasing + engine = engine_type(lambda: arr, len(arr)) + assert engine.is_monotonic_increasing is True + assert engine.is_monotonic_decreasing is False + + # monotonic decreasing + engine = engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is True + + # neither monotonic increasing or decreasing + arr = np.array([1] * num + [2] * num + [1] * num, dtype=dtype) + engine = engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is False + + def test_is_unique(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + # unique + arr = np.array([1, 3, 2], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.is_unique is True + + # not unique + arr = np.array([1, 2, 1], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.is_unique is False + + def test_get_loc(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + # unique + arr = np.array([1, 2, 3], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.get_loc(2) == 1 + + # monotonic + num = 1000 + arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.get_loc(2) == slice(1000, 2000) + + # not monotonic + arr = np.array([1, 2, 3] * num, dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + expected = np.array([False, True, False] * num, dtype=bool) + result = engine.get_loc(2) + assert (result == expected).all() + + def test_get_backfill_indexer( + self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + arr = np.array([1, 5, 10], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + + new = np.array(compat.range(12), dtype=dtype) + result = engine.get_backfill_indexer(new) + + expected = libalgos.backfill(arr, new) + tm.assert_numpy_array_equal(result, expected) + + def test_get_pad_indexer( + self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + arr = np.array([1, 5, 10], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + + new = np.array(compat.range(12), dtype=dtype) + result = engine.get_pad_indexer(new) + + expected = libalgos.pad(arr, new) + tm.assert_numpy_array_equal(result, expected) + + +class TestObjectEngine(object): + engine_type = libindex.ObjectEngine + dtype = np.object_ + values = list('abc') + + def test_is_monotonic(self): + + num = 1000 + arr = np.array(['a'] * num + ['a'] * num + ['c'] * num, + dtype=self.dtype) + + # monotonic increasing + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.is_monotonic_increasing is True + assert engine.is_monotonic_decreasing is False + + # monotonic decreasing + engine = self.engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is True + + # neither monotonic increasing or decreasing + arr = np.array(['a'] * num + ['b'] * num + ['a'] * num, + dtype=self.dtype) + engine = self.engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is False + + def test_is_unique(self): + # unique + arr = np.array(self.values, dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.is_unique is True + + # not unique + arr = np.array(['a', 'b', 'a'], dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.is_unique is False + + def test_get_loc(self): + # unique + arr = np.array(self.values, dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.get_loc('b') == 1 + + # monotonic + num = 1000 + arr = np.array(['a'] * num + ['b'] * num + ['c'] * num, + dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.get_loc('b') == slice(1000, 2000) + + # not monotonic + arr = np.array(self.values * num, dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + expected = np.array([False, True, False] * num, dtype=bool) + result = engine.get_loc('b') + assert (result == expected).all() + + def test_get_backfill_indexer(self): + arr = np.array(['a', 'e', 'j'], dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + + new = np.array(list('abcdefghij'), dtype=self.dtype) + result = engine.get_backfill_indexer(new) + + expected = libalgos.backfill_object(arr, new) + tm.assert_numpy_array_equal(result, expected) + + def test_get_pad_indexer(self): + arr = np.array(['a', 'e', 'j'], dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + + new = np.array(list('abcdefghij'), dtype=self.dtype) + result = engine.get_pad_indexer(new) + + expected = libalgos.pad_object(arr, new) + tm.assert_numpy_array_equal(result, expected) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index fcae4051dc471..6bf832fb9dc6d 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -334,6 +334,33 @@ def method(self, foo=None, bar=None): pass +class BadSeeAlso(object): + + def desc_no_period(self): + """ + Return the first 5 elements of the Series. + + See Also + -------- + Series.tail : Return the last 5 elements of the Series. + Series.iloc : Return a slice of the elements in the Series, + which can also be used to return the first or last n + """ + pass + + def desc_first_letter_lowercase(self): + """ + Return the first 5 elements of the Series. + + See Also + -------- + Series.tail : return the last 5 elements of the Series. + Series.iloc : Return a slice of the elements in the Series, + which can also be used to return the first or last n. + """ + pass + + class BadSummaries(object): def wrong_line(self): @@ -608,6 +635,11 @@ def test_bad_generic_functions(self, func): assert errors @pytest.mark.parametrize("klass,func,msgs", [ + # See Also tests + ('BadSeeAlso', 'desc_no_period', + ('Missing period at end of description for See Also "Series.iloc"',)), + ('BadSeeAlso', 'desc_first_letter_lowercase', + ('should be capitalized for See Also "Series.tail"',)), # Summary tests ('BadSummaries', 'wrong_line', ('should start in the line immediately after the opening quotes',)), diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index c571827db70f8..2fef3332de55c 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -505,7 +505,14 @@ def validate_one(func_name): wrns.append('See Also section not found') else: for rel_name, rel_desc in doc.see_also.items(): - if not rel_desc: + if rel_desc: + if not rel_desc.endswith('.'): + errs.append('Missing period at end of description for ' + 'See Also "{}" reference'.format(rel_name)) + if not rel_desc[0].isupper(): + errs.append('Description should be capitalized for ' + 'See Also "{}" reference'.format(rel_name)) + else: errs.append('Missing description for ' 'See Also "{}" reference'.format(rel_name)) diff --git a/setup.cfg b/setup.cfg index 6bedef1af5b4a..bceb517a1a787 100644 --- a/setup.cfg +++ b/setup.cfg @@ -167,12 +167,6 @@ skip= pandas/core/internals/concat.py, pandas/core/internals/managers.py, pandas/core/internals/blocks.py, - pandas/core/groupby/ops.py, - pandas/core/groupby/categorical.py, - pandas/core/groupby/generic.py, - pandas/core/groupby/groupby.py, - pandas/core/groupby/grouper.py, - pandas/core/groupby/base.py, pandas/core/reshape/concat.py, pandas/core/reshape/tile.py, pandas/core/reshape/melt.py, @@ -181,8 +175,6 @@ skip= pandas/core/reshape/merge.py, pandas/core/reshape/reshape.py, pandas/core/reshape/pivot.py, - pandas/core/sparse/array.py, - pandas/core/arrays/sparse.py, pandas/core/sparse/api.py, pandas/core/sparse/series.py, pandas/core/sparse/frame.py,