From fb7af6e257d5ca162487ea417eae675e3edbe271 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 22 Mar 2017 07:58:28 -0400 Subject: [PATCH] CLN: move groupby algos separate cython lib - separate out groupby algorithms to separate lib - release GIL on median - release GIL on is_lexsorted / fix memory leak - release GIL on nancorr Author: Jeff Reback Closes #15775 from jreback/groupby and squashes the following commits: 4e2bfec [Jeff Reback] release GIL on median release GIL on is_lexsorted / fix memory leak release GIL on nancorr ce28bb5 [Jeff Reback] CLN: separate out groupby algorithms to separate lib --- pandas/_libs/algos.pxd | 13 + pandas/_libs/algos.pyx | 530 +++++------------- pandas/_libs/groupby.pyx | 291 ++++++++++ ...by_helper.pxi.in => groupby_helper.pxi.in} | 18 +- pandas/core/groupby.py | 10 +- pandas/tests/groupby/test_bin_groupby.py | 5 +- pandas/tests/groupby/test_transform.py | 14 +- pandas/tests/test_algos.py | 7 +- setup.py | 8 +- 9 files changed, 474 insertions(+), 422 deletions(-) create mode 100644 pandas/_libs/algos.pxd create mode 100644 pandas/_libs/groupby.pyx rename pandas/_libs/{algos_groupby_helper.pxi.in => groupby_helper.pxi.in} (98%) diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd new file mode 100644 index 0000000000000..6d80e6f0073eb --- /dev/null +++ b/pandas/_libs/algos.pxd @@ -0,0 +1,13 @@ +from util cimport numeric +from numpy cimport float64_t, double_t + +cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil + +cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: + cdef numeric t + + # cython doesn't allow pointer dereference so use array syntax + t = a[0] + a[0] = b[0] + b[0] = t + return 0 diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7d3ce3280ec1e..897a60e0c2f21 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -96,22 +96,94 @@ class NegInfinity(object): __ge__ = lambda self, other: self is other -cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil except -1: - cdef numeric t +@cython.wraparound(False) +@cython.boundscheck(False) +def is_lexsorted(list list_of_arrays): + cdef: + int i + Py_ssize_t n, nlevels + int64_t k, cur, pre + ndarray arr + bint result = True + + nlevels = len(list_of_arrays) + n = len(list_of_arrays[0]) + + cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) + for i in range(nlevels): + arr = list_of_arrays[i] + vecs[i] = arr.data + + # Assume uniqueness?? + with nogil: + for i in range(n): + for k in range(nlevels): + cur = vecs[k][i] + pre = vecs[k][i -1] + if cur == pre: + continue + elif cur > pre: + break + else: + result = False + break + free(vecs) + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): + """ + compute a 1-d indexer that is an ordering of the passed index, + ordered by the groups. This is a reverse of the label + factorization process. + + Parameters + ---------- + index: int64 ndarray + mappings from group -> position + ngroups: int64 + number of groups + + return a tuple of (1-d indexer ordered by groups, group counts) + """ + + cdef: + Py_ssize_t i, loc, label, n + ndarray[int64_t] counts, where, result + + counts = np.zeros(ngroups + 1, dtype=np.int64) + n = len(index) + result = np.zeros(n, dtype=np.int64) + where = np.zeros(ngroups + 1, dtype=np.int64) + + with nogil: + + # count group sizes, location 0 for NA + for i in range(n): + counts[index[i] + 1] += 1 - # cython doesn't allow pointer dereference so use array syntax - t = a[0] - a[0] = b[0] - b[0] = t - return 0 + # mark the start of each contiguous group of like-indexed data + for i in range(1, ngroups + 1): + where[i] = where[i - 1] + counts[i - 1] + + # this is our indexer + for i in range(n): + label = index[i] + 1 + result[where[label]] = i + where[label] += 1 + + return result, counts @cython.boundscheck(False) @cython.wraparound(False) -cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k): +cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil: cdef: - Py_ssize_t i, j, l, m, n = a.size + Py_ssize_t i, j, l, m, n = a.shape[0] numeric x + with nogil: l = 0 m = n - 1 @@ -132,32 +204,6 @@ cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k): if j < k: l = i if k < i: m = j - return a[k] - - -cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): - cdef: - Py_ssize_t i, j, l, m - double_t x, t - - l = 0 - m = n -1 - while (l j: break - - if j < k: l = i - if k < i: m = j return a[k] @@ -181,6 +227,8 @@ cpdef numeric median(numeric[:] arr): # -------------- Min, Max subsequence +@cython.boundscheck(False) +@cython.wraparound(False) def max_subseq(ndarray[double_t] arr): cdef: Py_ssize_t i=0, s=0, e=0, T, n @@ -195,21 +243,24 @@ def max_subseq(ndarray[double_t] arr): S = m T = 0 - for i in range(1, n): - # S = max { S + A[i], A[i] ) - if (S > 0): - S = S + arr[i] - else: - S = arr[i] - T = i - if S > m: - s = T - e = i - m = S + with nogil: + for i in range(1, n): + # S = max { S + A[i], A[i] ) + if (S > 0): + S = S + arr[i] + else: + S = arr[i] + T = i + if S > m: + s = T + e = i + m = S return (s, e, m) +@cython.boundscheck(False) +@cython.wraparound(False) def min_subseq(ndarray[double_t] arr): cdef: Py_ssize_t s, e @@ -225,9 +276,10 @@ def min_subseq(ndarray[double_t] arr): @cython.boundscheck(False) @cython.wraparound(False) -def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None): +def nancorr(ndarray[float64_t, ndim=2] mat, bint cov=0, minp=None): cdef: Py_ssize_t i, j, xi, yi, N, K + bint minpv ndarray[float64_t, ndim=2] result ndarray[uint8_t, ndim=2] mask int64_t nobs = 0 @@ -236,46 +288,49 @@ def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None): N, K = ( mat).shape if minp is None: - minp = 1 + minpv = 1 + else: + minpv = minp result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) - for xi in range(K): - for yi in range(xi + 1): - nobs = sumxx = sumyy = sumx = sumy = 0 - for i in range(N): - if mask[i, xi] and mask[i, yi]: - vx = mat[i, xi] - vy = mat[i, yi] - nobs += 1 - sumx += vx - sumy += vy - - if nobs < minp: - result[xi, yi] = result[yi, xi] = np.NaN - else: - meanx = sumx / nobs - meany = sumy / nobs - - # now the cov numerator - sumx = 0 - + with nogil: + for xi in range(K): + for yi in range(xi + 1): + nobs = sumxx = sumyy = sumx = sumy = 0 for i in range(N): if mask[i, xi] and mask[i, yi]: - vx = mat[i, xi] - meanx - vy = mat[i, yi] - meany + vx = mat[i, xi] + vy = mat[i, yi] + nobs += 1 + sumx += vx + sumy += vy + + if nobs < minpv: + result[xi, yi] = result[yi, xi] = NaN + else: + meanx = sumx / nobs + meany = sumy / nobs - sumx += vx * vy - sumxx += vx * vx - sumyy += vy * vy + # now the cov numerator + sumx = 0 - divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy) + for i in range(N): + if mask[i, xi] and mask[i, yi]: + vx = mat[i, xi] - meanx + vy = mat[i, yi] - meany - if divisor != 0: - result[xi, yi] = result[yi, xi] = sumx / divisor - else: - result[xi, yi] = result[yi, xi] = np.NaN + sumx += vx * vy + sumxx += vx * vx + sumyy += vy * vy + + divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy) + + if divisor != 0: + result[xi, yi] = result[yi, xi] = sumx / divisor + else: + result[xi, yi] = result[yi, xi] = NaN return result @@ -308,7 +363,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): nobs += 1 if nobs < minp: - result[xi, yi] = result[yi, xi] = np.NaN + result[xi, yi] = result[yi, xi] = NaN else: maskedx = np.empty(nobs, dtype=np.float64) maskedy = np.empty(nobs, dtype=np.float64) @@ -339,326 +394,11 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): if divisor != 0: result[xi, yi] = result[yi, xi] = sumx / divisor else: - result[xi, yi] = result[yi, xi] = np.NaN + result[xi, yi] = result[yi, xi] = NaN return result -#---------------------------------------------------------------------- -# group operations - - -@cython.wraparound(False) -@cython.boundscheck(False) -def is_lexsorted(list list_of_arrays): - cdef: - int i - Py_ssize_t n, nlevels - int64_t k, cur, pre - ndarray arr - - nlevels = len(list_of_arrays) - n = len(list_of_arrays[0]) - - cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) - for i from 0 <= i < nlevels: - arr = list_of_arrays[i] - vecs[i] = arr.data - - # Assume uniqueness?? - for i from 1 <= i < n: - for k from 0 <= k < nlevels: - cur = vecs[k][i] - pre = vecs[k][i -1] - if cur == pre: - continue - elif cur > pre: - break - else: - return False - free(vecs) - return True - - -@cython.boundscheck(False) -@cython.wraparound(False) -def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): - """ - compute a 1-d indexer that is an ordering of the passed index, - ordered by the groups. This is a reverse of the label - factorization process. - - Parameters - ---------- - index: int64 ndarray - mappings from group -> position - ngroups: int64 - number of groups - - return a tuple of (1-d indexer ordered by groups, group counts) - """ - - cdef: - Py_ssize_t i, loc, label, n - ndarray[int64_t] counts, where, result - - counts = np.zeros(ngroups + 1, dtype=np.int64) - n = len(index) - result = np.zeros(n, dtype=np.int64) - where = np.zeros(ngroups + 1, dtype=np.int64) - - with nogil: - - # count group sizes, location 0 for NA - for i from 0 <= i < n: - counts[index[i] + 1] += 1 - - # mark the start of each contiguous group of like-indexed data - for i from 1 <= i < ngroups + 1: - where[i] = where[i - 1] + counts[i - 1] - - # this is our indexer - for i from 0 <= i < n: - label = index[i] + 1 - result[where[label]] = i - where[label] += 1 - - return result, counts - -# TODO: aggregate multiple columns in single pass -#---------------------------------------------------------------------- -# first, nth, last - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[int64_t, ndim=2] nobs - ndarray[object, ndim=2] resx - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth_bin_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] bins, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, b - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[float64_t, ndim=2] nobs - - nobs = np.zeros(( out).shape, dtype=np.float64) - resx = np.empty(( out).shape, dtype=object) - - if len(bins) == 0: - return - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - if nobs[b, j] == rank: - resx[b, j] = val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last_bin_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] bins): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, b - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[float64_t, ndim=2] nobs - - nobs = np.zeros(( out).shape, dtype=np.float64) - resx = np.empty(( out).shape, dtype=object) - - if len(bins) == 0: - return - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - resx[b, j] = val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - -cdef inline float64_t _median_linear(float64_t* a, int n): - cdef int i, j, na_count = 0 - cdef float64_t result - cdef float64_t* tmp - - if n == 0: - return NaN - - # count NAs - for i in range(n): - if a[i] != a[i]: - na_count += 1 - - if na_count: - if na_count == n: - return NaN - - tmp = malloc((n - na_count) * sizeof(float64_t)) - - j = 0 - for i in range(n): - if a[i] == a[i]: - tmp[j] = a[i] - j += 1 - - a = tmp - n -= na_count - - if n % 2: - result = kth_smallest_c( a, n / 2, n) - else: - result = (kth_smallest_c(a, n / 2, n) + - kth_smallest_c(a, n / 2 - 1, n)) / 2 - - if na_count: - free(a) - - return result - - # generated from template include "algos_common_helper.pxi" -include "algos_groupby_helper.pxi" include "algos_rank_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx new file mode 100644 index 0000000000000..c6ff602cfef1c --- /dev/null +++ b/pandas/_libs/groupby.pyx @@ -0,0 +1,291 @@ +# cython: profile=False + +from numpy cimport * +cimport numpy as np +import numpy as np + +cimport cython + +import_array() + +cimport util + +from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, + uint32_t, uint64_t, float16_t, float32_t, float64_t) + +from libc.stdlib cimport malloc, free + +from util cimport numeric, get_nat +from algos cimport swap +from algos import take_2d_axis1_float64_float64, groupsort_indexer + +cdef int64_t iNaT = get_nat() + +cdef double NaN = np.NaN +cdef double nan = NaN + + +# TODO: aggregate multiple columns in single pass +#---------------------------------------------------------------------- +# first, nth, last + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels, + int64_t rank): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + object val + float64_t count + ndarray[int64_t, ndim=2] nobs + ndarray[object, ndim=2] resx + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty(( out).shape, dtype=object) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, ngroups, b + object val + float64_t count + ndarray[object, ndim=2] resx + ndarray[float64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.float64) + resx = np.empty(( out).shape, dtype=object) + + if len(bins) == 0: + return + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + object val + float64_t count + ndarray[object, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty(( out).shape, dtype=object) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last_bin_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] bins): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, ngroups, b + object val + float64_t count + ndarray[object, ndim=2] resx + ndarray[float64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.float64) + resx = np.empty(( out).shape, dtype=object) + + if len(bins) == 0: + return + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + +cdef inline float64_t _median_linear(float64_t* a, int n) nogil: + cdef int i, j, na_count = 0 + cdef float64_t result + cdef float64_t* tmp + + if n == 0: + return NaN + + # count NAs + for i in range(n): + if a[i] != a[i]: + na_count += 1 + + if na_count: + if na_count == n: + return NaN + + tmp = malloc((n - na_count) * sizeof(float64_t)) + + j = 0 + for i in range(n): + if a[i] == a[i]: + tmp[j] = a[i] + j += 1 + + a = tmp + n -= na_count + + if n % 2: + result = kth_smallest_c( a, n / 2, n) + else: + result = (kth_smallest_c(a, n / 2, n) + + kth_smallest_c(a, n / 2 - 1, n)) / 2 + + if na_count: + free(a) + + return result + + +cdef inline float64_t kth_smallest_c(float64_t* a, + Py_ssize_t k, + Py_ssize_t n) nogil: + cdef: + Py_ssize_t i, j, l, m + double_t x, t + + l = 0 + m = n -1 + while (l j: break + + if j < k: l = i + if k < i: m = j + return a[k] + + +# generated from template +include "groupby_helper.pxi" diff --git a/pandas/_libs/algos_groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in similarity index 98% rename from pandas/_libs/algos_groupby_helper.pxi.in rename to pandas/_libs/groupby_helper.pxi.in index e2c263f49b110..d38b677df321c 100644 --- a/pandas/_libs/algos_groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -681,6 +681,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, #---------------------------------------------------------------------- +@cython.boundscheck(False) +@cython.wraparound(False) def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, @@ -704,13 +706,15 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, take_2d_axis1_float64_float64(values.T, indexer, out=data) - for i in range(K): - # exclude NA group - ptr += _counts[0] - for j in range(ngroups): - size = _counts[j + 1] - out[j, i] = _median_linear(ptr, size) - ptr += size + with nogil: + + for i in range(K): + # exclude NA group + ptr += _counts[0] + for j in range(ngroups): + size = _counts[j + 1] + out[j, i] = _median_linear(ptr, size) + ptr += size @cython.boundscheck(False) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0a63981290df3..727af8b8cd3eb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -60,7 +60,7 @@ import pandas.core.common as com from pandas.core.config import option_context -from pandas._libs import lib, algos as libalgos, Timestamp, NaT, iNaT +from pandas._libs import lib, groupby as libgroupby, Timestamp, NaT, iNaT from pandas._libs.lib import count_level_2d _doc_template = """ @@ -1474,7 +1474,7 @@ def shift(self, periods=1, freq=None, axis=0): # filled in by Cython indexer = np.zeros_like(labels) - libalgos.group_shift_indexer(indexer, labels, ngroups, periods) + libgroupby.group_shift_indexer(indexer, labels, ngroups, periods) output = {} for name, obj in self._iterate_slices(): @@ -1815,13 +1815,13 @@ def _get_cython_function(self, kind, how, values, is_numeric): def get_func(fname): # see if there is a fused-type version of function # only valid for numeric - f = getattr(libalgos, fname, None) + f = getattr(libgroupby, fname, None) if f is not None and is_numeric: return f # otherwise find dtype-specific version, falling back to object for dt in [dtype_str, 'object']: - f = getattr(libalgos, "%s_%s" % (fname, dtype_str), None) + f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None) if f is not None: return f @@ -3118,7 +3118,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, out = _ensure_int64(out) return Series(out, index=mi, name=self.name) - # for compat. with libalgos.value_counts need to ensure every + # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros diff = np.zeros(len(out), dtype='bool') for lab in labels[:-1]: diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 77c5bde332cff..02c7933e020ea 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -7,8 +7,7 @@ from pandas import Index, isnull from pandas.util.testing import assert_almost_equal import pandas.util.testing as tm -import pandas._libs.lib as lib -import pandas._libs.algos as algos +from pandas._libs import lib, groupby def test_series_grouper(): @@ -92,7 +91,7 @@ def _check(dtype): labels = _ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) - func = getattr(algos, 'group_ohlc_%s' % dtype) + func = getattr(groupby, 'group_ohlc_%s' % dtype) func(out, counts, obj[:, None], labels) def _ohlc(group): diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 4acf9dd4755f4..3b85fadda6cfe 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -6,7 +6,7 @@ from pandas import Series, DataFrame, Timestamp, MultiIndex, concat, date_range from pandas.types.common import _ensure_platform_int, is_timedelta64_dtype from pandas.compat import StringIO -from pandas._libs import algos +from pandas._libs import groupby from .common import MixIn, assert_fp_equal from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -418,8 +418,8 @@ def test_cython_group_transform_algos(self): dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, np.uint64, np.float32, np.float64] - ops = [(algos.group_cumprod_float64, np.cumproduct, [np.float64]), - (algos.group_cumsum, np.cumsum, dtypes)] + ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]), + (groupby.group_cumsum, np.cumsum, dtypes)] is_datetimelike = False for pd_op, np_op, dtypes in ops: @@ -437,13 +437,13 @@ def test_cython_group_transform_algos(self): data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') actual = np.zeros_like(data) actual.fill(np.nan) - algos.group_cumprod_float64(actual, data, labels, is_datetimelike) + groupby.group_cumprod_float64(actual, data, labels, is_datetimelike) expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') self.assert_numpy_array_equal(actual[:, 0], expected) actual = np.zeros_like(data) actual.fill(np.nan) - algos.group_cumsum(actual, data, labels, is_datetimelike) + groupby.group_cumsum(actual, data, labels, is_datetimelike) expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') self.assert_numpy_array_equal(actual[:, 0], expected) @@ -451,8 +451,8 @@ def test_cython_group_transform_algos(self): is_datetimelike = True data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] actual = np.zeros_like(data, dtype='int64') - algos.group_cumsum(actual, data.view('int64'), labels, - is_datetimelike) + groupby.group_cumsum(actual, data.view('int64'), labels, + is_datetimelike) expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), np.timedelta64(5, 'ns')]) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ce925f756edb7..f8eac7a8911ad 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -10,7 +10,8 @@ import pandas as pd from pandas import compat -from pandas._libs import algos as libalgos, hashtable +from pandas._libs import (groupby as libgroupby, algos as libalgos, + hashtable) from pandas._libs.hashtable import unique_label_indices from pandas.compat import lrange import pandas.core.algorithms as algos @@ -891,7 +892,7 @@ def test_group_var_constant(self): class TestGroupVarFloat64(tm.TestCase, GroupVarTestMixin): __test__ = True - algo = algos.algos.group_var_float64 + algo = libgroupby.group_var_float64 dtype = np.float64 rtol = 1e-5 @@ -914,7 +915,7 @@ def test_group_var_large_inputs(self): class TestGroupVarFloat32(tm.TestCase, GroupVarTestMixin): __test__ = True - algo = algos.algos.group_var_float32 + algo = libgroupby.group_var_float32 dtype = np.float32 rtol = 1e-2 diff --git a/setup.py b/setup.py index 3e0a6b41152dc..8e690f05b818c 100755 --- a/setup.py +++ b/setup.py @@ -110,8 +110,9 @@ def is_platform_mac(): _pxi_dep_template = { - 'algos': ['_libs/algos_common_helper.pxi.in', '_libs/algos_groupby_helper.pxi.in', + 'algos': ['_libs/algos_common_helper.pxi.in', '_libs/algos_take_helper.pxi.in', '_libs/algos_rank_helper.pxi.in'], + 'groupby': ['_libs/groupby_helper.pxi.in'], 'join': ['_libs/join_helper.pxi.in', '_libs/join_func_helper.pxi.in'], 'reshape': ['_libs/reshape_helper.pxi.in'], 'hashtable': ['_libs/hashtable_class_helper.pxi.in', @@ -496,8 +497,11 @@ def pxd(name): 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], 'depends': _pxi_dep['index']}, '_libs.algos': {'pyxfile': '_libs/algos', - 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], + 'pxdfiles': ['_libs/src/util', '_libs/algos', '_libs/hashtable'], 'depends': _pxi_dep['algos']}, + '_libs.groupby': {'pyxfile': '_libs/groupby', + 'pxdfiles': ['_libs/src/util', '_libs/algos'], + 'depends': _pxi_dep['groupby']}, '_libs.join': {'pyxfile': '_libs/join', 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], 'depends': _pxi_dep['join']},