From fb7af6e257d5ca162487ea417eae675e3edbe271 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 22 Mar 2017 07:58:28 -0400
Subject: [PATCH] CLN: move groupby algos separate cython lib

- separate out groupby algorithms to separate lib
- release GIL on median
- release GIL on is_lexsorted / fix memory leak
- release GIL on nancorr

Author: Jeff Reback <jeff@reback.net>

Closes #15775 from jreback/groupby and squashes the following commits:

4e2bfec [Jeff Reback] release GIL on median release GIL on is_lexsorted / fix memory leak release GIL on nancorr
ce28bb5 [Jeff Reback] CLN: separate out groupby algorithms to separate lib
---
 pandas/_libs/algos.pxd                        |  13 +
 pandas/_libs/algos.pyx                        | 530 +++++-------------
 pandas/_libs/groupby.pyx                      | 291 ++++++++++
 ...by_helper.pxi.in => groupby_helper.pxi.in} |  18 +-
 pandas/core/groupby.py                        |  10 +-
 pandas/tests/groupby/test_bin_groupby.py      |   5 +-
 pandas/tests/groupby/test_transform.py        |  14 +-
 pandas/tests/test_algos.py                    |   7 +-
 setup.py                                      |   8 +-
 9 files changed, 474 insertions(+), 422 deletions(-)
 create mode 100644 pandas/_libs/algos.pxd
 create mode 100644 pandas/_libs/groupby.pyx
 rename pandas/_libs/{algos_groupby_helper.pxi.in => groupby_helper.pxi.in} (98%)

diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd
new file mode 100644
index 0000000000000..6d80e6f0073eb
--- /dev/null
+++ b/pandas/_libs/algos.pxd
@@ -0,0 +1,13 @@
+from util cimport numeric
+from numpy cimport float64_t, double_t
+
+cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil
+
+cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
+    cdef numeric t
+
+    # cython doesn't allow pointer dereference so use array syntax
+    t = a[0]
+    a[0] = b[0]
+    b[0] = t
+    return 0
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 7d3ce3280ec1e..897a60e0c2f21 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -96,22 +96,94 @@ class NegInfinity(object):
     __ge__ = lambda self, other: self is other
 
 
-cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil except -1:
-    cdef numeric t
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def is_lexsorted(list list_of_arrays):
+    cdef:
+        int i
+        Py_ssize_t n, nlevels
+        int64_t k, cur, pre
+        ndarray arr
+        bint result = True
+
+    nlevels = len(list_of_arrays)
+    n = len(list_of_arrays[0])
+
+    cdef int64_t **vecs = <int64_t**> malloc(nlevels * sizeof(int64_t*))
+    for i in range(nlevels):
+        arr = list_of_arrays[i]
+        vecs[i] = <int64_t*> arr.data
+
+    # Assume uniqueness??
+    with nogil:
+        for i in range(n):
+            for k in range(nlevels):
+                cur = vecs[k][i]
+                pre = vecs[k][i -1]
+                if cur == pre:
+                    continue
+                elif cur > pre:
+                    break
+                else:
+                    result = False
+                    break
+    free(vecs)
+    return result
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups):
+    """
+    compute a 1-d indexer that is an ordering of the passed index,
+    ordered by the groups. This is a reverse of the label
+    factorization process.
+
+    Parameters
+    ----------
+    index: int64 ndarray
+        mappings from group -> position
+    ngroups: int64
+        number of groups
+
+    return a tuple of (1-d indexer ordered by groups, group counts)
+    """
+
+    cdef:
+        Py_ssize_t i, loc, label, n
+        ndarray[int64_t] counts, where, result
+
+    counts = np.zeros(ngroups + 1, dtype=np.int64)
+    n = len(index)
+    result = np.zeros(n, dtype=np.int64)
+    where = np.zeros(ngroups + 1, dtype=np.int64)
+
+    with nogil:
+
+        # count group sizes, location 0 for NA
+        for i in range(n):
+            counts[index[i] + 1] += 1
 
-    # cython doesn't allow pointer dereference so use array syntax
-    t = a[0]
-    a[0] = b[0]
-    b[0] = t
-    return 0
+        # mark the start of each contiguous group of like-indexed data
+        for i in range(1, ngroups + 1):
+            where[i] = where[i - 1] + counts[i - 1]
+
+        # this is our indexer
+        for i in range(n):
+            label = index[i] + 1
+            result[where[label]] = i
+            where[label] += 1
+
+    return result, counts
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k):
+cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil:
     cdef:
-        Py_ssize_t i, j, l, m, n = a.size
+        Py_ssize_t i, j, l, m, n = a.shape[0]
         numeric x
+
     with nogil:
         l = 0
         m = n - 1
@@ -132,32 +204,6 @@ cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k):
 
             if j < k: l = i
             if k < i: m = j
-        return a[k]
-
-
-cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
-    cdef:
-        Py_ssize_t i, j, l, m
-        double_t x, t
-
-    l = 0
-    m = n -1
-    while (l<m):
-        x = a[k]
-        i = l
-        j = m
-
-        while 1:
-            while a[i] < x: i += 1
-            while x < a[j]: j -= 1
-            if i <= j:
-                swap(&a[i], &a[j])
-                i += 1; j -= 1
-
-            if i > j: break
-
-        if j < k: l = i
-        if k < i: m = j
     return a[k]
 
 
@@ -181,6 +227,8 @@ cpdef numeric median(numeric[:] arr):
 
 # -------------- Min, Max subsequence
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
 def max_subseq(ndarray[double_t] arr):
     cdef:
         Py_ssize_t i=0, s=0, e=0, T, n
@@ -195,21 +243,24 @@ def max_subseq(ndarray[double_t] arr):
     S = m
     T = 0
 
-    for i in range(1, n):
-        # S = max { S + A[i], A[i] )
-        if (S > 0):
-            S = S + arr[i]
-        else:
-            S = arr[i]
-            T = i
-        if S > m:
-            s = T
-            e = i
-            m = S
+    with nogil:
+        for i in range(1, n):
+            # S = max { S + A[i], A[i] )
+            if (S > 0):
+                S = S + arr[i]
+            else:
+                S = arr[i]
+                T = i
+            if S > m:
+                s = T
+                e = i
+                m = S
 
     return (s, e, m)
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
 def min_subseq(ndarray[double_t] arr):
     cdef:
         Py_ssize_t s, e
@@ -225,9 +276,10 @@ def min_subseq(ndarray[double_t] arr):
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None):
+def nancorr(ndarray[float64_t, ndim=2] mat, bint cov=0, minp=None):
     cdef:
         Py_ssize_t i, j, xi, yi, N, K
+        bint minpv
         ndarray[float64_t, ndim=2] result
         ndarray[uint8_t, ndim=2] mask
         int64_t nobs = 0
@@ -236,46 +288,49 @@ def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None):
     N, K = (<object> mat).shape
 
     if minp is None:
-        minp = 1
+        minpv = 1
+    else:
+        minpv = <int>minp
 
     result = np.empty((K, K), dtype=np.float64)
     mask = np.isfinite(mat).view(np.uint8)
 
-    for xi in range(K):
-        for yi in range(xi + 1):
-            nobs = sumxx = sumyy = sumx = sumy = 0
-            for i in range(N):
-                if mask[i, xi] and mask[i, yi]:
-                    vx = mat[i, xi]
-                    vy = mat[i, yi]
-                    nobs += 1
-                    sumx += vx
-                    sumy += vy
-
-            if nobs < minp:
-                result[xi, yi] = result[yi, xi] = np.NaN
-            else:
-                meanx = sumx / nobs
-                meany = sumy / nobs
-
-                # now the cov numerator
-                sumx = 0
-
+    with nogil:
+        for xi in range(K):
+            for yi in range(xi + 1):
+                nobs = sumxx = sumyy = sumx = sumy = 0
                 for i in range(N):
                     if mask[i, xi] and mask[i, yi]:
-                        vx = mat[i, xi] - meanx
-                        vy = mat[i, yi] - meany
+                        vx = mat[i, xi]
+                        vy = mat[i, yi]
+                        nobs += 1
+                        sumx += vx
+                        sumy += vy
+
+                if nobs < minpv:
+                    result[xi, yi] = result[yi, xi] = NaN
+                else:
+                    meanx = sumx / nobs
+                    meany = sumy / nobs
 
-                        sumx += vx * vy
-                        sumxx += vx * vx
-                        sumyy += vy * vy
+                    # now the cov numerator
+                    sumx = 0
 
-                divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy)
+                    for i in range(N):
+                        if mask[i, xi] and mask[i, yi]:
+                            vx = mat[i, xi] - meanx
+                            vy = mat[i, yi] - meany
 
-                if divisor != 0:
-                    result[xi, yi] = result[yi, xi] = sumx / divisor
-                else:
-                    result[xi, yi] = result[yi, xi] = np.NaN
+                            sumx += vx * vy
+                            sumxx += vx * vx
+                            sumyy += vy * vy
+
+                    divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy)
+
+                    if divisor != 0:
+                        result[xi, yi] = result[yi, xi] = sumx / divisor
+                    else:
+                        result[xi, yi] = result[yi, xi] = NaN
 
     return result
 
@@ -308,7 +363,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
                     nobs += 1
 
             if nobs < minp:
-                result[xi, yi] = result[yi, xi] = np.NaN
+                result[xi, yi] = result[yi, xi] = NaN
             else:
                 maskedx = np.empty(nobs, dtype=np.float64)
                 maskedy = np.empty(nobs, dtype=np.float64)
@@ -339,326 +394,11 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
                 if divisor != 0:
                     result[xi, yi] = result[yi, xi] = sumx / divisor
                 else:
-                    result[xi, yi] = result[yi, xi] = np.NaN
+                    result[xi, yi] = result[yi, xi] = NaN
 
     return result
 
-#----------------------------------------------------------------------
-# group operations
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def is_lexsorted(list list_of_arrays):
-    cdef:
-        int i
-        Py_ssize_t n, nlevels
-        int64_t k, cur, pre
-        ndarray arr
-
-    nlevels = len(list_of_arrays)
-    n = len(list_of_arrays[0])
-
-    cdef int64_t **vecs = <int64_t**> malloc(nlevels * sizeof(int64_t*))
-    for i from 0 <= i < nlevels:
-        arr = list_of_arrays[i]
-        vecs[i] = <int64_t*> arr.data
-
-    # Assume uniqueness??
-    for i from 1 <= i < n:
-        for k from 0 <= k < nlevels:
-            cur = vecs[k][i]
-            pre = vecs[k][i -1]
-            if cur == pre:
-                continue
-            elif cur > pre:
-                break
-            else:
-                return False
-    free(vecs)
-    return True
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups):
-    """
-    compute a 1-d indexer that is an ordering of the passed index,
-    ordered by the groups. This is a reverse of the label
-    factorization process.
-
-    Parameters
-    ----------
-    index: int64 ndarray
-        mappings from group -> position
-    ngroups: int64
-        number of groups
-
-    return a tuple of (1-d indexer ordered by groups, group counts)
-    """
-
-    cdef:
-        Py_ssize_t i, loc, label, n
-        ndarray[int64_t] counts, where, result
-
-    counts = np.zeros(ngroups + 1, dtype=np.int64)
-    n = len(index)
-    result = np.zeros(n, dtype=np.int64)
-    where = np.zeros(ngroups + 1, dtype=np.int64)
-
-    with nogil:
-
-        # count group sizes, location 0 for NA
-        for i from 0 <= i < n:
-            counts[index[i] + 1] += 1
-
-        # mark the start of each contiguous group of like-indexed data
-        for i from 1 <= i < ngroups + 1:
-            where[i] = where[i - 1] + counts[i - 1]
-
-        # this is our indexer
-        for i from 0 <= i < n:
-            label = index[i] + 1
-            result[where[label]] = i
-            where[label] += 1
-
-    return result, counts
-
-# TODO: aggregate multiple columns in single pass
-#----------------------------------------------------------------------
-# first, nth, last
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_nth_object(ndarray[object, ndim=2] out,
-                     ndarray[int64_t] counts,
-                     ndarray[object, ndim=2] values,
-                     ndarray[int64_t] labels,
-                     int64_t rank):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        object val
-        float64_t count
-        ndarray[int64_t, ndim=2] nobs
-        ndarray[object, ndim=2] resx
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty((<object> out).shape, dtype=object)
-
-    N, K = (<object> values).shape
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                if nobs[lab, j] == rank:
-                    resx[lab, j] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = <object> nan
-            else:
-                out[i, j] = resx[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_nth_bin_object(ndarray[object, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[object, ndim=2] values,
-                         ndarray[int64_t] bins, int64_t rank):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        object val
-        float64_t count
-        ndarray[object, ndim=2] resx
-        ndarray[float64_t, ndim=2] nobs
-
-    nobs = np.zeros((<object> out).shape, dtype=np.float64)
-    resx = np.empty((<object> out).shape, dtype=object)
-
-    if len(bins) == 0:
-        return
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-
-    N, K = (<object> values).shape
-
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                if nobs[b, j] == rank:
-                    resx[b, j] = val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_last_object(ndarray[object, ndim=2] out,
-                      ndarray[int64_t] counts,
-                      ndarray[object, ndim=2] values,
-                      ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        object val
-        float64_t count
-        ndarray[object, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty((<object> out).shape, dtype=object)
-
-    N, K = (<object> values).shape
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                resx[lab, j] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_last_bin_object(ndarray[object, ndim=2] out,
-                          ndarray[int64_t] counts,
-                          ndarray[object, ndim=2] values,
-                          ndarray[int64_t] bins):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        object val
-        float64_t count
-        ndarray[object, ndim=2] resx
-        ndarray[float64_t, ndim=2] nobs
-
-    nobs = np.zeros((<object> out).shape, dtype=np.float64)
-    resx = np.empty((<object> out).shape, dtype=object)
-
-    if len(bins) == 0:
-        return
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-
-    N, K = (<object> values).shape
-
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                resx[b, j] = val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-
-cdef inline float64_t _median_linear(float64_t* a, int n):
-    cdef int i, j, na_count = 0
-    cdef float64_t result
-    cdef float64_t* tmp
-
-    if n == 0:
-        return NaN
-
-    # count NAs
-    for i in range(n):
-        if a[i] != a[i]:
-            na_count += 1
-
-    if na_count:
-        if na_count == n:
-            return NaN
-
-        tmp = <float64_t*> malloc((n - na_count) * sizeof(float64_t))
-
-        j = 0
-        for i in range(n):
-            if a[i] == a[i]:
-                tmp[j] = a[i]
-                j += 1
-
-        a = tmp
-        n -= na_count
-
-    if n % 2:
-        result = kth_smallest_c( a, n / 2, n)
-    else:
-        result = (kth_smallest_c(a, n / 2, n) +
-                  kth_smallest_c(a, n / 2 - 1, n)) / 2
-
-    if na_count:
-        free(a)
-
-    return result
-
-
 # generated from template
 include "algos_common_helper.pxi"
-include "algos_groupby_helper.pxi"
 include "algos_rank_helper.pxi"
 include "algos_take_helper.pxi"
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
new file mode 100644
index 0000000000000..c6ff602cfef1c
--- /dev/null
+++ b/pandas/_libs/groupby.pyx
@@ -0,0 +1,291 @@
+# cython: profile=False
+
+from numpy cimport *
+cimport numpy as np
+import numpy as np
+
+cimport cython
+
+import_array()
+
+cimport util
+
+from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+                    uint32_t, uint64_t, float16_t, float32_t, float64_t)
+
+from libc.stdlib cimport malloc, free
+
+from util cimport numeric, get_nat
+from algos cimport swap
+from algos import take_2d_axis1_float64_float64, groupsort_indexer
+
+cdef int64_t iNaT = get_nat()
+
+cdef double NaN = <double> np.NaN
+cdef double nan = NaN
+
+
+# TODO: aggregate multiple columns in single pass
+#----------------------------------------------------------------------
+# first, nth, last
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_object(ndarray[object, ndim=2] out,
+                     ndarray[int64_t] counts,
+                     ndarray[object, ndim=2] values,
+                     ndarray[int64_t] labels,
+                     int64_t rank):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        object val
+        float64_t count
+        ndarray[int64_t, ndim=2] nobs
+        ndarray[object, ndim=2] resx
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty((<object> out).shape, dtype=object)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                if nobs[lab, j] == rank:
+                    resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = <object> nan
+            else:
+                out[i, j] = resx[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_bin_object(ndarray[object, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[object, ndim=2] values,
+                         ndarray[int64_t] bins, int64_t rank):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        object val
+        float64_t count
+        ndarray[object, ndim=2] resx
+        ndarray[float64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.float64)
+    resx = np.empty((<object> out).shape, dtype=object)
+
+    if len(bins) == 0:
+        return
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[b, j] += 1
+                if nobs[b, j] == rank:
+                    resx[b, j] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_last_object(ndarray[object, ndim=2] out,
+                      ndarray[int64_t] counts,
+                      ndarray[object, ndim=2] values,
+                      ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        object val
+        float64_t count
+        ndarray[object, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty((<object> out).shape, dtype=object)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_last_bin_object(ndarray[object, ndim=2] out,
+                          ndarray[int64_t] counts,
+                          ndarray[object, ndim=2] values,
+                          ndarray[int64_t] bins):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        object val
+        float64_t count
+        ndarray[object, ndim=2] resx
+        ndarray[float64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.float64)
+    resx = np.empty((<object> out).shape, dtype=object)
+
+    if len(bins) == 0:
+        return
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[b, j] += 1
+                resx[b, j] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+
+cdef inline float64_t _median_linear(float64_t* a, int n) nogil:
+    cdef int i, j, na_count = 0
+    cdef float64_t result
+    cdef float64_t* tmp
+
+    if n == 0:
+        return NaN
+
+    # count NAs
+    for i in range(n):
+        if a[i] != a[i]:
+            na_count += 1
+
+    if na_count:
+        if na_count == n:
+            return NaN
+
+        tmp = <float64_t*> malloc((n - na_count) * sizeof(float64_t))
+
+        j = 0
+        for i in range(n):
+            if a[i] == a[i]:
+                tmp[j] = a[i]
+                j += 1
+
+        a = tmp
+        n -= na_count
+
+    if n % 2:
+        result = kth_smallest_c( a, n / 2, n)
+    else:
+        result = (kth_smallest_c(a, n / 2, n) +
+                  kth_smallest_c(a, n / 2 - 1, n)) / 2
+
+    if na_count:
+        free(a)
+
+    return result
+
+
+cdef inline float64_t kth_smallest_c(float64_t* a,
+                                     Py_ssize_t k,
+                                     Py_ssize_t n) nogil:
+    cdef:
+        Py_ssize_t i, j, l, m
+        double_t x, t
+
+    l = 0
+    m = n -1
+    while (l<m):
+        x = a[k]
+        i = l
+        j = m
+
+        while 1:
+            while a[i] < x: i += 1
+            while x < a[j]: j -= 1
+            if i <= j:
+                swap(&a[i], &a[j])
+                i += 1; j -= 1
+
+            if i > j: break
+
+        if j < k: l = i
+        if k < i: m = j
+    return a[k]
+
+
+# generated from template
+include "groupby_helper.pxi"
diff --git a/pandas/_libs/algos_groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
similarity index 98%
rename from pandas/_libs/algos_groupby_helper.pxi.in
rename to pandas/_libs/groupby_helper.pxi.in
index e2c263f49b110..d38b677df321c 100644
--- a/pandas/_libs/algos_groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -681,6 +681,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 #----------------------------------------------------------------------
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
 def group_median_float64(ndarray[float64_t, ndim=2] out,
                          ndarray[int64_t] counts,
                          ndarray[float64_t, ndim=2] values,
@@ -704,13 +706,15 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
 
     take_2d_axis1_float64_float64(values.T, indexer, out=data)
 
-    for i in range(K):
-        # exclude NA group
-        ptr += _counts[0]
-        for j in range(ngroups):
-            size = _counts[j + 1]
-            out[j, i] = _median_linear(ptr, size)
-            ptr += size
+    with nogil:
+
+        for i in range(K):
+            # exclude NA group
+            ptr += _counts[0]
+            for j in range(ngroups):
+                size = _counts[j + 1]
+                out[j, i] = _median_linear(ptr, size)
+                ptr += size
 
 
 @cython.boundscheck(False)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 0a63981290df3..727af8b8cd3eb 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -60,7 +60,7 @@
 import pandas.core.common as com
 from pandas.core.config import option_context
 
-from pandas._libs import lib, algos as libalgos, Timestamp, NaT, iNaT
+from pandas._libs import lib, groupby as libgroupby, Timestamp, NaT, iNaT
 from pandas._libs.lib import count_level_2d
 
 _doc_template = """
@@ -1474,7 +1474,7 @@ def shift(self, periods=1, freq=None, axis=0):
 
         # filled in by Cython
         indexer = np.zeros_like(labels)
-        libalgos.group_shift_indexer(indexer, labels, ngroups, periods)
+        libgroupby.group_shift_indexer(indexer, labels, ngroups, periods)
 
         output = {}
         for name, obj in self._iterate_slices():
@@ -1815,13 +1815,13 @@ def _get_cython_function(self, kind, how, values, is_numeric):
         def get_func(fname):
             # see if there is a fused-type version of function
             # only valid for numeric
-            f = getattr(libalgos, fname, None)
+            f = getattr(libgroupby, fname, None)
             if f is not None and is_numeric:
                 return f
 
             # otherwise find dtype-specific version, falling back to object
             for dt in [dtype_str, 'object']:
-                f = getattr(libalgos, "%s_%s" % (fname, dtype_str), None)
+                f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None)
                 if f is not None:
                     return f
 
@@ -3118,7 +3118,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
                 out = _ensure_int64(out)
             return Series(out, index=mi, name=self.name)
 
-        # for compat. with libalgos.value_counts need to ensure every
+        # for compat. with libgroupby.value_counts need to ensure every
         # bin is present at every index level, null filled with zeros
         diff = np.zeros(len(out), dtype='bool')
         for lab in labels[:-1]:
diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py
index 77c5bde332cff..02c7933e020ea 100644
--- a/pandas/tests/groupby/test_bin_groupby.py
+++ b/pandas/tests/groupby/test_bin_groupby.py
@@ -7,8 +7,7 @@
 from pandas import Index, isnull
 from pandas.util.testing import assert_almost_equal
 import pandas.util.testing as tm
-import pandas._libs.lib as lib
-import pandas._libs.algos as algos
+from pandas._libs import lib, groupby
 
 
 def test_series_grouper():
@@ -92,7 +91,7 @@ def _check(dtype):
         labels = _ensure_int64(np.repeat(np.arange(3),
                                          np.diff(np.r_[0, bins])))
 
-        func = getattr(algos, 'group_ohlc_%s' % dtype)
+        func = getattr(groupby, 'group_ohlc_%s' % dtype)
         func(out, counts, obj[:, None], labels)
 
         def _ohlc(group):
diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py
index 4acf9dd4755f4..3b85fadda6cfe 100644
--- a/pandas/tests/groupby/test_transform.py
+++ b/pandas/tests/groupby/test_transform.py
@@ -6,7 +6,7 @@
 from pandas import Series, DataFrame, Timestamp, MultiIndex, concat, date_range
 from pandas.types.common import _ensure_platform_int, is_timedelta64_dtype
 from pandas.compat import StringIO
-from pandas._libs import algos
+from pandas._libs import groupby
 from .common import MixIn, assert_fp_equal
 
 from pandas.util.testing import assert_frame_equal, assert_series_equal
@@ -418,8 +418,8 @@ def test_cython_group_transform_algos(self):
         dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32,
                   np.uint64, np.float32, np.float64]
 
-        ops = [(algos.group_cumprod_float64, np.cumproduct, [np.float64]),
-               (algos.group_cumsum, np.cumsum, dtypes)]
+        ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]),
+               (groupby.group_cumsum, np.cumsum, dtypes)]
 
         is_datetimelike = False
         for pd_op, np_op, dtypes in ops:
@@ -437,13 +437,13 @@ def test_cython_group_transform_algos(self):
         data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
         actual = np.zeros_like(data)
         actual.fill(np.nan)
-        algos.group_cumprod_float64(actual, data, labels, is_datetimelike)
+        groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
         expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
         self.assert_numpy_array_equal(actual[:, 0], expected)
 
         actual = np.zeros_like(data)
         actual.fill(np.nan)
-        algos.group_cumsum(actual, data, labels, is_datetimelike)
+        groupby.group_cumsum(actual, data, labels, is_datetimelike)
         expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
         self.assert_numpy_array_equal(actual[:, 0], expected)
 
@@ -451,8 +451,8 @@ def test_cython_group_transform_algos(self):
         is_datetimelike = True
         data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
         actual = np.zeros_like(data, dtype='int64')
-        algos.group_cumsum(actual, data.view('int64'), labels,
-                           is_datetimelike)
+        groupby.group_cumsum(actual, data.view('int64'), labels,
+                             is_datetimelike)
         expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
             2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
             np.timedelta64(5, 'ns')])
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index ce925f756edb7..f8eac7a8911ad 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -10,7 +10,8 @@
 import pandas as pd
 
 from pandas import compat
-from pandas._libs import algos as libalgos, hashtable
+from pandas._libs import (groupby as libgroupby, algos as libalgos,
+                          hashtable)
 from pandas._libs.hashtable import unique_label_indices
 from pandas.compat import lrange
 import pandas.core.algorithms as algos
@@ -891,7 +892,7 @@ def test_group_var_constant(self):
 class TestGroupVarFloat64(tm.TestCase, GroupVarTestMixin):
     __test__ = True
 
-    algo = algos.algos.group_var_float64
+    algo = libgroupby.group_var_float64
     dtype = np.float64
     rtol = 1e-5
 
@@ -914,7 +915,7 @@ def test_group_var_large_inputs(self):
 class TestGroupVarFloat32(tm.TestCase, GroupVarTestMixin):
     __test__ = True
 
-    algo = algos.algos.group_var_float32
+    algo = libgroupby.group_var_float32
     dtype = np.float32
     rtol = 1e-2
 
diff --git a/setup.py b/setup.py
index 3e0a6b41152dc..8e690f05b818c 100755
--- a/setup.py
+++ b/setup.py
@@ -110,8 +110,9 @@ def is_platform_mac():
 
 
 _pxi_dep_template = {
-    'algos': ['_libs/algos_common_helper.pxi.in', '_libs/algos_groupby_helper.pxi.in',
+    'algos': ['_libs/algos_common_helper.pxi.in',
               '_libs/algos_take_helper.pxi.in', '_libs/algos_rank_helper.pxi.in'],
+    'groupby': ['_libs/groupby_helper.pxi.in'],
     'join': ['_libs/join_helper.pxi.in', '_libs/join_func_helper.pxi.in'],
     'reshape': ['_libs/reshape_helper.pxi.in'],
     'hashtable': ['_libs/hashtable_class_helper.pxi.in',
@@ -496,8 +497,11 @@ def pxd(name):
                     'pxdfiles': ['_libs/src/util', '_libs/hashtable'],
                     'depends': _pxi_dep['index']},
     '_libs.algos': {'pyxfile': '_libs/algos',
-                    'pxdfiles': ['_libs/src/util', '_libs/hashtable'],
+                    'pxdfiles': ['_libs/src/util', '_libs/algos', '_libs/hashtable'],
                     'depends': _pxi_dep['algos']},
+    '_libs.groupby': {'pyxfile': '_libs/groupby',
+                    'pxdfiles': ['_libs/src/util', '_libs/algos'],
+                    'depends': _pxi_dep['groupby']},
     '_libs.join': {'pyxfile': '_libs/join',
                    'pxdfiles': ['_libs/src/util', '_libs/hashtable'],
                    'depends': _pxi_dep['join']},