biocore · wasade · Jul 28, 2023 · Jun 9, 2023 · Jun 13, 2023 · Jun 13, 2023
diff --git a/biom/_subsample.pyx b/biom/_subsample.pyx
@@ -6,22 +6,18 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # -----------------------------------------------------------------------------
 
-
 import numpy as np
 cimport numpy as cnp
 
-
-def _subsample(arr, n, with_replacement, rng):
-    """Subsample non-zero values of a sparse array
+def _subsample_with_replacement(arr, n, rng):
+    """Subsample non-zero values of a sparse array with replacement
 
     Parameters
     ----------
     arr : {csr_matrix, csc_matrix}
         A 1xM sparse vector
     n : int
         Number of items to subsample from `arr`
-    with_replacement : bool
-        Whether to permute or use multinomial sampling
     rng : Generator instance
         A random generator. This will likely be an instance returned 
         by np.random.default_rng
@@ -39,33 +35,110 @@ def _subsample(arr, n, with_replacement, rng):
     cdef:
         cnp.int64_t counts_sum
         cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
-        cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
+        cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
+        Py_ssize_t i, length
+
+    for i in range(indptr.shape[0] - 1):
+        start, end = indptr[i], indptr[i+1]
+        length = end - start
+        counts_sum = data[start:end].sum()
+
+        pvals = data[start:end] / counts_sum
+        data[start:end] = rng.multinomial(n, pvals)
+
+
+def _subsample_without_replacement(arr, n, rng):
+    """Subsample non-zero values of a sparse array w/out replacement
+
+    Parameters
+    ----------
+    arr : {csr_matrix, csc_matrix}
+        A 1xM sparse vector
+    n : int
+        Number of items to subsample from `arr`
+    rng : Generator instance
+        A random generator. This will likely be an instance returned 
+        by np.random.default_rng
+
+    Returns
+    -------
+    ndarray
+        Subsampled data
+
+    Notes
+    -----
+    This code was adapted from scikit-bio (`skbio.math._subsample`)
+
+    """
+    cdef:
+        cnp.int64_t counts_sum,idx,count_el, perm_count_ela
+        cnp.int64_t count_rem
+        cnp.int64_t cn = n
+        cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
         cnp.ndarray[cnp.float64_t, ndim=1] result
-        cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
         cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
-        cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
-        cnp.float64_t cnt
-        Py_ssize_t i, j, length
+        cnp.ndarray[cnp.int64_t, ndim=1] permuted
+        Py_ssize_t i
+        cnp.int32_t length,el
 
     for i in range(indptr.shape[0] - 1):
         start, end = indptr[i], indptr[i+1]
         length = end - start
         counts_sum = data[start:end].sum()
 
-        if with_replacement:
-            pvals = data[start:end] / counts_sum
-            data[start:end] = rng.multinomial(n, pvals)
-        else:
-            if counts_sum < n:
-                data[start:end] = 0
-                continue
-
-            r = np.arange(length, dtype=np.int32)
-            unpacked = np.repeat(r, data_i[start:end])
-            permuted = rng.permutation(unpacked)[:n]
-
-            result = np.zeros(length, dtype=np.float64)
-            for idx in range(permuted.shape[0]):
-                result[permuted[idx]] += 1
-
-            data[start:end] = result
+        if counts_sum < cn:
+            data[start:end] = 0
+            continue
+
+        permuted = rng.choice(counts_sum,cn,replace=False,shuffle=False)
+        permuted.sort()
+
+        # now need to do reverse mapping
+        result = np.zeros(length, dtype=np.float64)
+        el=0         # index in result/data
+        count_el =0  # index in permutted
+        count_rem=data[start]  # since each data has multiple els, sub count there
+        for idx in range(cn):
+            perm_count_el = permuted[idx]
+            # the array is sorted, so just jump ahead
+            while (perm_count_el-count_el) >= count_rem:
+               count_el += count_rem
+               el += 1
+               count_rem = data[start+el]
+            count_rem -= (perm_count_el-count_el)
+            count_el = perm_count_el
+
+            result[el] += 1
+
+        data[start:end] = result
+
+
+def _subsample(arr, n, with_replacement, rng):
+    """Subsample non-zero values of a sparse array
+
+    Parameters
+    ----------
+    arr : {csr_matrix, csc_matrix}
+        A 1xM sparse vector
+    n : int
+        Number of items to subsample from `arr`
+    with_replacement : bool
+        Whether to permute or use multinomial sampling
+    rng : Generator instance
+        A random generator. This will likely be an instance returned 
+        by np.random.default_rng
+
+    Returns
+    -------
+    ndarray
+        Subsampled data
+
+    Notes
+    -----
+    This code was adapted from scikit-bio (`skbio.math._subsample`)
+
+    """
+    if (with_replacement):
+       return _subsample_with_replacement(arr, n, rng)
+    else:
+       return _subsample_without_replacement(arr, n, rng)