Use rng.choice without unpacked in subsample_without_replacement with…

… 64-bit support (#935) * Split _subsample in w and wo replacement * Use rng.choice and reverse lookup... slower than original * Optimize reverse lookup * Improve spacing Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu> * Add spacing Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu> * Add spaces Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu> * Add space Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu> * Add space Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu> * Fix typo * Add comment * Declare idx as Py_ssize_t since it is used in a range loop * Use cdef * Remove tmp buffer result * Update comment Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu> * Document PR 935 --------- Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu>
biocore · Jul 28, 2023 · d3f82fe · d3f82fe
1 parent 941973a
commit d3f82fe
Show file tree

Hide file tree

Showing 2 changed files with 127 additions and 33 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -7,6 +7,7 @@ biom 2.1.15-dev
 Performance improvements:
 
 * Revise `Table._fast_merge` to use COO directly. For very large tables, this reduces runtime by ~50x and memory by ~5x. See PR [#913](https://github.com/biocore/biom-format/pull/933).
+* Drastically reduce the memory needs of subsampling when sums are large. Also adds 64-bit support. See PR [#935](https://github.com/biocore/biom-format/pull/935).
 
 biom 2.1.15
 -----------

diff --git a/biom/_subsample.pyx b/biom/_subsample.pyx
@@ -6,22 +6,23 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # -----------------------------------------------------------------------------
 
-
 import numpy as np
 cimport numpy as cnp
 
-
-def _subsample(arr, n, with_replacement, rng):
-    """Subsample non-zero values of a sparse array
+cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
+                                 cnp.ndarray[cnp.int32_t, ndim=1] indptr,
+                                 cnp.int64_t n,
+                                 object rng):
+    """Subsample non-zero values of a sparse array with replacement
 
     Parameters
     ----------
-    arr : {csr_matrix, csc_matrix}
-        A 1xM sparse vector
+    data : {csr_matrix, csc_matrix}.data
+        A 1xM sparse vector data
+    indptr : {csr_matrix, csc_matrix}.indptr
+        A 1xM sparse vector indptr
     n : int
         Number of items to subsample from `arr`
-    with_replacement : bool
-        Whether to permute or use multinomial sampling
     rng : Generator instance
         A random generator. This will likely be an instance returned 
         by np.random.default_rng
@@ -38,34 +39,126 @@ def _subsample(arr, n, with_replacement, rng):
     """
     cdef:
         cnp.int64_t counts_sum
-        cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
-        cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
-        cnp.ndarray[cnp.float64_t, ndim=1] result
-        cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
-        cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
-        cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
-        cnp.float64_t cnt
-        Py_ssize_t i, j, length
+        cnp.int32_t start,end,length
+        Py_ssize_t i
+        cnp.ndarray[cnp.float64_t, ndim=1] pvals
 
     for i in range(indptr.shape[0] - 1):
         start, end = indptr[i], indptr[i+1]
         length = end - start
         counts_sum = data[start:end].sum()
 
-        if with_replacement:
-            pvals = data[start:end] / counts_sum
-            data[start:end] = rng.multinomial(n, pvals)
-        else:
-            if counts_sum < n:
-                data[start:end] = 0
-                continue
-
-            r = np.arange(length, dtype=np.int32)
-            unpacked = np.repeat(r, data_i[start:end])
-            permuted = rng.permutation(unpacked)[:n]
-
-            result = np.zeros(length, dtype=np.float64)
-            for idx in range(permuted.shape[0]):
-                result[permuted[idx]] += 1
-
-            data[start:end] = result
+        pvals = data[start:end] / counts_sum
+        data[start:end] = rng.multinomial(n, pvals)
+
+
+cdef _subsample_without_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
+                                    cnp.ndarray[cnp.int32_t, ndim=1] indptr,
+                                    cnp.int64_t n,
+                                    object rng):
+    """Subsample non-zero values of a sparse array w/out replacement
+
+    Parameters
+    ----------
+    data : {csr_matrix, csc_matrix}.data
+        A 1xM sparse vector data
+    indptr : {csr_matrix, csc_matrix}.indptr
+        A 1xM sparse vector indptr
+    n : int
+        Number of items to subsample from `arr`
+    rng : Generator instance
+        A random generator. This will likely be an instance returned 
+        by np.random.default_rng
+
+    Returns
+    -------
+    ndarray
+        Subsampled data
+
+    Notes
+    -----
+    This code was adapted from scikit-bio (`skbio.math._subsample`)
+
+    """
+    cdef:
+        cnp.int64_t counts_sum, count_el, perm_count_el
+        cnp.int64_t count_rem
+        cnp.ndarray[cnp.int64_t, ndim=1] permuted
+        Py_ssize_t i, idx
+        cnp.int32_t length,el,start,end
+
+    for i in range(indptr.shape[0] - 1):
+        start, end = indptr[i], indptr[i+1]
+        length = end - start
+        counts_sum = data[start:end].sum()
+
+        if counts_sum < n:
+            data[start:end] = 0
+            continue
+
+        permuted = rng.choice(counts_sum, n, replace=False, shuffle=False)
+        permuted.sort()
+
+        # now need to do reverse mapping
+        # since I am not using np.repeat anymore
+        # reminder, old logic was
+        #   r = np.arange(length)
+        #   unpacked = np.repeat(r, data_i[start:end])
+        #   permuted_unpacked = rng.choice(unpacked, n, replace=False, shuffle=False)
+        # 
+        # specifically, what we're going to do here is randomly pick what elements within
+        # each sample to keep. this is analogous issuing the prior np.repeat call, and obtaining
+        # a random set of index positions for that resulting array. however, we do not need to 
+        # perform the np.repeat call as we know the length of that resulting vector already,
+        # and additionally, we can compute the sample associated with an index in that array
+        # without constructing it.
+
+        el = 0         # index in result/data
+        count_el = 0  # index in permutted
+        count_rem = long(data[start])  # since each data has multiple els, sub count there
+        data[start] = 0.0
+        for idx in range(n):
+            perm_count_el = permuted[idx]
+            # the array is sorted, so just jump ahead
+            while (perm_count_el - count_el) >= count_rem:
+               count_el += count_rem
+               el += 1
+               count_rem = long(data[start+el])
+               data[start+el] = 0.0
+            count_rem -= (perm_count_el - count_el)
+            count_el = perm_count_el
+
+            data[start+el] += 1
+        # clean up tail elements
+        data[start+el+1:end] = 0.0
+
+
+def _subsample(arr, n, with_replacement, rng):
+    """Subsample non-zero values of a sparse array
+
+    Parameters
+    ----------
+    arr : {csr_matrix, csc_matrix}
+        A 1xM sparse vector
+    n : int
+        Number of items to subsample from `arr`
+    with_replacement : bool
+        Whether to permute or use multinomial sampling
+    rng : Generator instance
+        A random generator. This will likely be an instance returned 
+        by np.random.default_rng
+
+    Returns
+    -------
+    ndarray
+        Subsampled data
+
+    Notes
+    -----
+    This code was adapted from scikit-bio (`skbio.math._subsample`)
+
+    """
+    if (with_replacement):
+       return _subsample_with_replacement(arr.data, arr.indptr, n, rng)
+    else:
+       return _subsample_without_replacement(arr.data, arr.indptr, n, rng)