Skip to content

Commit

Permalink
Use rng.choice without unpacked in subsample_without_replacement with…
Browse files Browse the repository at this point in the history
… 64-bit support (#935)

* Split _subsample in w and wo replacement

* Use rng.choice and reverse lookup... slower than original

* Optimize reverse lookup

* Improve spacing

Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu>

* Add spacing

Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu>

* Add spaces

Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu>

* Add space

Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu>

* Add space

Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu>

* Fix typo

* Add comment

* Declare idx as Py_ssize_t since it is used in a range loop

* Use cdef

* Remove tmp buffer result

* Update comment

Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu>

* Document PR 935

---------

Co-authored-by: Daniel McDonald <d3mcdonald@eng.ucsd.edu>
  • Loading branch information
sfiligoi and wasade authored Jul 28, 2023
1 parent 941973a commit d3f82fe
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 33 deletions.
1 change: 1 addition & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ biom 2.1.15-dev
Performance improvements:

* Revise `Table._fast_merge` to use COO directly. For very large tables, this reduces runtime by ~50x and memory by ~5x. See PR [#913](https://github.com/biocore/biom-format/pull/933).
* Drastically reduce the memory needs of subsampling when sums are large. Also adds 64-bit support. See PR [#935](https://github.com/biocore/biom-format/pull/935).

biom 2.1.15
-----------
Expand Down
159 changes: 126 additions & 33 deletions biom/_subsample.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,23 @@
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------


import numpy as np
cimport numpy as cnp


def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array
cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
cnp.ndarray[cnp.int32_t, ndim=1] indptr,
cnp.int64_t n,
object rng):
"""Subsample non-zero values of a sparse array with replacement
Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
data : {csr_matrix, csc_matrix}.data
A 1xM sparse vector data
indptr : {csr_matrix, csc_matrix}.indptr
A 1xM sparse vector indptr
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng
Expand All @@ -38,34 +39,126 @@ def _subsample(arr, n, with_replacement, rng):
"""
cdef:
cnp.int64_t counts_sum
cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
cnp.ndarray[cnp.float64_t, ndim=1] result
cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
cnp.float64_t cnt
Py_ssize_t i, j, length
cnp.int32_t start,end,length
Py_ssize_t i
cnp.ndarray[cnp.float64_t, ndim=1] pvals

for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()

if with_replacement:
pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)
else:
if counts_sum < n:
data[start:end] = 0
continue

r = np.arange(length, dtype=np.int32)
unpacked = np.repeat(r, data_i[start:end])
permuted = rng.permutation(unpacked)[:n]

result = np.zeros(length, dtype=np.float64)
for idx in range(permuted.shape[0]):
result[permuted[idx]] += 1

data[start:end] = result
pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)


cdef _subsample_without_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
cnp.ndarray[cnp.int32_t, ndim=1] indptr,
cnp.int64_t n,
object rng):
"""Subsample non-zero values of a sparse array w/out replacement
Parameters
----------
data : {csr_matrix, csc_matrix}.data
A 1xM sparse vector data
indptr : {csr_matrix, csc_matrix}.indptr
A 1xM sparse vector indptr
n : int
Number of items to subsample from `arr`
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng
Returns
-------
ndarray
Subsampled data
Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)
"""
cdef:
cnp.int64_t counts_sum, count_el, perm_count_el
cnp.int64_t count_rem
cnp.ndarray[cnp.int64_t, ndim=1] permuted
Py_ssize_t i, idx
cnp.int32_t length,el,start,end

for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()

if counts_sum < n:
data[start:end] = 0
continue

permuted = rng.choice(counts_sum, n, replace=False, shuffle=False)
permuted.sort()

# now need to do reverse mapping
# since I am not using np.repeat anymore
# reminder, old logic was
# r = np.arange(length)
# unpacked = np.repeat(r, data_i[start:end])
# permuted_unpacked = rng.choice(unpacked, n, replace=False, shuffle=False)
#
# specifically, what we're going to do here is randomly pick what elements within
# each sample to keep. this is analogous issuing the prior np.repeat call, and obtaining
# a random set of index positions for that resulting array. however, we do not need to
# perform the np.repeat call as we know the length of that resulting vector already,
# and additionally, we can compute the sample associated with an index in that array
# without constructing it.

el = 0 # index in result/data
count_el = 0 # index in permutted
count_rem = long(data[start]) # since each data has multiple els, sub count there
data[start] = 0.0
for idx in range(n):
perm_count_el = permuted[idx]
# the array is sorted, so just jump ahead
while (perm_count_el - count_el) >= count_rem:
count_el += count_rem
el += 1
count_rem = long(data[start+el])
data[start+el] = 0.0
count_rem -= (perm_count_el - count_el)
count_el = perm_count_el

data[start+el] += 1
# clean up tail elements
data[start+el+1:end] = 0.0


def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array
Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng
Returns
-------
ndarray
Subsampled data
Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)
"""
if (with_replacement):
return _subsample_with_replacement(arr.data, arr.indptr, n, rng)
else:
return _subsample_without_replacement(arr.data, arr.indptr, n, rng)

0 comments on commit d3f82fe

Please sign in to comment.