Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use rng.choice without unpacked in subsample_without_replacement with 64-bit support #935

Merged
merged 15 commits into from
Jul 28, 2023
152 changes: 119 additions & 33 deletions biom/_subsample.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,23 @@
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------


import numpy as np
cimport numpy as cnp


def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array
cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
cnp.ndarray[cnp.int32_t, ndim=1] indptr,
cnp.int64_t n,
object rng):
"""Subsample non-zero values of a sparse array with replacement

Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
data : {csr_matrix, csc_matrix}.data
A 1xM sparse vector data
indptr : {csr_matrix, csc_matrix}.indptr
A 1xM sparse vector indptr
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng
Expand All @@ -38,34 +39,119 @@ def _subsample(arr, n, with_replacement, rng):
"""
cdef:
cnp.int64_t counts_sum
cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
cnp.ndarray[cnp.float64_t, ndim=1] result
cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
cnp.float64_t cnt
Py_ssize_t i, j, length
cnp.int32_t start,end,length
Py_ssize_t i
cnp.ndarray[cnp.float64_t, ndim=1] pvals

for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()

if with_replacement:
pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)
else:
if counts_sum < n:
data[start:end] = 0
continue

r = np.arange(length, dtype=np.int32)
unpacked = np.repeat(r, data_i[start:end])
permuted = rng.permutation(unpacked)[:n]

result = np.zeros(length, dtype=np.float64)
for idx in range(permuted.shape[0]):
result[permuted[idx]] += 1

data[start:end] = result
pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)


cdef _subsample_without_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
cnp.ndarray[cnp.int32_t, ndim=1] indptr,
cnp.int64_t n,
object rng):
"""Subsample non-zero values of a sparse array w/out replacement

Parameters
----------
data : {csr_matrix, csc_matrix}.data
A 1xM sparse vector data
indptr : {csr_matrix, csc_matrix}.indptr
A 1xM sparse vector indptr
n : int
Number of items to subsample from `arr`
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng

Returns
-------
ndarray
Subsampled data

Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)

"""
cdef:
cnp.int64_t counts_sum, count_el, perm_count_el
cnp.int64_t count_rem
cnp.ndarray[cnp.int64_t, ndim=1] permuted
Py_ssize_t i, idx
cnp.int32_t length,el,start,end

for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()

if counts_sum < n:
data[start:end] = 0
continue

permuted = rng.choice(counts_sum, n, replace=False, shuffle=False)
permuted.sort()

# now need to do reverse mapping
# since I am not using np.repeat anymore
# reminder, old logic was
# r = np.arange(length)
# unpacked = np.repeat(r, data_i[start:end])
# permuted_unpacked = rng.choice(unpacked, n, replace=False, shuffle=False)
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved

el = 0 # index in result/data
count_el = 0 # index in permutted
count_rem = long(data[start]) # since each data has multiple els, sub count there
data[start] = 0.0
for idx in range(n):
perm_count_el = permuted[idx]
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
# the array is sorted, so just jump ahead
while (perm_count_el - count_el) >= count_rem:
count_el += count_rem
el += 1
count_rem = long(data[start+el])
data[start+el] = 0.0
count_rem -= (perm_count_el - count_el)
count_el = perm_count_el

data[start+el] += 1
# clean up tail elements
data[start+el+1:end] = 0.0


def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array

Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng

Returns
-------
ndarray
Subsampled data

Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)

"""
if (with_replacement):
return _subsample_with_replacement(arr.data, arr.indptr, n, rng)
else:
return _subsample_without_replacement(arr.data, arr.indptr, n, rng)