Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use rng.choice without unpacked in subsample_without_replacement with 64-bit support #935

Merged
merged 15 commits into from
Jul 28, 2023
129 changes: 101 additions & 28 deletions biom/_subsample.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,18 @@
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------


import numpy as np
cimport numpy as cnp


def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array
def _subsample_with_replacement(arr, n, rng):
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
"""Subsample non-zero values of a sparse array with replacement

Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng
Expand All @@ -39,33 +35,110 @@ def _subsample(arr, n, with_replacement, rng):
cdef:
cnp.int64_t counts_sum
cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
Py_ssize_t i, length

for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()

pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)


def _subsample_without_replacement(arr, n, rng):
"""Subsample non-zero values of a sparse array w/out replacement

Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
n : int
Number of items to subsample from `arr`
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng

Returns
-------
ndarray
Subsampled data

Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)

"""
cdef:
cnp.int64_t counts_sum,idx,count_el, perm_count_ela
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
cnp.int64_t count_rem
cnp.int64_t cn = n
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
cnp.ndarray[cnp.float64_t, ndim=1] result
cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
cnp.float64_t cnt
Py_ssize_t i, j, length
cnp.ndarray[cnp.int64_t, ndim=1] permuted
Py_ssize_t i
cnp.int32_t length,el

for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()

if with_replacement:
pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)
else:
if counts_sum < n:
data[start:end] = 0
continue

r = np.arange(length, dtype=np.int32)
unpacked = np.repeat(r, data_i[start:end])
permuted = rng.permutation(unpacked)[:n]

result = np.zeros(length, dtype=np.float64)
for idx in range(permuted.shape[0]):
result[permuted[idx]] += 1

data[start:end] = result
if counts_sum < cn:
data[start:end] = 0
continue

permuted = rng.choice(counts_sum,cn,replace=False,shuffle=False)
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
permuted.sort()

# now need to do reverse mapping
result = np.zeros(length, dtype=np.float64)
el=0 # index in result/data
count_el =0 # index in permutted
count_rem=data[start] # since each data has multiple els, sub count there
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
for idx in range(cn):
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
perm_count_el = permuted[idx]
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
# the array is sorted, so just jump ahead
while (perm_count_el-count_el) >= count_rem:
count_el += count_rem
el += 1
count_rem = data[start+el]
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
count_rem -= (perm_count_el-count_el)
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
count_el = perm_count_el

result[el] += 1

data[start:end] = result


def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array

Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng

Returns
-------
ndarray
Subsampled data

Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)

"""
if (with_replacement):
return _subsample_with_replacement(arr, n, rng)
else:
return _subsample_without_replacement(arr, n, rng)