Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

In memory one off #2

Merged
merged 31 commits into from
Mar 21, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
0774b7e
expose an in-mem one_off
wasade Mar 11, 2022
3efe505
load from scipy.sparse like data
wasade Mar 12, 2022
114591c
comment on from scipy.sparse
wasade Mar 12, 2022
8663ed6
use biom not biom_interface
wasade Mar 12, 2022
e9e5b36
limit one_off_inmem to c++
wasade Mar 14, 2022
cebd37b
do not extern inmem c-style
wasade Mar 14, 2022
6fcfb16
Bring over a few additional headers
wasade Mar 14, 2022
26aa816
add nullary constructors
wasade Mar 15, 2022
4def03f
more conservative destructor, use pointers
wasade Mar 15, 2022
1ca1215
Actually copy the ids
wasade Mar 15, 2022
c5056b1
address many of @sfiligoi's comments
wasade Mar 16, 2022
47c579e
nearly there
wasade Mar 16, 2022
919956e
close...
wasade Mar 16, 2022
de83beb
use reserve rather than resize
wasade Mar 16, 2022
66a09e3
kick ci
wasade Mar 16, 2022
e8a8fa0
move status enums to their own header
wasade Mar 16, 2022
e192e9a
define in situ
wasade Mar 16, 2022
01f1d67
remove unnecessary includes
wasade Mar 16, 2022
f35eac2
Address @sfiligoi's comments
wasade Mar 16, 2022
3059f2a
remove commted out function
wasade Mar 16, 2022
d103920
EXTERN on a few support methods
wasade Mar 16, 2022
88bc35c
sync declaration
wasade Mar 16, 2022
aa9523d
omp pragmas
wasade Mar 17, 2022
044fe19
parallelize get_sample_counts
wasade Mar 17, 2022
d53283a
remove much of one level of copying
wasade Mar 18, 2022
f2826d1
inmem now full matrix, and fix input types for biom
wasade Mar 18, 2022
f37912b
minor change to resize and some parallel w/ bptree
wasade Mar 18, 2022
0f98a90
remove unneeded parallels
wasade Mar 19, 2022
0252938
Allow the caller to allocate results
wasade Mar 21, 2022
4f1ab8a
partial maybe is sensitive
wasade Mar 21, 2022
80aeab0
Addressing @sfiligoi's comments
wasade Mar 21, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 34 additions & 25 deletions src/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,22 +48,25 @@
return err; \
}

#define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename); \
std::string content = std::string(std::istreambuf_iterator<char>(ifs), \
std::istreambuf_iterator<char>()); \
su::BPTree tree = su::BPTree(content); \
su::biom table = su::biom(biom_filename); \
if(table.n_samples <= 0 | table.n_obs <= 0) { \
return table_empty; \
} \
std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \
if(bad_id != "") { \
return table_and_tree_do_not_overlap; \
} \
std::unordered_set<std::string> to_keep(table.obs_ids.begin(), \
table.obs_ids.end()); \
su::BPTree tree_sheared = tree.shear(to_keep).collapse();

#define SYNC_TREE_TABLE(tree, table) std::unordered_set<std::string> to_keep(table.obs_ids.begin(), \
table.obs_ids.end()); \
su::BPTree tree_sheared = tree.shear(to_keep).collapse();

#define PARSE_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename); \
std::string content = std::string(std::istreambuf_iterator<char>(ifs), \
std::istreambuf_iterator<char>()); \
su::BPTree tree = su::BPTree(content); \
su::biom table = su::biom(biom_filename); \
if(table.n_samples <= 0 | table.n_obs <= 0) { \
return table_empty; \
} \
std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \
if(bad_id != "") { \
return table_and_tree_do_not_overlap; \
}

#define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) PARSE_TREE_TABLE(tree_filename, table_filename) \
SYNC_TREE_TABLE(tree, table)

using namespace su;
using namespace std;
Expand Down Expand Up @@ -98,7 +101,7 @@ void destroy_stripes(vector<double*> &dm_stripes, vector<double*> &dm_stripes_to
}


void initialize_mat(mat_t* &result, biom &table, bool is_upper_triangle) {
void initialize_mat(mat_t* &result, biom_interface &table, bool is_upper_triangle) {
wasade marked this conversation as resolved.
Show resolved Hide resolved
result = (mat_t*)malloc(sizeof(mat));
result->n_samples = table.n_samples;

Expand All @@ -115,7 +118,7 @@ void initialize_mat(mat_t* &result, biom &table, bool is_upper_triangle) {
}
}

void initialize_results_vec(r_vec* &result, biom& table){
void initialize_results_vec(r_vec* &result, biom &table){
// Stores results for Faith PD
result = (r_vec*)malloc(sizeof(results_vec));
result->n_samples = table.n_samples;
Expand Down Expand Up @@ -401,14 +404,11 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file
return okay;
}

compute_status one_off(const char* biom_filename, const char* tree_filename,
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int nthreads, mat_t** result) {

CHECK_FILE(biom_filename, table_missing)
CHECK_FILE(tree_filename, tree_missing)
compute_status one_off_inmem(su::biom_interface &table, su::BPTree &tree,
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int nthreads, mat_t** result) {
wasade marked this conversation as resolved.
Show resolved Hide resolved
SET_METHOD(unifrac_method, unknown_method)
PARSE_SYNC_TREE_TABLE(tree_filename, table_filename)
SYNC_TREE_TABLE(tree, table)

const unsigned int stripe_stop = (table.n_samples + 1) / 2;
std::vector<double*> dm_stripes(stripe_stop);
Expand All @@ -435,6 +435,15 @@ compute_status one_off(const char* biom_filename, const char* tree_filename,
return okay;
}

compute_status one_off(const char* biom_filename, const char* tree_filename,
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int nthreads, mat_t** result) {
CHECK_FILE(biom_filename, table_missing)
CHECK_FILE(tree_filename, tree_missing)
PARSE_TREE_TABLE(tree_filename, table_filename)
return one_off_inmem(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result);
}

// TMat mat_full_fp32_t
template<class TReal, class TMat>
compute_status one_off_matrix_T(const char* biom_filename, const char* tree_filename,
Expand Down
23 changes: 23 additions & 0 deletions src/api.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "task_parameters.hpp"
#include "biom_interface.hpp"
#include "tree.hpp"
wasade marked this conversation as resolved.
Show resolved Hide resolved

#ifdef __cplusplus
#include <vector>
Expand Down Expand Up @@ -154,6 +156,27 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int threads, mat_t** result);

/* Compute UniFrac - condensed form, in-memory
*
* table <biom> a constructed BIOM object
* tree <BPTree> a constructed BPTree object
* unifrac_method <const char*> the requested unifrac method.
* variance_adjust <bool> whether to apply variance adjustment.
* alpha <double> GUniFrac alpha, only relevant if method == generalized.
* bypass_tips <bool> disregard tips, reduces compute by about 50%
* threads <uint> the number of threads to use.
* result <mat_t**> the resulting distance matrix in condensed form, this is initialized within the method so using **
*
* one_off_inmem returns the following error codes:
*
* okay : no problems encountered
* unknown_method : the requested method is unknown.
* table_empty : the table does not have any entries
*/
EXTERN ComputeStatus one_off_inmem(su::biom_interface &table, su::BPTree &tree,
wasade marked this conversation as resolved.
Show resolved Hide resolved
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int threads, mat_t** result);

/* Compute UniFrac - matrix form
*
* biom_filename <const char*> the filename to the biom table.
Expand Down
122 changes: 111 additions & 11 deletions src/biom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ const std::string SAMPLE_DATA = std::string("/sample/matrix/data");
const std::string SAMPLE_IDS = std::string("/sample/ids");

biom::biom(std::string filename) {
has_hdf5_backing = true;
wasade marked this conversation as resolved.
Show resolved Hide resolved

file = H5File(filename.c_str(), H5F_ACC_RDONLY);

/* establish the datasets */
Expand Down Expand Up @@ -58,6 +60,31 @@ biom::biom(std::string filename) {
create_id_index(obs_ids, obs_id_index);
create_id_index(sample_ids, sample_id_index);

malloc_resident(n_obs);

uint32_t *current_indices = NULL;
double *current_data = NULL;
for(unsigned int i = 0; i < obs_ids.size(); i++) {
std::string id_ = obs_ids[i];
unsigned int n = get_obs_data_direct(id_, current_indices, current_data);
obs_counts_resident[i] = n;
obs_indices_resident[i] = current_indices;
obs_data_resident[i] = current_data;
}
sample_counts = get_sample_counts();
}

biom::~biom() {
for(unsigned int i = 0; i < n_obs; i++) {
free(obs_indices_resident[i]);
free(obs_data_resident[i]);
}
free(obs_indices_resident);
free(obs_data_resident);
free(obs_counts_resident);
}

void biom::malloc_resident(uint32_t n_obs) {
/* load obs sparse data */
obs_indices_resident = (uint32_t**)malloc(sizeof(uint32_t**) * n_obs);
if(obs_indices_resident == NULL) {
Expand All @@ -77,30 +104,46 @@ biom::biom(std::string filename) {
sizeof(unsigned int) * n_obs, __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}


biom::biom(const std::vector<std::string> &obs_ids,
const std::vector<std::string> &samp_ids,
const std::vector<uint32_t> &index,
const std::vector<uint32_t> &indptr,
const std::vector<double> &data) {
wasade marked this conversation as resolved.
Show resolved Hide resolved
nnz = data.size();
n_samples = samp_ids.size();
n_obs = obs_ids.size();

/* define a mapping between an ID and its corresponding offset */
obs_id_index = std::unordered_map<std::string, uint32_t>();
sample_id_index = std::unordered_map<std::string, uint32_t>();

create_id_index(obs_ids, obs_id_index);
create_id_index(samp_ids, sample_id_index);

malloc_resident(n_obs);

uint32_t *current_indices = NULL;
double *current_data = NULL;
for(unsigned int i = 0; i < obs_ids.size(); i++) {
std::string id_ = obs_ids[i];
unsigned int n = get_obs_data_direct(id_, current_indices, current_data);
unsigned int n = get_obs_data_from_sparse(id_, index, indptr, data, current_indices, current_data);
obs_counts_resident[i] = n;
obs_indices_resident[i] = current_indices;
obs_data_resident[i] = current_data;
}
sample_counts = get_sample_counts();
}

biom::~biom() {
for(unsigned int i = 0; i < n_obs; i++) {
free(obs_indices_resident[i]);
free(obs_data_resident[i]);
void biom::set_nnz() {
if(!has_hdf5_backing) {
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}
free(obs_indices_resident);
free(obs_data_resident);
free(obs_counts_resident);
}

void biom::set_nnz() {
// should these be cached?
DataType dtype = obs_data.getDataType();
DataSpace dataspace = obs_data.getSpace();
Expand All @@ -111,6 +154,12 @@ void biom::set_nnz() {
}

void biom::load_ids(const char *path, std::vector<std::string> &ids) {
if(!has_hdf5_backing) {
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

DataSet ds_ids = file.openDataSet(path);
DataType dtype = ds_ids.getDataType();
DataSpace dataspace = ds_ids.getSpace();
Expand Down Expand Up @@ -138,6 +187,12 @@ void biom::load_ids(const char *path, std::vector<std::string> &ids) {
}

void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
if(!has_hdf5_backing) {
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

DataSet ds = file.openDataSet(path);
DataType dtype = ds.getDataType();
DataSpace dataspace = ds.getSpace();
Expand All @@ -159,7 +214,7 @@ void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
free(dataout);
}

void biom::create_id_index(std::vector<std::string> &ids,
void biom::create_id_index(const std::vector<std::string> &ids,
std::unordered_map<std::string, uint32_t> &map) {
uint32_t count = 0;
map.reserve(ids.size());
Expand All @@ -168,7 +223,46 @@ void biom::create_id_index(std::vector<std::string> &ids,
}
}

unsigned int biom::get_obs_data_from_sparse(const std::string &id_,
const std::vector<uint32_t> &index,
const std::vector<uint32_t> &indptr,
const std::vector<double> &data,
uint32_t *& current_indices_out,
double *& current_data_out) {
uint32_t idx = obs_id_index.at(id_);
uint32_t start = indptr[idx];
uint32_t end = indptr[idx + 1];
unsigned int count = end - start;

current_indices_out = (uint32_t*)malloc(sizeof(uint32_t) * count);
if(current_indices_out == NULL) {
fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n",
sizeof(uint32_t) * count, __FILE__, __LINE__);
exit(EXIT_FAILURE);
}

current_data_out = (double*)malloc(sizeof(double) * count);
if(current_data_out == NULL) {
fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n",
sizeof(double) * count, __FILE__, __LINE__);
exit(EXIT_FAILURE);
}

for(int i = 0, offset = start ; offset < end; i++, offset++) {
current_indices_out[i] = index[offset];
current_data_out[i] = data[offset];
}

return count;
}

unsigned int biom::get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
if(!has_hdf5_backing) {
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

uint32_t idx = obs_id_index.at(id);
uint32_t start = obs_indptr[idx];
uint32_t end = obs_indptr[idx + 1];
Expand Down Expand Up @@ -270,6 +364,12 @@ void biom::get_obs_data_range(const std::string &id, unsigned int start, unsigne
}

unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
if(!has_hdf5_backing) {
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

uint32_t idx = sample_id_index.at(id);
uint32_t start = sample_indptr[idx];
uint32_t end = sample_indptr[idx + 1];
Expand Down
30 changes: 29 additions & 1 deletion src/biom.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@ namespace su {
*/
biom(std::string filename);

/* constructor from compress sparse data
*
* @param obs_ids vector of observation identifiers
* @param samp_ids vector of sample identifiers
* @param index vector of index positions
* @param indptr vector of indptr positions
* @param data vector of observation counts
*/
biom(const std::vector<std::string> &obs_ids,
const std::vector<std::string> &samp_ids,
const std::vector<uint32_t> &index,
const std::vector<uint32_t> &indptr,
const std::vector<double> &data);

/* default destructor
*
* Temporary arrays are freed
Expand Down Expand Up @@ -57,6 +71,8 @@ namespace su {
void get_obs_data_range(const std::string &id, unsigned int start, unsigned int end, bool normalize, float* out) const;

private:
bool has_hdf5_backing = false;

/* retain DataSet handles within the HDF5 file */
H5::DataSet obs_indices;
H5::DataSet sample_indices;
Expand All @@ -66,11 +82,23 @@ namespace su {
uint32_t **obs_indices_resident;
double **obs_data_resident;
unsigned int *obs_counts_resident;

void malloc_resident(uint32_t n_obs);

/* allow load from scipy.sparse like data
* WARNING: assumes data are CSR, with observations as rows
wasade marked this conversation as resolved.
Show resolved Hide resolved
*/
unsigned int get_obs_data_from_sparse(const std::string &id_,
const std::vector<uint32_t> &index,
const std::vector<uint32_t> &indptr,
const std::vector<double> &data,
uint32_t *& current_indices_out,
double *& current_data_out);
unsigned int get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out);
unsigned int get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out);
double* get_sample_counts();


/* At construction, lookups mapping IDs -> index position within an
* axis are defined
*/
Expand Down Expand Up @@ -100,7 +128,7 @@ namespace su {
* @param ids A vector of IDs to index
* @param map A hash table to populate
*/
void create_id_index(std::vector<std::string> &ids,
void create_id_index(const std::vector<std::string> &ids,
std::unordered_map<std::string, uint32_t> &map);


Expand Down
Loading