From 0774b7e44d8dd1d790432db21ed3f0f80ddcd2b4 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 11 Mar 2022 15:20:20 -0800 Subject: [PATCH 01/31] expose an in-mem one_off --- src/api.cpp | 59 ++++++++++++++++++++++++++++++----------------------- src/api.hpp | 23 +++++++++++++++++++++ 2 files changed, 57 insertions(+), 25 deletions(-) diff --git a/src/api.cpp b/src/api.cpp index 4566ca7..9ca718b 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -48,22 +48,25 @@ return err; \ } -#define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename); \ - std::string content = std::string(std::istreambuf_iterator(ifs), \ - std::istreambuf_iterator()); \ - su::BPTree tree = su::BPTree(content); \ - su::biom table = su::biom(biom_filename); \ - if(table.n_samples <= 0 | table.n_obs <= 0) { \ - return table_empty; \ - } \ - std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \ - if(bad_id != "") { \ - return table_and_tree_do_not_overlap; \ - } \ - std::unordered_set to_keep(table.obs_ids.begin(), \ - table.obs_ids.end()); \ - su::BPTree tree_sheared = tree.shear(to_keep).collapse(); - +#define SYNC_TREE_TABLE(tree, table) std::unordered_set to_keep(table.obs_ids.begin(), \ + table.obs_ids.end()); \ + su::BPTree tree_sheared = tree.shear(to_keep).collapse(); + +#define PARSE_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename); \ + std::string content = std::string(std::istreambuf_iterator(ifs), \ + std::istreambuf_iterator()); \ + su::BPTree tree = su::BPTree(content); \ + su::biom table = su::biom(biom_filename); \ + if(table.n_samples <= 0 | table.n_obs <= 0) { \ + return table_empty; \ + } \ + std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \ + if(bad_id != "") { \ + return table_and_tree_do_not_overlap; \ + } + +#define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) PARSE_TREE_TABLE(tree_filename, table_filename) \ + SYNC_TREE_TABLE(tree, table) using namespace su; using namespace std; @@ -98,7 +101,7 @@ void destroy_stripes(vector &dm_stripes, vector &dm_stripes_to } -void initialize_mat(mat_t* &result, biom &table, bool is_upper_triangle) { +void initialize_mat(mat_t* &result, biom_interface &table, bool is_upper_triangle) { result = (mat_t*)malloc(sizeof(mat)); result->n_samples = table.n_samples; @@ -115,7 +118,7 @@ void initialize_mat(mat_t* &result, biom &table, bool is_upper_triangle) { } } -void initialize_results_vec(r_vec* &result, biom& table){ +void initialize_results_vec(r_vec* &result, biom &table){ // Stores results for Faith PD result = (r_vec*)malloc(sizeof(results_vec)); result->n_samples = table.n_samples; @@ -401,14 +404,11 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file return okay; } -compute_status one_off(const char* biom_filename, const char* tree_filename, - const char* unifrac_method, bool variance_adjust, double alpha, - bool bypass_tips, unsigned int nthreads, mat_t** result) { - - CHECK_FILE(biom_filename, table_missing) - CHECK_FILE(tree_filename, tree_missing) +compute_status one_off_inmem(su::biom_interface &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, mat_t** result) { SET_METHOD(unifrac_method, unknown_method) - PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) + SYNC_TREE_TABLE(tree, table) const unsigned int stripe_stop = (table.n_samples + 1) / 2; std::vector dm_stripes(stripe_stop); @@ -435,6 +435,15 @@ compute_status one_off(const char* biom_filename, const char* tree_filename, return okay; } +compute_status one_off(const char* biom_filename, const char* tree_filename, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, mat_t** result) { + CHECK_FILE(biom_filename, table_missing) + CHECK_FILE(tree_filename, tree_missing) + PARSE_TREE_TABLE(tree_filename, table_filename) + return one_off_inmem(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); +} + // TMat mat_full_fp32_t template compute_status one_off_matrix_T(const char* biom_filename, const char* tree_filename, diff --git a/src/api.hpp b/src/api.hpp index be3bd63..8f05773 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -1,4 +1,6 @@ #include "task_parameters.hpp" +#include "biom_interface.hpp" +#include "tree.hpp" #ifdef __cplusplus #include @@ -154,6 +156,27 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat_t** result); +/* Compute UniFrac - condensed form, in-memory + * + * table a constructed BIOM object + * tree a constructed BPTree object + * unifrac_method the requested unifrac method. + * variance_adjust whether to apply variance adjustment. + * alpha GUniFrac alpha, only relevant if method == generalized. + * bypass_tips disregard tips, reduces compute by about 50% + * threads the number of threads to use. + * result the resulting distance matrix in condensed form, this is initialized within the method so using ** + * + * one_off_inmem returns the following error codes: + * + * okay : no problems encountered + * unknown_method : the requested method is unknown. + * table_empty : the table does not have any entries + */ +EXTERN ComputeStatus one_off_inmem(su::biom_interface &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int threads, mat_t** result); + /* Compute UniFrac - matrix form * * biom_filename the filename to the biom table. From 3efe505742bd2c8d4530e42ab9a870d6100f5712 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 11 Mar 2022 16:44:13 -0800 Subject: [PATCH 02/31] load from scipy.sparse like data --- src/biom.cpp | 122 +++++++++++++++++++++++++++++++++++++++++++----- src/biom.hpp | 29 +++++++++++- src/test_su.cpp | 27 +++++++++-- 3 files changed, 161 insertions(+), 17 deletions(-) diff --git a/src/biom.cpp b/src/biom.cpp index a62a9e2..f4e8a76 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -27,6 +27,8 @@ const std::string SAMPLE_DATA = std::string("/sample/matrix/data"); const std::string SAMPLE_IDS = std::string("/sample/ids"); biom::biom(std::string filename) { + has_hdf5_backing = true; + file = H5File(filename.c_str(), H5F_ACC_RDONLY); /* establish the datasets */ @@ -58,6 +60,31 @@ biom::biom(std::string filename) { create_id_index(obs_ids, obs_id_index); create_id_index(sample_ids, sample_id_index); + malloc_resident(n_obs); + + uint32_t *current_indices = NULL; + double *current_data = NULL; + for(unsigned int i = 0; i < obs_ids.size(); i++) { + std::string id_ = obs_ids[i]; + unsigned int n = get_obs_data_direct(id_, current_indices, current_data); + obs_counts_resident[i] = n; + obs_indices_resident[i] = current_indices; + obs_data_resident[i] = current_data; + } + sample_counts = get_sample_counts(); +} + +biom::~biom() { + for(unsigned int i = 0; i < n_obs; i++) { + free(obs_indices_resident[i]); + free(obs_data_resident[i]); + } + free(obs_indices_resident); + free(obs_data_resident); + free(obs_counts_resident); +} + +void biom::malloc_resident(uint32_t n_obs) { /* load obs sparse data */ obs_indices_resident = (uint32_t**)malloc(sizeof(uint32_t**) * n_obs); if(obs_indices_resident == NULL) { @@ -77,12 +104,32 @@ biom::biom(std::string filename) { sizeof(unsigned int) * n_obs, __FILE__, __LINE__); exit(EXIT_FAILURE); } +} + + +biom::biom(const std::vector &obs_ids, + const std::vector &samp_ids, + const std::vector &index, + const std::vector &indptr, + const std::vector &data) { + nnz = data.size(); + n_samples = samp_ids.size(); + n_obs = obs_ids.size(); + + /* define a mapping between an ID and its corresponding offset */ + obs_id_index = std::unordered_map(); + sample_id_index = std::unordered_map(); + + create_id_index(obs_ids, obs_id_index); + create_id_index(samp_ids, sample_id_index); + + malloc_resident(n_obs); uint32_t *current_indices = NULL; double *current_data = NULL; for(unsigned int i = 0; i < obs_ids.size(); i++) { std::string id_ = obs_ids[i]; - unsigned int n = get_obs_data_direct(id_, current_indices, current_data); + unsigned int n = get_obs_data_from_sparse(id_, index, indptr, data, current_indices, current_data); obs_counts_resident[i] = n; obs_indices_resident[i] = current_indices; obs_data_resident[i] = current_data; @@ -90,17 +137,13 @@ biom::biom(std::string filename) { sample_counts = get_sample_counts(); } -biom::~biom() { - for(unsigned int i = 0; i < n_obs; i++) { - free(obs_indices_resident[i]); - free(obs_data_resident[i]); +void biom::set_nnz() { + if(!has_hdf5_backing) { + fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); } - free(obs_indices_resident); - free(obs_data_resident); - free(obs_counts_resident); -} -void biom::set_nnz() { // should these be cached? DataType dtype = obs_data.getDataType(); DataSpace dataspace = obs_data.getSpace(); @@ -111,6 +154,12 @@ void biom::set_nnz() { } void biom::load_ids(const char *path, std::vector &ids) { + if(!has_hdf5_backing) { + fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + DataSet ds_ids = file.openDataSet(path); DataType dtype = ds_ids.getDataType(); DataSpace dataspace = ds_ids.getSpace(); @@ -138,6 +187,12 @@ void biom::load_ids(const char *path, std::vector &ids) { } void biom::load_indptr(const char *path, std::vector &indptr) { + if(!has_hdf5_backing) { + fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + DataSet ds = file.openDataSet(path); DataType dtype = ds.getDataType(); DataSpace dataspace = ds.getSpace(); @@ -159,7 +214,7 @@ void biom::load_indptr(const char *path, std::vector &indptr) { free(dataout); } -void biom::create_id_index(std::vector &ids, +void biom::create_id_index(const std::vector &ids, std::unordered_map &map) { uint32_t count = 0; map.reserve(ids.size()); @@ -168,7 +223,46 @@ void biom::create_id_index(std::vector &ids, } } +unsigned int biom::get_obs_data_from_sparse(const std::string &id_, + const std::vector &index, + const std::vector &indptr, + const std::vector &data, + uint32_t *& current_indices_out, + double *& current_data_out) { + uint32_t idx = obs_id_index.at(id_); + uint32_t start = indptr[idx]; + uint32_t end = indptr[idx + 1]; + unsigned int count = end - start; + + current_indices_out = (uint32_t*)malloc(sizeof(uint32_t) * count); + if(current_indices_out == NULL) { + fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", + sizeof(uint32_t) * count, __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + + current_data_out = (double*)malloc(sizeof(double) * count); + if(current_data_out == NULL) { + fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", + sizeof(double) * count, __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + + for(int i = 0, offset = start ; offset < end; i++, offset++) { + current_indices_out[i] = index[offset]; + current_data_out[i] = data[offset]; + } + + return count; +} + unsigned int biom::get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) { + if(!has_hdf5_backing) { + fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + uint32_t idx = obs_id_index.at(id); uint32_t start = obs_indptr[idx]; uint32_t end = obs_indptr[idx + 1]; @@ -270,6 +364,12 @@ void biom::get_obs_data_range(const std::string &id, unsigned int start, unsigne } unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) { + if(!has_hdf5_backing) { + fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + uint32_t idx = sample_id_index.at(id); uint32_t start = sample_indptr[idx]; uint32_t end = sample_indptr[idx + 1]; diff --git a/src/biom.hpp b/src/biom.hpp index 7e3fdb1..7e88773 100644 --- a/src/biom.hpp +++ b/src/biom.hpp @@ -27,6 +27,20 @@ namespace su { */ biom(std::string filename); + /* constructor from compress sparse data + * + * @param obs_ids vector of observation identifiers + * @param samp_ids vector of sample identifiers + * @param index vector of index positions + * @param indptr vector of indptr positions + * @param data vector of observation counts + */ + biom(const std::vector &obs_ids, + const std::vector &samp_ids, + const std::vector &index, + const std::vector &indptr, + const std::vector &data); + /* default destructor * * Temporary arrays are freed @@ -57,6 +71,8 @@ namespace su { void get_obs_data_range(const std::string &id, unsigned int start, unsigned int end, bool normalize, float* out) const; private: + bool has_hdf5_backing = false; + /* retain DataSet handles within the HDF5 file */ H5::DataSet obs_indices; H5::DataSet sample_indices; @@ -66,11 +82,20 @@ namespace su { uint32_t **obs_indices_resident; double **obs_data_resident; unsigned int *obs_counts_resident; - + + void malloc_resident(uint32_t n_obs); + + unsigned int get_obs_data_from_sparse(const std::string &id_, + const std::vector &index, + const std::vector &indptr, + const std::vector &data, + uint32_t *& current_indices_out, + double *& current_data_out); unsigned int get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out); unsigned int get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out); double* get_sample_counts(); + /* At construction, lookups mapping IDs -> index position within an * axis are defined */ @@ -100,7 +125,7 @@ namespace su { * @param ids A vector of IDs to index * @param map A hash table to populate */ - void create_id_index(std::vector &ids, + void create_id_index(const std::vector &ids, std::unordered_map &map); diff --git a/src/test_su.cpp b/src/test_su.cpp index 9ef3d33..d8f6dde 100644 --- a/src/test_su.cpp +++ b/src/test_su.cpp @@ -467,10 +467,7 @@ void test_biom_constructor() { SUITE_END(); } -void test_biom_get_obs_data() { - SUITE_START("biom get obs data"); - - su::biom table = su::biom("test.biom"); +void _exercise_get_obs_data(su::biom &table) { double exp0[] = {0.0, 0.0, 1.0, 0.0, 0.0, 0.0}; std::vector exp0_vec = _double_array_to_vector(exp0, 6); double exp1[] = {5.0, 1.0, 0.0, 2.0, 3.0, 1.0}; @@ -506,9 +503,30 @@ void test_biom_get_obs_data() { ASSERT(vec_almost_equal(obs_vec, exp4_vec)); free(out); +} + +void test_biom_constructor_from_sparse() { + SUITE_START("biom from sparse constructor"); + std::vector index = {2, 0, 1, 3, 4, 5, 2, 3, 5, 0, 1, 2, 5, 1, 2}; + std::vector indptr = {0, 1, 6, 9, 13, 15}; + std::vector data = {1., 5., 1., 2., 3., 1., 1., 4., 2., 2., 1., 1., 1., 1., 1.}; + std::vector obs_ids = {"GG_OTU_1", "GG_OTU_2", "GG_OTU_3", "GG_OTU_4", "GG_OTU_5"}; + std::vector samp_ids = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"}; + + su::biom table = su::biom(obs_ids, samp_ids, index, indptr, data); + _exercise_get_obs_data(table); SUITE_END(); } +void test_biom_get_obs_data() { + SUITE_START("biom get obs data"); + + su::biom table = su::biom("test.biom"); + _exercise_get_obs_data(table); + SUITE_END(); +} + + void test_bptree_leftchild() { SUITE_START("test bptree left child"); su::BPTree tree = su::BPTree("((3,4,(6)5)2,7,((10,100)9)8)1;"); @@ -1832,6 +1850,7 @@ int main(int argc, char** argv) { test_bptree_collapse_edge(); test_biom_constructor(); + test_biom_constructor_from_sparse(); test_biom_get_obs_data(); test_propstack_constructor(); From 114591c7f6ceaa6db4d5cf9cb70fd00b8131f445 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 11 Mar 2022 16:47:02 -0800 Subject: [PATCH 03/31] comment on from scipy.sparse --- src/biom.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/biom.hpp b/src/biom.hpp index 7e88773..8d0669a 100644 --- a/src/biom.hpp +++ b/src/biom.hpp @@ -85,6 +85,9 @@ namespace su { void malloc_resident(uint32_t n_obs); + /* allow load from scipy.sparse like data + * WARNING: assumes data are CSR, with observations as rows + */ unsigned int get_obs_data_from_sparse(const std::string &id_, const std::vector &index, const std::vector &indptr, From 8663ed61c1cf449e8c6630f3116d1d74e26755e9 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Sat, 12 Mar 2022 09:50:48 -0800 Subject: [PATCH 04/31] use biom not biom_interface --- src/api.cpp | 4 ++-- src/api.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/api.cpp b/src/api.cpp index 9ca718b..a6cb04d 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -101,7 +101,7 @@ void destroy_stripes(vector &dm_stripes, vector &dm_stripes_to } -void initialize_mat(mat_t* &result, biom_interface &table, bool is_upper_triangle) { +void initialize_mat(mat_t* &result, biom &table, bool is_upper_triangle) { result = (mat_t*)malloc(sizeof(mat)); result->n_samples = table.n_samples; @@ -404,7 +404,7 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file return okay; } -compute_status one_off_inmem(su::biom_interface &table, su::BPTree &tree, +compute_status one_off_inmem(su::biom &table, su::BPTree &tree, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, mat_t** result) { SET_METHOD(unifrac_method, unknown_method) diff --git a/src/api.hpp b/src/api.hpp index 8f05773..45185c9 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -1,6 +1,6 @@ #include "task_parameters.hpp" -#include "biom_interface.hpp" #include "tree.hpp" +#include "biom.hpp" #ifdef __cplusplus #include @@ -173,7 +173,7 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam * unknown_method : the requested method is unknown. * table_empty : the table does not have any entries */ -EXTERN ComputeStatus one_off_inmem(su::biom_interface &table, su::BPTree &tree, +EXTERN ComputeStatus one_off_inmem(su::biom &table, su::BPTree &tree, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat_t** result); From e9e5b36c516ba67b833d8f4096ecdc7f8f799c8c Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 14 Mar 2022 08:52:10 -0700 Subject: [PATCH 05/31] limit one_off_inmem to c++ --- src/api.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/api.hpp b/src/api.hpp index 45185c9..189c4b0 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -1,9 +1,9 @@ #include "task_parameters.hpp" -#include "tree.hpp" -#include "biom.hpp" #ifdef __cplusplus #include +#include "tree.hpp" +#include "biom.hpp" #define EXTERN extern "C" @@ -156,6 +156,7 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat_t** result); + /* Compute UniFrac - condensed form, in-memory * * table a constructed BIOM object @@ -173,9 +174,11 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam * unknown_method : the requested method is unknown. * table_empty : the table does not have any entries */ +#ifdef __cplusplus EXTERN ComputeStatus one_off_inmem(su::biom &table, su::BPTree &tree, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat_t** result); +#endif /* Compute UniFrac - matrix form * From cebd37bbf6442a8b9b0a52f535963003ce15e5fd Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 14 Mar 2022 08:55:03 -0700 Subject: [PATCH 06/31] do not extern inmem c-style --- src/api.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/api.hpp b/src/api.hpp index 189c4b0..6040856 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -175,9 +175,9 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam * table_empty : the table does not have any entries */ #ifdef __cplusplus -EXTERN ComputeStatus one_off_inmem(su::biom &table, su::BPTree &tree, - const char* unifrac_method, bool variance_adjust, double alpha, - bool bypass_tips, unsigned int threads, mat_t** result); +ComputeStatus one_off_inmem(su::biom &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int threads, mat_t** result); #endif /* Compute UniFrac - matrix form From 6fcfb1609fc904a1044015c719c5c4df4b0df616 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 14 Mar 2022 10:33:17 -0700 Subject: [PATCH 07/31] Bring over a few additional headers --- src/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Makefile b/src/Makefile index 2b97c05..4ee100e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -135,6 +135,9 @@ install: libssu.so ssu faithpd mkdir -p ${PREFIX}/include/unifrac cp task_parameters.hpp ${PREFIX}/include/unifrac/ cp api.hpp ${PREFIX}/include/unifrac/ + cp biom.hpp ${PREFIX}/include/unifrac/ + cp tree.hpp ${PREFIX}/include/unifrac/ + cp biom_interface.hpp ${PREFIX}/include/unifrac/ rapi_test: main mkdir -p ~/.R From 26aa816344bc4e626d0876c3d61fa3659dfb9505 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 14 Mar 2022 17:56:15 -0700 Subject: [PATCH 08/31] add nullary constructors --- src/biom.cpp | 3 +++ src/biom.hpp | 3 +++ src/tree.cpp | 2 ++ src/tree.hpp | 3 +++ 4 files changed, 11 insertions(+) diff --git a/src/biom.cpp b/src/biom.cpp index f4e8a76..681b6e1 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -106,6 +106,9 @@ void biom::malloc_resident(uint32_t n_obs) { } } +biom::biom() { + malloc_resident(0); +} biom::biom(const std::vector &obs_ids, const std::vector &samp_ids, diff --git a/src/biom.hpp b/src/biom.hpp index 8d0669a..46b1bfb 100644 --- a/src/biom.hpp +++ b/src/biom.hpp @@ -21,6 +21,9 @@ namespace su { class biom : public biom_interface { public: + /* nullary constructor */ + biom(); + /* default constructor * * @param filename The path to the BIOM table to read diff --git a/src/tree.cpp b/src/tree.cpp index 3dc6f98..8b4a793 100644 --- a/src/tree.cpp +++ b/src/tree.cpp @@ -4,6 +4,8 @@ using namespace su; +BPTree::BPTree() { } + BPTree::BPTree(std::string newick) { openclose = std::vector(); lengths = std::vector(); diff --git a/src/tree.hpp b/src/tree.hpp index 99f61df..eb3ee60 100644 --- a/src/tree.hpp +++ b/src/tree.hpp @@ -19,6 +19,9 @@ namespace su { class BPTree { public: + /* nullary constructor */ + BPTree(); + /* tracked attributes */ std::vector lengths; std::vector names; From 4def03f0ec56bcc64716bf47f17800b37dfde07d Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 15 Mar 2022 09:51:02 -0700 Subject: [PATCH 09/31] more conservative destructor, use pointers --- src/api.cpp | 18 +++++++++--------- src/api.hpp | 2 +- src/biom.cpp | 21 +++++++++++++++------ src/test_su.cpp | 16 +++++++++++++++- 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/src/api.cpp b/src/api.cpp index a6cb04d..ffde792 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -404,13 +404,13 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file return okay; } -compute_status one_off_inmem(su::biom &table, su::BPTree &tree, +compute_status one_off_inmem(su::biom *table, su::BPTree *tree, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, mat_t** result) { SET_METHOD(unifrac_method, unknown_method) - SYNC_TREE_TABLE(tree, table) + SYNC_TREE_TABLE((*tree), (*table)) - const unsigned int stripe_stop = (table.n_samples + 1) / 2; + const unsigned int stripe_stop = (table->n_samples + 1) / 2; std::vector dm_stripes(stripe_stop); std::vector dm_stripes_total(stripe_stop); @@ -422,15 +422,15 @@ compute_status one_off_inmem(su::biom &table, su::BPTree &tree, std::vector tasks(nthreads); std::vector threads(nthreads); - set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads); - su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); + set_tasks(tasks, alpha, table->n_samples, 0, stripe_stop, bypass_tips, nthreads); + su::process_stripes(*table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); - initialize_mat(*result, table, true); // true -> is_upper_triangle + initialize_mat(*result, *table, true); // true -> is_upper_triangle for(unsigned int tid = 0; tid < threads.size(); tid++) { - su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop); + su::stripes_to_condensed_form(dm_stripes,table->n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop); } - destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0); + destroy_stripes(dm_stripes, dm_stripes_total, table->n_samples, 0, 0); return okay; } @@ -441,7 +441,7 @@ compute_status one_off(const char* biom_filename, const char* tree_filename, CHECK_FILE(biom_filename, table_missing) CHECK_FILE(tree_filename, tree_missing) PARSE_TREE_TABLE(tree_filename, table_filename) - return one_off_inmem(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); + return one_off_inmem(&table, &tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); } // TMat mat_full_fp32_t diff --git a/src/api.hpp b/src/api.hpp index 6040856..ba03cc4 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -175,7 +175,7 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam * table_empty : the table does not have any entries */ #ifdef __cplusplus -ComputeStatus one_off_inmem(su::biom &table, su::BPTree &tree, +ComputeStatus one_off_inmem(su::biom *table, su::BPTree *tree, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat_t** result); #endif diff --git a/src/biom.cpp b/src/biom.cpp index 681b6e1..adfa1cc 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -75,13 +75,21 @@ biom::biom(std::string filename) { } biom::~biom() { - for(unsigned int i = 0; i < n_obs; i++) { - free(obs_indices_resident[i]); - free(obs_data_resident[i]); + if(obs_indices_resident != NULL && obs_data_resident != NULL) { + for(unsigned int i = 0; i < n_obs; i++) { + if(obs_indices_resident[i] != NULL) + free(obs_indices_resident[i]); + if(obs_data_resident[i] != NULL) + free(obs_data_resident[i]); + } } - free(obs_indices_resident); - free(obs_data_resident); - free(obs_counts_resident); + + if(obs_indices_resident != NULL) + free(obs_indices_resident); + if(obs_data_resident != NULL) + free(obs_data_resident); + if(obs_counts_resident != NULL) + free(obs_counts_resident); } void biom::malloc_resident(uint32_t n_obs) { @@ -107,6 +115,7 @@ void biom::malloc_resident(uint32_t n_obs) { } biom::biom() { + n_obs = 0; malloc_resident(0); } diff --git a/src/test_su.cpp b/src/test_su.cpp index d8f6dde..7cb2219 100644 --- a/src/test_su.cpp +++ b/src/test_su.cpp @@ -463,7 +463,7 @@ void test_biom_constructor() { ASSERT(table.obs_ids == exp_oids); ASSERT(table.sample_indptr == exp_s_indptr); ASSERT(table.obs_indptr == exp_o_indptr); - + SUITE_END(); } @@ -518,6 +518,18 @@ void test_biom_constructor_from_sparse() { SUITE_END(); } +void test_biom_nullary() { + SUITE_START("biom nullary"); + su::biom table = su::biom(); + SUITE_END(); +} + +void test_bptree_nullary() { + SUITE_START("bptree nullary"); + su::BPTree tree = su::BPTree(); + SUITE_END(); +} + void test_biom_get_obs_data() { SUITE_START("biom get obs data"); @@ -1836,6 +1848,7 @@ int main(int argc, char** argv) { test_bptree_constructor_edgecases(); test_bptree_constructor_quoted_comma(); test_bptree_constructor_quoted_parens(); + test_bptree_nullary(); test_bptree_postorder(); test_bptree_preorder(); test_bptree_parent(); @@ -1851,6 +1864,7 @@ int main(int argc, char** argv) { test_biom_constructor(); test_biom_constructor_from_sparse(); + test_biom_nullary(); test_biom_get_obs_data(); test_propstack_constructor(); From 1ca12158f2cc3a4229a1967b2b6167f191c5cdab Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 15 Mar 2022 10:56:02 -0700 Subject: [PATCH 10/31] Actually copy the ids --- src/biom.cpp | 15 ++++++++++----- src/biom.hpp | 3 +-- src/tree.cpp | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/biom.cpp b/src/biom.cpp index adfa1cc..dea77b1 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -119,21 +119,26 @@ biom::biom() { malloc_resident(0); } -biom::biom(const std::vector &obs_ids, - const std::vector &samp_ids, +biom::biom(const std::vector &obs_ids_in, + const std::vector &samp_ids_in, const std::vector &index, const std::vector &indptr, const std::vector &data) { nnz = data.size(); - n_samples = samp_ids.size(); - n_obs = obs_ids.size(); + n_samples = samp_ids_in.size(); + n_obs = obs_ids_in.size(); + + sample_ids = std::vector(); + obs_ids = std::vector(); + sample_ids.assign(samp_ids_in.begin(), samp_ids_in.end()); + obs_ids.assign(obs_ids_in.begin(), obs_ids_in.end()); /* define a mapping between an ID and its corresponding offset */ obs_id_index = std::unordered_map(); sample_id_index = std::unordered_map(); create_id_index(obs_ids, obs_id_index); - create_id_index(samp_ids, sample_id_index); + create_id_index(sample_ids, sample_id_index); malloc_resident(n_obs); diff --git a/src/biom.hpp b/src/biom.hpp index 46b1bfb..6f10116 100644 --- a/src/biom.hpp +++ b/src/biom.hpp @@ -72,10 +72,9 @@ namespace su { */ void get_obs_data_range(const std::string &id, unsigned int start, unsigned int end, bool normalize, double* out) const; void get_obs_data_range(const std::string &id, unsigned int start, unsigned int end, bool normalize, float* out) const; - private: bool has_hdf5_backing = false; - + /* retain DataSet handles within the HDF5 file */ H5::DataSet obs_indices; H5::DataSet sample_indices; diff --git a/src/tree.cpp b/src/tree.cpp index 8b4a793..a0fa371 100644 --- a/src/tree.cpp +++ b/src/tree.cpp @@ -199,7 +199,7 @@ void BPTree::index_and_cache() { } } -uint32_t BPTree::postorderselect(uint32_t k) const { +uint32_t BPTree::postorderselect(uint32_t k) const { return open(select_0_index[k]); } From c5056b13720c26de845efb3056d22908a18941a9 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 15 Mar 2022 18:16:46 -0700 Subject: [PATCH 11/31] address many of @sfiligoi's comments --- src/api.cpp | 29 ++++++++++++++++++++++------- src/api.hpp | 43 ++++++++++++++++++++++++++++++++++++++----- src/biom.cpp | 47 ++++++++++++++++++++++++++++------------------- src/biom.hpp | 20 ++++++++++++-------- src/test_su.cpp | 19 +++++++++++++------ src/tree.cpp | 29 +++++++++++++++++++++++++++++ src/tree.hpp | 9 +++++++++ 7 files changed, 151 insertions(+), 45 deletions(-) diff --git a/src/api.cpp b/src/api.cpp index ffde792..c1222ce 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -404,13 +404,28 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file return okay; } -compute_status one_off_inmem(su::biom *table, su::BPTree *tree, +compute_status one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, mat_t** result) { SET_METHOD(unifrac_method, unknown_method) - SYNC_TREE_TABLE((*tree), (*table)) - const unsigned int stripe_stop = (table->n_samples + 1) / 2; + table = su::biom(table_data->obs_ids, + table_data->sample_ids, + table_data->indices, + table_data->indptr, + table_data->data + table_data->n_obs, + table_data->n_samples, + table_data->nnz); + + tree = su::BPTree(tree_data->structure, + tree_data->lengths, + tree_data->names, + tree_data->n_parens); + + SYNC_TREE_TABLE(tree, table) + + const unsigned int stripe_stop = (table.n_samples + 1) / 2; std::vector dm_stripes(stripe_stop); std::vector dm_stripes_total(stripe_stop); @@ -422,15 +437,15 @@ compute_status one_off_inmem(su::biom *table, su::BPTree *tree, std::vector tasks(nthreads); std::vector threads(nthreads); - set_tasks(tasks, alpha, table->n_samples, 0, stripe_stop, bypass_tips, nthreads); + set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads); su::process_stripes(*table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); - initialize_mat(*result, *table, true); // true -> is_upper_triangle + initialize_mat(*result, table, true); // true -> is_upper_triangle for(unsigned int tid = 0; tid < threads.size(); tid++) { - su::stripes_to_condensed_form(dm_stripes,table->n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop); + su::stripes_to_condensed_form(dm_stripes,tablen_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop); } - destroy_stripes(dm_stripes, dm_stripes_total, table->n_samples, 0, 0); + destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0); return okay; } diff --git a/src/api.hpp b/src/api.hpp index ba03cc4..63d65a8 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -124,6 +124,41 @@ typedef struct partial_dyn_mat { char* filename; } partial_dyn_mat_t; +/* support structure to carry in biom table information + * + * obs_ids the observation IDs + * sample_ids the sample IDs + * indices the indices of the data values + * indptr the row offset of the data values + * data the actual matrix values + * n_obs the number of observations, corresponding to length of obs_ids + * n_samples the number of samples, corresponding to the length of sample_ids + * nnz the number of nonzero values, corresponding to the length of data and indices + */ +typedef struct support_biom { + char** obs_ids; + char** sample_ids; + int32_t* indices; + int32_t* indptr; + double* data; + int n_obs; + int n_samples; + int nnz; +} support_biom_t; + +/* support structure to carry in bptree information + * + * structure the topology of the tree + * lengths the branch lengths of the tree + * names the names of the tips and internal nodes of hte tree + * n_parens the length of the structure array + */ +typedef struct support_bptree { + bool* structure; + double* lengths; + char** names; + int n_parens; +} support_bptree_t; void destroy_mat(mat_t** result); @@ -174,11 +209,9 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam * unknown_method : the requested method is unknown. * table_empty : the table does not have any entries */ -#ifdef __cplusplus -ComputeStatus one_off_inmem(su::biom *table, su::BPTree *tree, - const char* unifrac_method, bool variance_adjust, double alpha, - bool bypass_tips, unsigned int threads, mat_t** result); -#endif +EXTERN ComputeStatus one_off_inmem(support_biom_t *table_data, support_bptree_t *tree_data, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int threads, mat_t** result); /* Compute UniFrac - matrix form * diff --git a/src/biom.cpp b/src/biom.cpp index dea77b1..769ba07 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -26,8 +26,7 @@ const std::string SAMPLE_INDICES = std::string("/sample/matrix/indices"); const std::string SAMPLE_DATA = std::string("/sample/matrix/data"); const std::string SAMPLE_IDS = std::string("/sample/ids"); -biom::biom(std::string filename) { - has_hdf5_backing = true; +biom::biom(std::string filename) : has_hdf5_backing = true { file = H5File(filename.c_str(), H5F_ACC_RDONLY); @@ -114,24 +113,34 @@ void biom::malloc_resident(uint32_t n_obs) { } } -biom::biom() { +biom::biom() : has_hdf5_backing = false { n_obs = 0; malloc_resident(0); } -biom::biom(const std::vector &obs_ids_in, - const std::vector &samp_ids_in, - const std::vector &index, - const std::vector &indptr, - const std::vector &data) { - nnz = data.size(); - n_samples = samp_ids_in.size(); - n_obs = obs_ids_in.size(); +biom::biom(const char** obs_ids_in, + const char** samp_ids_in, + const int32_t* indices + const int32_t* indptr, + const double* data, + const int n_obs, + const int n_samples, + const int nnz) : has_hdf5_backing = false { + this->nnz = nnz; + this->n_samples = n_samples; + this->n_obs = n_obs; sample_ids = std::vector(); + sample_ids.reserve(n_samples); obs_ids = std::vector(); - sample_ids.assign(samp_ids_in.begin(), samp_ids_in.end()); - obs_ids.assign(obs_ids_in.begin(), obs_ids_in.end()); + obs_ids.reserve(n_obs); + + for(int i = 0; i < n_obs; i++) { + obs_ids.push_back(std::string(obs_ids_in[i])); + } + for(int i = 0; i < n_samples; i++) { + sample_ids.push_back(std::string(samp_ids_in[i])); + } /* define a mapping between an ID and its corresponding offset */ obs_id_index = std::unordered_map(); @@ -144,7 +153,7 @@ biom::biom(const std::vector &obs_ids_in, uint32_t *current_indices = NULL; double *current_data = NULL; - for(unsigned int i = 0; i < obs_ids.size(); i++) { + for(unsigned int i = 0; i < n_obs; i++) { std::string id_ = obs_ids[i]; unsigned int n = get_obs_data_from_sparse(id_, index, indptr, data, current_indices, current_data); obs_counts_resident[i] = n; @@ -241,14 +250,14 @@ void biom::create_id_index(const std::vector &ids, } unsigned int biom::get_obs_data_from_sparse(const std::string &id_, - const std::vector &index, - const std::vector &indptr, - const std::vector &data, + const int32_t* index, + const int32_t* indptr, + const double* data, uint32_t *& current_indices_out, double *& current_data_out) { uint32_t idx = obs_id_index.at(id_); - uint32_t start = indptr[idx]; - uint32_t end = indptr[idx + 1]; + int32_t start = indptr[idx]; + int32_t end = indptr[idx + 1]; unsigned int count = end - start; current_indices_out = (uint32_t*)malloc(sizeof(uint32_t) * count); diff --git a/src/biom.hpp b/src/biom.hpp index 6f10116..4268b8a 100644 --- a/src/biom.hpp +++ b/src/biom.hpp @@ -37,12 +37,16 @@ namespace su { * @param index vector of index positions * @param indptr vector of indptr positions * @param data vector of observation counts + * @param n_obs number of observations + * @param n_samples number of samples */ - biom(const std::vector &obs_ids, - const std::vector &samp_ids, - const std::vector &index, - const std::vector &indptr, - const std::vector &data); + biom(const char** obs_ids, + const char** samp_ids, + const int32_t* index, + const int32_t* indptr, + const double* data, + const int n_obs, + const int n_samples); /* default destructor * @@ -91,9 +95,9 @@ namespace su { * WARNING: assumes data are CSR, with observations as rows */ unsigned int get_obs_data_from_sparse(const std::string &id_, - const std::vector &index, - const std::vector &indptr, - const std::vector &data, + const int32_t* index, + const int32_t* indptr, + const double* data, uint32_t *& current_indices_out, double *& current_data_out); unsigned int get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out); diff --git a/src/test_su.cpp b/src/test_su.cpp index 7cb2219..ff472cb 100644 --- a/src/test_su.cpp +++ b/src/test_su.cpp @@ -507,14 +507,15 @@ void _exercise_get_obs_data(su::biom &table) { void test_biom_constructor_from_sparse() { SUITE_START("biom from sparse constructor"); - std::vector index = {2, 0, 1, 3, 4, 5, 2, 3, 5, 0, 1, 2, 5, 1, 2}; - std::vector indptr = {0, 1, 6, 9, 13, 15}; - std::vector data = {1., 5., 1., 2., 3., 1., 1., 4., 2., 2., 1., 1., 1., 1., 1.}; - std::vector obs_ids = {"GG_OTU_1", "GG_OTU_2", "GG_OTU_3", "GG_OTU_4", "GG_OTU_5"}; - std::vector samp_ids = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"}; + int32_t index[] = {2, 0, 1, 3, 4, 5, 2, 3, 5, 0, 1, 2, 5, 1, 2}; + int32_t indptr[] = {0, 1, 6, 9, 13, 15}; + double data[] = {1., 5., 1., 2., 3., 1., 1., 4., 2., 2., 1., 1., 1., 1., 1.}; + char* obs_ids[] = {"GG_OTU_1", "GG_OTU_2", "GG_OTU_3", "GG_OTU_4", "GG_OTU_5"}; + char* samp_ids[] = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"}; - su::biom table = su::biom(obs_ids, samp_ids, index, indptr, data); + su::biom table = su::biom(obs_ids, samp_ids, index, indptr, data, 5, 6); _exercise_get_obs_data(table); + ASSERT(1 != 1); SUITE_END(); } @@ -1832,6 +1833,12 @@ void test_set_tasks() { SUITE_END(); } +void test_bptree_cstyle_constructor() { + SUITE_START("test bptree constructor from c-style data"); + ASSERT(1 != 1); + SUITE_END(); +} + void test_bptree_constructor_newline_bug() { SUITE_START("test bptree constructor newline bug"); su::BPTree tree = su::BPTree("((362be41f31fd26be95ae43a8769b91c0:0.116350803,(a16679d5a10caa9753f171977552d920:0.105836235,((a7acc2abb505c3ee177a12e514d3b994:0.008268754,(4e22aa3508b98813f52e1a12ffdb74ad:0.03144211,8139c4ac825dae48454fb4800fb87896:0.043622957)0.923:0.046588301)0.997:0.120902074,((2d3df7387323e2edcbbfcb6e56a02710:0.031543994,3f6752aabcc291b67a063fb6492fd107:0.091571442)0.759:0.016335166,((d599ebe277afb0dfd4ad3c2176afc50e:5e-09,84d0affc7243c7d6261f3a7d680b873f:0.010245188)0.883:0.048993011,51121722488d0c3da1388d1b117cd239:0.119447926)0.763:0.035660204)0.921:0.058191474)0.776:0.02854575)0.657:0.052060833)0.658:0.032547569,(99647b51f775c8ddde8ed36a7d60dbcd:0.173334268,(f18a9c8112372e2916a66a9778f3741b:0.194813398,(5833416522de0cca717a1abf720079ac:5e-09,(2bf1067d2cd4f09671e3ebe5500205ca:0.031692682,(b32621bcd86cb99e846d8f6fee7c9ab8:0.031330707,1016319c25196d73bdb3096d86a9df2f:5e-09)0.058:0.01028612)0.849:0.010284866)0.791:0.041353384)0.922:0.109470534):0.022169824000000005)root;\n\n"); diff --git a/src/tree.cpp b/src/tree.cpp index a0fa371..d8656eb 100644 --- a/src/tree.cpp +++ b/src/tree.cpp @@ -52,6 +52,35 @@ BPTree::BPTree(std::vector input_structure, std::vector input_leng index_and_cache(); } +BPTree::BPTree(const bool* input_structure, const double* input_lengths, const char** input_names, const int n_parens) { + structure = std::vector(); + lengths = std::vector(); + names = std::vector(); + + structure.resize(n_parens); + lengths.resize(n_parens); + names.resize(n_parens); + + nparens = n_parens; + + for(int i = 0; i < n_parens; i++) { + structure.push_back(input_structure[i]); + lengths.push_back(input_lengths[i]); + names.push_back(std::string(input_names[i])); + } + + openclose = std::vector(); + select_0_index = std::vector(); + select_1_index = std::vector(); + openclose.resize(nparens); + select_0_index.resize(nparens / 2); + select_1_index.resize(nparens / 2); + excess.resize(nparens); + + structure_to_openclose(); + index_and_cache(); +} + BPTree BPTree::mask(std::vector topology_mask, std::vector in_lengths) { std::vector new_structure = std::vector(); diff --git a/src/tree.hpp b/src/tree.hpp index eb3ee60..4eda901 100644 --- a/src/tree.hpp +++ b/src/tree.hpp @@ -44,6 +44,15 @@ namespace su { BPTree(std::vector input_structure, std::vector input_lengths, std::vector input_names); ~BPTree(); + /* constructor from a defined topology using c-types + * + * @param input_structure A boolean array defining the topology + * @param input_lengths A double array of the branch lengths + * @param input_names A char* array of the names + * @param n_parens The length of the topology + */ + BPTree(const bool* input_structure, const double* input_lengths, const char** input_names, const int n_parens); + /* postorder tree traversal * * Get the index position of the ith node in a postorder tree From 47c579e8f84c897d507ec1f29e067603c2fc9ae6 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 15 Mar 2022 19:23:58 -0700 Subject: [PATCH 12/31] nearly there --- src/api.cpp | 40 +++++++++++++++++++++++----------------- src/api.hpp | 5 +++++ src/biom.cpp | 15 +++++++-------- src/biom.hpp | 8 +++++--- src/test_su.cpp | 2 +- src/tree.cpp | 2 +- src/tree.hpp | 2 +- 7 files changed, 43 insertions(+), 31 deletions(-) diff --git a/src/api.cpp b/src/api.cpp index c1222ce..d79b03f 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -407,23 +407,29 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file compute_status one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, mat_t** result) { - SET_METHOD(unifrac_method, unknown_method) - - table = su::biom(table_data->obs_ids, - table_data->sample_ids, - table_data->indices, - table_data->indptr, - table_data->data - table_data->n_obs, - table_data->n_samples, - table_data->nnz); + su::biom table(table_data->obs_ids, + table_data->sample_ids, + table_data->indices, + table_data->indptr, + table_data->data, + table_data->n_obs, + table_data->n_samples, + table_data->nnz); + + su::BPTree tree(tree_data->structure, + tree_data->lengths, + tree_data->names, + tree_data->n_parens); + + return one_off_inmem_cpp(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); +} - tree = su::BPTree(tree_data->structure, - tree_data->lengths, - tree_data->names, - tree_data->n_parens); +compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, mat_t** result) { SYNC_TREE_TABLE(tree, table) + SET_METHOD(unifrac_method, unknown_method) const unsigned int stripe_stop = (table.n_samples + 1) / 2; std::vector dm_stripes(stripe_stop); @@ -438,11 +444,11 @@ compute_status one_off_inmem(const support_biom_t *table_data, const support_bpt std::vector threads(nthreads); set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads); - su::process_stripes(*table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); + su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); initialize_mat(*result, table, true); // true -> is_upper_triangle for(unsigned int tid = 0; tid < threads.size(); tid++) { - su::stripes_to_condensed_form(dm_stripes,tablen_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop); + su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop); } destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0); @@ -456,7 +462,7 @@ compute_status one_off(const char* biom_filename, const char* tree_filename, CHECK_FILE(biom_filename, table_missing) CHECK_FILE(tree_filename, tree_missing) PARSE_TREE_TABLE(tree_filename, table_filename) - return one_off_inmem(&table, &tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); + return one_off_inmem_cpp(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); } // TMat mat_full_fp32_t diff --git a/src/api.hpp b/src/api.hpp index 63d65a8..f9c2dd4 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -213,6 +213,11 @@ EXTERN ComputeStatus one_off_inmem(support_biom_t *table_data, support_bptree_t const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat_t** result); +/* define this thing... */ +compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, mat_t** result); + /* Compute UniFrac - matrix form * * biom_filename the filename to the biom table. diff --git a/src/biom.cpp b/src/biom.cpp index 769ba07..7262a5c 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -26,8 +26,7 @@ const std::string SAMPLE_INDICES = std::string("/sample/matrix/indices"); const std::string SAMPLE_DATA = std::string("/sample/matrix/data"); const std::string SAMPLE_IDS = std::string("/sample/ids"); -biom::biom(std::string filename) : has_hdf5_backing = true { - +biom::biom(std::string filename) : has_hdf5_backing(true) { file = H5File(filename.c_str(), H5F_ACC_RDONLY); /* establish the datasets */ @@ -113,19 +112,19 @@ void biom::malloc_resident(uint32_t n_obs) { } } -biom::biom() : has_hdf5_backing = false { +biom::biom() : has_hdf5_backing(false) { n_obs = 0; malloc_resident(0); } -biom::biom(const char** obs_ids_in, - const char** samp_ids_in, - const int32_t* indices +biom::biom(char** obs_ids_in, + char** samp_ids_in, + const int32_t* indices, const int32_t* indptr, const double* data, const int n_obs, const int n_samples, - const int nnz) : has_hdf5_backing = false { + const int nnz) : has_hdf5_backing(false) { this->nnz = nnz; this->n_samples = n_samples; this->n_obs = n_obs; @@ -155,7 +154,7 @@ biom::biom(const char** obs_ids_in, double *current_data = NULL; for(unsigned int i = 0; i < n_obs; i++) { std::string id_ = obs_ids[i]; - unsigned int n = get_obs_data_from_sparse(id_, index, indptr, data, current_indices, current_data); + unsigned int n = get_obs_data_from_sparse(id_, indices, indptr, data, current_indices, current_data); obs_counts_resident[i] = n; obs_indices_resident[i] = current_indices; obs_data_resident[i] = current_data; diff --git a/src/biom.hpp b/src/biom.hpp index 4268b8a..4c514c0 100644 --- a/src/biom.hpp +++ b/src/biom.hpp @@ -39,14 +39,16 @@ namespace su { * @param data vector of observation counts * @param n_obs number of observations * @param n_samples number of samples + * @param nnz number of data points */ - biom(const char** obs_ids, - const char** samp_ids, + biom(char** obs_ids, + char** samp_ids, const int32_t* index, const int32_t* indptr, const double* data, const int n_obs, - const int n_samples); + const int n_samples, + const int nnz); /* default destructor * diff --git a/src/test_su.cpp b/src/test_su.cpp index ff472cb..a68ec9d 100644 --- a/src/test_su.cpp +++ b/src/test_su.cpp @@ -513,7 +513,7 @@ void test_biom_constructor_from_sparse() { char* obs_ids[] = {"GG_OTU_1", "GG_OTU_2", "GG_OTU_3", "GG_OTU_4", "GG_OTU_5"}; char* samp_ids[] = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"}; - su::biom table = su::biom(obs_ids, samp_ids, index, indptr, data, 5, 6); + su::biom table = su::biom(obs_ids, samp_ids, index, indptr, data, 5, 6, 15); _exercise_get_obs_data(table); ASSERT(1 != 1); SUITE_END(); diff --git a/src/tree.cpp b/src/tree.cpp index d8656eb..1b93f08 100644 --- a/src/tree.cpp +++ b/src/tree.cpp @@ -52,7 +52,7 @@ BPTree::BPTree(std::vector input_structure, std::vector input_leng index_and_cache(); } -BPTree::BPTree(const bool* input_structure, const double* input_lengths, const char** input_names, const int n_parens) { +BPTree::BPTree(const bool* input_structure, const double* input_lengths, char** input_names, const int n_parens) { structure = std::vector(); lengths = std::vector(); names = std::vector(); diff --git a/src/tree.hpp b/src/tree.hpp index 4eda901..cee4b4c 100644 --- a/src/tree.hpp +++ b/src/tree.hpp @@ -51,7 +51,7 @@ namespace su { * @param input_names A char* array of the names * @param n_parens The length of the topology */ - BPTree(const bool* input_structure, const double* input_lengths, const char** input_names, const int n_parens); + BPTree(const bool* input_structure, const double* input_lengths, char** input_names, const int n_parens); /* postorder tree traversal * From 919956e60304b0308c376c4e89f6485c42405ac7 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 15 Mar 2022 19:31:38 -0700 Subject: [PATCH 13/31] close... --- src/test_su.cpp | 60 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/src/test_su.cpp b/src/test_su.cpp index a68ec9d..9d14d6c 100644 --- a/src/test_su.cpp +++ b/src/test_su.cpp @@ -515,7 +515,7 @@ void test_biom_constructor_from_sparse() { su::biom table = su::biom(obs_ids, samp_ids, index, indptr, data, 5, 6, 15); _exercise_get_obs_data(table); - ASSERT(1 != 1); + SUITE_END(); } @@ -1835,7 +1835,62 @@ void test_set_tasks() { void test_bptree_cstyle_constructor() { SUITE_START("test bptree constructor from c-style data"); - ASSERT(1 != 1); + //01234567 + //11101000 + su::BPTree existing = su::BPTree("(('123:foo; bar':1,b:2)c);"); + bool structure[] = {true, true, true, false, true, false, false, false}; + double lengths[] = {0, 0, 1, 0, 2, 0, 0, 0}; + char* names[] = {"", "c", "123:foo; bar", "", "b", "", "", ""}; + su::BPTree tree = su::BPTree(structure, lengths, names, 8); + + unsigned int exp_nparens = 8; + + std::vector exp_structure; + exp_structure.push_back(true); + exp_structure.push_back(true); + exp_structure.push_back(true); + exp_structure.push_back(false); + exp_structure.push_back(true); + exp_structure.push_back(false); + exp_structure.push_back(false); + exp_structure.push_back(false); + + std::vector exp_openclose; + exp_openclose.push_back(7); + exp_openclose.push_back(6); + exp_openclose.push_back(3); + exp_openclose.push_back(2); + exp_openclose.push_back(5); + exp_openclose.push_back(4); + exp_openclose.push_back(1); + exp_openclose.push_back(0); + + std::vector exp_names; + exp_names.push_back(std::string()); + exp_names.push_back(std::string("c")); + exp_names.push_back(std::string("123:foo; bar")); + exp_names.push_back(std::string()); + exp_names.push_back(std::string("b")); + exp_names.push_back(std::string()); + exp_names.push_back(std::string()); + exp_names.push_back(std::string()); + + std::vector exp_lengths; + exp_lengths.push_back(0.0); + exp_lengths.push_back(0.0); + exp_lengths.push_back(1.0); + exp_lengths.push_back(0.0); + exp_lengths.push_back(2.0); + exp_lengths.push_back(0.0); + exp_lengths.push_back(0.0); + exp_lengths.push_back(0.0); + + ASSERT(tree.nparens == exp_nparens); + ASSERT(tree.get_structure() == exp_structure); + ASSERT(tree.get_openclose() == exp_openclose); + ASSERT(tree.lengths == exp_lengths); + ASSERT(tree.names == exp_names); + SUITE_END(); } @@ -1855,6 +1910,7 @@ int main(int argc, char** argv) { test_bptree_constructor_edgecases(); test_bptree_constructor_quoted_comma(); test_bptree_constructor_quoted_parens(); + test_bptree_cstyle_constructor(); test_bptree_nullary(); test_bptree_postorder(); test_bptree_preorder(); From de83bebca3f5b8bfc47292cede63311d9e78f37b Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 16 Mar 2022 11:30:48 -0700 Subject: [PATCH 14/31] use reserve rather than resize --- src/test_su.cpp | 2 -- src/tree.cpp | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/test_su.cpp b/src/test_su.cpp index 9d14d6c..d8198d5 100644 --- a/src/test_su.cpp +++ b/src/test_su.cpp @@ -153,7 +153,6 @@ void test_bptree_constructor_from_existing() { //11101000 su::BPTree existing = su::BPTree("(('123:foo; bar':1,b:2)c);"); su::BPTree tree = su::BPTree(existing.get_structure(), existing.lengths, existing.names); - unsigned int exp_nparens = 8; std::vector exp_structure; @@ -1837,7 +1836,6 @@ void test_bptree_cstyle_constructor() { SUITE_START("test bptree constructor from c-style data"); //01234567 //11101000 - su::BPTree existing = su::BPTree("(('123:foo; bar':1,b:2)c);"); bool structure[] = {true, true, true, false, true, false, false, false}; double lengths[] = {0, 0, 1, 0, 2, 0, 0, 0}; char* names[] = {"", "c", "123:foo; bar", "", "b", "", "", ""}; diff --git a/src/tree.cpp b/src/tree.cpp index 1b93f08..500526e 100644 --- a/src/tree.cpp +++ b/src/tree.cpp @@ -57,9 +57,9 @@ BPTree::BPTree(const bool* input_structure, const double* input_lengths, char** lengths = std::vector(); names = std::vector(); - structure.resize(n_parens); - lengths.resize(n_parens); - names.resize(n_parens); + structure.reserve(n_parens); + lengths.reserve(n_parens); + names.reserve(n_parens); nparens = n_parens; From 66a09e306509f5d2cfcaa5f25e0a37163e8f2d03 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 16 Mar 2022 12:36:11 -0700 Subject: [PATCH 15/31] kick ci --- src/test_su.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test_su.cpp b/src/test_su.cpp index d8198d5..bf6c296 100644 --- a/src/test_su.cpp +++ b/src/test_su.cpp @@ -153,8 +153,8 @@ void test_bptree_constructor_from_existing() { //11101000 su::BPTree existing = su::BPTree("(('123:foo; bar':1,b:2)c);"); su::BPTree tree = su::BPTree(existing.get_structure(), existing.lengths, existing.names); + unsigned int exp_nparens = 8; - std::vector exp_structure; exp_structure.push_back(true); exp_structure.push_back(true); From e8a8fa0d63d376f7dabf596bbb374860c1641833 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 16 Mar 2022 13:00:10 -0700 Subject: [PATCH 16/31] move status enums to their own header --- src/api.hpp | 8 +++----- src/status_enum.hpp | 8 ++++++++ 2 files changed, 11 insertions(+), 5 deletions(-) create mode 100644 src/status_enum.hpp diff --git a/src/api.hpp b/src/api.hpp index f9c2dd4..513fdf2 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -1,9 +1,10 @@ #include "task_parameters.hpp" +#include "status_enum.hpp" #ifdef __cplusplus #include -#include "tree.hpp" #include "biom.hpp" +#include "tree.hpp" #define EXTERN extern "C" @@ -16,10 +17,6 @@ #define PARTIAL_MAGIC_V2 0x088ABA02 -typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error} ComputeStatus; -typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus; -typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus; - /* a result matrix * * n_samples the number of samples. @@ -214,6 +211,7 @@ EXTERN ComputeStatus one_off_inmem(support_biom_t *table_data, support_bptree_t bool bypass_tips, unsigned int threads, mat_t** result); /* define this thing... */ + compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, mat_t** result); diff --git a/src/status_enum.hpp b/src/status_enum.hpp new file mode 100644 index 0000000..c88bb69 --- /dev/null +++ b/src/status_enum.hpp @@ -0,0 +1,8 @@ +#ifndef _UNIFRAC_STATUS_H +#define _UNIFRAC_STATUS_H + +typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error} ComputeStatus; +typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus; +typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus; + +#endif From e192e9a32e2b95d223c1a6d1a277002b4fb49eb0 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 16 Mar 2022 13:06:32 -0700 Subject: [PATCH 17/31] define in situ --- src/api.cpp | 63 ++++++++++++++++++++++++++--------------------------- src/api.hpp | 6 ++--- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/src/api.cpp b/src/api.cpp index d79b03f..1a291ce 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -351,6 +351,37 @@ void set_tasks(std::vector &tasks, } } +compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, mat_t** result) { + SYNC_TREE_TABLE(tree, table) + SET_METHOD(unifrac_method, unknown_method) + + const unsigned int stripe_stop = (table.n_samples + 1) / 2; + std::vector dm_stripes(stripe_stop); + std::vector dm_stripes_total(stripe_stop); + + if(nthreads > dm_stripes.size()) { + fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n", dm_stripes.size()); + nthreads = dm_stripes.size(); + } + + std::vector tasks(nthreads); + std::vector threads(nthreads); + + set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads); + su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); + + initialize_mat(*result, table, true); // true -> is_upper_triangle + for(unsigned int tid = 0; tid < threads.size(); tid++) { + su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop); + } + + destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0); + + return okay; +} + compute_status partial(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, unsigned int stripe_start, unsigned int stripe_stop, @@ -424,38 +455,6 @@ compute_status one_off_inmem(const support_biom_t *table_data, const support_bpt return one_off_inmem_cpp(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); } - -compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree, - const char* unifrac_method, bool variance_adjust, double alpha, - bool bypass_tips, unsigned int nthreads, mat_t** result) { - SYNC_TREE_TABLE(tree, table) - SET_METHOD(unifrac_method, unknown_method) - - const unsigned int stripe_stop = (table.n_samples + 1) / 2; - std::vector dm_stripes(stripe_stop); - std::vector dm_stripes_total(stripe_stop); - - if(nthreads > dm_stripes.size()) { - fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n", dm_stripes.size()); - nthreads = dm_stripes.size(); - } - - std::vector tasks(nthreads); - std::vector threads(nthreads); - - set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads); - su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); - - initialize_mat(*result, table, true); // true -> is_upper_triangle - for(unsigned int tid = 0; tid < threads.size(); tid++) { - su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop); - } - - destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0); - - return okay; -} - compute_status one_off(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, mat_t** result) { diff --git a/src/api.hpp b/src/api.hpp index 513fdf2..d7f7749 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -212,9 +212,9 @@ EXTERN ComputeStatus one_off_inmem(support_biom_t *table_data, support_bptree_t /* define this thing... */ -compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree, - const char* unifrac_method, bool variance_adjust, double alpha, - bool bypass_tips, unsigned int nthreads, mat_t** result); +//compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree, +// const char* unifrac_method, bool variance_adjust, double alpha, +// bool bypass_tips, unsigned int nthreads, mat_t** result); /* Compute UniFrac - matrix form * From 01f1d676072546c5417dcd96eebdf045b2efa60f Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 16 Mar 2022 14:04:24 -0700 Subject: [PATCH 18/31] remove unnecessary includes --- src/api.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/api.hpp b/src/api.hpp index d7f7749..924d234 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -3,8 +3,6 @@ #ifdef __cplusplus #include -#include "biom.hpp" -#include "tree.hpp" #define EXTERN extern "C" From f35eac251d5d930dc62664ba1765ef32440ee3b5 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 16 Mar 2022 14:10:14 -0700 Subject: [PATCH 19/31] Address @sfiligoi's comments --- src/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Makefile b/src/Makefile index 4ee100e..ed050ab 100644 --- a/src/Makefile +++ b/src/Makefile @@ -135,9 +135,7 @@ install: libssu.so ssu faithpd mkdir -p ${PREFIX}/include/unifrac cp task_parameters.hpp ${PREFIX}/include/unifrac/ cp api.hpp ${PREFIX}/include/unifrac/ - cp biom.hpp ${PREFIX}/include/unifrac/ - cp tree.hpp ${PREFIX}/include/unifrac/ - cp biom_interface.hpp ${PREFIX}/include/unifrac/ + cp status_enum.hpp ${PREFIX}/include/unifrac/ rapi_test: main mkdir -p ~/.R From 3059f2a80474fffc8169cf559d1cfa264c466f20 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 16 Mar 2022 15:32:07 -0700 Subject: [PATCH 20/31] remove commted out function --- src/api.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/api.hpp b/src/api.hpp index 924d234..8c8668c 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -208,12 +208,6 @@ EXTERN ComputeStatus one_off_inmem(support_biom_t *table_data, support_bptree_t const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat_t** result); -/* define this thing... */ - -//compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree, -// const char* unifrac_method, bool variance_adjust, double alpha, -// bool bypass_tips, unsigned int nthreads, mat_t** result); - /* Compute UniFrac - matrix form * * biom_filename the filename to the biom table. From d103920cd0f36a18d6b870481341d3783033663f Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 16 Mar 2022 15:36:01 -0700 Subject: [PATCH 21/31] EXTERN on a few support methods --- src/api.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/api.hpp b/src/api.hpp index 8c8668c..304b82d 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -156,12 +156,12 @@ typedef struct support_bptree { } support_bptree_t; -void destroy_mat(mat_t** result); -void destroy_mat_full_fp64(mat_full_fp64_t** result); -void destroy_mat_full_fp32(mat_full_fp32_t** result); -void destroy_partial_mat(partial_mat_t** result); -void destroy_partial_dyn_mat(partial_dyn_mat_t** result); -void destroy_results_vec(r_vec** result); +EXTERN void destroy_mat(mat_t** result); +EXTERN void destroy_mat_full_fp64(mat_full_fp64_t** result); +EXTERN void destroy_mat_full_fp32(mat_full_fp32_t** result); +EXTERN void destroy_partial_mat(partial_mat_t** result); +EXTERN void destroy_partial_dyn_mat(partial_dyn_mat_t** result); +EXTERN void destroy_results_vec(r_vec** result); /* Compute UniFrac - condensed form * From 88bc35c67ffe5e4b655e8692f4b8bf3421adb532 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 16 Mar 2022 15:58:52 -0700 Subject: [PATCH 22/31] sync declaration --- src/api.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api.hpp b/src/api.hpp index 304b82d..dfce358 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -204,7 +204,7 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam * unknown_method : the requested method is unknown. * table_empty : the table does not have any entries */ -EXTERN ComputeStatus one_off_inmem(support_biom_t *table_data, support_bptree_t *tree_data, +EXTERN ComputeStatus one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat_t** result); From aa9523db9bf87370f8e95410b1a4266480f32f83 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 17 Mar 2022 16:05:24 -0700 Subject: [PATCH 23/31] omp pragmas --- src/biom.cpp | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/src/biom.cpp b/src/biom.cpp index 7262a5c..062ab81 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -55,10 +55,15 @@ biom::biom(std::string filename) : has_hdf5_backing(true) { obs_id_index = std::unordered_map(); sample_id_index = std::unordered_map(); - create_id_index(obs_ids, obs_id_index); - create_id_index(sample_ids, sample_id_index); - - malloc_resident(n_obs); + #pragma omp parallel for schedule(static) + for(int i = 0; i < 3; i++) { + if(i == 0) + create_id_index(obs_ids, obs_id_index); + else if(i == 1) + create_id_index(sample_ids, sample_id_index); + else if(i == 2) + malloc_resident(n_obs); + } uint32_t *current_indices = NULL; double *current_data = NULL; @@ -133,25 +138,38 @@ biom::biom(char** obs_ids_in, sample_ids.reserve(n_samples); obs_ids = std::vector(); obs_ids.reserve(n_obs); - - for(int i = 0; i < n_obs; i++) { - obs_ids.push_back(std::string(obs_ids_in[i])); - } - for(int i = 0; i < n_samples; i++) { - sample_ids.push_back(std::string(samp_ids_in[i])); + + #pragma omp parallel for schedule(static) + for(int x = 0; x < 2; x++) { + if(x == 0) { + for(int i = 0; i < n_obs; i++) { + obs_ids.push_back(std::string(obs_ids_in[i])); + } + } else { + for(int i = 0; i < n_samples; i++) { + sample_ids.push_back(std::string(samp_ids_in[i])); + } + } } /* define a mapping between an ID and its corresponding offset */ obs_id_index = std::unordered_map(); sample_id_index = std::unordered_map(); - create_id_index(obs_ids, obs_id_index); - create_id_index(sample_ids, sample_id_index); - - malloc_resident(n_obs); + #pragma omp parallel for schedule(static) + for(int i = 0; i < 3; i++) { + if(i == 0) + create_id_index(obs_ids, obs_id_index); + else if(i == 1) + create_id_index(sample_ids, sample_id_index); + else if(i == 2) + malloc_resident(n_obs); + } uint32_t *current_indices = NULL; double *current_data = NULL; + + #pragma omp parallel for schedule(static) for(unsigned int i = 0; i < n_obs; i++) { std::string id_ = obs_ids[i]; unsigned int n = get_obs_data_from_sparse(id_, indices, indptr, data, current_indices, current_data); @@ -435,6 +453,7 @@ unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& cur double* biom::get_sample_counts() { double *sample_counts = (double*)calloc(sizeof(double), n_samples); + for(unsigned int i = 0; i < n_obs; i++) { unsigned int count = obs_counts_resident[i]; uint32_t *indices = obs_indices_resident[i]; From 044fe19940c5eb36f8eafdc9fe6bf3201f0a0d51 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 17 Mar 2022 16:21:11 -0700 Subject: [PATCH 24/31] parallelize get_sample_counts --- src/biom.cpp | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/biom.cpp b/src/biom.cpp index 062ab81..b36f388 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -452,16 +452,25 @@ unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& cur } double* biom::get_sample_counts() { - double *sample_counts = (double*)calloc(sizeof(double), n_samples); - - for(unsigned int i = 0; i < n_obs; i++) { - unsigned int count = obs_counts_resident[i]; - uint32_t *indices = obs_indices_resident[i]; - double *data = obs_data_resident[i]; - for(unsigned int j = 0; j < count; j++) { - uint32_t index = indices[j]; - double datum = data[j]; - sample_counts[index] += datum; + double *sample_counts = (double*)malloc(sizeof(double) * n_samples); + + #pragma omp parallel + { + #pragma omp for + for(int i = 0; i < n_samples; i++) { + sample_counts[i] = 0; + } + + #pragma omp for reduction(+ : sample_counts[:n_samples]) + for(unsigned int i = 0; i < n_obs; i++) { + unsigned int count = obs_counts_resident[i]; + uint32_t *indices = obs_indices_resident[i]; + double *data = obs_data_resident[i]; + for(unsigned int j = 0; j < count; j++) { + uint32_t index = indices[j]; + double datum = data[j]; + sample_counts[index] += datum; + } } } return sample_counts; From d53283aef8e5fe24ac2c37e25c59620ea3b0b357 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 17 Mar 2022 18:47:08 -0700 Subject: [PATCH 25/31] remove much of one level of copying --- src/biom.cpp | 88 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/src/biom.cpp b/src/biom.cpp index b36f388..52c2873 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -78,21 +78,25 @@ biom::biom(std::string filename) : has_hdf5_backing(true) { } biom::~biom() { - if(obs_indices_resident != NULL && obs_data_resident != NULL) { - for(unsigned int i = 0; i < n_obs; i++) { - if(obs_indices_resident[i] != NULL) - free(obs_indices_resident[i]); - if(obs_data_resident[i] != NULL) - free(obs_data_resident[i]); + if(has_hdf5_backing) { + if(obs_indices_resident != NULL && obs_data_resident != NULL) { + for(unsigned int i = 0; i < n_obs; i++) { + if(obs_indices_resident[i] != NULL) + free(obs_indices_resident[i]); + if(obs_data_resident[i] != NULL) + free(obs_data_resident[i]); + } } - } - - if(obs_indices_resident != NULL) - free(obs_indices_resident); - if(obs_data_resident != NULL) - free(obs_data_resident); - if(obs_counts_resident != NULL) - free(obs_counts_resident); + + if(obs_indices_resident != NULL) + free(obs_indices_resident); + if(obs_data_resident != NULL) + free(obs_data_resident); + if(obs_counts_resident != NULL) + free(obs_counts_resident); + } + // else, it is the responsibility of the entity constructing this object + // to clean itself up } void biom::malloc_resident(uint32_t n_obs) { @@ -138,7 +142,7 @@ biom::biom(char** obs_ids_in, sample_ids.reserve(n_samples); obs_ids = std::vector(); obs_ids.reserve(n_obs); - + #pragma omp parallel for schedule(static) for(int x = 0; x < 2; x++) { if(x == 0) { @@ -168,14 +172,14 @@ biom::biom(char** obs_ids_in, uint32_t *current_indices = NULL; double *current_data = NULL; - + #pragma omp parallel for schedule(static) for(unsigned int i = 0; i < n_obs; i++) { std::string id_ = obs_ids[i]; unsigned int n = get_obs_data_from_sparse(id_, indices, indptr, data, current_indices, current_data); obs_counts_resident[i] = n; - obs_indices_resident[i] = current_indices; - obs_data_resident[i] = current_data; + //obs_indices_resident[i] = current_indices; + //obs_data_resident[i] = current_data; } sample_counts = get_sample_counts(); } @@ -276,25 +280,37 @@ unsigned int biom::get_obs_data_from_sparse(const std::string &id_, int32_t start = indptr[idx]; int32_t end = indptr[idx + 1]; unsigned int count = end - start; - - current_indices_out = (uint32_t*)malloc(sizeof(uint32_t) * count); - if(current_indices_out == NULL) { - fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", - sizeof(uint32_t) * count, __FILE__, __LINE__); - exit(EXIT_FAILURE); - } - - current_data_out = (double*)malloc(sizeof(double) * count); - if(current_data_out == NULL) { - fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", - sizeof(double) * count, __FILE__, __LINE__); - exit(EXIT_FAILURE); - } - for(int i = 0, offset = start ; offset < end; i++, offset++) { - current_indices_out[i] = index[offset]; - current_data_out[i] = data[offset]; - } + ////////////// + // dont actually do this, but remove const upstream as we are no longer + // copying but adopting? + // + // this "works" but we need to correct typing upstream as well + //////////////////// + uint32_t* index_ptr = (uint32_t*)(const_cast(index + start)); + double* data_ptr = const_cast(data + start); + + this->obs_indices_resident[idx] = index_ptr; + this->obs_data_resident[idx] = data_ptr; + + //current_indices_out = (uint32_t*)malloc(sizeof(uint32_t) * count); + //if(current_indices_out == NULL) { + // fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", + // sizeof(uint32_t) * count, __FILE__, __LINE__); + // exit(EXIT_FAILURE); + //} + + //current_data_out = (double*)malloc(sizeof(double) * count); + //if(current_data_out == NULL) { + // fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", + // sizeof(double) * count, __FILE__, __LINE__); + // exit(EXIT_FAILURE); + //} + + //for(int i = 0, offset = start ; offset < end; i++, offset++) { + // current_indices_out[i] = index[offset]; + // current_data_out[i] = data[offset]; + //} return count; } From f2826d1adb6a5be58954a74b1dc052626ce43445 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 18 Mar 2022 14:17:59 -0700 Subject: [PATCH 26/31] inmem now full matrix, and fix input types for biom --- src/api.cpp | 153 ++++++++++++++++++++++++++++++++------------ src/api.hpp | 31 +++++++-- src/biom.cpp | 70 ++++---------------- src/biom.hpp | 10 ++- src/status_enum.hpp | 2 +- src/test_su.cpp | 4 +- 6 files changed, 159 insertions(+), 111 deletions(-) diff --git a/src/api.cpp b/src/api.cpp index 1a291ce..1a4b656 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -149,6 +149,34 @@ void initialize_mat_no_biom(mat_t* &result, char** sample_ids, unsigned int n_sa } } +inline compute_status is_fp64_method(const std::string &method_string, bool &fp64) { + if ((method_string=="unweighted_fp32") || (method_string=="weighted_normalized_fp32") || (method_string=="weighted_unnormalized_fp32") || (method_string=="generalized_fp32")) { + fp64 = false; + } else if ((method_string=="unweighted") || (method_string=="weighted_normalized") || (method_string=="weighted_unnormalized") || (method_string=="generalized")) { + fp64 = true; + } else { + return unknown_method; + } + + return okay; +} + + +inline compute_status is_fp64(const std::string &method_string, const std::string &format_string, bool &fp64) { + if (format_string == "hdf5_fp32") { + fp64 = false; + } else if (format_string == "hdf5_fp64") { + fp64 = true; + } else if (format_string == "hdf5") { + return is_fp64_method(method_string, fp64); + } else { + return unknown_method; + } + + return okay; +} + + template void initialize_mat_full_no_biom_T(TMat* &result, const char* const * sample_ids, unsigned int n_samples, const char *mmap_dir /* if NULL or "", use malloc */) { @@ -435,38 +463,20 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file return okay; } -compute_status one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data, - const char* unifrac_method, bool variance_adjust, double alpha, - bool bypass_tips, unsigned int nthreads, mat_t** result) { - su::biom table(table_data->obs_ids, - table_data->sample_ids, - table_data->indices, - table_data->indptr, - table_data->data, - table_data->n_obs, - table_data->n_samples, - table_data->nnz); - - su::BPTree tree(tree_data->structure, - tree_data->lengths, - tree_data->names, - tree_data->n_parens); - - return one_off_inmem_cpp(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); -} - compute_status one_off(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, mat_t** result) { CHECK_FILE(biom_filename, table_missing) CHECK_FILE(tree_filename, tree_missing) PARSE_TREE_TABLE(tree_filename, table_filename) + + // condensed form return one_off_inmem_cpp(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); } // TMat mat_full_fp32_t template -compute_status one_off_matrix_T(const char* biom_filename, const char* tree_filename, +compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, const char *mmap_dir, @@ -475,10 +485,8 @@ compute_status one_off_matrix_T(const char* biom_filename, const char* tree_file if (mmap_dir[0]==0) mmap_dir = NULL; // easier to have a simple test going on } - CHECK_FILE(biom_filename, table_missing) - CHECK_FILE(tree_filename, tree_missing) SET_METHOD(unifrac_method, unknown_method) - PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) + SYNC_TREE_TABLE(tree, table) const unsigned int stripe_stop = (table.n_samples + 1) / 2; partial_mat_t *partial_mat = NULL; @@ -527,7 +535,10 @@ compute_status one_off_matrix(const char* biom_filename, const char* tree_filena bool bypass_tips, unsigned int nthreads, const char *mmap_dir, mat_full_fp64_t** result) { - return one_off_matrix_T(biom_filename,tree_filename,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result); + CHECK_FILE(biom_filename, table_missing) + CHECK_FILE(tree_filename, tree_missing) + PARSE_TREE_TABLE(tree_filename, biom_filename) + return one_off_matrix_T(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result); } compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_filename, @@ -535,29 +546,89 @@ compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_f bool bypass_tips, unsigned int nthreads, const char *mmap_dir, mat_full_fp32_t** result) { - return one_off_matrix_T(biom_filename,tree_filename,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result); + CHECK_FILE(biom_filename, table_missing) + CHECK_FILE(tree_filename, tree_missing) + PARSE_TREE_TABLE(tree_filename, biom_filename) + return one_off_matrix_T(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result); } -inline compute_status is_fp64(const std::string &method_string, const std::string &format_string, bool &fp64) { - if (format_string == "hdf5_fp32") { - fp64 = false; - } else if (format_string == "hdf5_fp64") { - fp64 = true; - } else if (format_string == "hdf5") { - if ((method_string=="unweighted_fp32") || (method_string=="weighted_normalized_fp32") || (method_string=="weighted_unnormalized_fp32") || (method_string=="generalized_fp32")) { - fp64 = false; - } else if ((method_string=="unweighted") || (method_string=="weighted_normalized") || (method_string=="weighted_unnormalized") || (method_string=="generalized")) { - fp64 = true; +compute_status one_off_inmem_matrix(su::biom &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, + mat_full_fp64_t** result) { + // NULL -> mmap_dir, we're doing full in memory here + return one_off_matrix_T(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,NULL,result); +} + +compute_status one_off_inmem_matrix_fp32(su::biom &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, + mat_full_fp32_t** result) { + // NULL -> mmap_dir, we're doing full in memory here + return one_off_matrix_T(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,NULL,result); +} + +compute_status one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, mat_full_fp64_t** result) { + bool fp64; + compute_status rc = is_fp64_method(unifrac_method, fp64); + + if (rc == okay) { + if (!fp64) { + return invalid_method; + } } else { - return unknown_method; + return rc; } - } else { - return unknown_method; - } - return okay; + su::biom table(table_data->obs_ids, + table_data->sample_ids, + table_data->indices, + table_data->indptr, + table_data->data, + table_data->n_obs, + table_data->n_samples, + table_data->nnz); + + su::BPTree tree(tree_data->structure, + tree_data->lengths, + tree_data->names, + tree_data->n_parens); + + return one_off_inmem_matrix(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); } +compute_status one_off_inmem_fp32(const support_biom_t *table_data, const support_bptree_t *tree_data, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, mat_full_fp32_t** result) { + bool fp64; + compute_status rc = is_fp64_method(unifrac_method, fp64); + + if (rc == okay) { + if (fp64) { + return invalid_method; + } + } else { + return rc; + } + + su::biom table(table_data->obs_ids, + table_data->sample_ids, + table_data->indices, + table_data->indptr, + table_data->data, + table_data->n_obs, + table_data->n_samples, + table_data->nnz); + + su::BPTree tree(tree_data->structure, + tree_data->lengths, + tree_data->names, + tree_data->n_parens); + + return one_off_inmem_matrix_fp32(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); +} compute_status unifrac_to_file(const char* biom_filename, const char* tree_filename, const char* out_filename, const char* unifrac_method, bool variance_adjust, double alpha, diff --git a/src/api.hpp b/src/api.hpp index dfce358..5804810 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -133,8 +133,8 @@ typedef struct partial_dyn_mat { typedef struct support_biom { char** obs_ids; char** sample_ids; - int32_t* indices; - int32_t* indptr; + uint32_t* indices; + uint32_t* indptr; double* data; int n_obs; int n_samples; @@ -187,7 +187,7 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam bool bypass_tips, unsigned int threads, mat_t** result); -/* Compute UniFrac - condensed form, in-memory +/* Compute UniFrac - against in-memory objects returning full form matrix * * table a constructed BIOM object * tree a constructed BPTree object @@ -196,7 +196,7 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam * alpha GUniFrac alpha, only relevant if method == generalized. * bypass_tips disregard tips, reduces compute by about 50% * threads the number of threads to use. - * result the resulting distance matrix in condensed form, this is initialized within the method so using ** + * result the resulting distance matrix in full form, this is initialized within the method so using ** * * one_off_inmem returns the following error codes: * @@ -206,7 +206,28 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam */ EXTERN ComputeStatus one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data, const char* unifrac_method, bool variance_adjust, double alpha, - bool bypass_tips, unsigned int threads, mat_t** result); + bool bypass_tips, unsigned int threads, mat_full_fp64_t** result); + +/* Compute UniFrac - against in-memory objects returning full form matrix, fp32 + * + * table a constructed BIOM object + * tree a constructed BPTree object + * unifrac_method the requested unifrac method. + * variance_adjust whether to apply variance adjustment. + * alpha GUniFrac alpha, only relevant if method == generalized. + * bypass_tips disregard tips, reduces compute by about 50% + * threads the number of threads to use. + * result the resulting distance matrix in full form, this is initialized within the method so using ** + * + * one_off_inmem returns the following error codes: + * + * okay : no problems encountered + * unknown_method : the requested method is unknown. + * table_empty : the table does not have any entries + */ +EXTERN ComputeStatus one_off_inmem_fp32(const support_biom_t *table_data, const support_bptree_t *tree_data, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int threads, mat_full_fp32_t** result); /* Compute UniFrac - matrix form * diff --git a/src/biom.cpp b/src/biom.cpp index 52c2873..6470983 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -126,11 +126,12 @@ biom::biom() : has_hdf5_backing(false) { malloc_resident(0); } +// not using const on indices/indptr/data as the pointers are being borrowed biom::biom(char** obs_ids_in, char** samp_ids_in, - const int32_t* indices, - const int32_t* indptr, - const double* data, + uint32_t* indices, + uint32_t* indptr, + double* data, const int n_obs, const int n_samples, const int nnz) : has_hdf5_backing(false) { @@ -170,16 +171,18 @@ biom::biom(char** obs_ids_in, malloc_resident(n_obs); } - uint32_t *current_indices = NULL; - double *current_data = NULL; - #pragma omp parallel for schedule(static) for(unsigned int i = 0; i < n_obs; i++) { - std::string id_ = obs_ids[i]; - unsigned int n = get_obs_data_from_sparse(id_, indices, indptr, data, current_indices, current_data); - obs_counts_resident[i] = n; - //obs_indices_resident[i] = current_indices; - //obs_data_resident[i] = current_data; + int32_t start = indptr[i]; + int32_t end = indptr[i + 1]; + unsigned int count = end - start; + + uint32_t* index_ptr = (indices + start); + double* data_ptr = (data + start); + + obs_indices_resident[i] = index_ptr; + obs_data_resident[i] = data_ptr; + obs_counts_resident[i] = count; } sample_counts = get_sample_counts(); } @@ -270,51 +273,6 @@ void biom::create_id_index(const std::vector &ids, } } -unsigned int biom::get_obs_data_from_sparse(const std::string &id_, - const int32_t* index, - const int32_t* indptr, - const double* data, - uint32_t *& current_indices_out, - double *& current_data_out) { - uint32_t idx = obs_id_index.at(id_); - int32_t start = indptr[idx]; - int32_t end = indptr[idx + 1]; - unsigned int count = end - start; - - ////////////// - // dont actually do this, but remove const upstream as we are no longer - // copying but adopting? - // - // this "works" but we need to correct typing upstream as well - //////////////////// - uint32_t* index_ptr = (uint32_t*)(const_cast(index + start)); - double* data_ptr = const_cast(data + start); - - this->obs_indices_resident[idx] = index_ptr; - this->obs_data_resident[idx] = data_ptr; - - //current_indices_out = (uint32_t*)malloc(sizeof(uint32_t) * count); - //if(current_indices_out == NULL) { - // fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", - // sizeof(uint32_t) * count, __FILE__, __LINE__); - // exit(EXIT_FAILURE); - //} - - //current_data_out = (double*)malloc(sizeof(double) * count); - //if(current_data_out == NULL) { - // fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", - // sizeof(double) * count, __FILE__, __LINE__); - // exit(EXIT_FAILURE); - //} - - //for(int i = 0, offset = start ; offset < end; i++, offset++) { - // current_indices_out[i] = index[offset]; - // current_data_out[i] = data[offset]; - //} - - return count; -} - unsigned int biom::get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) { if(!has_hdf5_backing) { fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", diff --git a/src/biom.hpp b/src/biom.hpp index 4c514c0..3fc2c6d 100644 --- a/src/biom.hpp +++ b/src/biom.hpp @@ -43,9 +43,9 @@ namespace su { */ biom(char** obs_ids, char** samp_ids, - const int32_t* index, - const int32_t* indptr, - const double* data, + uint32_t* index, + uint32_t* indptr, + double* data, const int n_obs, const int n_samples, const int nnz); @@ -99,9 +99,7 @@ namespace su { unsigned int get_obs_data_from_sparse(const std::string &id_, const int32_t* index, const int32_t* indptr, - const double* data, - uint32_t *& current_indices_out, - double *& current_data_out); + const double* data); unsigned int get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out); unsigned int get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out); double* get_sample_counts(); diff --git a/src/status_enum.hpp b/src/status_enum.hpp index c88bb69..5ba8d00 100644 --- a/src/status_enum.hpp +++ b/src/status_enum.hpp @@ -1,7 +1,7 @@ #ifndef _UNIFRAC_STATUS_H #define _UNIFRAC_STATUS_H -typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error} ComputeStatus; +typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error, invalid_method} ComputeStatus; typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus; typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus; diff --git a/src/test_su.cpp b/src/test_su.cpp index bf6c296..512bc5b 100644 --- a/src/test_su.cpp +++ b/src/test_su.cpp @@ -506,8 +506,8 @@ void _exercise_get_obs_data(su::biom &table) { void test_biom_constructor_from_sparse() { SUITE_START("biom from sparse constructor"); - int32_t index[] = {2, 0, 1, 3, 4, 5, 2, 3, 5, 0, 1, 2, 5, 1, 2}; - int32_t indptr[] = {0, 1, 6, 9, 13, 15}; + uint32_t index[] = {2, 0, 1, 3, 4, 5, 2, 3, 5, 0, 1, 2, 5, 1, 2}; + uint32_t indptr[] = {0, 1, 6, 9, 13, 15}; double data[] = {1., 5., 1., 2., 3., 1., 1., 4., 2., 2., 1., 1., 1., 1., 1.}; char* obs_ids[] = {"GG_OTU_1", "GG_OTU_2", "GG_OTU_3", "GG_OTU_4", "GG_OTU_5"}; char* samp_ids[] = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"}; From f37912bdb7ce32c9c00d01b73b4b362f0b33e0db Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 18 Mar 2022 15:33:51 -0700 Subject: [PATCH 27/31] minor change to resize and some parallel w/ bptree --- src/biom.cpp | 9 +++++---- src/biom.hpp | 7 ------- src/tree.cpp | 13 +++++++------ 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/src/biom.cpp b/src/biom.cpp index 6470983..11698c7 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -135,24 +135,25 @@ biom::biom(char** obs_ids_in, const int n_obs, const int n_samples, const int nnz) : has_hdf5_backing(false) { + this->nnz = nnz; this->n_samples = n_samples; this->n_obs = n_obs; sample_ids = std::vector(); - sample_ids.reserve(n_samples); + sample_ids.resize(n_samples); obs_ids = std::vector(); - obs_ids.reserve(n_obs); + obs_ids.resize(n_obs); #pragma omp parallel for schedule(static) for(int x = 0; x < 2; x++) { if(x == 0) { for(int i = 0; i < n_obs; i++) { - obs_ids.push_back(std::string(obs_ids_in[i])); + obs_ids[i] = std::string(obs_ids_in[i]); } } else { for(int i = 0; i < n_samples; i++) { - sample_ids.push_back(std::string(samp_ids_in[i])); + sample_ids[i] = std::string(samp_ids_in[i]); } } } diff --git a/src/biom.hpp b/src/biom.hpp index 3fc2c6d..b47967d 100644 --- a/src/biom.hpp +++ b/src/biom.hpp @@ -93,13 +93,6 @@ namespace su { void malloc_resident(uint32_t n_obs); - /* allow load from scipy.sparse like data - * WARNING: assumes data are CSR, with observations as rows - */ - unsigned int get_obs_data_from_sparse(const std::string &id_, - const int32_t* index, - const int32_t* indptr, - const double* data); unsigned int get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out); unsigned int get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out); double* get_sample_counts(); diff --git a/src/tree.cpp b/src/tree.cpp index 500526e..fc67031 100644 --- a/src/tree.cpp +++ b/src/tree.cpp @@ -57,16 +57,17 @@ BPTree::BPTree(const bool* input_structure, const double* input_lengths, char** lengths = std::vector(); names = std::vector(); - structure.reserve(n_parens); - lengths.reserve(n_parens); - names.reserve(n_parens); + structure.resize(n_parens); + lengths.resize(n_parens); + names.resize(n_parens); nparens = n_parens; + #pragma omp parallel for schedule(static) for(int i = 0; i < n_parens; i++) { - structure.push_back(input_structure[i]); - lengths.push_back(input_lengths[i]); - names.push_back(std::string(input_names[i])); + structure[i] = input_structure[i]; + lengths[i] = input_lengths[i]; + names[i] = std::string(input_names[i]); } openclose = std::vector(); From 0f98a9014a985bf3736a1130b25ead24c9eb1a2e Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 18 Mar 2022 17:39:51 -0700 Subject: [PATCH 28/31] remove unneeded parallels --- src/biom.cpp | 27 +++++++++++---------------- src/tree.cpp | 2 +- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/src/biom.cpp b/src/biom.cpp index 11698c7..7ac5011 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -429,23 +429,18 @@ unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& cur double* biom::get_sample_counts() { double *sample_counts = (double*)malloc(sizeof(double) * n_samples); - #pragma omp parallel - { - #pragma omp for - for(int i = 0; i < n_samples; i++) { - sample_counts[i] = 0; - } + for(int i = 0; i < n_samples; i++) { + sample_counts[i] = 0; + } - #pragma omp for reduction(+ : sample_counts[:n_samples]) - for(unsigned int i = 0; i < n_obs; i++) { - unsigned int count = obs_counts_resident[i]; - uint32_t *indices = obs_indices_resident[i]; - double *data = obs_data_resident[i]; - for(unsigned int j = 0; j < count; j++) { - uint32_t index = indices[j]; - double datum = data[j]; - sample_counts[index] += datum; - } + for(unsigned int i = 0; i < n_obs; i++) { + unsigned int count = obs_counts_resident[i]; + uint32_t *indices = obs_indices_resident[i]; + double *data = obs_data_resident[i]; + for(unsigned int j = 0; j < count; j++) { + uint32_t index = indices[j]; + double datum = data[j]; + sample_counts[index] += datum; } } return sample_counts; diff --git a/src/tree.cpp b/src/tree.cpp index fc67031..57efd24 100644 --- a/src/tree.cpp +++ b/src/tree.cpp @@ -63,7 +63,7 @@ BPTree::BPTree(const bool* input_structure, const double* input_lengths, char** nparens = n_parens; - #pragma omp parallel for schedule(static) + //#pragma omp parallel for schedule(static) for(int i = 0; i < n_parens; i++) { structure[i] = input_structure[i]; lengths[i] = input_lengths[i]; From 02529387069c51a2171033c45d81755061732ca5 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Sun, 20 Mar 2022 18:07:20 -0700 Subject: [PATCH 29/31] Allow the caller to allocate results --- src/api.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/api.cpp b/src/api.cpp index 1a4b656..98fd836 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -509,7 +509,10 @@ compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree, destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, stripe_stop); } - initialize_mat_full_no_biom_T(*result, partial_mat->sample_ids, partial_mat->n_samples,mmap_dir); + // allow the caller to allocate the memory + if((*result) == NULL) { + initialize_mat_full_no_biom_T(*result, partial_mat->sample_ids, partial_mat->n_samples,mmap_dir); + } if (((*result)==NULL) || ((*result)->matrix==NULL) || ((*result)->sample_ids==NULL) ) { fprintf(stderr, "Memory allocation error! (initialize_mat)\n"); @@ -640,7 +643,7 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen if (rc==okay) { if (fp64) { - mat_full_fp64_t* result; + mat_full_fp64_t* result = NULL; rc = one_off_matrix(biom_filename, tree_filename, unifrac_method, variance_adjust, alpha, bypass_tips, threads, mmap_dir, @@ -654,7 +657,7 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen if (iostatus!=write_okay) rc=output_error; } } else { - mat_full_fp32_t* result; + mat_full_fp32_t* result = NULL; rc = one_off_matrix_fp32(biom_filename, tree_filename, unifrac_method, variance_adjust, alpha, bypass_tips, threads, mmap_dir, @@ -1507,7 +1510,10 @@ MergeStatus merge_partial_to_matrix_T(partial_dyn_mat_t* * partial_mats, int n_p MergeStatus err = check_partial(partial_mats, n_partials, false); if (err!=merge_okay) return err; - initialize_mat_full_no_biom_T(*result, partial_mats[0]->sample_ids, partial_mats[0]->n_samples,mmap_dir); + // allow the caller to allocate + if((*result) == NULL) { + initialize_mat_full_no_biom_T(*result, partial_mats[0]->sample_ids, partial_mats[0]->n_samples,mmap_dir); + } if ((*result)==NULL) return incomplete_stripe_set; if ((*result)->matrix==NULL) return incomplete_stripe_set; From 4f1ab8a0fcb6c8855e531587dc4428e7e8174ed5 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Sun, 20 Mar 2022 18:22:42 -0700 Subject: [PATCH 30/31] partial maybe is sensitive --- src/api.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/api.cpp b/src/api.cpp index 98fd836..a66eb15 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -1510,10 +1510,7 @@ MergeStatus merge_partial_to_matrix_T(partial_dyn_mat_t* * partial_mats, int n_p MergeStatus err = check_partial(partial_mats, n_partials, false); if (err!=merge_okay) return err; - // allow the caller to allocate - if((*result) == NULL) { - initialize_mat_full_no_biom_T(*result, partial_mats[0]->sample_ids, partial_mats[0]->n_samples,mmap_dir); - } + initialize_mat_full_no_biom_T(*result, partial_mats[0]->sample_ids, partial_mats[0]->n_samples,mmap_dir); if ((*result)==NULL) return incomplete_stripe_set; if ((*result)->matrix==NULL) return incomplete_stripe_set; From 80aeab07204b1d9bfcfb799fc251696071ca5af4 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 21 Mar 2022 14:38:16 -0700 Subject: [PATCH 31/31] Addressing @sfiligoi's comments --- src/biom.cpp | 6 +----- src/test_su.cpp | 4 ++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/biom.cpp b/src/biom.cpp index 7ac5011..0ad3236 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -427,11 +427,7 @@ unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& cur } double* biom::get_sample_counts() { - double *sample_counts = (double*)malloc(sizeof(double) * n_samples); - - for(int i = 0; i < n_samples; i++) { - sample_counts[i] = 0; - } + double *sample_counts = (double*)calloc(sizeof(double), n_samples); for(unsigned int i = 0; i < n_obs; i++) { unsigned int count = obs_counts_resident[i]; diff --git a/src/test_su.cpp b/src/test_su.cpp index 512bc5b..d1b7169 100644 --- a/src/test_su.cpp +++ b/src/test_su.cpp @@ -153,7 +153,7 @@ void test_bptree_constructor_from_existing() { //11101000 su::BPTree existing = su::BPTree("(('123:foo; bar':1,b:2)c);"); su::BPTree tree = su::BPTree(existing.get_structure(), existing.lengths, existing.names); - + unsigned int exp_nparens = 8; std::vector exp_structure; exp_structure.push_back(true); @@ -462,7 +462,7 @@ void test_biom_constructor() { ASSERT(table.obs_ids == exp_oids); ASSERT(table.sample_indptr == exp_s_indptr); ASSERT(table.obs_indptr == exp_o_indptr); - + SUITE_END(); }