diff --git a/src/Makefile b/src/Makefile index 2b97c05..ed050ab 100644 --- a/src/Makefile +++ b/src/Makefile @@ -135,6 +135,7 @@ install: libssu.so ssu faithpd mkdir -p ${PREFIX}/include/unifrac cp task_parameters.hpp ${PREFIX}/include/unifrac/ cp api.hpp ${PREFIX}/include/unifrac/ + cp status_enum.hpp ${PREFIX}/include/unifrac/ rapi_test: main mkdir -p ~/.R diff --git a/src/api.cpp b/src/api.cpp index 4566ca7..a66eb15 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -48,22 +48,25 @@ return err; \ } -#define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename); \ - std::string content = std::string(std::istreambuf_iterator(ifs), \ - std::istreambuf_iterator()); \ - su::BPTree tree = su::BPTree(content); \ - su::biom table = su::biom(biom_filename); \ - if(table.n_samples <= 0 | table.n_obs <= 0) { \ - return table_empty; \ - } \ - std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \ - if(bad_id != "") { \ - return table_and_tree_do_not_overlap; \ - } \ - std::unordered_set to_keep(table.obs_ids.begin(), \ - table.obs_ids.end()); \ - su::BPTree tree_sheared = tree.shear(to_keep).collapse(); - +#define SYNC_TREE_TABLE(tree, table) std::unordered_set to_keep(table.obs_ids.begin(), \ + table.obs_ids.end()); \ + su::BPTree tree_sheared = tree.shear(to_keep).collapse(); + +#define PARSE_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename); \ + std::string content = std::string(std::istreambuf_iterator(ifs), \ + std::istreambuf_iterator()); \ + su::BPTree tree = su::BPTree(content); \ + su::biom table = su::biom(biom_filename); \ + if(table.n_samples <= 0 | table.n_obs <= 0) { \ + return table_empty; \ + } \ + std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \ + if(bad_id != "") { \ + return table_and_tree_do_not_overlap; \ + } + +#define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) PARSE_TREE_TABLE(tree_filename, table_filename) \ + SYNC_TREE_TABLE(tree, table) using namespace su; using namespace std; @@ -115,7 +118,7 @@ void initialize_mat(mat_t* &result, biom &table, bool is_upper_triangle) { } } -void initialize_results_vec(r_vec* &result, biom& table){ +void initialize_results_vec(r_vec* &result, biom &table){ // Stores results for Faith PD result = (r_vec*)malloc(sizeof(results_vec)); result->n_samples = table.n_samples; @@ -146,6 +149,34 @@ void initialize_mat_no_biom(mat_t* &result, char** sample_ids, unsigned int n_sa } } +inline compute_status is_fp64_method(const std::string &method_string, bool &fp64) { + if ((method_string=="unweighted_fp32") || (method_string=="weighted_normalized_fp32") || (method_string=="weighted_unnormalized_fp32") || (method_string=="generalized_fp32")) { + fp64 = false; + } else if ((method_string=="unweighted") || (method_string=="weighted_normalized") || (method_string=="weighted_unnormalized") || (method_string=="generalized")) { + fp64 = true; + } else { + return unknown_method; + } + + return okay; +} + + +inline compute_status is_fp64(const std::string &method_string, const std::string &format_string, bool &fp64) { + if (format_string == "hdf5_fp32") { + fp64 = false; + } else if (format_string == "hdf5_fp64") { + fp64 = true; + } else if (format_string == "hdf5") { + return is_fp64_method(method_string, fp64); + } else { + return unknown_method; + } + + return okay; +} + + template void initialize_mat_full_no_biom_T(TMat* &result, const char* const * sample_ids, unsigned int n_samples, const char *mmap_dir /* if NULL or "", use malloc */) { @@ -348,6 +379,37 @@ void set_tasks(std::vector &tasks, } } +compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, mat_t** result) { + SYNC_TREE_TABLE(tree, table) + SET_METHOD(unifrac_method, unknown_method) + + const unsigned int stripe_stop = (table.n_samples + 1) / 2; + std::vector dm_stripes(stripe_stop); + std::vector dm_stripes_total(stripe_stop); + + if(nthreads > dm_stripes.size()) { + fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n", dm_stripes.size()); + nthreads = dm_stripes.size(); + } + + std::vector tasks(nthreads); + std::vector threads(nthreads); + + set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads); + su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); + + initialize_mat(*result, table, true); // true -> is_upper_triangle + for(unsigned int tid = 0; tid < threads.size(); tid++) { + su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop); + } + + destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0); + + return okay; +} + compute_status partial(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, unsigned int stripe_start, unsigned int stripe_stop, @@ -404,40 +466,17 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file compute_status one_off(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, mat_t** result) { - CHECK_FILE(biom_filename, table_missing) CHECK_FILE(tree_filename, tree_missing) - SET_METHOD(unifrac_method, unknown_method) - PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) + PARSE_TREE_TABLE(tree_filename, table_filename) - const unsigned int stripe_stop = (table.n_samples + 1) / 2; - std::vector dm_stripes(stripe_stop); - std::vector dm_stripes_total(stripe_stop); - - if(nthreads > dm_stripes.size()) { - fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n", dm_stripes.size()); - nthreads = dm_stripes.size(); - } - - std::vector tasks(nthreads); - std::vector threads(nthreads); - - set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads); - su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); - - initialize_mat(*result, table, true); // true -> is_upper_triangle - for(unsigned int tid = 0; tid < threads.size(); tid++) { - su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop); - } - - destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0); - - return okay; + // condensed form + return one_off_inmem_cpp(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); } // TMat mat_full_fp32_t template -compute_status one_off_matrix_T(const char* biom_filename, const char* tree_filename, +compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, const char *mmap_dir, @@ -446,10 +485,8 @@ compute_status one_off_matrix_T(const char* biom_filename, const char* tree_file if (mmap_dir[0]==0) mmap_dir = NULL; // easier to have a simple test going on } - CHECK_FILE(biom_filename, table_missing) - CHECK_FILE(tree_filename, tree_missing) SET_METHOD(unifrac_method, unknown_method) - PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) + SYNC_TREE_TABLE(tree, table) const unsigned int stripe_stop = (table.n_samples + 1) / 2; partial_mat_t *partial_mat = NULL; @@ -472,7 +509,10 @@ compute_status one_off_matrix_T(const char* biom_filename, const char* tree_file destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, stripe_stop); } - initialize_mat_full_no_biom_T(*result, partial_mat->sample_ids, partial_mat->n_samples,mmap_dir); + // allow the caller to allocate the memory + if((*result) == NULL) { + initialize_mat_full_no_biom_T(*result, partial_mat->sample_ids, partial_mat->n_samples,mmap_dir); + } if (((*result)==NULL) || ((*result)->matrix==NULL) || ((*result)->sample_ids==NULL) ) { fprintf(stderr, "Memory allocation error! (initialize_mat)\n"); @@ -498,7 +538,10 @@ compute_status one_off_matrix(const char* biom_filename, const char* tree_filena bool bypass_tips, unsigned int nthreads, const char *mmap_dir, mat_full_fp64_t** result) { - return one_off_matrix_T(biom_filename,tree_filename,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result); + CHECK_FILE(biom_filename, table_missing) + CHECK_FILE(tree_filename, tree_missing) + PARSE_TREE_TABLE(tree_filename, biom_filename) + return one_off_matrix_T(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result); } compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_filename, @@ -506,29 +549,89 @@ compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_f bool bypass_tips, unsigned int nthreads, const char *mmap_dir, mat_full_fp32_t** result) { - return one_off_matrix_T(biom_filename,tree_filename,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result); + CHECK_FILE(biom_filename, table_missing) + CHECK_FILE(tree_filename, tree_missing) + PARSE_TREE_TABLE(tree_filename, biom_filename) + return one_off_matrix_T(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result); } -inline compute_status is_fp64(const std::string &method_string, const std::string &format_string, bool &fp64) { - if (format_string == "hdf5_fp32") { - fp64 = false; - } else if (format_string == "hdf5_fp64") { - fp64 = true; - } else if (format_string == "hdf5") { - if ((method_string=="unweighted_fp32") || (method_string=="weighted_normalized_fp32") || (method_string=="weighted_unnormalized_fp32") || (method_string=="generalized_fp32")) { - fp64 = false; - } else if ((method_string=="unweighted") || (method_string=="weighted_normalized") || (method_string=="weighted_unnormalized") || (method_string=="generalized")) { - fp64 = true; +compute_status one_off_inmem_matrix(su::biom &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, + mat_full_fp64_t** result) { + // NULL -> mmap_dir, we're doing full in memory here + return one_off_matrix_T(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,NULL,result); +} + +compute_status one_off_inmem_matrix_fp32(su::biom &table, su::BPTree &tree, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, + mat_full_fp32_t** result) { + // NULL -> mmap_dir, we're doing full in memory here + return one_off_matrix_T(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,NULL,result); +} + +compute_status one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, mat_full_fp64_t** result) { + bool fp64; + compute_status rc = is_fp64_method(unifrac_method, fp64); + + if (rc == okay) { + if (!fp64) { + return invalid_method; + } } else { - return unknown_method; + return rc; } - } else { - return unknown_method; - } - return okay; + su::biom table(table_data->obs_ids, + table_data->sample_ids, + table_data->indices, + table_data->indptr, + table_data->data, + table_data->n_obs, + table_data->n_samples, + table_data->nnz); + + su::BPTree tree(tree_data->structure, + tree_data->lengths, + tree_data->names, + tree_data->n_parens); + + return one_off_inmem_matrix(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); } +compute_status one_off_inmem_fp32(const support_biom_t *table_data, const support_bptree_t *tree_data, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int nthreads, mat_full_fp32_t** result) { + bool fp64; + compute_status rc = is_fp64_method(unifrac_method, fp64); + + if (rc == okay) { + if (fp64) { + return invalid_method; + } + } else { + return rc; + } + + su::biom table(table_data->obs_ids, + table_data->sample_ids, + table_data->indices, + table_data->indptr, + table_data->data, + table_data->n_obs, + table_data->n_samples, + table_data->nnz); + + su::BPTree tree(tree_data->structure, + tree_data->lengths, + tree_data->names, + tree_data->n_parens); + + return one_off_inmem_matrix_fp32(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result); +} compute_status unifrac_to_file(const char* biom_filename, const char* tree_filename, const char* out_filename, const char* unifrac_method, bool variance_adjust, double alpha, @@ -540,7 +643,7 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen if (rc==okay) { if (fp64) { - mat_full_fp64_t* result; + mat_full_fp64_t* result = NULL; rc = one_off_matrix(biom_filename, tree_filename, unifrac_method, variance_adjust, alpha, bypass_tips, threads, mmap_dir, @@ -554,7 +657,7 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen if (iostatus!=write_okay) rc=output_error; } } else { - mat_full_fp32_t* result; + mat_full_fp32_t* result = NULL; rc = one_off_matrix_fp32(biom_filename, tree_filename, unifrac_method, variance_adjust, alpha, bypass_tips, threads, mmap_dir, diff --git a/src/api.hpp b/src/api.hpp index be3bd63..5804810 100644 --- a/src/api.hpp +++ b/src/api.hpp @@ -1,4 +1,5 @@ #include "task_parameters.hpp" +#include "status_enum.hpp" #ifdef __cplusplus #include @@ -14,10 +15,6 @@ #define PARTIAL_MAGIC_V2 0x088ABA02 -typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error} ComputeStatus; -typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus; -typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus; - /* a result matrix * * n_samples the number of samples. @@ -122,14 +119,49 @@ typedef struct partial_dyn_mat { char* filename; } partial_dyn_mat_t; +/* support structure to carry in biom table information + * + * obs_ids the observation IDs + * sample_ids the sample IDs + * indices the indices of the data values + * indptr the row offset of the data values + * data the actual matrix values + * n_obs the number of observations, corresponding to length of obs_ids + * n_samples the number of samples, corresponding to the length of sample_ids + * nnz the number of nonzero values, corresponding to the length of data and indices + */ +typedef struct support_biom { + char** obs_ids; + char** sample_ids; + uint32_t* indices; + uint32_t* indptr; + double* data; + int n_obs; + int n_samples; + int nnz; +} support_biom_t; + +/* support structure to carry in bptree information + * + * structure the topology of the tree + * lengths the branch lengths of the tree + * names the names of the tips and internal nodes of hte tree + * n_parens the length of the structure array + */ +typedef struct support_bptree { + bool* structure; + double* lengths; + char** names; + int n_parens; +} support_bptree_t; -void destroy_mat(mat_t** result); -void destroy_mat_full_fp64(mat_full_fp64_t** result); -void destroy_mat_full_fp32(mat_full_fp32_t** result); -void destroy_partial_mat(partial_mat_t** result); -void destroy_partial_dyn_mat(partial_dyn_mat_t** result); -void destroy_results_vec(r_vec** result); +EXTERN void destroy_mat(mat_t** result); +EXTERN void destroy_mat_full_fp64(mat_full_fp64_t** result); +EXTERN void destroy_mat_full_fp32(mat_full_fp32_t** result); +EXTERN void destroy_partial_mat(partial_mat_t** result); +EXTERN void destroy_partial_dyn_mat(partial_dyn_mat_t** result); +EXTERN void destroy_results_vec(r_vec** result); /* Compute UniFrac - condensed form * @@ -154,6 +186,49 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat_t** result); + +/* Compute UniFrac - against in-memory objects returning full form matrix + * + * table a constructed BIOM object + * tree a constructed BPTree object + * unifrac_method the requested unifrac method. + * variance_adjust whether to apply variance adjustment. + * alpha GUniFrac alpha, only relevant if method == generalized. + * bypass_tips disregard tips, reduces compute by about 50% + * threads the number of threads to use. + * result the resulting distance matrix in full form, this is initialized within the method so using ** + * + * one_off_inmem returns the following error codes: + * + * okay : no problems encountered + * unknown_method : the requested method is unknown. + * table_empty : the table does not have any entries + */ +EXTERN ComputeStatus one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int threads, mat_full_fp64_t** result); + +/* Compute UniFrac - against in-memory objects returning full form matrix, fp32 + * + * table a constructed BIOM object + * tree a constructed BPTree object + * unifrac_method the requested unifrac method. + * variance_adjust whether to apply variance adjustment. + * alpha GUniFrac alpha, only relevant if method == generalized. + * bypass_tips disregard tips, reduces compute by about 50% + * threads the number of threads to use. + * result the resulting distance matrix in full form, this is initialized within the method so using ** + * + * one_off_inmem returns the following error codes: + * + * okay : no problems encountered + * unknown_method : the requested method is unknown. + * table_empty : the table does not have any entries + */ +EXTERN ComputeStatus one_off_inmem_fp32(const support_biom_t *table_data, const support_bptree_t *tree_data, + const char* unifrac_method, bool variance_adjust, double alpha, + bool bypass_tips, unsigned int threads, mat_full_fp32_t** result); + /* Compute UniFrac - matrix form * * biom_filename the filename to the biom table. diff --git a/src/biom.cpp b/src/biom.cpp index a62a9e2..0ad3236 100644 --- a/src/biom.cpp +++ b/src/biom.cpp @@ -26,7 +26,7 @@ const std::string SAMPLE_INDICES = std::string("/sample/matrix/indices"); const std::string SAMPLE_DATA = std::string("/sample/matrix/data"); const std::string SAMPLE_IDS = std::string("/sample/ids"); -biom::biom(std::string filename) { +biom::biom(std::string filename) : has_hdf5_backing(true) { file = H5File(filename.c_str(), H5F_ACC_RDONLY); /* establish the datasets */ @@ -55,9 +55,51 @@ biom::biom(std::string filename) { obs_id_index = std::unordered_map(); sample_id_index = std::unordered_map(); - create_id_index(obs_ids, obs_id_index); - create_id_index(sample_ids, sample_id_index); + #pragma omp parallel for schedule(static) + for(int i = 0; i < 3; i++) { + if(i == 0) + create_id_index(obs_ids, obs_id_index); + else if(i == 1) + create_id_index(sample_ids, sample_id_index); + else if(i == 2) + malloc_resident(n_obs); + } + + uint32_t *current_indices = NULL; + double *current_data = NULL; + for(unsigned int i = 0; i < obs_ids.size(); i++) { + std::string id_ = obs_ids[i]; + unsigned int n = get_obs_data_direct(id_, current_indices, current_data); + obs_counts_resident[i] = n; + obs_indices_resident[i] = current_indices; + obs_data_resident[i] = current_data; + } + sample_counts = get_sample_counts(); +} + +biom::~biom() { + if(has_hdf5_backing) { + if(obs_indices_resident != NULL && obs_data_resident != NULL) { + for(unsigned int i = 0; i < n_obs; i++) { + if(obs_indices_resident[i] != NULL) + free(obs_indices_resident[i]); + if(obs_data_resident[i] != NULL) + free(obs_data_resident[i]); + } + } + + if(obs_indices_resident != NULL) + free(obs_indices_resident); + if(obs_data_resident != NULL) + free(obs_data_resident); + if(obs_counts_resident != NULL) + free(obs_counts_resident); + } + // else, it is the responsibility of the entity constructing this object + // to clean itself up +} +void biom::malloc_resident(uint32_t n_obs) { /* load obs sparse data */ obs_indices_resident = (uint32_t**)malloc(sizeof(uint32_t**) * n_obs); if(obs_indices_resident == NULL) { @@ -77,30 +119,82 @@ biom::biom(std::string filename) { sizeof(unsigned int) * n_obs, __FILE__, __LINE__); exit(EXIT_FAILURE); } +} - uint32_t *current_indices = NULL; - double *current_data = NULL; - for(unsigned int i = 0; i < obs_ids.size(); i++) { - std::string id_ = obs_ids[i]; - unsigned int n = get_obs_data_direct(id_, current_indices, current_data); - obs_counts_resident[i] = n; - obs_indices_resident[i] = current_indices; - obs_data_resident[i] = current_data; - } - sample_counts = get_sample_counts(); +biom::biom() : has_hdf5_backing(false) { + n_obs = 0; + malloc_resident(0); } -biom::~biom() { - for(unsigned int i = 0; i < n_obs; i++) { - free(obs_indices_resident[i]); - free(obs_data_resident[i]); +// not using const on indices/indptr/data as the pointers are being borrowed +biom::biom(char** obs_ids_in, + char** samp_ids_in, + uint32_t* indices, + uint32_t* indptr, + double* data, + const int n_obs, + const int n_samples, + const int nnz) : has_hdf5_backing(false) { + + this->nnz = nnz; + this->n_samples = n_samples; + this->n_obs = n_obs; + + sample_ids = std::vector(); + sample_ids.resize(n_samples); + obs_ids = std::vector(); + obs_ids.resize(n_obs); + + #pragma omp parallel for schedule(static) + for(int x = 0; x < 2; x++) { + if(x == 0) { + for(int i = 0; i < n_obs; i++) { + obs_ids[i] = std::string(obs_ids_in[i]); + } + } else { + for(int i = 0; i < n_samples; i++) { + sample_ids[i] = std::string(samp_ids_in[i]); + } + } } - free(obs_indices_resident); - free(obs_data_resident); - free(obs_counts_resident); + + /* define a mapping between an ID and its corresponding offset */ + obs_id_index = std::unordered_map(); + sample_id_index = std::unordered_map(); + + #pragma omp parallel for schedule(static) + for(int i = 0; i < 3; i++) { + if(i == 0) + create_id_index(obs_ids, obs_id_index); + else if(i == 1) + create_id_index(sample_ids, sample_id_index); + else if(i == 2) + malloc_resident(n_obs); + } + + #pragma omp parallel for schedule(static) + for(unsigned int i = 0; i < n_obs; i++) { + int32_t start = indptr[i]; + int32_t end = indptr[i + 1]; + unsigned int count = end - start; + + uint32_t* index_ptr = (indices + start); + double* data_ptr = (data + start); + + obs_indices_resident[i] = index_ptr; + obs_data_resident[i] = data_ptr; + obs_counts_resident[i] = count; + } + sample_counts = get_sample_counts(); } void biom::set_nnz() { + if(!has_hdf5_backing) { + fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + // should these be cached? DataType dtype = obs_data.getDataType(); DataSpace dataspace = obs_data.getSpace(); @@ -111,6 +205,12 @@ void biom::set_nnz() { } void biom::load_ids(const char *path, std::vector &ids) { + if(!has_hdf5_backing) { + fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + DataSet ds_ids = file.openDataSet(path); DataType dtype = ds_ids.getDataType(); DataSpace dataspace = ds_ids.getSpace(); @@ -138,6 +238,12 @@ void biom::load_ids(const char *path, std::vector &ids) { } void biom::load_indptr(const char *path, std::vector &indptr) { + if(!has_hdf5_backing) { + fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + DataSet ds = file.openDataSet(path); DataType dtype = ds.getDataType(); DataSpace dataspace = ds.getSpace(); @@ -159,7 +265,7 @@ void biom::load_indptr(const char *path, std::vector &indptr) { free(dataout); } -void biom::create_id_index(std::vector &ids, +void biom::create_id_index(const std::vector &ids, std::unordered_map &map) { uint32_t count = 0; map.reserve(ids.size()); @@ -169,6 +275,12 @@ void biom::create_id_index(std::vector &ids, } unsigned int biom::get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) { + if(!has_hdf5_backing) { + fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + uint32_t idx = obs_id_index.at(id); uint32_t start = obs_indptr[idx]; uint32_t end = obs_indptr[idx + 1]; @@ -270,6 +382,12 @@ void biom::get_obs_data_range(const std::string &id, unsigned int start, unsigne } unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) { + if(!has_hdf5_backing) { + fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + uint32_t idx = sample_id_index.at(id); uint32_t start = sample_indptr[idx]; uint32_t end = sample_indptr[idx + 1]; @@ -310,6 +428,7 @@ unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& cur double* biom::get_sample_counts() { double *sample_counts = (double*)calloc(sizeof(double), n_samples); + for(unsigned int i = 0; i < n_obs; i++) { unsigned int count = obs_counts_resident[i]; uint32_t *indices = obs_indices_resident[i]; diff --git a/src/biom.hpp b/src/biom.hpp index 7e3fdb1..b47967d 100644 --- a/src/biom.hpp +++ b/src/biom.hpp @@ -21,12 +21,35 @@ namespace su { class biom : public biom_interface { public: + /* nullary constructor */ + biom(); + /* default constructor * * @param filename The path to the BIOM table to read */ biom(std::string filename); + /* constructor from compress sparse data + * + * @param obs_ids vector of observation identifiers + * @param samp_ids vector of sample identifiers + * @param index vector of index positions + * @param indptr vector of indptr positions + * @param data vector of observation counts + * @param n_obs number of observations + * @param n_samples number of samples + * @param nnz number of data points + */ + biom(char** obs_ids, + char** samp_ids, + uint32_t* index, + uint32_t* indptr, + double* data, + const int n_obs, + const int n_samples, + const int nnz); + /* default destructor * * Temporary arrays are freed @@ -55,8 +78,9 @@ namespace su { */ void get_obs_data_range(const std::string &id, unsigned int start, unsigned int end, bool normalize, double* out) const; void get_obs_data_range(const std::string &id, unsigned int start, unsigned int end, bool normalize, float* out) const; - private: + bool has_hdf5_backing = false; + /* retain DataSet handles within the HDF5 file */ H5::DataSet obs_indices; H5::DataSet sample_indices; @@ -66,11 +90,14 @@ namespace su { uint32_t **obs_indices_resident; double **obs_data_resident; unsigned int *obs_counts_resident; + + void malloc_resident(uint32_t n_obs); unsigned int get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out); unsigned int get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out); double* get_sample_counts(); + /* At construction, lookups mapping IDs -> index position within an * axis are defined */ @@ -100,7 +127,7 @@ namespace su { * @param ids A vector of IDs to index * @param map A hash table to populate */ - void create_id_index(std::vector &ids, + void create_id_index(const std::vector &ids, std::unordered_map &map); diff --git a/src/status_enum.hpp b/src/status_enum.hpp new file mode 100644 index 0000000..5ba8d00 --- /dev/null +++ b/src/status_enum.hpp @@ -0,0 +1,8 @@ +#ifndef _UNIFRAC_STATUS_H +#define _UNIFRAC_STATUS_H + +typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error, invalid_method} ComputeStatus; +typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus; +typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus; + +#endif diff --git a/src/test_su.cpp b/src/test_su.cpp index 9ef3d33..d1b7169 100644 --- a/src/test_su.cpp +++ b/src/test_su.cpp @@ -153,9 +153,8 @@ void test_bptree_constructor_from_existing() { //11101000 su::BPTree existing = su::BPTree("(('123:foo; bar':1,b:2)c);"); su::BPTree tree = su::BPTree(existing.get_structure(), existing.lengths, existing.names); - + unsigned int exp_nparens = 8; - std::vector exp_structure; exp_structure.push_back(true); exp_structure.push_back(true); @@ -467,10 +466,7 @@ void test_biom_constructor() { SUITE_END(); } -void test_biom_get_obs_data() { - SUITE_START("biom get obs data"); - - su::biom table = su::biom("test.biom"); +void _exercise_get_obs_data(su::biom &table) { double exp0[] = {0.0, 0.0, 1.0, 0.0, 0.0, 0.0}; std::vector exp0_vec = _double_array_to_vector(exp0, 6); double exp1[] = {5.0, 1.0, 0.0, 2.0, 3.0, 1.0}; @@ -506,9 +502,43 @@ void test_biom_get_obs_data() { ASSERT(vec_almost_equal(obs_vec, exp4_vec)); free(out); +} + +void test_biom_constructor_from_sparse() { + SUITE_START("biom from sparse constructor"); + uint32_t index[] = {2, 0, 1, 3, 4, 5, 2, 3, 5, 0, 1, 2, 5, 1, 2}; + uint32_t indptr[] = {0, 1, 6, 9, 13, 15}; + double data[] = {1., 5., 1., 2., 3., 1., 1., 4., 2., 2., 1., 1., 1., 1., 1.}; + char* obs_ids[] = {"GG_OTU_1", "GG_OTU_2", "GG_OTU_3", "GG_OTU_4", "GG_OTU_5"}; + char* samp_ids[] = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"}; + + su::biom table = su::biom(obs_ids, samp_ids, index, indptr, data, 5, 6, 15); + _exercise_get_obs_data(table); + + SUITE_END(); +} + +void test_biom_nullary() { + SUITE_START("biom nullary"); + su::biom table = su::biom(); + SUITE_END(); +} + +void test_bptree_nullary() { + SUITE_START("bptree nullary"); + su::BPTree tree = su::BPTree(); + SUITE_END(); +} + +void test_biom_get_obs_data() { + SUITE_START("biom get obs data"); + + su::biom table = su::biom("test.biom"); + _exercise_get_obs_data(table); SUITE_END(); } + void test_bptree_leftchild() { SUITE_START("test bptree left child"); su::BPTree tree = su::BPTree("((3,4,(6)5)2,7,((10,100)9)8)1;"); @@ -1802,6 +1832,66 @@ void test_set_tasks() { SUITE_END(); } +void test_bptree_cstyle_constructor() { + SUITE_START("test bptree constructor from c-style data"); + //01234567 + //11101000 + bool structure[] = {true, true, true, false, true, false, false, false}; + double lengths[] = {0, 0, 1, 0, 2, 0, 0, 0}; + char* names[] = {"", "c", "123:foo; bar", "", "b", "", "", ""}; + su::BPTree tree = su::BPTree(structure, lengths, names, 8); + + unsigned int exp_nparens = 8; + + std::vector exp_structure; + exp_structure.push_back(true); + exp_structure.push_back(true); + exp_structure.push_back(true); + exp_structure.push_back(false); + exp_structure.push_back(true); + exp_structure.push_back(false); + exp_structure.push_back(false); + exp_structure.push_back(false); + + std::vector exp_openclose; + exp_openclose.push_back(7); + exp_openclose.push_back(6); + exp_openclose.push_back(3); + exp_openclose.push_back(2); + exp_openclose.push_back(5); + exp_openclose.push_back(4); + exp_openclose.push_back(1); + exp_openclose.push_back(0); + + std::vector exp_names; + exp_names.push_back(std::string()); + exp_names.push_back(std::string("c")); + exp_names.push_back(std::string("123:foo; bar")); + exp_names.push_back(std::string()); + exp_names.push_back(std::string("b")); + exp_names.push_back(std::string()); + exp_names.push_back(std::string()); + exp_names.push_back(std::string()); + + std::vector exp_lengths; + exp_lengths.push_back(0.0); + exp_lengths.push_back(0.0); + exp_lengths.push_back(1.0); + exp_lengths.push_back(0.0); + exp_lengths.push_back(2.0); + exp_lengths.push_back(0.0); + exp_lengths.push_back(0.0); + exp_lengths.push_back(0.0); + + ASSERT(tree.nparens == exp_nparens); + ASSERT(tree.get_structure() == exp_structure); + ASSERT(tree.get_openclose() == exp_openclose); + ASSERT(tree.lengths == exp_lengths); + ASSERT(tree.names == exp_names); + + SUITE_END(); +} + void test_bptree_constructor_newline_bug() { SUITE_START("test bptree constructor newline bug"); su::BPTree tree = su::BPTree("((362be41f31fd26be95ae43a8769b91c0:0.116350803,(a16679d5a10caa9753f171977552d920:0.105836235,((a7acc2abb505c3ee177a12e514d3b994:0.008268754,(4e22aa3508b98813f52e1a12ffdb74ad:0.03144211,8139c4ac825dae48454fb4800fb87896:0.043622957)0.923:0.046588301)0.997:0.120902074,((2d3df7387323e2edcbbfcb6e56a02710:0.031543994,3f6752aabcc291b67a063fb6492fd107:0.091571442)0.759:0.016335166,((d599ebe277afb0dfd4ad3c2176afc50e:5e-09,84d0affc7243c7d6261f3a7d680b873f:0.010245188)0.883:0.048993011,51121722488d0c3da1388d1b117cd239:0.119447926)0.763:0.035660204)0.921:0.058191474)0.776:0.02854575)0.657:0.052060833)0.658:0.032547569,(99647b51f775c8ddde8ed36a7d60dbcd:0.173334268,(f18a9c8112372e2916a66a9778f3741b:0.194813398,(5833416522de0cca717a1abf720079ac:5e-09,(2bf1067d2cd4f09671e3ebe5500205ca:0.031692682,(b32621bcd86cb99e846d8f6fee7c9ab8:0.031330707,1016319c25196d73bdb3096d86a9df2f:5e-09)0.058:0.01028612)0.849:0.010284866)0.791:0.041353384)0.922:0.109470534):0.022169824000000005)root;\n\n"); @@ -1818,6 +1908,8 @@ int main(int argc, char** argv) { test_bptree_constructor_edgecases(); test_bptree_constructor_quoted_comma(); test_bptree_constructor_quoted_parens(); + test_bptree_cstyle_constructor(); + test_bptree_nullary(); test_bptree_postorder(); test_bptree_preorder(); test_bptree_parent(); @@ -1832,6 +1924,8 @@ int main(int argc, char** argv) { test_bptree_collapse_edge(); test_biom_constructor(); + test_biom_constructor_from_sparse(); + test_biom_nullary(); test_biom_get_obs_data(); test_propstack_constructor(); diff --git a/src/tree.cpp b/src/tree.cpp index 3dc6f98..57efd24 100644 --- a/src/tree.cpp +++ b/src/tree.cpp @@ -4,6 +4,8 @@ using namespace su; +BPTree::BPTree() { } + BPTree::BPTree(std::string newick) { openclose = std::vector(); lengths = std::vector(); @@ -50,6 +52,36 @@ BPTree::BPTree(std::vector input_structure, std::vector input_leng index_and_cache(); } +BPTree::BPTree(const bool* input_structure, const double* input_lengths, char** input_names, const int n_parens) { + structure = std::vector(); + lengths = std::vector(); + names = std::vector(); + + structure.resize(n_parens); + lengths.resize(n_parens); + names.resize(n_parens); + + nparens = n_parens; + + //#pragma omp parallel for schedule(static) + for(int i = 0; i < n_parens; i++) { + structure[i] = input_structure[i]; + lengths[i] = input_lengths[i]; + names[i] = std::string(input_names[i]); + } + + openclose = std::vector(); + select_0_index = std::vector(); + select_1_index = std::vector(); + openclose.resize(nparens); + select_0_index.resize(nparens / 2); + select_1_index.resize(nparens / 2); + excess.resize(nparens); + + structure_to_openclose(); + index_and_cache(); +} + BPTree BPTree::mask(std::vector topology_mask, std::vector in_lengths) { std::vector new_structure = std::vector(); @@ -197,7 +229,7 @@ void BPTree::index_and_cache() { } } -uint32_t BPTree::postorderselect(uint32_t k) const { +uint32_t BPTree::postorderselect(uint32_t k) const { return open(select_0_index[k]); } diff --git a/src/tree.hpp b/src/tree.hpp index 99f61df..cee4b4c 100644 --- a/src/tree.hpp +++ b/src/tree.hpp @@ -19,6 +19,9 @@ namespace su { class BPTree { public: + /* nullary constructor */ + BPTree(); + /* tracked attributes */ std::vector lengths; std::vector names; @@ -41,6 +44,15 @@ namespace su { BPTree(std::vector input_structure, std::vector input_lengths, std::vector input_names); ~BPTree(); + /* constructor from a defined topology using c-types + * + * @param input_structure A boolean array defining the topology + * @param input_lengths A double array of the branch lengths + * @param input_names A char* array of the names + * @param n_parens The length of the topology + */ + BPTree(const bool* input_structure, const double* input_lengths, char** input_names, const int n_parens); + /* postorder tree traversal * * Get the index position of the ith node in a postorder tree