From 1727690224fb9c521ebed4b1a8a8dbfce480dfdc Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Tue, 28 Jan 2025 15:50:27 -0600 Subject: [PATCH] consider the mask when filling bp_length --- src/subcommand/similarity_main.cpp | 67 ++++++++++++++++-------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/src/subcommand/similarity_main.cpp b/src/subcommand/similarity_main.cpp index 7f05ff4d..a9257dae 100644 --- a/src/subcommand/similarity_main.cpp +++ b/src/subcommand/similarity_main.cpp @@ -140,6 +140,36 @@ args::Group threading_opts(parser, "[ Threading ]"); }); } + // ska::flat_hash_map, uint64_t> leads to huge memory usage with deep graphs + // Load mask if specified + std::vector node_mask(graph.get_node_count(), true); // Default all nodes included + if (mask_file) { + std::ifstream mask_in(args::get(mask_file)); + std::string line; + uint64_t line_count = 0; + while (std::getline(mask_in, line)) { + if (line_count >= graph.get_node_count()) { + std::cerr << "[odgi::similarity] error: mask file has more lines than graph nodes (" + << graph.get_node_count() << ")" << std::endl; + return 1; + } + if (line == "0") { + node_mask[line_count] = false; + } else if (line == "1") { + node_mask[line_count] = true; + } else { + std::cerr << "[odgi::similarity] error: mask file should contain only 0 or 1 values, found: " << line << std::endl; + return 1; + } + line_count++; + } + if (line_count != graph.get_node_count()) { + std::cerr << "[odgi::similarity] error: mask file should have exactly " << graph.get_node_count() + << " lines, found: " << line_count << std::endl; + return 1; + } + } + auto get_path_name = (using_delim ? (std::function) @@ -186,7 +216,12 @@ args::Group threading_opts(parser, "[ Threading ]"); graph.for_each_step_in_path( p, [&](const step_handle_t& s) { - path_length += graph.get_length(graph.get_handle_of_step(s)); + auto h = graph.get_handle_of_step(s); + // Skip masked-out nodes + if (!node_mask[graph.get_id(h) - 1]) { + return; + } + path_length += graph.get_length(h); }); #pragma omp critical (bp_count) bp_count[get_path_id(p)] += path_length; @@ -199,36 +234,6 @@ args::Group threading_opts(parser, "[ Threading ]"); graph.get_node_count(), "[odgi::similarity] collecting path intersection lengths"); } - // ska::flat_hash_map, uint64_t> leads to huge memory usage with deep graphs - // Load mask if specified - std::vector node_mask(graph.get_node_count(), true); // Default all nodes included - if (mask_file) { - std::ifstream mask_in(args::get(mask_file)); - std::string line; - uint64_t line_count = 0; - while (std::getline(mask_in, line)) { - if (line_count >= graph.get_node_count()) { - std::cerr << "[odgi::similarity] error: mask file has more lines than graph nodes (" - << graph.get_node_count() << ")" << std::endl; - return 1; - } - if (line == "0") { - node_mask[line_count] = false; - } else if (line == "1") { - node_mask[line_count] = true; - } else { - std::cerr << "[odgi::similarity] error: mask file should contain only 0 or 1 values, found: " << line << std::endl; - return 1; - } - line_count++; - } - if (line_count != graph.get_node_count()) { - std::cerr << "[odgi::similarity] error: mask file should have exactly " << graph.get_node_count() - << " lines, found: " << line_count << std::endl; - return 1; - } - } - ska::flat_hash_map path_intersection_length; graph.for_each_handle( [&](const handle_t& h) {