Add kmer complexity filter

marbl · Jul 10, 2023 · 0ce0c7d · 0ce0c7d
1 parent 1db87ea
commit 0ce0c7d
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 9 deletions.
diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp
@@ -166,7 +166,7 @@ namespace skch
 
                                                         //--for split read mapping
 
-    long double seqComplexity;                               // Estimated sequence complexity
+    long double kmerComplexity;                               // Estimated sequence complexity
     int n_merged;                                       // how many mappings we've merged into this one
     offset_t splitMappingId;                            // To identify split mappings that are chained
     uint8_t discard;                                    // set to 1 for deletion
@@ -276,7 +276,7 @@ namespace skch
       MinmerVec minmerTableQuery;         //Vector of minmers in the query
       MinmerVec seedHits;                 //Vector of minmers in the reference
       int refGroup;                       //Prefix group of sequence
-      float seqComplexity;                //Estimated sequence complexity
+      float kmerComplexity;                //Estimated sequence complexity
     };
 }
 

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
@@ -804,7 +804,7 @@ namespace skch
           int orig_len = Q.minmerTableQuery.size();
 #endif
           const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits<hash_t>::max();
-          Q.seqComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);
+          Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);
 
           // TODO remove them from the original sketch instead of removing for each read
           auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) {
@@ -1191,8 +1191,9 @@ namespace skch
               // if we are in all-vs-all mode, it isn't a self-mapping,
               // and if we are self-mapping, the query is shorter than the target
               const auto& ref = this->refSketch.metadata[l2.seqId];
-              if((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity)
-                  || nucIdentity >= param.percentageIdentity)
+              if((Q.kmerComplexity >= param.kmerComplexityThreshold)
+                  && ((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity)
+                  || nucIdentity >= param.percentageIdentity))
               {
                 //Track the best jaccard numerator
                 bestJaccardNumerator = std::max<double>(bestJaccardNumerator, l2.sharedSketchSize);
@@ -1215,7 +1216,7 @@ namespace skch
                   res.blockLength = std::max(res.refEndPos - res.refStartPos, res.queryEndPos - res.queryStartPos);
                   res.approxMatches = std::round(res.nucIdentity * res.blockLength / 100.0);
                   res.strand = l2.strand; 
-                  res.seqComplexity = Q.seqComplexity;
+                  res.kmerComplexity = Q.kmerComplexity;
 
                   res.selfMapFilter = ((param.skip_self || param.skip_prefix) && Q.fullLen > ref.len);
 
@@ -1528,8 +1529,8 @@ namespace skch
                                   [](double x, MappingResult &e){ return x + e.nucIdentity; })     )/ std::distance(it, it_end);
 
             //Mean sequence complexity of all mappings in the chain
-            it->seqComplexity = (   std::accumulate(it, it_end, 0.0,
-                                  [](double x, MappingResult &e){ return x + e.seqComplexity; })     )/ std::distance(it, it_end);
+            it->kmerComplexity = (   std::accumulate(it, it_end, 0.0,
+                                  [](double x, MappingResult &e){ return x + e.kmerComplexity; })     )/ std::distance(it, it_end);
 
             //Discard other mappings of this chain
             std::for_each( std::next(it), it_end, [&](MappingResult &e){ e.discard = 1; });
@@ -1749,7 +1750,7 @@ namespace skch
                      << sep << e.blockLength
                      << sep << fakeMapQ
                      << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity
-                     << sep << "sc:f:" << e.seqComplexity;
+                     << sep << "kc:f:" << e.kmerComplexity;
             if (!param.mergeMappings) 
             {
               outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize;

diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp
@@ -60,6 +60,7 @@ struct Parameters
     bool keep_low_pct_id;                             //true if we should keep mappings whose estimated identity < percentageIdentity
     bool report_ANI_percentage;                       //true if ANI should be in [0,100] as opposed to [0,1] (this is necessary for wfmash
     bool filterLengthMismatches;                      //true if filtering out length mismatches
+    float kmerComplexityThreshold;                    //minimum kmer complexity to consider (default 0)
 
 
     int sketchSize;

diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp
@@ -95,6 +95,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
     cmd.defineOptionAlternative("kmer","k");
 
     cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue);
+    cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue);
 
 
     //cmd.defineOption("shortenCandidateRegions", "Only compute rolling minhash score for small regions around positions where the intersection of reference and query minmers is locally maximal. Results in slighty faster runtimes at the cost of mapping placement and ANI prediction.");
@@ -510,6 +511,16 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
       parameters.percentageIdentity = 0.85;
     str.clear();
 
+    if(cmd.foundOption("kmerComplexity"))
+    {
+      str << cmd.optionValue("kmerComplexity");
+      str >> parameters.kmerComplexityThreshold;
+      str.clear();
+    }
+    else
+      parameters.kmerComplexityThreshold = 0.0;
+    str.clear();
+
 
     if (cmd.foundOption("hgFilterAniDiff")) {
       str << cmd.optionValue("hgFilterAniDiff");