Skip to content

Commit

Permalink
Add kmer complexity filter
Browse files Browse the repository at this point in the history
  • Loading branch information
bkille committed Jul 10, 2023
1 parent 1db87ea commit 0ce0c7d
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 9 deletions.
4 changes: 2 additions & 2 deletions src/map/include/base_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ namespace skch

//--for split read mapping

long double seqComplexity; // Estimated sequence complexity
long double kmerComplexity; // Estimated sequence complexity
int n_merged; // how many mappings we've merged into this one
offset_t splitMappingId; // To identify split mappings that are chained
uint8_t discard; // set to 1 for deletion
Expand Down Expand Up @@ -276,7 +276,7 @@ namespace skch
MinmerVec minmerTableQuery; //Vector of minmers in the query
MinmerVec seedHits; //Vector of minmers in the reference
int refGroup; //Prefix group of sequence
float seqComplexity; //Estimated sequence complexity
float kmerComplexity; //Estimated sequence complexity
};
}

Expand Down
15 changes: 8 additions & 7 deletions src/map/include/computeMap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,7 @@ namespace skch
int orig_len = Q.minmerTableQuery.size();
#endif
const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits<hash_t>::max();
Q.seqComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);
Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);

// TODO remove them from the original sketch instead of removing for each read
auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) {
Expand Down Expand Up @@ -1191,8 +1191,9 @@ namespace skch
// if we are in all-vs-all mode, it isn't a self-mapping,
// and if we are self-mapping, the query is shorter than the target
const auto& ref = this->refSketch.metadata[l2.seqId];
if((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity)
|| nucIdentity >= param.percentageIdentity)
if((Q.kmerComplexity >= param.kmerComplexityThreshold)
&& ((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity)
|| nucIdentity >= param.percentageIdentity))
{
//Track the best jaccard numerator
bestJaccardNumerator = std::max<double>(bestJaccardNumerator, l2.sharedSketchSize);
Expand All @@ -1215,7 +1216,7 @@ namespace skch
res.blockLength = std::max(res.refEndPos - res.refStartPos, res.queryEndPos - res.queryStartPos);
res.approxMatches = std::round(res.nucIdentity * res.blockLength / 100.0);
res.strand = l2.strand;
res.seqComplexity = Q.seqComplexity;
res.kmerComplexity = Q.kmerComplexity;

res.selfMapFilter = ((param.skip_self || param.skip_prefix) && Q.fullLen > ref.len);

Expand Down Expand Up @@ -1528,8 +1529,8 @@ namespace skch
[](double x, MappingResult &e){ return x + e.nucIdentity; }) )/ std::distance(it, it_end);

//Mean sequence complexity of all mappings in the chain
it->seqComplexity = ( std::accumulate(it, it_end, 0.0,
[](double x, MappingResult &e){ return x + e.seqComplexity; }) )/ std::distance(it, it_end);
it->kmerComplexity = ( std::accumulate(it, it_end, 0.0,
[](double x, MappingResult &e){ return x + e.kmerComplexity; }) )/ std::distance(it, it_end);

//Discard other mappings of this chain
std::for_each( std::next(it), it_end, [&](MappingResult &e){ e.discard = 1; });
Expand Down Expand Up @@ -1749,7 +1750,7 @@ namespace skch
<< sep << e.blockLength
<< sep << fakeMapQ
<< sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity
<< sep << "sc:f:" << e.seqComplexity;
<< sep << "kc:f:" << e.kmerComplexity;
if (!param.mergeMappings)
{
outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize;
Expand Down
1 change: 1 addition & 0 deletions src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ struct Parameters
bool keep_low_pct_id; //true if we should keep mappings whose estimated identity < percentageIdentity
bool report_ANI_percentage; //true if ANI should be in [0,100] as opposed to [0,1] (this is necessary for wfmash
bool filterLengthMismatches; //true if filtering out length mismatches
float kmerComplexityThreshold; //minimum kmer complexity to consider (default 0)


int sketchSize;
Expand Down
11 changes: 11 additions & 0 deletions src/map/include/parseCmdArgs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
cmd.defineOptionAlternative("kmer","k");

cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue);
cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue);


//cmd.defineOption("shortenCandidateRegions", "Only compute rolling minhash score for small regions around positions where the intersection of reference and query minmers is locally maximal. Results in slighty faster runtimes at the cost of mapping placement and ANI prediction.");
Expand Down Expand Up @@ -510,6 +511,16 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
parameters.percentageIdentity = 0.85;
str.clear();

if(cmd.foundOption("kmerComplexity"))
{
str << cmd.optionValue("kmerComplexity");
str >> parameters.kmerComplexityThreshold;
str.clear();
}
else
parameters.kmerComplexityThreshold = 0.0;
str.clear();


if (cmd.foundOption("hgFilterAniDiff")) {
str << cmd.optionValue("hgFilterAniDiff");
Expand Down

0 comments on commit 0ce0c7d

Please sign in to comment.