Skip to content

Commit

Permalink
Added very explicitly written logic to choose a minimum kmer count fo…
Browse files Browse the repository at this point in the history
…r filtering depending on whether the user has specified their own value or if there are enough fastq files to fit the mixture model
  • Loading branch information
jhellewell14 committed Jan 20, 2025
1 parent 755b6ce commit 41fdd25
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 7 deletions.
4 changes: 2 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ pub enum Commands {
single_strand: bool,

/// Minimum k-mer count (with reads)
#[arg(long, default_value_t = DEFAULT_MINCOUNT)]
min_count: u16,
#[arg(long)]
min_count: Option<u16>,

/// Minimum k-mer quality (with reads)
#[arg(long, default_value_t = DEFAULT_MINQUAL)]
Expand Down
66 changes: 61 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -503,17 +503,73 @@ pub fn main() {

// Read input
let input_files = get_input_list(file_list, seq_files);
let rc = !*single_strand;

// Are there >=2 fastq files?
let enough_fastq = input_files
.iter()
.filter(|file| file.2.is_some())
.count()
.ge(&2);

// Is min_count provided by the user?
let not_provided = min_count.is_none();

// Minimum kmer count logic
let min_kmer_count: u16 = if enough_fastq & not_provided {
// Calculate cutoff
// Get first two fastq files
let mut files_iter = input_files
.iter()
.filter(|file| file.2.is_some())
.take(2)
.map(|x| x.1.clone());
let fastq_fwd: String = files_iter.next().unwrap();
let fastq_rev: String = files_iter.next().unwrap();

let cutoff;
if *k <= 31 {
log::info!("k={}: using 64-bit representation", *k);
let mut cov =
CoverageHistogram::<u64>::new(&fastq_fwd, &fastq_rev, *k, rc, args.verbose);
cutoff = cov.fit_histogram().expect("Couldn't fit coverage model");
cov.plot_hist();
} else {
log::info!("k={}: using 128-bit representation", *k);
let mut cov = CoverageHistogram::<u128>::new(
&fastq_fwd,
&fastq_rev,
*k,
rc,
args.verbose,
);
cutoff = cov.fit_histogram().expect("Couldn't fit coverage model");
cov.plot_hist();
}
cutoff as u16
} else if !not_provided {
// check provided value and use
let val = min_count.unwrap();
if val.ge(&1) {
val
} else {
panic!("Minimum k-mer count cannot be less than 1");
}
} else {
// Use default
log::info!("Not enough fastq files, using default kmer count of 5");
DEFAULT_MINCOUNT
};

let quality = QualOpts {
min_count: *min_count,
min_count: min_kmer_count,
min_qual: *min_qual,
qual_filter: *qual_filter,
};

// Build, merge
let rc = !*single_strand;

if *k <= 31 {
log::info!("k={}: using 64-bit representation", *k);
// log::info!("k={}: using 64-bit representation", *k);
let merged_dict = build_and_merge::<u64>(
&input_files,
*k,
Expand All @@ -526,7 +582,7 @@ pub fn main() {
// Save
save_skf(&merged_dict, output);
} else {
log::info!("k={}: using 128-bit representation", *k);
// log::info!("k={}: using 128-bit representation", *k);
let merged_dict = build_and_merge::<u128>(
&input_files,
*k,
Expand Down

0 comments on commit 41fdd25

Please sign in to comment.