Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Adding bounds checking for --scaled and --num in sourmash sketch #1711

Merged
merged 16 commits into from
Aug 27, 2021
50 changes: 26 additions & 24 deletions src/sourmash/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import os
import argparse
from sourmash.logging import notify
from sourmash.sourmash_args import check_scaled_bounds
from sourmash.sourmash_args import check_num_bounds
keyabarve marked this conversation as resolved.
Show resolved Hide resolved


def add_moltype_args(parser):
Expand Down Expand Up @@ -96,20 +98,20 @@ def command_list(dirpath):
return sorted(basenames)


def check_scaled_bounds(arg):
actual_min_val = 0
min_val = 100
max_val = 1e6
# def check_scaled_bounds(arg):
# actual_min_val = 0
# min_val = 100
# max_val = 1e6

f = float(arg)
# f = float(arg)

if f < actual_min_val:
raise argparse.ArgumentTypeError(f"ERROR: --scaled value must be positive")
if f < min_val:
notify('WARNING: --scaled value should be >= 100. Continuing anyway.')
if f > max_val:
notify('WARNING: --scaled value should be <= 1e6. Continuing anyway.')
return f
# if f < actual_min_val:
# raise argparse.ArgumentTypeError(f"ERROR: --scaled value must be positive")
# if f < min_val:
# notify('WARNING: --scaled value should be >= 100. Continuing anyway.')
# if f > max_val:
# notify('WARNING: --scaled value should be <= 1e6. Continuing anyway.')
# return f


def add_scaled_arg(parser, default=None):
Expand All @@ -119,20 +121,20 @@ def add_scaled_arg(parser, default=None):
)


def check_num_bounds(arg):
actual_min_val = 0
min_val = 50
max_val = 50000
# def check_num_bounds(arg):
# actual_min_val = 0
# min_val = 50
# max_val = 50000

f = int(arg)
# f = int(arg)

if f < actual_min_val:
raise argparse.ArgumentTypeError(f"ERROR: --num-hashes value must be positive")
if f < min_val:
notify('WARNING: --num-hashes value should be >= 50. Continuing anyway.')
if f > max_val:
notify('WARNING: --num-hashes value should be <= 50000. Continuing anyway.')
return f
# if f < actual_min_val:
# raise argparse.ArgumentTypeError(f"ERROR: --num-hashes value must be positive")
# if f < min_val:
# notify('WARNING: --num-hashes value should be >= 50. Continuing anyway.')
# if f > max_val:
# notify('WARNING: --num-hashes value should be <= 50000. Continuing anyway.')
# return f


def add_num_arg(parser, default=0):
Expand Down
35 changes: 28 additions & 7 deletions src/sourmash/command_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from .logging import notify, error, set_quiet
from .command_compute import (_compute_individual, _compute_merged,
ComputeParameters)

from sourmash.sourmash_args import check_scaled_bounds
from sourmash.sourmash_args import check_num_bounds
keyabarve marked this conversation as resolved.
Show resolved Hide resolved

DEFAULTS = dict(
dna='k=31,scaled=1000,noabund',
Expand Down Expand Up @@ -38,8 +39,18 @@ def _parse_params_str(params_str):
num = int(num)
except ValueError:
raise ValueError(f"cannot parse num='{num}' as a number")
if num < 0:
raise ValueError(f"num is {num}, must be >= 0")
# if num < 0:
# raise ValueError(f"num is {num}, must be >= 0")

# if num < 0:
# raise ValueError(f"ERROR: num value must be positive")
# if num < 50:
# notify('WARNING: num value should be >= 50. Continuing anyway.')
# if num > 50000:
# notify('WARNING: num value should be <= 50000. Continuing anyway.')

num = check_num_bounds(num)

params['num'] = int(item[4:])
params['scaled'] = 0
elif item.startswith('scaled='):
Expand All @@ -50,10 +61,20 @@ def _parse_params_str(params_str):
scaled = int(scaled)
except ValueError:
raise ValueError(f"cannot parse scaled='{scaled}' as an integer")
if scaled < 0:
raise ValueError(f"scaled is {scaled}, must be >= 1")
if scaled > 1e8:
notify(f"WARNING: scaled value of {scaled} is nonsensical!?")
# if scaled < 0:
# raise ValueError(f"scaled is {scaled}, must be >= 1")
# if scaled > 1e8:
# notify(f"WARNING: scaled value of {scaled} is nonsensical!?")

# if scaled < 0:
# raise ValueError(f"ERROR: scaled value must be positive")
# if scaled < 100:
# notify('WARNING: scaled value should be >= 100. Continuing anyway.')
# if scaled > 1e6:
# notify('WARNING: scaled value should be <= 1e6. Continuing anyway.')

scaled = check_scaled_bounds(scaled)

params['scaled'] = scaled
params['num'] = 0
elif item.startswith('seed='):
Expand Down
33 changes: 33 additions & 0 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,44 @@
from . import signature as sigmod
from .picklist import SignaturePicklist, PickStyle
from .manifest import CollectionManifest
import argparse
keyabarve marked this conversation as resolved.
Show resolved Hide resolved


DEFAULT_LOAD_K = 31


def check_scaled_bounds(arg):
actual_min_val = 0
keyabarve marked this conversation as resolved.
Show resolved Hide resolved
min_val = 100
max_val = 1e6

f = float(arg)

if f < actual_min_val:
raise argparse.ArgumentTypeError(f"ERROR: scaled value must be positive")
if f < min_val:
notify('WARNING: scaled value should be >= 100. Continuing anyway.')
if f > max_val:
notify('WARNING: scaled value should be <= 1e6. Continuing anyway.')
return f


def check_num_bounds(arg):
actual_min_val = 0
min_val = 50
max_val = 50000

f = int(arg)

if f < actual_min_val:
raise argparse.ArgumentTypeError(f"ERROR: num value must be positive")
if f < min_val:
notify('WARNING: num value should be >= 50. Continuing anyway.')
if f > max_val:
notify('WARNING: num value should be <= 50000. Continuing anyway.')
return f


def get_moltype(sig, require=False):
mh = sig.minhash
if mh.moltype in ('DNA', 'dayhoff', 'hp', 'protein'):
Expand Down
84 changes: 82 additions & 2 deletions tests/test_sourmash_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,84 @@
from sourmash_tst_utils import SourmashCommandFailed


def test_do_sourmash_sketch_check_scaled_bounds_negative():
with utils.TempDirectory() as location:
keyabarve marked this conversation as resolved.
Show resolved Hide resolved
testdata1 = utils.get_test_data('short.fa')
status, out, err = utils.runscript('sourmash',
['sketch', 'translate',
'-p', 'scaled=-5',
testdata1],
in_directory=location,
fail_ok=True)

assert "ERROR: scaled value must be positive" in err


def test_do_sourmash_sketch_check_scaled_bounds_less_than_minimum():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
status, out, err = utils.runscript('sourmash',
['sketch', 'translate',
'-p', 'scaled=50',
testdata1],
in_directory=location,
fail_ok=True)

assert "WARNING: scaled value should be >= 100. Continuing anyway." in err


def test_do_sourmash_sketch_check_scaled_bounds_more_than_maximum():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
status, out, err = utils.runscript('sourmash',
['sketch', 'translate',
'-p', 'scaled=1000000000',
testdata1],
in_directory=location,
fail_ok=True)

assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in err


def test_do_sourmash_sketch_check_num_bounds_negative():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
status, out, err = utils.runscript('sourmash',
['sketch', 'translate',
'-p', 'num=-5',
testdata1],
in_directory=location,
fail_ok=True)

assert "ERROR: num value must be positive" in err


def test_do_sourmash_sketch_check_num_bounds_less_than_minimum():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
status, out, err = utils.runscript('sourmash',
['sketch', 'translate',
'-p', 'num=25',
testdata1],
in_directory=location,
fail_ok=True)

assert "WARNING: num value should be >= 50. Continuing anyway." in err


def test_do_sourmash_sketch_check_num_bounds_more_than_maximum():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
status, out, err = utils.runscript('sourmash',
['sketch', 'translate',
'-p', 'num=100000',
testdata1],
in_directory=location,
fail_ok=True)

assert "WARNING: num value should be <= 50000. Continuing anyway." in err


def test_dna_defaults():
factory = _signatures_for_sketch_factory([], 'dna', False)
params_list = list(factory.get_compute_params())
Expand Down Expand Up @@ -847,7 +925,8 @@ def test_do_sourmash_sketchdna_with_bad_scaled():
fail_ok=True)

assert status != 0
assert 'scaled is -1, must be >= 1' in err
# assert 'scaled is -1, must be >= 1' in err
assert 'ERROR: scaled value must be positive' in err

status, out, err = utils.runscript('sourmash',
['sketch', 'dna',
Expand All @@ -866,7 +945,8 @@ def test_do_sourmash_sketchdna_with_bad_scaled():
in_directory=location)

assert status == 0
assert 'WARNING: scaled value of 1000000000 is nonsensical!?' in err
# assert 'WARNING: scaled value of 1000000000 is nonsensical!?' in err
assert 'WARNING: scaled value should be <= 1e6. Continuing anyway.' in err


def test_do_sketch_with_seed():
Expand Down