Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] refactor LCA_Database to support programmatic creation. #946

Merged
merged 38 commits into from
Apr 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
ecf1e56
calculate fraction of match that was in original query, for gather
ctb Apr 7, 2020
26db2af
Update sourmash/search.py
ctb Apr 10, 2020
b02177c
restore column order; fix downsampling in contained_by
ctb Apr 10, 2020
5a030ef
add explicit tests for the various f_ columns in gather csv output
ctb Apr 10, 2020
0372a6f
fix test for py27
ctb Apr 13, 2020
cb3ffe6
fix comments, simplify code
ctb Apr 13, 2020
2e021b5
update gather docs
ctb Apr 13, 2020
0d79fa0
update docs to be more correct and clearer
ctb Apr 14, 2020
210ffa2
initial start on refactor of LCA_Database to support programmatic cre…
ctb Apr 15, 2020
7a71c4a
moar refactor
ctb Apr 15, 2020
b4389ae
Merge branch 'master' into update/gather_outputs
ctb Apr 15, 2020
82d0b7d
Merge branch 'master' of github.com:dib-lab/sourmash into refactor/lc…
ctb Apr 15, 2020
d286e87
Merge branch 'update/gather_outputs' of github.com:dib-lab/sourmash i…
ctb Apr 15, 2020
9d1d8d7
move idx and lid creation into class
ctb Apr 15, 2020
8a14595
move stuff into __init__
ctb Apr 15, 2020
1240c7b
factor out signature insertion code
ctb Apr 15, 2020
54b6aa4
Merge branch 'master' of github.com:dib-lab/sourmash into refactor/lc…
ctb Apr 15, 2020
958edfd
Merge branch 'master' into refactor/lca_db_create
ctb Apr 15, 2020
f1fc1e9
Merge branch 'master' of github.com:dib-lab/sourmash into refactor/lc…
ctb Apr 15, 2020
491de56
commentary/cleanup
ctb Apr 16, 2020
b0561d9
interim commit for refactored ident -> lineage code
ctb Apr 16, 2020
7b93bf0
remove now-unnecessary cleanup code
ctb Apr 16, 2020
427e6fe
re-add missing set
ctb Apr 16, 2020
40b5de7
create new lca.lca_db module
ctb Apr 16, 2020
c41ac92
merge refactored LCA_Database creation code into main class
ctb Apr 16, 2020
3e2e694
some initial API tests
ctb Apr 16, 2020
aba6b86
refactor new LCA_Database.insert_signature to match Index.insert sign…
ctb Apr 16, 2020
1a9dcf1
add cache invalidation to insert_signature
ctb Apr 16, 2020
b8bdd9a
define insert to match Index behavior
ctb Apr 16, 2020
22fec7b
moar documentation and cleanup
ctb Apr 16, 2020
e6b0f9a
comment find_signatures
ctb Apr 16, 2020
7cf31a8
add test for some basic lineage functionality
ctb Apr 17, 2020
85f8711
added some whitebox tests of LCA_Database
ctb Apr 17, 2020
c2ad151
more comments & tests; make _find_signatures internal only
ctb Apr 18, 2020
4c05cbd
test scaling during insert and after
ctb Apr 18, 2020
5de5779
more commenting
ctb Apr 18, 2020
5dc6f8b
pull in more from lca_utils
ctb Apr 18, 2020
c1eee31
Merge branch 'master' of github.com:dib-lab/sourmash into refactor/lc…
ctb Apr 18, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions sourmash/lca/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
"LCA and reverse index utilities."

from .lca_db import LCA_Database
from .lca_utils import (taxlist, zip_lineage, build_tree, find_lca,
gather_assignments, LineagePair, display_lineage,
count_lca_for_assignments)

from .command_index import index
from .command_classify import classify
from .command_summarize import summarize_main
from .command_rankinfo import rankinfo_main
from .command_gather import gather_main
from .__main__ import main

140 changes: 37 additions & 103 deletions sourmash/lca/command_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
import csv
from collections import defaultdict

from .. import sourmash_args, load_signatures
from ..logging import notify, error, debug, set_quiet
from sourmash import sourmash_args, load_signatures
from sourmash.logging import notify, error, debug, set_quiet
from . import lca_utils
from .lca_utils import LineagePair
from ..sourmash_args import DEFAULT_LOAD_K
from .lca_db import LCA_Database
from sourmash.sourmash_args import DEFAULT_LOAD_K


def load_taxonomy_assignments(filename, delimiter=',', start_column=2,
Expand Down Expand Up @@ -152,56 +153,12 @@ def index(args):
use_headers=not args.no_headers,
force=args.force)

# convert identities to numbers.
ident_to_idx = {}
idx_to_lid = {}

lid_to_lineage = {}
lineage_to_lid = {}

arg_d = dict(next_index=0, next_lid=0) # hack to keep from using nonlocal

def get_ident_index(ident, fail_on_duplicate=False, arg_d=arg_d):
idx = ident_to_idx.get(ident)
if fail_on_duplicate:
assert idx is None # should be no duplicate identities

if idx is None:
idx = arg_d['next_index']
arg_d['next_index'] += 1

ident_to_idx[ident] = idx

return idx

def get_lineage_id(lineage, arg_d=arg_d):
# lineage -> id
lid = lineage_to_lid.get(lineage)
if lid is None:
lid = arg_d['next_lid']
arg_d['next_lid'] += 1

lineage_to_lid[lineage] = lid
lid_to_lineage[lid] = lineage

return lid

for (ident, lineage) in assignments.items():
idx = get_ident_index(ident, fail_on_duplicate=True)
lid = get_lineage_id(lineage)

# index -> lineage id
idx_to_lid[idx] = lid

notify('{} distinct identities in spreadsheet out of {} rows.',
len(idx_to_lid), num_rows)
len(assignments), num_rows)
notify('{} distinct lineages in spreadsheet out of {} rows.',
len(set(idx_to_lid.values())), num_rows)
len(set(assignments.values())), num_rows)

# load signatures, construct index of hashvals to ident
hashval_to_idx = defaultdict(set)
md5_to_name = {}
ident_to_name = {}
db = LCA_Database(args.ksize, args.scaled)

# notify('finding signatures...')
if args.traverse_directory:
Expand All @@ -213,6 +170,9 @@ def get_lineage_id(lineage, arg_d=arg_d):
else:
inp_files = list(args.signatures)

# track duplicates
md5_to_name = {}

#
# main loop, connecting lineage ID to signature.
#
Expand All @@ -221,7 +181,7 @@ def get_lineage_id(lineage, arg_d=arg_d):
total_n = len(inp_files)
record_duplicates = set()
record_no_lineage = set()
record_remnants = set(ident_to_idx.keys())
record_remnants = set(assignments)
record_used_lineages = set()
record_used_idents = set()
n_skipped = 0
Expand All @@ -232,110 +192,84 @@ def get_lineage_id(lineage, arg_d=arg_d):
notify('\r... loading signature {} (file {} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='')
debug(filename, sig.name())

# block off duplicates.
if sig.md5sum() in md5_to_name:
debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum())
record_duplicates.add(filename)
continue

md5_to_name[sig.md5sum()] = sig.name()

# parse identifier, potentially with splitting
ident = sig.name()
if args.split_identifiers: # hack for NCBI-style names, etc.
ident = ident.split(' ')[0].split('.')[0]

# store full name
ident_to_name[ident] = sig.name()
lineage = assignments.get(ident)

# store md5 -> name too
md5_to_name[sig.md5sum()] = sig.name()

# remove from our list of remnant lineages
# punt if no lineage and --require-taxonomy
if lineage is None and args.require_taxonomy:
debug('(skipping, because --require-taxonomy was specified)')
n_skipped += 1
continue

# add the signature into the database.
db.insert(sig, ident=ident, lineage=lineage)

# remove from our list of remaining lineages
try:
record_remnants.remove(ident)
except KeyError:
# @CTB
pass

# track ident as used
record_used_idents.add(ident)

# downsample to specified scaled; this has the side effect of
# making sure they're all at the same scaled value!
minhash = sig.minhash.downsample_scaled(args.scaled)

# connect hashvals to identity (and maybe lineage)
idx = get_ident_index(ident)
lid = idx_to_lid.get(idx)

lineage = None
if lid is not None:
lineage = lid_to_lineage.get(lid)

# track lineage info - either no lineage, or this lineage used.
if lineage is None:
debug('WARNING: no lineage assignment for {}.', ident)
record_no_lineage.add(ident)
else:
record_used_lineages.add(lineage)

if lineage is None and args.require_taxonomy:
debug('(skipping, because --require-taxonomy was specified)')
n_skipped += 1
continue

for hashval in minhash.get_mins():
hashval_to_idx[hashval].add(idx)
# end main add signatures loop

notify(u'\r\033[K', end=u'')

# check -- did we find any signatures?
if n == 0:
error('ERROR: no signatures found. ??')
if args.traverse_directory and not args.force:
error('(note, with --traverse-directory, you may want to use -f)')
sys.exit(1)

if not hashval_to_idx:
# check -- did the signatures we found have any hashes?
if not db.hashval_to_idx:
error('ERROR: no hash values found - are there any signatures?')
sys.exit(1)

# summarize:
notify('{} assigned lineages out of {} distinct lineages in spreadsheet.',
len(record_used_lineages), len(set(assignments.values())))
unused_lineages = set(assignments.values()) - record_used_lineages

notify('{} identifiers used out of {} distinct identifiers in spreadsheet.',
len(record_used_idents), len(set(assignments)))

# remove unused identifiers
unused_identifiers = set(assignments) - record_used_idents
for ident in unused_identifiers:
assert ident not in ident_to_name
idx = get_ident_index(ident)
del ident_to_idx[ident]
if idx in idx_to_lid:
del idx_to_lid[idx]

# remove unusued lineages and lids
for lineage in unused_lineages:
lid = lineage_to_lid[lineage]
del lineage_to_lid[lineage]
del lid_to_lineage[lid]

# now, save!
db_outfile = args.lca_db_out
if not (db_outfile.endswith('.lca.json') or \
db_outfile.endswith('.lca.json.gz')):
db_outfile.endswith('.lca.json.gz')): # logic -> db.save
db_outfile += '.lca.json'
notify('saving to LCA DB: {}'.format(db_outfile))

db = lca_utils.LCA_Database()
db.ident_to_name = ident_to_name
db.ident_to_idx = ident_to_idx
db.idx_to_lid = idx_to_lid
db.lineage_to_lid = lineage_to_lid
db.lid_to_lineage = lid_to_lineage
db.hashval_to_idx = hashval_to_idx

db.ksize = int(args.ksize)
db.scaled = int(args.scaled)

db.save(db_outfile)

## done!

# output a record of stuff if requested/available:
if record_duplicates or record_no_lineage or record_remnants or unused_lineages:
if record_duplicates:
notify('WARNING: {} duplicate signatures.', len(record_duplicates))
Expand Down
Loading