Skip to content

Commit

Permalink
Merge branch 'latest' into dk/allow_empty_picklist
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb authored Jul 7, 2022
2 parents 632d6fd + 50bc6dd commit 685c1fd
Show file tree
Hide file tree
Showing 3 changed files with 2 additions and 230 deletions.
123 changes: 1 addition & 122 deletions src/sourmash/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@
StandaloneManifestIndex - load manifests directly, and do lazy loading of
signatures on demand. No signatures are kept in memory.
LazyLoadedIndex - selection on manifests with loading of index on demand.
(Consider using StandaloneManifestIndex instead.)
CounterGather - an ancillary class returned by the 'counter_gather()' method.
"""

Expand Down Expand Up @@ -462,7 +459,7 @@ class LazyLinearIndex(Index):
signatures are actually requested (hence, 'lazy').
* this class stores the provided index 'db' in memory. If you need
a class that does lazy loading of signatures from disk and does not
store signatures in memory, see LazyLoadedIndex.
store signatures in memory, see StandaloneManifestIndex.
* if you want efficient manifest-based selection, consider
MultiIndex (signatures in memory).
"""
Expand Down Expand Up @@ -1058,119 +1055,6 @@ def select(self, **kwargs):
prepend_location=self.prepend_location)


class LazyLoadedIndex(Index):
"""Given an index location and a manifest, do select only on the manifest
until signatures are actually requested, and only then load the index.
This class is useful when you have an index object that consume
memory when it is loaded (e.g. JSON signature files, or LCA
databases) and you want to avoid keeping them in memory. The
downside of using this class is that it will load the signatures
from disk every time they are needed (e.g. 'find(...)', 'signatures()').
Wrapper class; signatures dynamically loaded from disk; uses manifests.
CTB: This may be redundant with StandaloneManifestIndex.
"""
def __init__(self, filename, manifest):
"Create an Index with given filename and manifest."
if not os.path.exists(filename):
raise ValueError(f"'{filename}' must exist when creating LazyLoadedIndex")

if manifest is None:
raise ValueError("manifest cannot be None")

self.filename = filename
self.manifest = manifest

@property
def location(self):
"the 'location' attribute for this index will be the filename."
return self.filename

def signatures(self):
"yield all signatures from the manifest."
if not len(self):
# nothing in manifest? done!
return []

# ok - something in manifest, let's go get those signatures!
picklist = self.manifest.to_picklist()
idx = sourmash.load_file_as_index(self.location)

# convert remaining manifest into picklist
# CTB: one optimization down the road is, for storage-backed
# Index objects, to just reach in and get the signatures directly,
# without going through 'select'. Still, this is nice for abstraction
# because we don't need to care what the index is - it'll work on
# anything. It just might be a bit slower.
idx = idx.select(picklist=picklist)

# extract signatures.
for ss in idx.signatures():
yield ss

def find(self, *args, **kwargs):
"""Run find after loading and selecting; this provides 'search',
"'gather', and 'prefetch' functionality, which are built on 'find'.
"""
if not len(self):
# nothing in manifest? done!
return []

# ok - something in manifest, let's go get those signatures!
picklist = self.manifest.to_picklist()
idx = sourmash.load_file_as_index(self.location)

# convert remaining manifest into picklist
# CTB: one optimization down the road is, for storage-backed
# Index objects, to just reach in and get the signatures directly,
# without going through 'select'. Still, this is nice for abstraction
# because we don't need to care what the index is - it'll work on
# anything. It just might be a bit slower.
idx = idx.select(picklist=picklist)

for x in idx.find(*args, **kwargs):
yield x

def __len__(self):
"track index size based on the manifest."
return len(self.manifest)

def __bool__(self):
return bool(self.manifest)

@classmethod
def load(cls, location):
"""Load index from given location, but retain only the manifest.
Fail if no manifest.
"""
idx = sourmash.load_file_as_index(location)
manifest = idx.manifest

if not idx.manifest:
raise ValueError(f"no manifest on index at {location}")

del idx
# NOTE: index is not retained outside this scope, just location.

return cls(location, manifest)

def insert(self, *args):
raise NotImplementedError

def save(self, *args):
raise NotImplementedError

def select(self, **kwargs):
"Run 'select' on manifest, return new object with new manifest."
manifest = self.manifest
new_manifest = manifest.select_to_manifest(**kwargs)

return LazyLoadedIndex(self.filename, new_manifest)


class StandaloneManifestIndex(Index):
"""Load a standalone manifest as an Index.
Expand All @@ -1191,11 +1075,6 @@ class StandaloneManifestIndex(Index):
StandaloneManifestIndex does _not_ store signatures in memory.
This class overlaps in concept with LazyLoadedIndex and behaves
identically when a manifest contains only rows from a single
on-disk Index object. However, unlike LazyLoadedIndex, this class
can be used to reference multiple on-disk Index objects.
This class also overlaps in concept with MultiIndex when
MultiIndex.load_from_pathlist is used to load other Index
objects. However, this class does not store any signatures in
Expand Down
97 changes: 0 additions & 97 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2264,103 +2264,6 @@ def test_lazy_index_wraps_multi_index_location():
lazy2.signatures_with_location()):
assert ss_tup == ss_lazy_tup

def test_lazy_loaded_index_1(runtmp):
# some basic tests for LazyLoadedIndex
lcafile = utils.get_test_data('prot/protein.lca.json.gz')
sigzip = utils.get_test_data('prot/protein.zip')

with pytest.raises(ValueError) as exc:
db = index.LazyLoadedIndex.load(lcafile)
# no manifest on LCA database
assert "no manifest on index at" in str(exc)

# load something, check that it's only accessed upon .signatures(...)
test_zip = runtmp.output('test.zip')
shutil.copyfile(sigzip, test_zip)
db = index.LazyLoadedIndex.load(test_zip)
assert len(db) == 2
assert db.location == test_zip

# now remove!
os.unlink(test_zip)

# can still access manifest...
assert len(db) == 2

# ...but we should get an error when we call signatures.
with pytest.raises(ValueError):
list(db.signatures())

# but put it back, and all is forgiven. yay!
shutil.copyfile(sigzip, test_zip)
x = list(db.signatures())
assert len(x) == 2


def test_lazy_loaded_index_2_empty(runtmp):
# some basic tests for LazyLoadedIndex that is empty
sigzip = utils.get_test_data('prot/protein.zip')

# load something:
test_zip = runtmp.output('test.zip')
shutil.copyfile(sigzip, test_zip)
db = index.LazyLoadedIndex.load(test_zip)
assert len(db) == 2
assert db.location == test_zip
assert bool(db)

# select to empty:
db = db.select(ksize=50)

assert len(db) == 0
assert not bool(db)

x = list(db.signatures())
assert len(x) == 0


def test_lazy_loaded_index_3_find(runtmp):
# test 'find'
query_file = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
sigzip = utils.get_test_data('prot/protein.zip')

# load something:
test_zip = runtmp.output('test.zip')
shutil.copyfile(sigzip, test_zip)
db = index.LazyLoadedIndex.load(test_zip)

# can we find matches? should find two.
query = sourmash.load_one_signature(query_file)
assert query.minhash.ksize == 19
x = db.search(query, threshold=0.0)
x = list(x)
assert len(x) == 2

# no matches!
db = db.select(ksize=20)
x = db.search(query, threshold=0.0)
x = list(x)
assert len(x) == 0


def test_lazy_loaded_index_4_nofile(runtmp):
# test check for filename must exist
with pytest.raises(ValueError) as exc:
index.LazyLoadedIndex(runtmp.output('xyz'), True)

assert "must exist when creating" in str(exc)


def test_lazy_loaded_index_4_nomanifest(runtmp):
# test check for empty manifest
sig2 = utils.get_test_data("2.fa.sig")

with pytest.raises(ValueError) as exc:
index.LazyLoadedIndex(sig2, None)

assert "manifest cannot be None" in str(exc)


def test_revindex_index_search():
# confirm that RevIndex works
sig2 = utils.get_test_data("2.fa.sig")
Expand Down
12 changes: 1 addition & 11 deletions tests/test_index_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from sourmash import SourmashSignature
from sourmash.index import (LinearIndex, ZipFileLinearIndex,
LazyLinearIndex, MultiIndex,
StandaloneManifestIndex, LazyLoadedIndex)
StandaloneManifestIndex)
from sourmash.index.sqlite_index import SqliteIndex
from sourmash.index.revindex import RevIndex
from sourmash.sbt import SBT, GraphFactory
Expand Down Expand Up @@ -147,15 +147,6 @@ def build_sqlite_index(runtmp):
return db


def build_lazy_loaded_index(runtmp):
db = build_lca_index(runtmp)
outfile = runtmp.output('db.lca.json')
db.save(outfile)

mf = CollectionManifest.create_manifest(db._signatures_with_internal())
return LazyLoadedIndex(outfile, mf)


def build_revindex(runtmp):
ss2, ss47, ss63 = _load_three_sigs()

Expand Down Expand Up @@ -194,7 +185,6 @@ def build_lca_index_save_load_sql(runtmp):
build_lca_index_save_load,
build_sqlite_index,
build_lca_index_save_load_sql,
build_lazy_loaded_index,
# build_revindex,
]
)
Expand Down

0 comments on commit 685c1fd

Please sign in to comment.