From 50bc6ddc2ba681e46501d14488a09df1c6d7d3de Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 7 Jul 2022 11:43:12 -0700 Subject: [PATCH] remove LazyLoadedIndex (#2104) --- src/sourmash/index/__init__.py | 123 +-------------------------------- tests/test_index.py | 97 -------------------------- tests/test_index_protocol.py | 12 +--- 3 files changed, 2 insertions(+), 230 deletions(-) diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index d575132437..5ef8304fd9 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -31,9 +31,6 @@ StandaloneManifestIndex - load manifests directly, and do lazy loading of signatures on demand. No signatures are kept in memory. -LazyLoadedIndex - selection on manifests with loading of index on demand. -(Consider using StandaloneManifestIndex instead.) - CounterGather - an ancillary class returned by the 'counter_gather()' method. """ @@ -462,7 +459,7 @@ class LazyLinearIndex(Index): signatures are actually requested (hence, 'lazy'). * this class stores the provided index 'db' in memory. If you need a class that does lazy loading of signatures from disk and does not - store signatures in memory, see LazyLoadedIndex. + store signatures in memory, see StandaloneManifestIndex. * if you want efficient manifest-based selection, consider MultiIndex (signatures in memory). """ @@ -1058,119 +1055,6 @@ def select(self, **kwargs): prepend_location=self.prepend_location) -class LazyLoadedIndex(Index): - """Given an index location and a manifest, do select only on the manifest - until signatures are actually requested, and only then load the index. - - This class is useful when you have an index object that consume - memory when it is loaded (e.g. JSON signature files, or LCA - databases) and you want to avoid keeping them in memory. The - downside of using this class is that it will load the signatures - from disk every time they are needed (e.g. 'find(...)', 'signatures()'). - - Wrapper class; signatures dynamically loaded from disk; uses manifests. - - CTB: This may be redundant with StandaloneManifestIndex. - """ - def __init__(self, filename, manifest): - "Create an Index with given filename and manifest." - if not os.path.exists(filename): - raise ValueError(f"'{filename}' must exist when creating LazyLoadedIndex") - - if manifest is None: - raise ValueError("manifest cannot be None") - - self.filename = filename - self.manifest = manifest - - @property - def location(self): - "the 'location' attribute for this index will be the filename." - return self.filename - - def signatures(self): - "yield all signatures from the manifest." - if not len(self): - # nothing in manifest? done! - return [] - - # ok - something in manifest, let's go get those signatures! - picklist = self.manifest.to_picklist() - idx = sourmash.load_file_as_index(self.location) - - # convert remaining manifest into picklist - # CTB: one optimization down the road is, for storage-backed - # Index objects, to just reach in and get the signatures directly, - # without going through 'select'. Still, this is nice for abstraction - # because we don't need to care what the index is - it'll work on - # anything. It just might be a bit slower. - idx = idx.select(picklist=picklist) - - # extract signatures. - for ss in idx.signatures(): - yield ss - - def find(self, *args, **kwargs): - """Run find after loading and selecting; this provides 'search', - "'gather', and 'prefetch' functionality, which are built on 'find'. - """ - if not len(self): - # nothing in manifest? done! - return [] - - # ok - something in manifest, let's go get those signatures! - picklist = self.manifest.to_picklist() - idx = sourmash.load_file_as_index(self.location) - - # convert remaining manifest into picklist - # CTB: one optimization down the road is, for storage-backed - # Index objects, to just reach in and get the signatures directly, - # without going through 'select'. Still, this is nice for abstraction - # because we don't need to care what the index is - it'll work on - # anything. It just might be a bit slower. - idx = idx.select(picklist=picklist) - - for x in idx.find(*args, **kwargs): - yield x - - def __len__(self): - "track index size based on the manifest." - return len(self.manifest) - - def __bool__(self): - return bool(self.manifest) - - @classmethod - def load(cls, location): - """Load index from given location, but retain only the manifest. - - Fail if no manifest. - """ - idx = sourmash.load_file_as_index(location) - manifest = idx.manifest - - if not idx.manifest: - raise ValueError(f"no manifest on index at {location}") - - del idx - # NOTE: index is not retained outside this scope, just location. - - return cls(location, manifest) - - def insert(self, *args): - raise NotImplementedError - - def save(self, *args): - raise NotImplementedError - - def select(self, **kwargs): - "Run 'select' on manifest, return new object with new manifest." - manifest = self.manifest - new_manifest = manifest.select_to_manifest(**kwargs) - - return LazyLoadedIndex(self.filename, new_manifest) - - class StandaloneManifestIndex(Index): """Load a standalone manifest as an Index. @@ -1191,11 +1075,6 @@ class StandaloneManifestIndex(Index): StandaloneManifestIndex does _not_ store signatures in memory. - This class overlaps in concept with LazyLoadedIndex and behaves - identically when a manifest contains only rows from a single - on-disk Index object. However, unlike LazyLoadedIndex, this class - can be used to reference multiple on-disk Index objects. - This class also overlaps in concept with MultiIndex when MultiIndex.load_from_pathlist is used to load other Index objects. However, this class does not store any signatures in diff --git a/tests/test_index.py b/tests/test_index.py index 24468db9bf..bd216b8f32 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -2264,103 +2264,6 @@ def test_lazy_index_wraps_multi_index_location(): lazy2.signatures_with_location()): assert ss_tup == ss_lazy_tup -def test_lazy_loaded_index_1(runtmp): - # some basic tests for LazyLoadedIndex - lcafile = utils.get_test_data('prot/protein.lca.json.gz') - sigzip = utils.get_test_data('prot/protein.zip') - - with pytest.raises(ValueError) as exc: - db = index.LazyLoadedIndex.load(lcafile) - # no manifest on LCA database - assert "no manifest on index at" in str(exc) - - # load something, check that it's only accessed upon .signatures(...) - test_zip = runtmp.output('test.zip') - shutil.copyfile(sigzip, test_zip) - db = index.LazyLoadedIndex.load(test_zip) - assert len(db) == 2 - assert db.location == test_zip - - # now remove! - os.unlink(test_zip) - - # can still access manifest... - assert len(db) == 2 - - # ...but we should get an error when we call signatures. - with pytest.raises(ValueError): - list(db.signatures()) - - # but put it back, and all is forgiven. yay! - shutil.copyfile(sigzip, test_zip) - x = list(db.signatures()) - assert len(x) == 2 - - -def test_lazy_loaded_index_2_empty(runtmp): - # some basic tests for LazyLoadedIndex that is empty - sigzip = utils.get_test_data('prot/protein.zip') - - # load something: - test_zip = runtmp.output('test.zip') - shutil.copyfile(sigzip, test_zip) - db = index.LazyLoadedIndex.load(test_zip) - assert len(db) == 2 - assert db.location == test_zip - assert bool(db) - - # select to empty: - db = db.select(ksize=50) - - assert len(db) == 0 - assert not bool(db) - - x = list(db.signatures()) - assert len(x) == 0 - - -def test_lazy_loaded_index_3_find(runtmp): - # test 'find' - query_file = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigzip = utils.get_test_data('prot/protein.zip') - - # load something: - test_zip = runtmp.output('test.zip') - shutil.copyfile(sigzip, test_zip) - db = index.LazyLoadedIndex.load(test_zip) - - # can we find matches? should find two. - query = sourmash.load_one_signature(query_file) - assert query.minhash.ksize == 19 - x = db.search(query, threshold=0.0) - x = list(x) - assert len(x) == 2 - - # no matches! - db = db.select(ksize=20) - x = db.search(query, threshold=0.0) - x = list(x) - assert len(x) == 0 - - -def test_lazy_loaded_index_4_nofile(runtmp): - # test check for filename must exist - with pytest.raises(ValueError) as exc: - index.LazyLoadedIndex(runtmp.output('xyz'), True) - - assert "must exist when creating" in str(exc) - - -def test_lazy_loaded_index_4_nomanifest(runtmp): - # test check for empty manifest - sig2 = utils.get_test_data("2.fa.sig") - - with pytest.raises(ValueError) as exc: - index.LazyLoadedIndex(sig2, None) - - assert "manifest cannot be None" in str(exc) - - def test_revindex_index_search(): # confirm that RevIndex works sig2 = utils.get_test_data("2.fa.sig") diff --git a/tests/test_index_protocol.py b/tests/test_index_protocol.py index 22498fcd04..15fd70aad0 100644 --- a/tests/test_index_protocol.py +++ b/tests/test_index_protocol.py @@ -9,7 +9,7 @@ from sourmash import SourmashSignature from sourmash.index import (LinearIndex, ZipFileLinearIndex, LazyLinearIndex, MultiIndex, - StandaloneManifestIndex, LazyLoadedIndex) + StandaloneManifestIndex) from sourmash.index.sqlite_index import SqliteIndex from sourmash.index.revindex import RevIndex from sourmash.sbt import SBT, GraphFactory @@ -147,15 +147,6 @@ def build_sqlite_index(runtmp): return db -def build_lazy_loaded_index(runtmp): - db = build_lca_index(runtmp) - outfile = runtmp.output('db.lca.json') - db.save(outfile) - - mf = CollectionManifest.create_manifest(db._signatures_with_internal()) - return LazyLoadedIndex(outfile, mf) - - def build_revindex(runtmp): ss2, ss47, ss63 = _load_three_sigs() @@ -194,7 +185,6 @@ def build_lca_index_save_load_sql(runtmp): build_lca_index_save_load, build_sqlite_index, build_lca_index_save_load_sql, - build_lazy_loaded_index, # build_revindex, ] )