diff --git a/pyproject.toml b/pyproject.toml index 98690e7c..ec93b7b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,8 @@ dependencies = [ "importlib_resources", "jsonschema", "pygit2", - "outpack-query-parser" + "outpack-query-parser", + "humanize" ] [project.urls] @@ -44,6 +45,7 @@ path = "src/outpack/__about__.py" dependencies = [ "coverage[toml]>=6.5", "pytest", + "pytest_mock", "sphinx", "sphinx-rtd-theme", "myst-parser", diff --git a/src/outpack/filestore.py b/src/outpack/filestore.py index 2ad1e176..1bdcb3d5 100644 --- a/src/outpack/filestore.py +++ b/src/outpack/filestore.py @@ -2,40 +2,47 @@ import os.path import shutil import stat +import tempfile +from contextlib import contextmanager +from pathlib import Path from outpack.hash import Hash, hash_parse, hash_validate_file class FileStore: def __init__(self, path): - self._path = path + self._path = Path(path) os.makedirs(path, exist_ok=True) def filename(self, hash): dat = hash_parse(hash) - return os.path.join( - self._path, dat.algorithm, dat.value[:2], dat.value[2:] - ) + return self._path / dat.algorithm / dat.value[:2] / dat.value[2:] - def get(self, hash, dst): + def get(self, hash, dst, *, overwrite=False): src = self.filename(hash) if not os.path.exists(src): msg = f"Hash '{hash}' not found in store" raise Exception(msg) os.makedirs(os.path.dirname(dst), exist_ok=True) - # todo - control over overwrite args needed. + if not overwrite and os.path.exists(dst): + msg = f"Failed to copy '{src}' to '{dst}', file already exists" + raise Exception(msg) shutil.copyfile(src, dst) def exists(self, hash): return os.path.exists(self.filename(hash)) - def put(self, src, hash): + def put(self, src, hash, *, move=False): hash_validate_file(src, hash) dst = self.filename(hash) if not os.path.exists(dst): os.makedirs(os.path.dirname(dst), exist_ok=True) - shutil.copyfile(src, dst) - os.chmod(dst, stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH) + if move: + shutil.move(src, dst) + else: + shutil.copyfile(src, dst) + # Make file readonly for everyone + dst.chmod(0o444) return hash def ls(self): @@ -43,9 +50,51 @@ def ls(self): # (os.walk, Path.glob etc), but this is probably clearest. ret = [] for algorithm in os.listdir(self._path): - path_alg = os.path.join(self._path, algorithm) + path_alg = self._path / algorithm for prefix in os.listdir(path_alg): path_prefix = os.path.join(path_alg, prefix) for suffix in os.listdir(path_prefix): ret.append(Hash(algorithm, prefix + suffix)) return ret + + def destroy(self) -> None: + def onerror(func, path, _exc_info): + """ + Error handler for ``shutil.rmtree``. + + If the error is due to an access error (read only file) + it attempts to add write permission and then retries. + + If the error is for another reason it re-raises the error. + We manually remove write permission in ``put`` above so this + is expected. + + Note we only need this on windows, on Linux shutils.rmtree will + successfully remove the dir and its contents without having + to add write permission to individual files + + Usage : ``shutil.rmtree(path, onerror=onerror)`` + """ + if not os.access(path, os.W_OK): + os.chmod(path, stat.S_IWUSR) + func(path) + else: + raise + + shutil.rmtree(self._path, onerror=onerror) + + @contextmanager + def tmp(self): + # On a newer version of tempfile we could use `delete_on_close = False` + # see + # https://github.com/mrc-ide/outpack-py/pull/33#discussion_r1500522877 + path = self._path / "tmp" + path.mkdir(exist_ok=True) + f = tempfile.NamedTemporaryFile(dir=path, delete=False) + try: + yield f.name + finally: + try: + os.unlink(f.name) + except OSError: + pass diff --git a/src/outpack/index.py b/src/outpack/index.py index d84b9e6b..cd4649f9 100644 --- a/src/outpack/index.py +++ b/src/outpack/index.py @@ -1,14 +1,19 @@ import pathlib from dataclasses import dataclass -from typing import List +from typing import Dict, List -from outpack.metadata import read_metadata_core, read_packet_location +from outpack.metadata import ( + MetadataCore, + PacketLocation, + read_metadata_core, + read_packet_location, +) @dataclass class IndexData: - metadata: dict - location: dict + metadata: Dict[str, MetadataCore] + location: Dict[str, Dict[str, PacketLocation]] unpacked: List[str] @staticmethod @@ -29,21 +34,28 @@ def refresh(self): self.data = _index_update(self._path, self.data) return self - def all_metadata(self): + def all_metadata(self) -> Dict[str, MetadataCore]: return self.refresh().data.metadata - def metadata(self, id): + def metadata(self, id) -> MetadataCore: if id in self.data.metadata: return self.data.metadata[id] return self.refresh().data.metadata[id] - def all_locations(self): + def all_locations(self) -> Dict[str, Dict[str, PacketLocation]]: return self.refresh().data.location - def location(self, name): + def location(self, name) -> Dict[str, PacketLocation]: return self.refresh().data.location[name] - def unpacked(self): + def packets_in_location(self, name) -> List[str]: + try: + packets = list(self.location(name).keys()) + except KeyError: + packets = [] + return packets + + def unpacked(self) -> List[str]: return self.refresh().data.unpacked @@ -62,7 +74,7 @@ def _read_metadata(path_root, data): return data -def _read_locations(path_root, data): +def _read_locations(path_root, data) -> Dict[str, Dict[str, PacketLocation]]: path = path_root / ".outpack" / "location" for loc in path.iterdir(): if loc.name not in data: diff --git a/src/outpack/location.py b/src/outpack/location.py index cd4cdcaf..365f95b8 100644 --- a/src/outpack/location.py +++ b/src/outpack/location.py @@ -1,13 +1,8 @@ import collections -import os import shutil -from typing import List from outpack.config import Location, update_config -from outpack.hash import hash_validate_string from outpack.location_path import OutpackLocationPath -from outpack.metadata import PacketLocation -from outpack.packet import mark_known from outpack.root import root_open from outpack.static import ( LOCATION_LOCAL, @@ -96,7 +91,7 @@ def location_resolve_valid( isinstance(item, str) for item in location ): unknown = set(location).difference(outpack_location_list(root)) - if len(unknown) > 0: + if unknown: unknown_text = "', '".join(unknown) msg = f"Unknown location: '{unknown_text}'" raise Exception(msg) @@ -119,111 +114,6 @@ def location_resolve_valid( return location -def outpack_location_pull_metadata(location=None, root=None, *, locate=True): - root = root_open(root, locate=locate) - location_name = location_resolve_valid( - location, - root, - include_local=False, - include_orphan=False, - allow_no_locations=True, - ) - for name in location_name: - driver = _location_driver(name, root) - _pull_all_metadata(driver, root, name) - known_packets = [] - for packet_location in root.index.all_locations().values(): - known_packets.extend(list(packet_location.values())) - _validate_hashes(driver, name, known_packets) - _mark_all_known(driver, root, name) - - # TODO: mrc-4601 deorphan recovered packets - - -def _pull_all_metadata(driver, root, location_name): - known_there = driver.list() - known_here = root.index.all_metadata().keys() - for packet_id in known_there: - if packet_id not in known_here: - _pull_packet_metadata(driver, root, location_name, packet_id) - - -def _get_remove_location_hint(location_name): - return ( - f'Probably all you can do at this point is ' - f'remove this location from your configuration ' - f'by running ' - f'orderly_location_remove("{location_name}")' - ) - - -def _pull_packet_metadata(driver, root, location_name, packet_id): - metadata = driver.metadata(packet_id)[packet_id] - expected_hash = driver.list()[packet_id].hash - - hash_validate_string( - metadata, - expected_hash, - f"metadata for '{packet_id}' from '{location_name}'", - [ - "This is bad news, I'm afraid. Your location is sending data " - "that does not match the hash it says it does. Please let us " - "know how this might have happened.", - _get_remove_location_hint(location_name), - ], - ) - - path_metadata = root.path / ".outpack" / "metadata" - os.makedirs(path_metadata, exist_ok=True) - filename = path_metadata / packet_id - with open(filename, "w") as f: - f.writelines(metadata) - - -def _validate_hashes(driver, location_name, packets: List[PacketLocation]): - mismatched_hashes = set() - known_there = driver.list() - for packet in packets: - if known_there.get(packet.packet) is not None: - hash_there = known_there[packet.packet].hash - hash_here = packet.hash - if hash_there != hash_here: - mismatched_hashes.add(packet.packet) - - if len(mismatched_hashes) > 0: - id_text = "', '".join(mismatched_hashes) - msg = ( - f"Location '{location_name}' has conflicting metadata\n" - f"This is really bad news. We have been offered metadata " - f"from '{location_name}' that has a different hash to " - f"metadata that we have already imported from other " - f"locations. I'm not going to import this new metadata, " - f"but there's no guarantee that the older metadata is " - f"actually what you want!\nConflicts for: '{id_text}'\n" - f"We would be interested in this case, please let us know\n" - f"{_get_remove_location_hint(location_name)}" - ) - raise Exception(msg) - - -def _mark_all_known(driver, root, location_name): - try: - known_here = root.index.location(location_name) - except KeyError: - known_here = {} - - known_there = driver.list() - for packet_id in known_there: - if packet_id not in known_here.keys(): - mark_known( - root, - packet_id, - location_name, - known_there[packet_id].hash, - known_there[packet_id].time, - ) - - def _location_check_new_name(root, name): if _location_exists(root, name): msg = f"A location with name '{name}' already exists" @@ -240,6 +130,13 @@ def _location_exists(root, name): return name in outpack_location_list(root) +# TODO: Create a driver interface type +# atm we can't specify a type for driver return +# in this function. We want to return either an +# OutpackLocationPath driver or an http driver +# or other types down the line. We could set union type but +# would be nicer to use an interface-like pattern +# see mrc-5043 def _location_driver(location_name, root): location = root.config.location[location_name] if location.type == "path": diff --git a/src/outpack/location_path.py b/src/outpack/location_path.py index 48dfa7ab..288caae2 100644 --- a/src/outpack/location_path.py +++ b/src/outpack/location_path.py @@ -19,7 +19,7 @@ def metadata(self, packet_ids): all_ids = self.__root.index.location(LOCATION_LOCAL).keys() missing_ids = set(packet_ids).difference(all_ids) - if len(missing_ids) > 0: + if missing_ids: missing_msg = "', '".join(missing_ids) msg = f"Some packet ids not found: '{missing_msg}'" raise Exception(msg) @@ -32,7 +32,6 @@ def metadata(self, packet_ids): def fetch_file(self, hash, dest): if self.__root.config.core.use_file_store: path = self.__root.files.filename(hash) - print(path) if not os.path.exists(path): msg = f"Hash '{hash}' not found at location" raise Exception(msg) diff --git a/src/outpack/location_pull.py b/src/outpack/location_pull.py new file mode 100644 index 00000000..36d6e49b --- /dev/null +++ b/src/outpack/location_pull.py @@ -0,0 +1,467 @@ +import itertools +import os +import time +from contextlib import contextmanager +from dataclasses import dataclass +from typing import Dict, Generator, List, Optional, Set, Union + +import humanize + +from outpack.filestore import FileStore +from outpack.hash import hash_validate_string +from outpack.location import _location_driver, location_resolve_valid +from outpack.metadata import ( + MetadataCore, + PacketFileWithLocation, + PacketLocation, +) +from outpack.packet import mark_known +from outpack.root import OutpackRoot, find_file_by_hash, root_open +from outpack.search_options import SearchOptions +from outpack.static import LOCATION_LOCAL +from outpack.util import format_list, partition, pl + + +def outpack_location_pull_metadata(location=None, root=None, *, locate=True): + root = root_open(root, locate=locate) + location_name = location_resolve_valid( + location, + root, + include_local=False, + include_orphan=False, + allow_no_locations=True, + ) + for name in location_name: + driver = _location_driver(name, root) + _pull_all_metadata(driver, root, name) + known_packets = [] + for packet_location in root.index.all_locations().values(): + known_packets.extend(list(packet_location.values())) + _validate_hashes(driver, name, known_packets) + _mark_all_known(driver, root, name) + + # TODO: mrc-4601 deorphan recovered packets + + +def _pull_packet_metadata(driver, root, location_name, packet_id): + metadata = driver.metadata(packet_id)[packet_id] + expected_hash = driver.list()[packet_id].hash + + hash_validate_string( + metadata, + expected_hash, + f"metadata for '{packet_id}' from '{location_name}'", + [ + "This is bad news, I'm afraid. Your location is sending data " + "that does not match the hash it says it does. Please let us " + "know how this might have happened.", + _get_remove_location_hint(location_name), + ], + ) + + path_metadata = root.path / ".outpack" / "metadata" + os.makedirs(path_metadata, exist_ok=True) + filename = path_metadata / packet_id + with open(filename, "w") as f: + f.writelines(metadata) + + +def _get_remove_location_hint(location_name): + return ( + f'Probably all you can do at this point is ' + f'remove this location from your configuration ' + f'by running ' + f'outpack_location_remove("{location_name}")' + ) + + +def _validate_hashes(driver, location_name, packets: List[PacketLocation]): + mismatched_hashes = set() + known_there = driver.list() + for packet in packets: + if known_there.get(packet.packet) is not None: + hash_there = known_there[packet.packet].hash + hash_here = packet.hash + if hash_there != hash_here: + mismatched_hashes.add(packet.packet) + + if mismatched_hashes: + id_text = "', '".join(mismatched_hashes) + msg = ( + f"Location '{location_name}' has conflicting metadata\n" + f"This is really bad news. We have been offered metadata " + f"from '{location_name}' that has a different hash to " + f"metadata that we have already imported from other " + f"locations. I'm not going to import this new metadata, " + f"but there's no guarantee that the older metadata is " + f"actually what you want!\nConflicts for: '{id_text}'\n" + f"We would be interested in this case, please let us know\n" + f"{_get_remove_location_hint(location_name)}" + ) + raise Exception(msg) + + +def _mark_all_known(driver, root, location_name): + try: + known_here = root.index.location(location_name) + except KeyError: + known_here = {} + + known_there = driver.list() + for packet_id in known_there: + if packet_id not in known_here.keys(): + mark_known( + root, + packet_id, + location_name, + known_there[packet_id].hash, + known_there[packet_id].time, + ) + + +def outpack_location_pull_packet( + ids: Union[str, List[str]], + *, + options=None, + recursive=None, + root=None, + locate=True, +): + root = root_open(root, locate=locate) + options = SearchOptions(options) + if isinstance(ids, str): + ids = [ids] + plan = _location_build_pull_plan(ids, options.location, recursive, root) + + ## Warn people of extra pulls and skips + if plan.info.n_extra > 0: + print( + f"Also pulling {plan.info.n_extra} " + f"{pl(plan.info.n_extra, 'packet')}, which are " + f"dependencies of those requested" + ) + if plan.info.n_skip > 0: + print( + f"Skipping {plan.info.n_skip} of {plan.info.n_total} " + f"{pl(plan.info.n_total, 'packet')} which are already " + f"unpacked" + ) + + with _location_pull_files(plan.files, root) as store: + use_archive = root.config.core.path_archive is not None + n_packets = len(plan.packets) + time_start = time.time() + for idx, packet in enumerate(plan.packets.values()): + if use_archive: + print( + f"Writing files for '{packet.packet}' (packet {idx + 1}/" + f"{n_packets})" + ) + _location_pull_files_archive(packet.packet, store, root) + + mark_known( + root, packet.packet, LOCATION_LOCAL, packet.hash, time.time() + ) + + print( + f"Unpacked {n_packets} {pl(n_packets, 'packet')} in " + f"{humanize.time.precisedelta(int(time.time() - time_start))}." + ) + + return list(plan.packets.keys()) + + +# This approach may be suboptimal in the case where the user does not +# already have a file store, as it means that files will be copied +# around and hashed more than ideal: +# +# * hash the candidate file +# * rehash on entry into the file store +# * copy into the file store +# * copy from the file store into the final location +# +# So in the case where a hash is only present once in a chain of +# packets being pulled this will be one too many hashes and one too +# many copies. +# +# However, this approach makes the logic fairly easy to deal with, +# and copes well with data races and corruption of data on disk +# (e.g., users having edited files that we rely on, or editing them +# after we hash them the first time). +@contextmanager +def _location_pull_files( + files: List[PacketFileWithLocation], root: OutpackRoot +) -> Generator[FileStore, None, None]: + store = root.files + cleanup_store = False + if store is not None: + exists, missing = partition(lambda file: store.exists(file.hash), files) + + if exists: + print( + f"Found {len(exists)} {pl(exists, 'file')} in the " + f"file store" + ) + else: + print("Looking for suitable files already on disk") + store = _temporary_filestore(root) + cleanup_store = True + + missing = [] + no_found = 0 + for file in files: + path = find_file_by_hash(root, file.hash) + if path is not None: + store.put(path, file.hash) + no_found += 1 + else: + missing.append(file) + + print(f"Found {no_found} {pl(no_found, 'file')} on disk ") + + if len(missing) == 0: + print("All files available locally, no need to fetch any") + else: + locations = {file.location for file in missing} + total_size = humanize.naturalsize(sum(file.size for file in missing)) + print( + f"Need to fetch {len(missing)} {pl(missing, 'file')} " + f"({total_size}) from {len(locations)} " + f"{pl(locations, 'location')}" + ) + for location in locations: + from_this_location = [ + file for file in missing if file.location == location + ] + _location_pull_hash_store( + from_this_location, + location, + _location_driver(location, root), + store, + ) + + try: + yield store + finally: + if cleanup_store: + store.destroy() + + +def _location_pull_hash_store( + files: List[PacketFileWithLocation], + location_name: str, + driver, + store: FileStore, +): + no_of_files = len(files) + # TODO: show a nice progress bar for users + for idx, file in enumerate(files): + print( + f"Fetching file {idx + 1}/{no_of_files} " + f"({humanize.naturalsize(file.size)}) from '{location_name}'" + ) + with store.tmp() as path: + tmp = driver.fetch_file(file.hash, path) + store.put(tmp, file.hash) + + +def _location_pull_files_archive(packet_id: str, store, root: OutpackRoot): + meta = root.index.metadata(packet_id) + dest_root = ( + root.path / root.config.core.path_archive / meta.name / packet_id + ) + for file in meta.files: + store.get(file.hash, dest_root / file.path, overwrite=True) + + +def _pull_all_metadata(driver, root, location_name): + known_there = driver.list() + known_here = root.index.all_metadata().keys() + for packet_id in known_there: + if packet_id not in known_here: + _pull_packet_metadata(driver, root, location_name, packet_id) + + +@dataclass +class PullPlanInfo: + n_extra: int + n_skip: int + n_total: int + + +@dataclass +class LocationPullPlan: + packets: Dict[str, PacketLocation] + files: List[PacketFileWithLocation] + info: PullPlanInfo + + +@dataclass +class PullPlanPackets: + requested: List[str] + full: List[str] + skip: Set[str] + fetch: Set[str] + + +def _location_build_pull_plan( + packet_ids: List[str], + locations: Optional[List[str]], + recursive: Optional[bool], + root: OutpackRoot, +) -> LocationPullPlan: + packets = _location_build_pull_plan_packets( + packet_ids, root, recursive=recursive + ) + locations = _location_build_pull_plan_location(packets, locations, root) + files = _location_build_pull_plan_files(packets.fetch, locations, root) + fetch = _location_build_packet_locations(packets.fetch, locations, root) + + info = PullPlanInfo( + n_extra=len(packets.full) - len(packets.requested), + n_skip=len(packets.skip), + n_total=len(packets.full), + ) + + return LocationPullPlan(packets=fetch, files=files, info=info) + + +def _location_build_pull_plan_packets( + packet_ids: List[str], root: OutpackRoot, *, recursive: Optional[bool] +) -> PullPlanPackets: + requested = packet_ids + if recursive is None: + recursive = root.config.core.require_complete_tree + if root.config.core.require_complete_tree and not recursive: + msg = """'recursive' must be True (or None) with your configuration +Because 'core.require_complete_tree' is true, we can't do a \ +non-recursive pull, as this might leave an incomplete tree""" + raise Exception(msg) + + index = root.index + if recursive: + full = _find_all_dependencies(packet_ids, index.all_metadata()) + else: + full = packet_ids + + skip = set(full).intersection(index.unpacked()) + fetch = set(full).difference(skip) + + return PullPlanPackets( + requested=requested, full=full, skip=skip, fetch=fetch + ) + + +def _find_all_dependencies( + packet_ids: List[str], metadata: Dict[str, MetadataCore] +) -> List[str]: + ret = set(packet_ids) + packets = set(packet_ids) + while packets: + dependency_ids = { + dependencies.packet + for packet_id in packets + if packet_id in metadata.keys() + for dependencies in ( + [] + if metadata.get(packet_id) is None + else metadata[packet_id].depends + ) + } + packets = dependency_ids.difference(ret) + ret = packets.union(ret) + + return sorted(ret) + + +def _location_build_pull_plan_location( + packets: PullPlanPackets, locations: Optional[List[str]], root: OutpackRoot +) -> List[str]: + location_names = location_resolve_valid( + locations, + root, + include_local=False, + include_orphan=False, + allow_no_locations=len(packets.fetch) == 0, + ) + + known_packets = [ + root.index.packets_in_location(location_name) + for location_name in location_names + ] + missing = packets.fetch.difference(itertools.chain(*known_packets)) + if missing: + extra = missing.difference(packets.requested) + if extra: + hint = ( + f"{len(extra)} missing " + f"{pl(extra, 'packet was', 'packets were')} " + f"requested as " + f"{pl(extra, 'dependency', 'dependencies')} " + f"of the {pl(extra, 'one')} you asked for: " + f"{format_list(extra)}" + ) + else: + # In the case where the above is used, we probably have + # up-to-date metadata, so we don't display this. + hint = "Do you need to run 'outpack_location_pull_metadata()'?" + + msg = ( + f"Failed to find {pl(missing, 'packet')} " + f"{format_list(missing)}\n" + f"Looked in {pl(location_names, 'location')} " + f"{format_list(location_names)}\n" + hint + ) + raise Exception(msg) + + return location_names + + +def _location_build_pull_plan_files( + packet_ids: Set[str], locations: List[str], root: OutpackRoot +) -> List[PacketFileWithLocation]: + metadata = root.index.all_metadata() + file_hashes = { + file.hash + for packet_id in packet_ids + for file in metadata[packet_id].files + } + n_files = len(file_hashes) + + if n_files == 0: + return [] + + # Find first location within the set which contains each packet + # We've already checked earlier that the file is in at least 1 + # location so we don't have to worry about that here + all_files = [] + seen_hashes = set() + for location_name in locations: + location_packets = set(root.index.packets_in_location(location_name)) + packets_in_location = location_packets.intersection(packet_ids) + for packet_id in packets_in_location: + for file in metadata[packet_id].files: + file_with_location = PacketFileWithLocation.from_packet_file( + file, location_name + ) + if file.hash not in seen_hashes: + seen_hashes.add(file_with_location.hash) + all_files.append(file_with_location) + + return all_files + + +def _location_build_packet_locations( + packets: Set[str], locations: List[str], root: OutpackRoot +) -> Dict[str, PacketLocation]: + packets_fetch = {} + for location in locations: + packets_from_location = root.index.location(location) + packets_in_this_location = packets & packets_from_location.keys() + for packet_id in packets_in_this_location: + packets_fetch[packet_id] = packets_from_location[packet_id] + return packets_fetch + + +def _temporary_filestore(root: OutpackRoot) -> FileStore: + return FileStore(root.path / "orderly" / "pull") diff --git a/src/outpack/metadata.py b/src/outpack/metadata.py index 26046a5c..8411dbdb 100644 --- a/src/outpack/metadata.py +++ b/src/outpack/metadata.py @@ -23,6 +23,18 @@ def from_file(directory, path, hash_algorithm): return PacketFile(path, s, h) +@dataclass +class PacketFileWithLocation: + path: str + size: float + hash: str + location: str + + @staticmethod + def from_packet_file(file: PacketFile, location: str): + return PacketFileWithLocation(file.path, file.size, file.hash, location) + + @dataclass_json() @dataclass class PacketDependsPath: @@ -74,11 +86,11 @@ class PacketLocation: hash: str -def read_metadata_core(path): +def read_metadata_core(path) -> MetadataCore: with open(path) as f: - return MetadataCore.from_json(f.read().strip()) + return MetadataCore.from_json(f.read().strip()) # type: ignore -def read_packet_location(path): +def read_packet_location(path) -> PacketLocation: with open(path) as f: - return PacketLocation.from_json(f.read().strip()) + return PacketLocation.from_json(f.read().strip()) # type: ignore diff --git a/src/outpack/root.py b/src/outpack/root.py index 672af845..cdb10b65 100644 --- a/src/outpack/root.py +++ b/src/outpack/root.py @@ -1,7 +1,7 @@ import os import shutil from pathlib import Path -from typing import Union +from typing import Optional, Union from outpack.config import read_config from outpack.filestore import FileStore @@ -11,7 +11,7 @@ class OutpackRoot: - files = None + files: Optional[FileStore] = None def __init__(self, path): self.path = Path(path) @@ -26,7 +26,7 @@ def export_file(self, id, there, here, dest): dest = Path(dest) here_full = dest / here if self.config.core.use_file_store: - self.files.get(hash, here_full) + self.files.get(hash, here_full, overwrite=False) else: # consider starting from the case most likely to contain # this hash, since we already know that it's 'id' unless @@ -78,8 +78,8 @@ def find_file_by_hash(root, hash): return path else: msg = ( - f"Rejecting file from archive '{f.path}'" - f"in {meta.name}/{meta.id}" + f"Rejecting file from archive '{f.path}' " + f"in '{meta.name}/{meta.id}'" ) print(msg) return None diff --git a/src/outpack/search.py b/src/outpack/search.py index 55646a98..aa455801 100644 --- a/src/outpack/search.py +++ b/src/outpack/search.py @@ -4,6 +4,7 @@ import outpack_query_parser as parser +from outpack.ids import is_outpack_id from outpack.metadata import MetadataCore, Parameters from outpack.root import OutpackRoot, root_open from outpack.search_options import SearchOptions @@ -70,6 +71,8 @@ def as_query(query: Union[Query, str]) -> Query: if isinstance(query, Query): return query else: + if is_outpack_id(query): + query = f"'{query}'" return Query.parse(query) diff --git a/src/outpack/util.py b/src/outpack/util.py index 697cc393..e319ad47 100644 --- a/src/outpack/util.py +++ b/src/outpack/util.py @@ -3,6 +3,7 @@ import runpy import time from contextlib import contextmanager +from itertools import filterfalse, tee from pathlib import Path @@ -106,3 +107,30 @@ def read_string(path): with open(path) as f: lines = f.read().rstrip() return lines + + +def format_list(x): + return ", ".join("'" + item + "'" for item in x) + + +def pl(x, singular, plural=None): + if plural is None: + plural = singular + "s" + + if isinstance(x, int): + length = x + else: + length = len(x) + return f"{singular if length == 1 else plural}" + + +def partition(pred, iterable): + """Partition entries into false entries and true entries. + + This is slightly modified version of partition from itertools + recipes https://docs.python.org/dev/library/itertools.html#itertools-recipes + If *pred* is slow, consider wrapping it with functools.lru_cache(). + """ + # partition(is_odd, range(10)) --> 1 3 5 7 9 and 0 2 4 6 8 + t1, t2 = tee(iterable) + return list(filter(pred, t1)), list(filterfalse(pred, t2)) diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py index 1e0d7205..8d0e0a42 100644 --- a/tests/helpers/__init__.py +++ b/tests/helpers/__init__.py @@ -1,12 +1,16 @@ import random import shutil +import string from contextlib import contextmanager from pathlib import Path from tempfile import TemporaryDirectory +from typing import List, Optional from outpack.init import outpack_init +from outpack.metadata import MetadataCore, PacketDepends from outpack.packet import Packet from outpack.root import root_open +from outpack.schema import outpack_schema_version @contextmanager @@ -22,7 +26,8 @@ def create_packet(root, name, *, packet_id=None, parameters=None): p = Packet(root, src, name, id=packet_id, parameters=parameters) try: yield p - except BaseException: + except BaseException as e: + print("Error in packet creation: ", e) p.end(insert=False) else: p.end(insert=True) @@ -41,11 +46,39 @@ def create_random_packet(root, name="data", *, parameters=None, packet_id=None): return p.id +## Create a chain of packets a, b, c, ... that depend on each other +def create_random_packet_chain(root, length, base=None): + ids = {} + for i in range(length): + name = chr(i + ord("a")) + with create_packet(root, name) as p: + if base is not None: + p.use_dependency(base, {"input.txt": "data.txt"}) + + d = [f"{random.random()}\n" for _ in range(10)] # noqa: S311 + with open(p.path / "data.txt", "w") as f: + f.writelines(d) + + ids[name] = p.id + base = p.id + + return ids + + def create_temporary_root(path, **kwargs): outpack_init(path, **kwargs) return root_open(path, locate=False) +def create_temporary_roots(path, location_names=None, **kwargs): + if location_names is None: + location_names = ["src", "dst"] + root = {} + for name in location_names: + root[name] = create_temporary_root(path / name, **kwargs) + return root + + def copy_examples(names, root): if isinstance(names, str): names = [names] @@ -53,3 +86,47 @@ def copy_examples(names, root): path_src = root.path / "src" / nm path_src.parent.mkdir(parents=True, exist_ok=True) shutil.copytree(Path("tests/orderly/examples") / nm, path_src) + + +def create_metadata_depends(id: str, depends: Optional[List[str]] = None): + if depends is None: + depends = [] + dependencies = [ + PacketDepends(dependency_id, "", []) for dependency_id in depends + ] + return { + id: MetadataCore( + outpack_schema_version(), + id, + "name_" + random_characters(4), + {}, + {}, + [], + dependencies, + None, + None, + ) + } + + +def random_characters(n): + return "".join( + random.choice(string.ascii_letters + string.digits) # noqa: S311 + for _ in range(n) + ) + + +# Like Rs rep function, useful for setting up test values +def rep(x, each): + ret = [] + if isinstance(each, int): + each = [each] * len(x) + if len(x) != len(each): + msg = ( + "Repeats must be int or same length as the thing you want to repeat" + ) + raise Exception(msg) + for item, times in zip(x, each): + ret.extend([item] * times) + + return ret diff --git a/tests/test_filestore.py b/tests/test_filestore.py index 56e4a896..af12abea 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -1,3 +1,5 @@ +import os +import platform import random import pytest @@ -25,23 +27,86 @@ def test_can_store_files(tmp_path): f.write(randstr(10)) s = FileStore(str(tmp_path / "store")) + path_a = tmp_path / "tmp" / "a" + hash_a = hash_file(path_a, "md5") + assert not s.exists(hash_a) + assert s.put(path_a, hash_a, move=True) == hash_a + assert s.exists(hash_a) + assert not path_a.exists() + assert s.ls() == [hash_a] + + path_b = tmp_path / "tmp" / "b" + hash_b = hash_file(path_b, "md5") + assert not s.exists(hash_b) + assert s.put(path_b, hash_b) == hash_b + assert s.exists(hash_b) + assert path_b.exists() + assert all(h in s.ls() for h in [hash_a, hash_b]) + + dest = tmp_path / "dest" + s.get(hash_a, dest, overwrite=False) + assert dest.exists() + assert hash_file(dest, "md5") == hash_a + + for i in range(9): + p = tmp_path / "tmp" / letters[i + 1] + s.put(p, hash_file(p, "md5")) + + assert len(s.ls()) == 10 + + +@pytest.mark.skipif( + platform.system() != "Windows", + reason="destroy onerror handler only invoked on Windows \ +so only run this test on Windows", +) +def test_destroy_store_raises_error(tmp_path, mocker): + store_path = tmp_path / "store" + + mocker.patch("os.access", return_value=True) + + store = FileStore(str(store_path)) + assert store_path.exists() + file_path = tmp_path / "a" + with open(file_path, "w") as f: + f.write(randstr(10)) + file_hash = hash_file(file_path, "md5") + assert store.put(file_path, file_hash, move=False) == file_hash + assert store.ls() == [file_hash] + + # Error raised from anything other than file permission issue + with pytest.raises(Exception, match="Access is denied"): + store.destroy() + + +def test_get_files_fails_if_overwrite_false(tmp_path): + tmp = tmp_path / "tmp" + tmp.mkdir() + letters = [chr(i + ord("a")) for i in range(10)] + for i in range(10): + with open(tmp_path / "tmp" / letters[i], "w") as f: + f.write(randstr(10)) + + store = FileStore(str(tmp_path / "store")) p = tmp_path / "tmp" / "a" h = hash_file(p, "md5") - assert not s.exists(h) - assert s.put(p, h) == h - assert s.exists(h) - assert s.ls() == [h] + assert not store.exists(h) + assert store.put(p, h) == h + assert store.exists(h) dest = tmp_path / "dest" - s.get(h, dest) + assert not dest.exists() + store.get(h, dest, overwrite=False) assert dest.exists() assert hash_file(dest, "md5") == h - for i in range(10): - p = tmp_path / "tmp" / letters[i] - s.put(p, hash_file(p, "md5")) + store.get(h, dest, overwrite=True) + assert dest.exists() + assert hash_file(dest, "md5") == h - assert len(s.ls()) == 10 + with pytest.raises(Exception) as e: + store.get(h, dest, overwrite=False) + assert e.match(r"Failed to copy '.+' to '.+', file already exists") def test_if_hash_not_found(tmp_path): @@ -50,5 +115,14 @@ def test_if_hash_not_found(tmp_path): h = Hash("md5", "7c4d97e580abb6c2ffb8b1872907d84b") dest = tmp_path / "dest" with pytest.raises(Exception) as e: - s.get(h, dest) + s.get(h, dest, overwrite=False) assert e.match("Hash 'md5:7c4d97e.+' not found in store") + + +def test_can_create_filename_within_the_store(tmp_path): + path = str(tmp_path / "store") + store = FileStore(path) + with store.tmp() as temp_file: + assert os.path.dirname(temp_file) == str(store._path / "tmp") + assert os.path.exists(temp_file) + assert os.path.exists(os.path.dirname(temp_file)) diff --git a/tests/test_location.py b/tests/test_location.py index 1f31568f..3da0dea4 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -1,19 +1,14 @@ -import json - import pytest -from outpack.ids import outpack_id from outpack.location import ( location_resolve_valid, outpack_location_add, outpack_location_list, - outpack_location_pull_metadata, outpack_location_remove, outpack_location_rename, ) -from outpack.util import read_string -from .helpers import create_random_packet, create_temporary_root +from .helpers import create_temporary_root def test_no_locations_except_local_by_default(tmp_path): @@ -217,205 +212,6 @@ def test_cant_add_wip_location_type(tmp_path): assert e.match("Cannot add a location with type 'custom' yet.") -def test_can_pull_metadata_from_a_file_base_location(tmp_path): - root_upstream = create_temporary_root( - tmp_path / "upstream", use_file_store=True - ) - - ids = [create_random_packet(root_upstream) for _ in range(3)] - root_downstream = create_temporary_root( - tmp_path / "downstream", use_file_store=True - ) - - outpack_location_add( - "upstream", - "path", - {"path": str(root_upstream.path)}, - root=root_downstream, - ) - assert outpack_location_list(root_downstream) == ["local", "upstream"] - - outpack_location_pull_metadata("upstream", root=root_downstream) - - metadata = root_downstream.index.all_metadata() - assert len(metadata) == 3 - assert set(metadata.keys()) == set(ids) - assert metadata == root_upstream.index.all_metadata() - - packets = root_downstream.index.location("upstream") - assert len(packets) == 3 - assert set(packets.keys()) == set(ids) - - -def test_can_pull_empty_metadata(tmp_path): - root_upstream = create_temporary_root( - tmp_path / "upstream", use_file_store=True - ) - root_downstream = create_temporary_root( - tmp_path / "downstream", use_file_store=True - ) - - outpack_location_add( - "upstream", - "path", - {"path": str(root_upstream.path)}, - root=root_downstream, - ) - outpack_location_pull_metadata("upstream", root=root_downstream) - - index = root_downstream.index.data - assert len(index.metadata) == 0 - - -def test_can_pull_metadata_from_subset_of_locations(tmp_path): - root = {"a": create_temporary_root(tmp_path / "a", use_file_store=True)} - - location_names = ["x", "y", "z"] - for name in location_names: - root[name] = create_temporary_root(tmp_path / name, use_file_store=True) - outpack_location_add( - name, "path", {"path": str(root[name].path)}, root=root["a"] - ) - - assert outpack_location_list(root["a"]) == ["local", "x", "y", "z"] - - ids = {} - for name in location_names: - ids[name] = [create_random_packet(root[name]) for _ in range(3)] - - outpack_location_pull_metadata(["x", "y"], root=root["a"]) - index = root["a"].index - - assert set(index.all_metadata()) == set(ids["x"] + ids["y"]) - locations = index.all_locations() - assert set(locations.keys()) == {"local", "x", "y"} - assert len(locations["local"]) == 0 - assert len(locations["x"]) == 3 - assert len(locations["y"]) == 3 - - x_metadata = root["x"].index.all_metadata().keys() - y_metadata = root["y"].index.all_metadata().keys() - for packet_id in index.all_metadata().keys(): - if packet_id in ids["x"]: - assert packet_id in x_metadata - else: - assert packet_id in y_metadata - - outpack_location_pull_metadata(root=root["a"]) - index = root["a"].index - - assert set(index.all_metadata()) == set(ids["x"] + ids["y"] + ids["z"]) - locations = index.all_locations() - assert set(locations.keys()) == {"local", "x", "y", "z"} - assert len(locations["local"]) == 0 - assert len(locations["x"]) == 3 - assert len(locations["y"]) == 3 - assert len(locations["z"]) == 3 - z_metadata = root["z"].index.all_metadata().keys() - for packet_id in index.all_metadata().keys(): - if packet_id in ids["x"]: - assert packet_id in x_metadata - elif packet_id in ids["y"]: - assert packet_id in y_metadata - else: - assert packet_id in z_metadata - - -def test_cant_pull_metadata_from_an_unknown_location(tmp_path): - root = create_temporary_root(tmp_path) - with pytest.raises(Exception) as e: - outpack_location_pull_metadata("upstream", root=root) - - assert e.match("Unknown location: 'upstream'") - - -def test_noop_to_pull_metadata_from_no_locations(tmp_path): - root = create_temporary_root(tmp_path) - outpack_location_pull_metadata("local", root=root) - outpack_location_pull_metadata(root=root) - - -def test_handle_metadata_where_hash_does_not_match_reported(tmp_path): - here = create_temporary_root(tmp_path / "here") - there = create_temporary_root(tmp_path / "there") - outpack_location_add("server", "path", {"path": str(there.path)}, root=here) - packet_id = create_random_packet(there) - - path_metadata = there.path / ".outpack" / "metadata" / packet_id - parsed = json.loads(read_string(path_metadata)) - with open(path_metadata, "w") as f: - f.write(json.dumps(parsed, indent=4)) - - with pytest.raises(Exception) as e: - outpack_location_pull_metadata(root=here) - - assert e.match( - f"Hash of metadata for '{packet_id}' from 'server' does not match:" - ) - assert e.match("This is bad news") - assert e.match("remove this location") - - -def test_handle_metadata_where_two_locations_differ_in_hash_for_same_id( - tmp_path, -): - root = {} - - for name in ["a", "b", "us"]: - root[name] = create_temporary_root(tmp_path / name) - - packet_id = outpack_id() - create_random_packet(root["a"], packet_id=packet_id) - create_random_packet(root["b"], packet_id=packet_id) - - outpack_location_add( - "a", "path", {"path": str(root["a"].path)}, root=root["us"] - ) - outpack_location_add( - "b", "path", {"path": str(root["b"].path)}, root=root["us"] - ) - - outpack_location_pull_metadata(location="a", root=root["us"]) - - with pytest.raises(Exception) as e: - outpack_location_pull_metadata(location="b", root=root["us"]) - - assert e.match( - "We have been offered metadata from 'b' that has a different" - ) - assert e.match(f"Conflicts for: '{packet_id}'") - assert e.match("please let us know") - assert e.match("remove this location") - - -def test_can_pull_metadata_through_chain_of_locations(tmp_path): - root = {} - for name in ["a", "b", "c", "d"]: - root[name] = create_temporary_root(tmp_path / name) - - # More interesting topology, with a chain of locations, but d also - # knowing directly about an earlier location - # > a -> b -> c -> d - # > `-------/ - outpack_location_add( - "a", "path", {"path": str(root["a"].path)}, root=root["b"] - ) - outpack_location_add( - "b", "path", {"path": str(root["b"].path)}, root=root["c"] - ) - outpack_location_add( - "b", "path", {"path": str(root["b"].path)}, root=root["d"] - ) - outpack_location_add( - "c", "path", {"path": str(root["c"].path)}, root=root["d"] - ) - - # Create a packet and make sure it's in both b and c - create_random_packet(root["a"]) - outpack_location_pull_metadata(root=root["b"]) - # TODO: complete test once orderly_location_pull_packet - - def test_can_resolve_locations(tmp_path): root = {} for name in ["dst", "a", "b", "c", "d"]: @@ -500,27 +296,3 @@ def test_can_resolve_locations(tmp_path): ) assert e.match("Unknown location: '[fg]', '[fg]'") - - -def test_informative_error_when_no_locations_configured(tmp_path): - root = create_temporary_root(tmp_path) - - locations = location_resolve_valid( - None, - root, - include_local=False, - include_orphan=False, - allow_no_locations=True, - ) - assert locations == [] - - with pytest.raises(Exception) as e: - location_resolve_valid( - None, - root, - include_local=False, - include_orphan=False, - allow_no_locations=False, - ) - - assert e.match("No suitable location found") diff --git a/tests/test_location_pull.py b/tests/test_location_pull.py new file mode 100644 index 00000000..bd8f5302 --- /dev/null +++ b/tests/test_location_pull.py @@ -0,0 +1,693 @@ +import contextlib +import io +import json +import os +import re +from operator import itemgetter + +import pytest + +from outpack.hash import hash_file +from outpack.ids import outpack_id +from outpack.location import ( + location_resolve_valid, + outpack_location_add, + outpack_location_list, +) +from outpack.location_pull import ( + PullPlanInfo, + _find_all_dependencies, + _location_build_pull_plan, + outpack_location_pull_metadata, + outpack_location_pull_packet, +) +from outpack.packet import Packet +from outpack.util import read_string + +from .helpers import ( + create_metadata_depends, + create_random_packet, + create_random_packet_chain, + create_temporary_root, + create_temporary_roots, + rep, +) + + +def test_can_pull_metadata_from_a_file_base_location(tmp_path): + root_upstream = create_temporary_root( + tmp_path / "upstream", use_file_store=True + ) + + ids = [create_random_packet(root_upstream) for _ in range(3)] + root_downstream = create_temporary_root( + tmp_path / "downstream", use_file_store=True + ) + + outpack_location_add( + "upstream", + "path", + {"path": str(root_upstream.path)}, + root=root_downstream, + ) + assert outpack_location_list(root_downstream) == ["local", "upstream"] + + outpack_location_pull_metadata("upstream", root=root_downstream) + + metadata = root_downstream.index.all_metadata() + assert len(metadata) == 3 + assert set(metadata.keys()) == set(ids) + assert metadata == root_upstream.index.all_metadata() + + packets = root_downstream.index.location("upstream") + assert len(packets) == 3 + assert set(packets.keys()) == set(ids) + + +def test_can_pull_empty_metadata(tmp_path): + root_upstream = create_temporary_root( + tmp_path / "upstream", use_file_store=True + ) + root_downstream = create_temporary_root( + tmp_path / "downstream", use_file_store=True + ) + + outpack_location_add( + "upstream", + "path", + {"path": str(root_upstream.path)}, + root=root_downstream, + ) + outpack_location_pull_metadata("upstream", root=root_downstream) + + index = root_downstream.index.data + assert len(index.metadata) == 0 + + +def test_can_pull_metadata_from_subset_of_locations(tmp_path): + root = {"a": create_temporary_root(tmp_path / "a", use_file_store=True)} + + location_names = ["x", "y", "z"] + for name in location_names: + root[name] = create_temporary_root(tmp_path / name, use_file_store=True) + outpack_location_add( + name, "path", {"path": str(root[name].path)}, root=root["a"] + ) + + assert outpack_location_list(root["a"]) == ["local", "x", "y", "z"] + + ids = {} + for name in location_names: + ids[name] = [create_random_packet(root[name]) for _ in range(3)] + + outpack_location_pull_metadata(["x", "y"], root=root["a"]) + index = root["a"].index + + assert set(index.all_metadata()) == set(ids["x"] + ids["y"]) + locations = index.all_locations() + assert set(locations.keys()) == {"local", "x", "y"} + assert len(locations["local"]) == 0 + assert len(locations["x"]) == 3 + assert len(locations["y"]) == 3 + + x_metadata = root["x"].index.all_metadata().keys() + y_metadata = root["y"].index.all_metadata().keys() + for packet_id in index.all_metadata().keys(): + if packet_id in ids["x"]: + assert packet_id in x_metadata + else: + assert packet_id in y_metadata + + outpack_location_pull_metadata(root=root["a"]) + index = root["a"].index + + assert set(index.all_metadata()) == set(ids["x"] + ids["y"] + ids["z"]) + locations = index.all_locations() + assert set(locations.keys()) == {"local", "x", "y", "z"} + assert len(locations["local"]) == 0 + assert len(locations["x"]) == 3 + assert len(locations["y"]) == 3 + assert len(locations["z"]) == 3 + z_metadata = root["z"].index.all_metadata().keys() + for packet_id in index.all_metadata().keys(): + if packet_id in ids["x"]: + assert packet_id in x_metadata + elif packet_id in ids["y"]: + assert packet_id in y_metadata + else: + assert packet_id in z_metadata + + +def test_cant_pull_metadata_from_an_unknown_location(tmp_path): + root = create_temporary_root(tmp_path) + with pytest.raises(Exception) as e: + outpack_location_pull_metadata("upstream", root=root) + + assert e.match("Unknown location: 'upstream'") + + +def test_noop_to_pull_metadata_from_no_locations(tmp_path): + root = create_temporary_root(tmp_path) + outpack_location_pull_metadata("local", root=root) + outpack_location_pull_metadata(root=root) + + +def test_handle_metadata_where_hash_does_not_match_reported(tmp_path): + here = create_temporary_root(tmp_path / "here") + there = create_temporary_root(tmp_path / "there") + outpack_location_add("server", "path", {"path": str(there.path)}, root=here) + packet_id = create_random_packet(there) + + path_metadata = there.path / ".outpack" / "metadata" / packet_id + parsed = json.loads(read_string(path_metadata)) + with open(path_metadata, "w") as f: + f.write(json.dumps(parsed, indent=4)) + + with pytest.raises(Exception) as e: + outpack_location_pull_metadata(root=here) + + assert e.match( + f"Hash of metadata for '{packet_id}' from 'server' does not match:" + ) + assert e.match("This is bad news") + assert e.match("remove this location") + + +def test_handle_metadata_where_two_locations_differ_in_hash_for_same_id( + tmp_path, +): + root = {} + + for name in ["a", "b", "us"]: + root[name] = create_temporary_root(tmp_path / name) + + packet_id = outpack_id() + create_random_packet(root["a"], packet_id=packet_id) + create_random_packet(root["b"], packet_id=packet_id) + + outpack_location_add( + "a", "path", {"path": str(root["a"].path)}, root=root["us"] + ) + outpack_location_add( + "b", "path", {"path": str(root["b"].path)}, root=root["us"] + ) + + outpack_location_pull_metadata(location="a", root=root["us"]) + + with pytest.raises(Exception) as e: + outpack_location_pull_metadata(location="b", root=root["us"]) + + assert e.match( + "We have been offered metadata from 'b' that has a different" + ) + assert e.match(f"Conflicts for: '{packet_id}'") + assert e.match("please let us know") + assert e.match("remove this location") + + +def test_can_pull_metadata_through_chain_of_locations(tmp_path): + root = {} + for name in ["a", "b", "c", "d"]: + root[name] = create_temporary_root(tmp_path / name) + + # More interesting topology, with a chain of locations, but d also + # knowing directly about an earlier location + # > a -> b -> c -> d + # > `-------/ + outpack_location_add( + "a", "path", {"path": str(root["a"].path)}, root=root["b"] + ) + outpack_location_add( + "b", "path", {"path": str(root["b"].path)}, root=root["c"] + ) + outpack_location_add( + "b", "path", {"path": str(root["b"].path)}, root=root["d"] + ) + outpack_location_add( + "c", "path", {"path": str(root["c"].path)}, root=root["d"] + ) + + # Create a packet and make sure it's in both b and c + create_random_packet(root["a"]) + outpack_location_pull_metadata(root=root["b"]) + # TODO: complete test once orderly_location_pull_packet + + +def test_informative_error_when_no_locations_configured(tmp_path): + root = create_temporary_root(tmp_path) + + locations = location_resolve_valid( + None, + root, + include_local=False, + include_orphan=False, + allow_no_locations=True, + ) + assert locations == [] + + with pytest.raises(Exception) as e: + location_resolve_valid( + None, + root, + include_local=False, + include_orphan=False, + allow_no_locations=False, + ) + + assert e.match("No suitable location found") + + with pytest.raises(Exception) as e: + outpack_location_pull_packet(outpack_id(), root=root) + assert e.match("No suitable location found") + + +def test_can_resolve_dependencies_where_there_are_none(): + metadata = create_metadata_depends("a") + deps = _find_all_dependencies(["a"], metadata) + assert deps == ["a"] + + metadata = { + **create_metadata_depends("a"), + **create_metadata_depends("b", ["a"]), + } + deps = _find_all_dependencies(["a"], metadata) + assert deps == ["a"] + + +def test_can_find_dependencies(): + metadata = { + **create_metadata_depends("a"), + **create_metadata_depends("b"), + **create_metadata_depends("c"), + **create_metadata_depends("d", ["a", "b"]), + **create_metadata_depends("e", ["b", "c"]), + **create_metadata_depends("f", ["a", "c"]), + **create_metadata_depends("g", ["a", "f", "c"]), + **create_metadata_depends("h", ["a", "b", "c"]), + **create_metadata_depends("i", ["f"]), + **create_metadata_depends("j", ["i", "e", "a"]), + } + + assert _find_all_dependencies(["a"], metadata) == ["a"] + assert _find_all_dependencies(["b"], metadata) == ["b"] + assert _find_all_dependencies(["c"], metadata) == ["c"] + + assert _find_all_dependencies(["d"], metadata) == ["a", "b", "d"] + assert _find_all_dependencies(["e"], metadata) == ["b", "c", "e"] + assert _find_all_dependencies(["f"], metadata) == ["a", "c", "f"] + + assert _find_all_dependencies(["g"], metadata) == ["a", "c", "f", "g"] + assert _find_all_dependencies(["h"], metadata) == ["a", "b", "c", "h"] + assert _find_all_dependencies(["i"], metadata) == ["a", "c", "f", "i"] + assert _find_all_dependencies(["j"], metadata) == [ + "a", + "b", + "c", + "e", + "f", + "i", + "j", + ] + + +def test_can_find_multiple_dependencies_at_once(): + metadata = { + **create_metadata_depends("a"), + **create_metadata_depends("b"), + **create_metadata_depends("c"), + **create_metadata_depends("d", ["a", "b"]), + **create_metadata_depends("e", ["b", "c"]), + **create_metadata_depends("f", ["a", "c"]), + **create_metadata_depends("g", ["a", "f", "c"]), + **create_metadata_depends("h", ["a", "b", "c"]), + **create_metadata_depends("i", ["f"]), + **create_metadata_depends("j", ["i", "e", "a"]), + } + + assert _find_all_dependencies([], metadata) == [] + assert _find_all_dependencies(["c", "b", "a"], metadata) == ["a", "b", "c"] + assert _find_all_dependencies(["d", "e", "f"], metadata) == [ + "a", + "b", + "c", + "d", + "e", + "f", + ] + + +def test_can_pull_packet_from_location_into_another_file_store(tmp_path): + root = create_temporary_roots(tmp_path, use_file_store=True) + + id = create_random_packet(root["src"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) + outpack_location_pull_metadata(root=root["dst"]) + outpack_location_pull_packet(id, root=root["dst"]) + + index = root["dst"].index + assert index.unpacked() == [id] + assert os.path.exists( + root["dst"].path / "archive" / "data" / id / "data.txt" + ) + + meta = index.metadata(id) + assert all(root["dst"].files.exists(file.hash) for file in meta.files) + + +def test_can_pull_packet_from_location_file_store_only(tmp_path): + root = create_temporary_roots( + tmp_path, use_file_store=True, path_archive=None + ) + + id = create_random_packet(root["src"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) + outpack_location_pull_metadata(root=root["dst"]) + outpack_location_pull_packet(id, root=root["dst"]) + + index = root["dst"].index + assert index.unpacked() == [id] + assert not os.path.exists( + root["dst"].path / "archive" / "data" / id / "data.txt" + ) + + meta = index.metadata(id) + assert all(root["dst"].files.exists(file.hash) for file in meta.files) + + +def test_can_pull_packet_from_one_location_to_another_archive(tmp_path): + root = create_temporary_roots(tmp_path, use_file_store=False) + + id = create_random_packet(root["src"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) + outpack_location_pull_metadata(root=root["dst"]) + outpack_location_pull_packet(id, root=root["dst"]) + + index = root["dst"].index + assert index.unpacked() == [id] + assert os.path.exists( + root["dst"].path / "archive" / "data" / id / "data.txt" + ) + + +def test_detect_and_avoid_modified_files_in_source_repository(tmp_path): + root = create_temporary_roots(tmp_path, use_file_store=False) + + tmp_dir = tmp_path / "new_dir" + os.mkdir(tmp_dir) + with open(tmp_dir / "a.txt", "w") as f: + f.writelines("my data a") + with open(tmp_dir / "b.txt", "w") as f: + f.writelines("my data b") + ids = [None] * 2 + for i in range(len(ids)): + p = Packet(root["src"], tmp_dir, "data") + p.end() + ids[i] = p.id + + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root["dst"] + ) + outpack_location_pull_metadata(root=root["dst"]) + + ## When I corrupt the file in the first ID by truncating it: + src_data = root["src"].path / "archive" / "data" + dest_data = root["dst"].path / "archive" / "data" + with open(src_data / ids[0] / "a.txt", "w") as f: + f.truncate(0) + + ## and then try and pull it, user is warned + f = io.StringIO() + with contextlib.redirect_stdout(f): + outpack_location_pull_packet(ids[0], root=root["dst"]) + + assert re.search( + r"Rejecting file from archive 'a\.txt' in 'data/", f.getvalue() + ) + + assert hash_file(dest_data / ids[0] / "a.txt") == hash_file( + src_data / ids[1] / "a.txt" + ) + assert hash_file(dest_data / ids[0] / "b.txt") == hash_file( + src_data / ids[1] / "b.txt" + ) + + +def test_do_not_unpack_packet_twice(tmp_path): + root = create_temporary_roots(tmp_path) + + id = create_random_packet(root["src"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root["dst"] + ) + outpack_location_pull_metadata(root=root["dst"]) + + assert outpack_location_pull_packet(id, root=root["dst"]) == [id] + assert outpack_location_pull_packet(id, root=root["dst"]) == [] + + +def test_sensible_error_if_packet_not_known(tmp_path): + root = create_temporary_roots(tmp_path) + id = create_random_packet(root["src"]) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) + + with pytest.raises(Exception) as e: + outpack_location_pull_packet(id, root=root["dst"]) + assert e.match(f"Failed to find packet '{id}'") + assert e.match("Looked in location 'src'") + assert e.match("Do you need to run 'outpack_location_pull_metadata'?") + + +def test_error_if_dependent_packet_not_known(tmp_path): + root = create_temporary_roots( + tmp_path, ["a", "c"], require_complete_tree=True + ) + root["b"] = create_temporary_root( + tmp_path / "b", require_complete_tree=False + ) + + ids = create_random_packet_chain(root["a"], 5) + outpack_location_add( + "a", "path", {"path": str(root["a"].path)}, root=root["b"] + ) + outpack_location_pull_metadata(root=root["b"]) + outpack_location_pull_packet(ids["e"], root=root["b"]) + + outpack_location_add( + "b", "path", {"path": str(root["b"].path)}, root=root["c"] + ) + outpack_location_pull_metadata(root=root["c"]) + + with pytest.raises(Exception) as e: + outpack_location_pull_packet(ids["e"], root=root["c"]) + assert e.match(f"Failed to find packet '{ids['d']}") + assert e.match("Looked in location 'b'") + assert e.match( + "1 missing packet was requested as dependency of the " + f"one you asked for: '{ids['d']}'" + ) + + +def test_can_pull_a_tree_recursively(tmp_path): + root = create_temporary_roots(tmp_path) + ids = create_random_packet_chain(root["src"], 3) + + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root["dst"] + ) + outpack_location_pull_metadata(root=root["dst"]) + pulled_packets = outpack_location_pull_packet( + ids["c"], recursive=True, root=root["dst"] + ) + assert set(pulled_packets) == set(itemgetter("a", "b", "c")(ids)) + + assert set(root["dst"].index.unpacked()) == set( + root["src"].index.unpacked() + ) + + pulled_packets = outpack_location_pull_packet( + ids["c"], recursive=True, root=root["dst"] + ) + assert pulled_packets == [] + + +def test_can_filter_locations(tmp_path): + location_names = ["dst", "a", "b", "c", "d"] + root = create_temporary_roots(tmp_path, location_names, use_file_store=True) + for name in location_names: + if name != "dst": + outpack_location_add( + name, "path", {"path": str(root[name].path)}, root=root["dst"] + ) + + ids_a = [create_random_packet(root["a"]) for _ in range(3)] + outpack_location_add( + "a", "path", {"path": str(root["a"].path)}, root=root["b"] + ) + outpack_location_pull_metadata(root=root["b"]) + outpack_location_pull_packet(ids_a, root=root["b"]) + + ids_b = ids_a + [create_random_packet(root["b"]) for _ in range(3)] + ids_c = [create_random_packet(root["c"]) for _ in range(3)] + outpack_location_add( + "a", "path", {"path": str(root["a"].path)}, root=root["d"] + ) + outpack_location_add( + "c", "path", {"path": str(root["c"].path)}, root=root["d"] + ) + outpack_location_pull_metadata(root=root["d"]) + outpack_location_pull_packet(ids_a, root=root["d"]) + outpack_location_pull_packet(ids_c, root=root["d"]) + ids_d = ids_c + [create_random_packet(root["d"]) for _ in range(3)] + + outpack_location_pull_metadata(root=root["dst"]) + + ids = list(set(ids_a + ids_b + ids_c + ids_d)) + + def locs(location_names): + return location_resolve_valid( + location_names, + root["dst"], + include_local=False, + include_orphan=False, + allow_no_locations=False, + ) + + plan = _location_build_pull_plan(ids, None, None, root=root["dst"]) + locations = [file.location for file in plan.files] + assert locations == rep(["a", "b", "c", "d"], 3) + + # Invert order, prefer "d" + plan = _location_build_pull_plan( + ids, locs(["d", "c", "b", "a"]), None, root=root["dst"] + ) + locations = [file.location for file in plan.files] + assert locations == rep(["d", "b"], [9, 3]) + + # Drop redundant locations + plan = _location_build_pull_plan( + ids, locs(["b", "d"]), None, root=root["dst"] + ) + locations = [file.location for file in plan.files] + assert locations == rep(["b", "d"], 6) + + # Corner cases + plan = _location_build_pull_plan([ids_a[0]], None, None, root=root["dst"]) + locations = [file.location for file in plan.files] + assert locations == ["a"] + + plan = _location_build_pull_plan([], None, None, root=root["dst"]) + assert plan.files == [] + assert plan.packets == {} + assert plan.info == PullPlanInfo(0, 0, 0) + + # Failure to find packets + with pytest.raises(Exception) as e: + _location_build_pull_plan(ids, ["a", "b", "c"], None, root=root["dst"]) + assert "Looked in locations 'a', 'b', 'c'" in e.value.args[0] + assert ( + "Do you need to run 'outpack_location_pull_metadata()'?" + in e.value.args[0] + ) + + +def test_nonrecursive_pulls_are_prevented_by_configuration(tmp_path): + root = create_temporary_roots(tmp_path, require_complete_tree=True) + ids = create_random_packet_chain(root["src"], 3) + + with pytest.raises(Exception) as e: + outpack_location_pull_packet( + ids["c"], recursive=False, root=root["dst"] + ) + + assert ( + "'recursive' must be True (or None) with your configuration" + in e.value.args[0] + ) + + +def test_if_recursive_pulls_are_required_pulls_are_default_recursive(tmp_path): + root = create_temporary_roots( + tmp_path, ["src", "shallow"], require_complete_tree=False + ) + root["deep"] = create_temporary_root(tmp_path, require_complete_tree=True) + + ids = create_random_packet_chain(root["src"], 3) + + for r in [root["shallow"], root["deep"]]: + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=r + ) + outpack_location_pull_metadata(root=r) + + outpack_location_pull_packet(ids["c"], recursive=None, root=root["shallow"]) + assert root["shallow"].index.unpacked() == [ids["c"]] + + outpack_location_pull_packet(ids["c"], recursive=None, root=root["deep"]) + assert root["deep"].index.unpacked() == list(ids.values()) + + +## TODO: Uncomment when wired up with searching +# def test_can_pull_packets_as_result_of_query(tmp_path): +# TODO +# def test_pull_packet_errors_if_allow_remote_is_false(tmp_path): +# root = create_temporary_roots(tmp_path) +# id = create_random_packet(root["src"]) +# +# outpack_location_add("src", "path", +# {"path": str(root["src"].path)}, +# root=root["dst"]) +# outpack_location_pull_metadata(root=root["dst"]) +# +# with pytest.raises(Exception): +# outpack_location_pull_packet(None, options={"allow_remote" = False}, root=root["dst"]) +# assert e.match("If specifying 'options', 'allow_remote' must be True") + + +def test_skip_files_in_file_store(tmp_path): + root = create_temporary_roots(tmp_path, use_file_store=True) + + ids = create_random_packet_chain(root["src"], 3) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) + + outpack_location_pull_metadata(root=root["dst"]) + outpack_location_pull_packet(ids["a"], root=root["dst"]) + + f = io.StringIO() + with contextlib.redirect_stdout(f): + outpack_location_pull_packet(ids["b"], root=root["dst"]) + text = f.getvalue() + assert re.search("Found 1 file in the file store", text) + assert re.search( + r"Need to fetch 1 file \([0-9]* Bytes\) from 1 location", text + ) + + +def test_skip_files_already_on_disk(tmp_path): + root = create_temporary_roots(tmp_path, use_file_store=False) + + ids = create_random_packet_chain(root["src"], 3) + outpack_location_add( + "src", "path", {"path": str(root["src"].path)}, root=root["dst"] + ) + + outpack_location_pull_metadata(root=root["dst"]) + outpack_location_pull_packet(ids["a"], root=root["dst"]) + + f = io.StringIO() + with contextlib.redirect_stdout(f): + outpack_location_pull_packet(ids["b"], root=root["dst"]) + text = f.getvalue() + assert re.search("Found 1 file on disk", text) + assert re.search( + r"Need to fetch 1 file \([0-9]* Bytes\) from 1 location", text + ) diff --git a/tests/test_root.py b/tests/test_root.py index 1b283abe..d1bbf517 100644 --- a/tests/test_root.py +++ b/tests/test_root.py @@ -89,7 +89,7 @@ def test_can_reject_corrupted_files(tmp_path, capsys): captured = capsys.readouterr() assert ( captured.out - == f"Rejecting file from archive 'data.txt'in data/{id[1]}\n" + == f"Rejecting file from archive 'data.txt' in 'data/{id[1]}'\n" ) dest = tmp_path / "dest" with pytest.raises(Exception, match="File not found in archive"): diff --git a/tests/test_tools.py b/tests/test_tools.py index 78746dc4..87c6885b 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -29,9 +29,9 @@ def test_git_report_no_info_without_git_repo(tmp_path): def test_git_report_git_info_if_possible(tmp_path): sha = simple_git_example(tmp_path) res = git_info(tmp_path) - # This default branch name won't be robust to changes in future - # git versions - assert res == GitInfo(branch="master", sha=sha, url=[]) + assert res == GitInfo(branch="master", sha=sha, url=[]) or res == GitInfo( + branch="main", sha=sha, url=[] + ) def test_git_report_single_url(tmp_path): diff --git a/tests/test_util.py b/tests/test_util.py index 43c8fcf3..9d037942 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -7,9 +7,12 @@ assert_file_exists, expand_dirs, find_file_descend, + format_list, iso_time_str, match_value, num_to_time, + partition, + pl, read_string, run_script, time_to_num, @@ -115,3 +118,31 @@ def test_can_inject_data_into_run(tmp_path): assert tmp_path.joinpath("result.txt").exists() with open(tmp_path.joinpath("result.txt")) as f: assert f.read() == "hello" + + +def test_can_format_list(): + assert format_list(["one", "two"]) == "'one', 'two'" + assert format_list(["one"]) == "'one'" + assert format_list({"one", "two"}) in ("'one', 'two'", "'two', 'one'") + assert format_list({"one", "one"}) == "'one'" # noqa:B033 + + +def test_can_pluralise(): + assert pl([], "item") == "items" + assert pl(["one"], "item") == "item" + assert pl(["one", "two"], "item") == "items" + assert pl({"Inky"}, "octopus", "octopodes") == "octopus" + assert pl({"Inky", "Tentacool"}, "octopus", "octopodes") == "octopodes" + assert pl(2, "item") == "items" + assert pl(1, "item") == "item" + + +def test_can_partition(): + test_list = ["one", "two", "three", "four", "five"] + true_list, false_list = partition(lambda x: "e" in x, test_list) + assert true_list == ["one", "three", "five"] + assert false_list == ["two", "four"] + + true_list, false_list = partition(lambda x: "z" in x, test_list) + assert true_list == [] + assert false_list == test_list