Skip to content

Commit

Permalink
Merge pull request #33 from mrc-ide/mrc-4629
Browse files Browse the repository at this point in the history
mrc-4629: Add function to pull a packet from a location
  • Loading branch information
r-ash authored Feb 28, 2024
2 parents 2014c26 + 45c7e77 commit 1ebc054
Show file tree
Hide file tree
Showing 17 changed files with 1,503 additions and 387 deletions.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ dependencies = [
"importlib_resources",
"jsonschema",
"pygit2",
"outpack-query-parser"
"outpack-query-parser",
"humanize"
]

[project.urls]
Expand All @@ -44,6 +45,7 @@ path = "src/outpack/__about__.py"
dependencies = [
"coverage[toml]>=6.5",
"pytest",
"pytest_mock",
"sphinx",
"sphinx-rtd-theme",
"myst-parser",
Expand Down
69 changes: 59 additions & 10 deletions src/outpack/filestore.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,99 @@
import os.path
import shutil
import stat
import tempfile
from contextlib import contextmanager
from pathlib import Path

from outpack.hash import Hash, hash_parse, hash_validate_file


class FileStore:
def __init__(self, path):
self._path = path
self._path = Path(path)
os.makedirs(path, exist_ok=True)

def filename(self, hash):
dat = hash_parse(hash)
return os.path.join(
self._path, dat.algorithm, dat.value[:2], dat.value[2:]
)
return self._path / dat.algorithm / dat.value[:2] / dat.value[2:]

def get(self, hash, dst):
def get(self, hash, dst, *, overwrite=False):
src = self.filename(hash)
if not os.path.exists(src):
msg = f"Hash '{hash}' not found in store"
raise Exception(msg)
os.makedirs(os.path.dirname(dst), exist_ok=True)
# todo - control over overwrite args needed.
if not overwrite and os.path.exists(dst):
msg = f"Failed to copy '{src}' to '{dst}', file already exists"
raise Exception(msg)
shutil.copyfile(src, dst)

def exists(self, hash):
return os.path.exists(self.filename(hash))

def put(self, src, hash):
def put(self, src, hash, *, move=False):
hash_validate_file(src, hash)
dst = self.filename(hash)
if not os.path.exists(dst):
os.makedirs(os.path.dirname(dst), exist_ok=True)
shutil.copyfile(src, dst)
os.chmod(dst, stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH)
if move:
shutil.move(src, dst)
else:
shutil.copyfile(src, dst)
# Make file readonly for everyone
dst.chmod(0o444)
return hash

def ls(self):
# Lots of ways of pulling this off with higer order functions
# (os.walk, Path.glob etc), but this is probably clearest.
ret = []
for algorithm in os.listdir(self._path):
path_alg = os.path.join(self._path, algorithm)
path_alg = self._path / algorithm
for prefix in os.listdir(path_alg):
path_prefix = os.path.join(path_alg, prefix)
for suffix in os.listdir(path_prefix):
ret.append(Hash(algorithm, prefix + suffix))
return ret

def destroy(self) -> None:
def onerror(func, path, _exc_info):
"""
Error handler for ``shutil.rmtree``.
If the error is due to an access error (read only file)
it attempts to add write permission and then retries.
If the error is for another reason it re-raises the error.
We manually remove write permission in ``put`` above so this
is expected.
Note we only need this on windows, on Linux shutils.rmtree will
successfully remove the dir and its contents without having
to add write permission to individual files
Usage : ``shutil.rmtree(path, onerror=onerror)``
"""
if not os.access(path, os.W_OK):
os.chmod(path, stat.S_IWUSR)
func(path)
else:
raise

shutil.rmtree(self._path, onerror=onerror)

@contextmanager
def tmp(self):
# On a newer version of tempfile we could use `delete_on_close = False`
# see
# https://github.com/mrc-ide/outpack-py/pull/33#discussion_r1500522877
path = self._path / "tmp"
path.mkdir(exist_ok=True)
f = tempfile.NamedTemporaryFile(dir=path, delete=False)
try:
yield f.name
finally:
try:
os.unlink(f.name)
except OSError:
pass
32 changes: 22 additions & 10 deletions src/outpack/index.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import pathlib
from dataclasses import dataclass
from typing import List
from typing import Dict, List

from outpack.metadata import read_metadata_core, read_packet_location
from outpack.metadata import (
MetadataCore,
PacketLocation,
read_metadata_core,
read_packet_location,
)


@dataclass
class IndexData:
metadata: dict
location: dict
metadata: Dict[str, MetadataCore]
location: Dict[str, Dict[str, PacketLocation]]
unpacked: List[str]

@staticmethod
Expand All @@ -29,21 +34,28 @@ def refresh(self):
self.data = _index_update(self._path, self.data)
return self

def all_metadata(self):
def all_metadata(self) -> Dict[str, MetadataCore]:
return self.refresh().data.metadata

def metadata(self, id):
def metadata(self, id) -> MetadataCore:
if id in self.data.metadata:
return self.data.metadata[id]
return self.refresh().data.metadata[id]

def all_locations(self):
def all_locations(self) -> Dict[str, Dict[str, PacketLocation]]:
return self.refresh().data.location

def location(self, name):
def location(self, name) -> Dict[str, PacketLocation]:
return self.refresh().data.location[name]

def unpacked(self):
def packets_in_location(self, name) -> List[str]:
try:
packets = list(self.location(name).keys())
except KeyError:
packets = []
return packets

def unpacked(self) -> List[str]:
return self.refresh().data.unpacked


Expand All @@ -62,7 +74,7 @@ def _read_metadata(path_root, data):
return data


def _read_locations(path_root, data):
def _read_locations(path_root, data) -> Dict[str, Dict[str, PacketLocation]]:
path = path_root / ".outpack" / "location"
for loc in path.iterdir():
if loc.name not in data:
Expand Down
119 changes: 8 additions & 111 deletions src/outpack/location.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
import collections
import os
import shutil
from typing import List

from outpack.config import Location, update_config
from outpack.hash import hash_validate_string
from outpack.location_path import OutpackLocationPath
from outpack.metadata import PacketLocation
from outpack.packet import mark_known
from outpack.root import root_open
from outpack.static import (
LOCATION_LOCAL,
Expand Down Expand Up @@ -96,7 +91,7 @@ def location_resolve_valid(
isinstance(item, str) for item in location
):
unknown = set(location).difference(outpack_location_list(root))
if len(unknown) > 0:
if unknown:
unknown_text = "', '".join(unknown)
msg = f"Unknown location: '{unknown_text}'"
raise Exception(msg)
Expand All @@ -119,111 +114,6 @@ def location_resolve_valid(
return location


def outpack_location_pull_metadata(location=None, root=None, *, locate=True):
root = root_open(root, locate=locate)
location_name = location_resolve_valid(
location,
root,
include_local=False,
include_orphan=False,
allow_no_locations=True,
)
for name in location_name:
driver = _location_driver(name, root)
_pull_all_metadata(driver, root, name)
known_packets = []
for packet_location in root.index.all_locations().values():
known_packets.extend(list(packet_location.values()))
_validate_hashes(driver, name, known_packets)
_mark_all_known(driver, root, name)

# TODO: mrc-4601 deorphan recovered packets


def _pull_all_metadata(driver, root, location_name):
known_there = driver.list()
known_here = root.index.all_metadata().keys()
for packet_id in known_there:
if packet_id not in known_here:
_pull_packet_metadata(driver, root, location_name, packet_id)


def _get_remove_location_hint(location_name):
return (
f'Probably all you can do at this point is '
f'remove this location from your configuration '
f'by running '
f'orderly_location_remove("{location_name}")'
)


def _pull_packet_metadata(driver, root, location_name, packet_id):
metadata = driver.metadata(packet_id)[packet_id]
expected_hash = driver.list()[packet_id].hash

hash_validate_string(
metadata,
expected_hash,
f"metadata for '{packet_id}' from '{location_name}'",
[
"This is bad news, I'm afraid. Your location is sending data "
"that does not match the hash it says it does. Please let us "
"know how this might have happened.",
_get_remove_location_hint(location_name),
],
)

path_metadata = root.path / ".outpack" / "metadata"
os.makedirs(path_metadata, exist_ok=True)
filename = path_metadata / packet_id
with open(filename, "w") as f:
f.writelines(metadata)


def _validate_hashes(driver, location_name, packets: List[PacketLocation]):
mismatched_hashes = set()
known_there = driver.list()
for packet in packets:
if known_there.get(packet.packet) is not None:
hash_there = known_there[packet.packet].hash
hash_here = packet.hash
if hash_there != hash_here:
mismatched_hashes.add(packet.packet)

if len(mismatched_hashes) > 0:
id_text = "', '".join(mismatched_hashes)
msg = (
f"Location '{location_name}' has conflicting metadata\n"
f"This is really bad news. We have been offered metadata "
f"from '{location_name}' that has a different hash to "
f"metadata that we have already imported from other "
f"locations. I'm not going to import this new metadata, "
f"but there's no guarantee that the older metadata is "
f"actually what you want!\nConflicts for: '{id_text}'\n"
f"We would be interested in this case, please let us know\n"
f"{_get_remove_location_hint(location_name)}"
)
raise Exception(msg)


def _mark_all_known(driver, root, location_name):
try:
known_here = root.index.location(location_name)
except KeyError:
known_here = {}

known_there = driver.list()
for packet_id in known_there:
if packet_id not in known_here.keys():
mark_known(
root,
packet_id,
location_name,
known_there[packet_id].hash,
known_there[packet_id].time,
)


def _location_check_new_name(root, name):
if _location_exists(root, name):
msg = f"A location with name '{name}' already exists"
Expand All @@ -240,6 +130,13 @@ def _location_exists(root, name):
return name in outpack_location_list(root)


# TODO: Create a driver interface type
# atm we can't specify a type for driver return
# in this function. We want to return either an
# OutpackLocationPath driver or an http driver
# or other types down the line. We could set union type but
# would be nicer to use an interface-like pattern
# see mrc-5043
def _location_driver(location_name, root):
location = root.config.location[location_name]
if location.type == "path":
Expand Down
3 changes: 1 addition & 2 deletions src/outpack/location_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def metadata(self, packet_ids):

all_ids = self.__root.index.location(LOCATION_LOCAL).keys()
missing_ids = set(packet_ids).difference(all_ids)
if len(missing_ids) > 0:
if missing_ids:
missing_msg = "', '".join(missing_ids)
msg = f"Some packet ids not found: '{missing_msg}'"
raise Exception(msg)
Expand All @@ -32,7 +32,6 @@ def metadata(self, packet_ids):
def fetch_file(self, hash, dest):
if self.__root.config.core.use_file_store:
path = self.__root.files.filename(hash)
print(path)
if not os.path.exists(path):
msg = f"Hash '{hash}' not found at location"
raise Exception(msg)
Expand Down
Loading

0 comments on commit 1ebc054

Please sign in to comment.