Skip to content

Commit

Permalink
Merge pull request conda-incubator#21 from FaustinCarter/feature_diff…
Browse files Browse the repository at this point in the history
…-tar

Add optional targets for reference.json and update.tar to conda-diff-tar
  • Loading branch information
xhochy authored Oct 2, 2020
2 parents 992a0da + 262d1aa commit b9e5923
Show file tree
Hide file tree
Showing 5 changed files with 228 additions and 32 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ cover/
nosetests.xml
coverage.xml

# Possible files generated during tests
reference.json
update.tar

# Translations
*.mo
*.pot
Expand Down Expand Up @@ -71,6 +75,9 @@ docs/_build/
#pycharm
.idea/*

#vscode
.vscode

#Dolphin browser files
.directory/
.directory
Expand Down
23 changes: 20 additions & 3 deletions conda_mirror/conda_mirror.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,12 @@ def _remove_package(pkg_path, reason):
The reason why the package is being removed
"""
msg = "Removing: %s. Reason: %s" % (pkg_path, reason)
logger.warning(msg)
if logger:
logger.warning(msg)
else:
# Logging breaks in multiprocessing in Windows
# TODO: Fix this properly with a logging Queue
sys.stdout.write("Warning: " + msg)
os.remove(pkg_path)
return pkg_path, msg

Expand Down Expand Up @@ -656,13 +661,25 @@ def _validate_or_remove_package(args):
try:
package_metadata = package_repodata[package]
except KeyError:
logger.warning("%s is not in the upstream index. Removing...", package)
log_msg = f"{package} is not in the upstream index. Removing..."
if logger:
logger.warning(log_msg)
else:
# Windows does not handle multiprocessing logging well
# TODO: Fix this properly with a logging Queue
sys.stdout.write("Warning: " + log_msg)
reason = "Package is not in the repodata index"
package_path = os.path.join(package_directory, package)
return _remove_package(package_path, reason=reason)
# validate the integrity of the package, the size of the package and
# its hashes
logger.info("Validating {:4d} of {:4d}: {}.".format(num + 1, num_packages, package))
log_msg = "Validating {:4d} of {:4d}: {}.".format(num + 1, num_packages, package)
if logger:
logger.info(log_msg)
else:
# Windows does not handle multiprocessing logging well
# TODO: Fix this properly with a logging Queue
sys.stdout.write("Info: "+log_msg)
package_path = os.path.join(package_directory, package)
return _validate(
package_path, md5=package_metadata.get("md5"), size=package_metadata.get("size")
Expand Down
82 changes: 69 additions & 13 deletions conda_mirror/diff_tar.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from os.path import abspath, isdir, join, relpath


REFERENCE_PATH = "./reference.json"
DEFAULT_REFERENCE_PATH = "./reference.json"
DEFAULT_UPDATE_PATH = "./update.tar"


class NoReferenceError(FileNotFoundError):
Expand Down Expand Up @@ -72,38 +73,44 @@ def verify_all_repos(mirror_dir):
print("MD5 mismatch: %s" % path)


def write_reference(mirror_dir):
def write_reference(mirror_dir, outfile=None):
"""
Write the "reference file", which is a collection of the content of all
repodata.json files.
"""
if not outfile:
outfile = DEFAULT_REFERENCE_PATH
data = json.dumps(all_repodata(mirror_dir), indent=2, sort_keys=True)
# make sure we have newline at the end
if not data.endswith("\n"):
data += "\n"
with open(REFERENCE_PATH, "w") as fo:
with open(outfile, "w") as fo:
fo.write(data)


def read_reference():
def read_reference(infile=None):
"""
Read the "reference file" from disk and return its content as a dictionary.
"""
if not infile:
infile = DEFAULT_REFERENCE_PATH
try:
with open(REFERENCE_PATH) as fi:
with open(infile) as fi:
return json.load(fi)
except FileNotFoundError as e:
raise NoReferenceError(e)


def get_updates(mirror_dir):
def get_updates(mirror_dir, infile=None):
"""
Compare the "reference file" to the actual the repository (all the
repodata.json files) and iterate the new and updates files in the
repository. That is, the files which need to go into the differential
tarball.
"""
d1 = read_reference()
if not infile:
infile = DEFAULT_REFERENCE_PATH
d1 = read_reference(infile)
d2 = all_repodata(mirror_dir)
for repo_path, index2 in d2.items():
index1 = d1.get(repo_path, {})
Expand All @@ -116,12 +123,21 @@ def get_updates(mirror_dir):
yield relpath(join(repo_path, fn), mirror_dir)


def tar_repo(mirror_dir, outfile="update.tar", verbose=False):
def tar_repo(
mirror_dir,
infile=None,
outfile=None,
verbose=False
):
"""
Write the so-called differential tarball, see get_updates().
"""
if not infile:
infile = DEFAULT_REFERENCE_PATH
if not outfile:
outfile = DEFAULT_UPDATE_PATH
t = tarfile.open(outfile, "w")
for f in get_updates(mirror_dir):
for f in get_updates(mirror_dir, infile):
if verbose:
print("adding: %s" % f)
t.add(join(mirror_dir, f), f)
Expand Down Expand Up @@ -151,6 +167,21 @@ def main():
"--reference", action="store_true", help="create a reference point file"
)

p.add_argument(
"-o",
"--outfile",
action="store",
help="Path to references json file when using --reference, "
"or update tarfile when using --create",
)

p.add_argument(
"-i",
"--infile",
action="store",
help="Path to specify references json file when using --create or --show"
)

p.add_argument(
"--show",
action="store_true",
Expand Down Expand Up @@ -184,17 +215,42 @@ def main():

try:
if args.create:
tar_repo(mirror_dir, verbose=args.verbose)
if args.outfile:
outfile = args.outfile
else:
outfile = DEFAULT_UPDATE_PATH

if args.infile:
infile = args.infile
else:
infile = DEFAULT_REFERENCE_PATH

tar_repo(mirror_dir, infile, outfile, verbose=args.verbose)

elif args.verify:
verify_all_repos(mirror_dir)

elif args.show:
for path in get_updates(mirror_dir):
if args.infile:
infile = args.infile
else:
infile = DEFAULT_REFERENCE_PATH

if args.outfile:
p.error("--outfile not allowed with --show")

for path in get_updates(mirror_dir, infile):
print(path)

elif args.reference:
write_reference(mirror_dir)
if args.infile:
p.error("--infile not allowed with --reference")
if args.outfile:
outfile = args.outfile
else:
outfile = DEFAULT_REFERENCE_PATH

write_reference(mirror_dir, outfile)

else:
print("Nothing done.")
Expand All @@ -205,7 +261,7 @@ def main():
Error: no such file: %s
Please use the --reference option before creating a differential tarball.\
"""
% REFERENCE_PATH
% DEFAULT_REFERENCE_PATH
)


Expand Down
50 changes: 42 additions & 8 deletions diff-tar.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,59 @@ Create differential tarballs
This tools allows you to create differential tarballs of a (usually
mirrored) conda repository. The resulting tarball can be used to update
a copy of the mirror on a remote (air-gapped) system, without having to
copy the entire conda repository. The workflow is a follows:
copy the entire conda repository.

Usage:
------
Running `conda-diff-tar --help` will show the following output:

```
usage: conda-diff-tar [-h] [--create] [--reference] [-o OUTFILE] [-i INFILE]
[--show] [--verify] [-v] [--version]
[REPOSITORY]
create "differential" tarballs of a conda repository
positional arguments:
REPOSITORY path to repository directory
optional arguments:
-h, --help show this help message and exit
--create create differential tarball
--reference create a reference point file
-o OUTFILE, --outfile OUTFILE
Path to references json file when using --reference,
or update tarfile when using --create
-i INFILE, --infile INFILE
Path to specify references json file when using
--create or --show
--show show the files in respect to the latest reference
point file (which would be included in the
differential tarball)
--verify verify the mirror repository and exit
-v, --verbose
--version print version and exit
```

Example workflow:
-----------------

1. we assume that the remote and local repository are in sync
2. create a `reference.json` file of the local repository
2. create a `reference.json` file of the local repository with the `--reference` flag
3. update the local repository using `conda-mirror` or some other tools
4. create the "differential" tarball
4. create the "differential" tarball with the `--create` flag
5. move the differential tarball to the remote machine, and unpack it
6. now that the remote repository is up-to-date, we should create a new
`reference.json` on the local machine. That is, step 2
`reference.json` on the local machine. That is, repeat step 2


Notes:
------

The file `reference.json` is a collection of all `repodata.json`
files (`linux-64`, `win-32`, `noarch`, etc.) of the local repository.
The file `reference.json` (or whatever you named it) is a collection of all `repodata.json`
files (`linux-64`, `win-32`, `noarch`, etc.) in the local repository.
It is created in order to compare a future state of the repository to the
state of the repository when `reference.json` it was created.
state of the repository when `reference.json` was created.

The differential tarball contains files which either have been updated (such
as `repodata.json`) or new files (new conda packages). It is meant to be
Expand All @@ -32,7 +67,6 @@ unpacked on top of the existing mirror on the remote machine by:
# or y using tar's -C option from any directory
tar xf update.tar -C <repository>


Example:
--------

Expand Down
Loading

0 comments on commit b9e5923

Please sign in to comment.