From 32d188bf22f8c33f229044b9fd4fc674f9f396e6 Mon Sep 17 00:00:00 2001 From: Eric Dill Date: Tue, 17 May 2022 18:53:01 -0400 Subject: [PATCH 1/5] Add flag to only mirror up to N packages --- conda_mirror/conda_mirror.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/conda_mirror/conda_mirror.py b/conda_mirror/conda_mirror.py index 448b45c..6d90556 100644 --- a/conda_mirror/conda_mirror.py +++ b/conda_mirror/conda_mirror.py @@ -2,6 +2,7 @@ import bz2 import fnmatch import hashlib +import itertools import json import logging import multiprocessing @@ -387,6 +388,13 @@ def _make_arg_parser(): dest="show_progress", help="Do not display progress bars.", ) + ap.add_argument( + "--max-packages", + action="store", + type=int, + dest="max_packages", + help="Limit the total number of packages downloaded", + ) return ap @@ -499,6 +507,7 @@ def pdb_hook(exctype, value, traceback): "ssl_verify": args.ssl_verify, "max_retries": args.max_retries, "show_progress": args.show_progress, + "max_packages": args.max_packages, } @@ -610,7 +619,12 @@ def get_repodata(channel, platform, proxies=None, ssl_verify=None): channel=channel, platform=platform, file_name="repodata.json" ) - resp = requests.get(url, proxies=proxies, verify=ssl_verify).json() + resp = requests.get(url, proxies=proxies, verify=ssl_verify) + try: + resp.raise_for_status() + except requests.exceptions.HTTPError: + raise RuntimeError(f"platform {platform} for channel {channel} not found on anaconda.org") + resp = resp.json() info = resp.get("info", {}) packages = resp.get("packages", {}) # Patch the repodata.json so that all package info dicts contain a "subdir" @@ -897,6 +911,7 @@ def main( chunk_size: int = DEFAULT_CHUNK_SIZE, max_retries=100, show_progress: bool = True, + max_packages=None ): """ @@ -953,6 +968,9 @@ def main( default 100. show_progress: bool Show progress bar while downloading. True by default. + max_packages : int, optional + Maximum number of packages to mirror. If not set, will mirror all packages + in list Returns ------- @@ -992,6 +1010,7 @@ def main( 'size': 1960193, 'version': '8.5.18'} """ + logger.debug(f"Local values in main: {pformat(locals())}") # Steps: # 1. figure out blacklisted packages # 2. un-blacklist packages that are actually whitelisted @@ -1083,6 +1102,8 @@ def main( # mirror list local_packages = _list_conda_packages(local_directory) to_mirror = possible_packages_to_mirror - set(local_packages) + if max_packages is not None: + to_mirror = set(itertools.islice(to_mirror, max_packages)) logger.info("PACKAGES TO MIRROR") logger.info(pformat(sorted(to_mirror))) summary["to-mirror"].update(to_mirror) From 360a13fcda627983947e30d61c90587e77660326 Mon Sep 17 00:00:00 2001 From: Eric Dill Date: Tue, 17 May 2022 19:05:07 -0400 Subject: [PATCH 2/5] Add checkpointing every 100 packages --- conda_mirror/conda_mirror.py | 99 ++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/conda_mirror/conda_mirror.py b/conda_mirror/conda_mirror.py index 6d90556..58cf94d 100644 --- a/conda_mirror/conda_mirror.py +++ b/conda_mirror/conda_mirror.py @@ -1122,13 +1122,14 @@ def main( session = requests.Session() with tempfile.TemporaryDirectory(dir=temp_directory) as download_dir: logger.info("downloading to the tempdir %s", download_dir) - for package_name in tqdm( - sorted(to_mirror), - desc=platform, - unit="package", - leave=False, - disable=not show_progress, - ): + for package_name, package_counter in enumerate( + tqdm( + sorted(to_mirror), + desc=platform, + unit="package", + leave=False, + disable=not show_progress, + )): url = download_url.format( channel=channel, platform=platform, file_name=package_name ) @@ -1169,43 +1170,14 @@ def main( logger.exception("Unexpected error: %s. Aborting download.", ex) break - # validate all packages in the download directory - validation_results = _validate_packages( - packages, download_dir, num_threads=num_threads - ) - summary["validating-new"].update(validation_results) - logger.debug( - "Newly downloaded files at %s are %s", - download_dir, - pformat(os.listdir(download_dir)), - ) - - # 8. Use already downloaded repodata.json contents but prune it of - # packages we don't want - repodata = {"info": info, "packages": packages} - - # compute the packages that we have locally - packages_we_have = set(local_packages + _list_conda_packages(download_dir)) - # remake the packages dictionary with only the packages we have - # locally - repodata["packages"] = { - name: info - for name, info in repodata["packages"].items() - if name in packages_we_have - } - _write_repodata(download_dir, repodata) - - # move new conda packages - for f in _list_conda_packages(download_dir): - old_path = os.path.join(download_dir, f) - new_path = os.path.join(local_directory, f) - logger.info("moving %s to %s", old_path, new_path) - shutil.move(old_path, new_path) - - for f in ("repodata.json", "repodata.json.bz2"): - download_path = os.path.join(download_dir, f) - move_path = os.path.join(local_directory, f) - shutil.move(download_path, move_path) + if (package_counter+1) % 100 == 0: + # Every 100 packages, pause to validate and move packages + # If we dont do this then whenever an invocation is interrupted, nothing is saved. + # This serves as basically a checkpoint + _validate_and_move(packages, download_dir, num_threads, summary, info, local_packages, local_directory) + + # When finished with the loop, validate and move the remaining packages + _validate_and_move(packages, download_dir, num_threads, summary, info, local_packages, local_directory) # Also need to make a "noarch" channel or conda gets mad noarch_path = os.path.join(target_directory, "noarch") @@ -1216,6 +1188,45 @@ def main( return summary +def _validate_and_move(packages, download_dir, num_threads, summary, info, local_packages, local_directory): + # validate all packages in the download directory + validation_results = _validate_packages( + packages, download_dir, num_threads=num_threads + ) + summary["validating-new"].update(validation_results) + logger.debug( + "Newly downloaded files at %s are %s", + download_dir, + pformat(os.listdir(download_dir)), + ) + + # 8. Use already downloaded repodata.json contents but prune it of + # packages we don't want + repodata = {"info": info, "packages": packages} + + # compute the packages that we have locally + packages_we_have = set(local_packages + _list_conda_packages(download_dir)) + # remake the packages dictionary with only the packages we have + # locally + repodata["packages"] = { + name: info + for name, info in repodata["packages"].items() + if name in packages_we_have + } + _write_repodata(download_dir, repodata) + + # move new conda packages + for f in _list_conda_packages(download_dir): + old_path = os.path.join(download_dir, f) + new_path = os.path.join(local_directory, f) + logger.info("moving %s to %s", old_path, new_path) + shutil.move(old_path, new_path) + + for f in ("repodata.json", "repodata.json.bz2"): + download_path = os.path.join(download_dir, f) + move_path = os.path.join(local_directory, f) + shutil.move(download_path, move_path) + def _write_repodata(package_dir, repodata_dict): data = json.dumps(repodata_dict, indent=2, sort_keys=True) From 130ad83a2fb674a901425bc33cdd8d5d74f3751a Mon Sep 17 00:00:00 2001 From: Eric Dill Date: Tue, 17 May 2022 19:12:33 -0500 Subject: [PATCH 3/5] Updates from debugging on mirroring system --- conda_mirror/conda_mirror.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/conda_mirror/conda_mirror.py b/conda_mirror/conda_mirror.py index 58cf94d..16edc9a 100644 --- a/conda_mirror/conda_mirror.py +++ b/conda_mirror/conda_mirror.py @@ -1122,7 +1122,7 @@ def main( session = requests.Session() with tempfile.TemporaryDirectory(dir=temp_directory) as download_dir: logger.info("downloading to the tempdir %s", download_dir) - for package_name, package_counter in enumerate( + for package_counter, package_name in enumerate( tqdm( sorted(to_mirror), desc=platform, @@ -1170,14 +1170,16 @@ def main( logger.exception("Unexpected error: %s. Aborting download.", ex) break - if (package_counter+1) % 100 == 0: + if (package_counter+1) % 15 == 0: # Every 100 packages, pause to validate and move packages # If we dont do this then whenever an invocation is interrupted, nothing is saved. # This serves as basically a checkpoint _validate_and_move(packages, download_dir, num_threads, summary, info, local_packages, local_directory) + # After moving packages to their ultimate resting place, update the packages we have locally + local_packages = _list_conda_packages(local_directory) - # When finished with the loop, validate and move the remaining packages - _validate_and_move(packages, download_dir, num_threads, summary, info, local_packages, local_directory) + # When finished with the loop, validate and move the remaining packages + _validate_and_move(packages, download_dir, num_threads, summary, info, local_packages, local_directory) # Also need to make a "noarch" channel or conda gets mad noarch_path = os.path.join(target_directory, "noarch") From 5da0db76a449868a7f98d990953728ec88f3b8a4 Mon Sep 17 00:00:00 2001 From: Eric Dill Date: Wed, 18 May 2022 13:01:58 -0400 Subject: [PATCH 4/5] reformat with black --- conda_mirror/conda_mirror.py | 40 ++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/conda_mirror/conda_mirror.py b/conda_mirror/conda_mirror.py index 16edc9a..0ef707b 100644 --- a/conda_mirror/conda_mirror.py +++ b/conda_mirror/conda_mirror.py @@ -623,7 +623,9 @@ def get_repodata(channel, platform, proxies=None, ssl_verify=None): try: resp.raise_for_status() except requests.exceptions.HTTPError: - raise RuntimeError(f"platform {platform} for channel {channel} not found on anaconda.org") + raise RuntimeError( + f"platform {platform} for channel {channel} not found on anaconda.org" + ) resp = resp.json() info = resp.get("info", {}) packages = resp.get("packages", {}) @@ -911,7 +913,7 @@ def main( chunk_size: int = DEFAULT_CHUNK_SIZE, max_retries=100, show_progress: bool = True, - max_packages=None + max_packages=None, ): """ @@ -969,7 +971,7 @@ def main( show_progress: bool Show progress bar while downloading. True by default. max_packages : int, optional - Maximum number of packages to mirror. If not set, will mirror all packages + Maximum number of packages to mirror. If not set, will mirror all packages in list Returns @@ -1129,7 +1131,8 @@ def main( unit="package", leave=False, disable=not show_progress, - )): + ) + ): url = download_url.format( channel=channel, platform=platform, file_name=package_name ) @@ -1170,16 +1173,32 @@ def main( logger.exception("Unexpected error: %s. Aborting download.", ex) break - if (package_counter+1) % 15 == 0: + if (package_counter + 1) % 15 == 0: # Every 100 packages, pause to validate and move packages # If we dont do this then whenever an invocation is interrupted, nothing is saved. # This serves as basically a checkpoint - _validate_and_move(packages, download_dir, num_threads, summary, info, local_packages, local_directory) + _validate_and_move( + packages, + download_dir, + num_threads, + summary, + info, + local_packages, + local_directory, + ) # After moving packages to their ultimate resting place, update the packages we have locally local_packages = _list_conda_packages(local_directory) - + # When finished with the loop, validate and move the remaining packages - _validate_and_move(packages, download_dir, num_threads, summary, info, local_packages, local_directory) + _validate_and_move( + packages, + download_dir, + num_threads, + summary, + info, + local_packages, + local_directory, + ) # Also need to make a "noarch" channel or conda gets mad noarch_path = os.path.join(target_directory, "noarch") @@ -1190,7 +1209,10 @@ def main( return summary -def _validate_and_move(packages, download_dir, num_threads, summary, info, local_packages, local_directory): + +def _validate_and_move( + packages, download_dir, num_threads, summary, info, local_packages, local_directory +): # validate all packages in the download directory validation_results = _validate_packages( packages, download_dir, num_threads=num_threads From 60a205e7bb80661d636381c06f84f30d52b18681 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 27 Feb 2023 09:08:23 +0100 Subject: [PATCH 5/5] Update conda_mirror/conda_mirror.py --- conda_mirror/conda_mirror.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conda_mirror/conda_mirror.py b/conda_mirror/conda_mirror.py index 84868c8..73ec033 100644 --- a/conda_mirror/conda_mirror.py +++ b/conda_mirror/conda_mirror.py @@ -1211,7 +1211,8 @@ def main( local_packages, local_directory, ) - # After moving packages to their ultimate resting place, update the packages we have locally + # After moving packages to their ultimate resting place, + # update the packages we have locally local_packages = _list_conda_packages(local_directory) # When finished with the loop, validate and move the remaining packages