src/cmd-buildfetch

#!/usr/bin/env python3
# NOTE: PYTHONUNBUFFERED is set in the entrypoint for unbuffered output
#
# Fetches the bare minimum from external servers to create the next build. May
# require configured AWS credentials if bucket and objects are not public.

import argparse
import os
import subprocess
import sys
import requests
import shutil
from tenacity import retry, retry_if_exception_type

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from cosalib.builds import Builds, BUILDFILES
from cosalib.cmdlib import (
    get_basearch,
    load_json,
    retry_callback,
    retry_stop,
    rm_allow_noent,
    sha256sum_file)
from cosalib.s3 import S3

retry_requests_exception = (retry_if_exception_type(requests.Timeout) |
                            retry_if_exception_type(requests.ReadTimeout) |
                            retry_if_exception_type(requests.ConnectTimeout) |
                            retry_if_exception_type(requests.ConnectionError))

FCOS_STREAMS_URL = "https://builds.coreos.fedoraproject.org/prod/streams"

# https://github.com/coreos/fedora-coreos-tracker/blob/main/Design.md#version-numbers
FCOS_VERSION_STREAM_MAPPING = {
    '1':  'next',
    '2':  'testing',
    '3':  'stable',
    '10': 'next-devel',
    '20': 'testing-devel',
    '91': 'rawhide',
    '92': 'branched',
    '93': 'bodhi-updates-testing',
    '94': 'bodhi-updates',
}


def get_stream_for_build(build):
    z = build.split('.')[2]
    return FCOS_VERSION_STREAM_MAPPING[z]


def main():
    args = parse_args()
    if args.aws_config_file:
        os.environ["AWS_CONFIG_FILE"] = args.aws_config_file

    if not args.url:
        if args.build:
            stream = get_stream_for_build(args.build)
            if args.stream and stream != args.stream:
                raise Exception("A conflicting --build and --stream were provided")
        else:
            stream = args.stream or 'testing-devel'
        args.url = f'{FCOS_STREAMS_URL}/{stream}/builds'
    if args.url.startswith("s3://"):
        fetcher = S3Fetcher(args.url)
    elif args.url.startswith("http://") or args.url.startswith("https://"):
        fetcher = HTTPFetcher(args.url)
    elif args.url.startswith("file://") or args.url.startswith("/"):
        fetcher = LocalFetcher(args.url)
    else:
        raise Exception("Invalid scheme: only file://, s3://, and http(s):// supported")

    builds = None
    if fetcher.exists('builds.json'):
        # Check to see if local builds.json has been modified with local builds
        if os.path.isfile(BUILDFILES['sourcedata']) \
           and os.path.isfile(BUILDFILES['list']):
            # If we have local builds, don't overwrite that by default.
            havelocalchanges = subprocess.call(['cmp', BUILDFILES['sourcedata'], BUILDFILES['list']],
                                               stdout=subprocess.DEVNULL,
                                               stderr=subprocess.DEVNULL) != 0
            if havelocalchanges:
                if args.force:
                    print(f"Detected local modifications to {BUILDFILES['list']}")
                    print("Forcing update as requested by --force")
                else:
                    raise SystemExit(f"{BUILDFILES['list']} modified locally. "
                                     "Run with --force to overwrite local changes")

        # Download builds.json to local builds.json
        fetcher.fetch('builds.json', dest=BUILDFILES['list'])
        print(f"Updated {BUILDFILES['list']}")
        # Record the origin and original state
        with open(BUILDFILES['sourceurl'], 'w') as f:
            f.write(args.url + '\n')
        # Copy the builds.json to the local sourcedata file so we can
        # detect local modifications.
        subprocess.check_call(['cp-reflink', BUILDFILES['list'], BUILDFILES['sourcedata']])
        builds = Builds()
    else:
        print("No builds.json found")
        return

    if builds.is_empty():
        print("Remote has empty build list!")
        return

    buildid = args.build or builds.get_latest()
    # Let's handle args.arch. If the user didn't specify an arch
    # then operate on the current arch of the system. If the user
    # gave us the special value of --arch=all then download the
    # build metadata for all architectures related to this build.
    if len(args.arch) == 0:
        arches = [get_basearch()]
    elif args.arch == ['all']:
        arches = builds.get_build_arches(buildid)
    else:
        arches = args.arch

    # If someone passed in --find-build-for-arch they want us to
    # find the most recent build that was successful for $arch.
    # Since this may be a different build ID for the different
    # given possible architectures we'll limit the use of this option
    # to a single given arch.
    if args.find_build_for_arch:
        if len(arches) != 1:
            raise Exception("Must provide single arch when finding build for architecture")
        buildid = builds.get_latest_for_arch(arches[0])
        if buildid is None:
            print(f"No builds for arch {arches[0]} found in the history")
            return

    for arch in arches:
        # If the architecture doesn't exist then assume there were
        # no builds for this architecture yet, which can only happen
        # if someone passed in architecture value(s)
        if arch not in builds.get_build_arches(buildid):
            print(f"No {arch} artifacts for build {buildid}")
            continue

        builddir = builds.get_build_dir(buildid, arch)
        os.makedirs(builddir, exist_ok=True)

        # trim out the leading builds/
        assert builddir.startswith("builds/")
        builddir = builddir[len("builds/"):]

        default_objects = ['meta.json', 'commitmeta.json', 'ostree-commit-object']
        objects = default_objects + args.file
        for f in objects:
            fetcher.fetch(f'{builddir}/{f}')

        buildmeta = load_json(f'builds/{builddir}/meta.json')

        # dedupe any possible duplicates in the list
        args.artifact = set(args.artifact)

        if 'all' in args.artifact:
            artifacts = buildmeta['images'].keys()
        else:
            artifacts = args.artifact

        for imgname in artifacts:
            if imgname not in buildmeta['images']:
                raise Exception(f"Requested artifact {imgname} not available in build")
            img = buildmeta['images'][imgname]
            imgpath = img['path']
            fetcher.fetch(f'{builddir}/{imgpath}')
            sha256sum = sha256sum_file(f'builds/{builddir}/{imgpath}')
            if img['sha256'] != sha256sum:
                print(f"Calcluated sha256sum: {sha256sum}")
                print(f"Expected  sha256sum: {img['sha256']}")
                raise Exception(f"Downloaded checksum for {imgpath} does not match expected")

        # also nuke the any local matching OStree ref, since we want to build on
        # top of this new one
        if 'ref' in buildmeta and os.path.isdir('tmp/repo'):
            subprocess.check_call(['ostree', 'refs', '--repo', 'tmp/repo',
                                   '--delete', buildmeta['ref']],
                                  stdout=subprocess.DEVNULL)

    # and finally the symlink
    if args.build is None or args.build == builds.get_latest():
        rm_allow_noent('builds/latest')
        os.symlink(buildid, 'builds/latest')


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", metavar='URL', default="",
                        help="URL from which to fetch metadata")
    parser.add_argument("--stream", metavar='STREAM', action='store',
                        help="stream from which to fetch metadata")
    parser.add_argument("-b", "--build", action='store',
                        help="Fetch specified build instead of latest")
    parser.add_argument("--force", action='store_true',
                        help="Assuming local changes, force update {BUILDFILES['list']}")
    parser.add_argument("--arch", default=[], action='append',
                        help="the target architecture(s)")
    parser.add_argument("--artifact", default=[], action='append',
                        help="Fetch given image artifact(s)", metavar="ARTIFACT")
    parser.add_argument("--file", default=[], action='append',
                        help="Fetch given non-artifact file(s)")
    parser.add_argument("--aws-config-file", metavar='CONFIG', default="",
                        help="Path to AWS config file")
    parser.add_argument("--find-build-for-arch", action='store_true',
                        help="Traverse build history to find latest for given architecture")
    return parser.parse_args()


class Fetcher(object):

    def __init__(self, url_base):
        self.url_base = url_base

    def fetch(self, path, dest=None):
        # if no specific dest given, assume it's a path under builds/
        if dest is None:
            dest = f'builds/{path}'
        # NB: `urllib.parse.urljoin()` does not do what one thinks it does.
        # Using `os.path.join()` is a hack, but eh... we're not planning to run
        # on Windows anytime soon.
        url = os.path.join(self.url_base, path)
        print(f"Fetching: {url}")
        # ensure the dir for dest file exists
        # otherwise s3 download_file won't be able to write temp file
        os.makedirs(os.path.dirname(dest), exist_ok=True)
        self.fetch_impl(url, dest)
        return dest

    def fetch_impl(self, url, dest):
        raise NotImplementedError

    def exists_impl(self, url):
        raise NotImplementedError

    def fetch_json(self, path):
        return load_json(self.fetch(path))

    def exists(self, path):
        url = os.path.join(self.url_base, path)
        return self.exists_impl(url)


class HTTPFetcher(Fetcher):

    def __init__(self, url_base):
        super().__init__(url_base)

    @retry(stop=retry_stop, retry=retry_requests_exception, before_sleep=retry_callback)
    def fetch_impl(self, url, dest):
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(dest, mode='wb') as f:
                # Stream file data from the network to the file in these size chunks.
                # 30 MiB is somewhat arbitrary but should be easily supported on most systems
                # without transfer slowdown.
                max_chunk_size = 30 * 1024 * 1024

                # If the HTTP headers have encoded the file transfer as chunks already, respect those instead
                # of our hardcoded max size.
                if 'chunked' in r.headers.get('transfer-encoding', list()):
                    max_chunk_size = None

                # With stream=True above, read data from the network and write it to the file in chunks
                # rather than trying to put it all in RAM and then write it all to disk.
                # For large ociarchive files on lower-RAM systems, this can cause a crash, and the performance
                # trade-off for chunking it is usually negligible unless the files are extra huge, the disk IO cache is
                # very small, and the network pipe is very large.
                for chunk in r.iter_content(chunk_size=max_chunk_size):
                    f.write(chunk)

    @retry(stop=retry_stop, retry=retry_requests_exception, before_sleep=retry_callback)
    def exists_impl(self, url):
        with requests.head(url) as r:
            if r.status_code == 200:
                return True
            # just treat 403 as ENOENT too; this is common for APIs to do (like
            # AWS) and we don't support HTTP basic auth here anyway
            if r.status_code in [404, 403]:
                return False
            raise Exception(f"Received rc {r.status_code} for {url}")


class S3Fetcher(Fetcher):
    def __init__(self, url_base):
        super().__init__(url_base)
        self.s3_client = S3()

    def fetch_impl(self, url, dest):
        assert url.startswith("s3://")
        bucket, key = url[len("s3://"):].split('/', 1)
        # this function does not need to be retried with the decorator as download_file would
        # retry automatically based on s3config settings
        self.s3_client.download_file(bucket, key, dest)

    def exists_impl(self, url):
        assert url.startswith("s3://")
        bucket, key = url[len("s3://"):].split('/', 1)
        # sanity check that the bucket exists and we have access to it
        self.s3_client.head_bucket(bucket=bucket)
        return self.s3_client.head_object(bucket=bucket, key=key)


class LocalFetcher(Fetcher):

    def __init__(self, url_base):
        if url_base.startswith("file://"):
            url_base = url_base[len("file://"):]
        super().__init__(url_base)

    def fetch_impl(self, url, dest):
        shutil.copyfile(url, dest)

    def exists_impl(self, url):
        return os.path.exists(url)


if __name__ == '__main__':
    sys.exit(main())