From ea27e9e8bd0137ee290e9778db09519b5e47270e Mon Sep 17 00:00:00 2001 From: Fred Park Date: Wed, 27 Jun 2018 08:49:32 -0700 Subject: [PATCH] Support XFS filesystems in storage clusters - Allow mdadm-based RAID-0 arrays to expand (experimental) - Greatly expand remote fs guide with configuration/usage explanations - Fix blocking bug with fs commands - Resolves #219 --- convoy/batch.py | 5 + convoy/remotefs.py | 18 +- convoy/settings.py | 6 +- docs/14-batch-shipyard-configuration-jobs.md | 4 +- docs/15-batch-shipyard-configuration-fs.md | 18 +- docs/65-batch-shipyard-remote-fs.md | 254 +++++++++++++++++- docs/66-batch-shipyard-resource-monitoring.md | 4 +- docs/97-faq.md | 1 + schemas/fs.yaml | 1 + scripts/shipyard_remotefs_bootstrap.sh | 56 +++- scripts/shipyard_remotefs_stat.sh | 5 + shipyard.py | 1 - 12 files changed, 336 insertions(+), 37 deletions(-) diff --git a/convoy/batch.py b/convoy/batch.py index cf52c816..dbb73a4c 100644 --- a/convoy/batch.py +++ b/convoy/batch.py @@ -2980,6 +2980,7 @@ def egress_service_logs( ('waiting for {} log files to be uploaded; this may take ' 'some time, please be patient').format( resp.number_of_files_uploaded)) + count = 0 while True: blobs = blob_client.list_blobs( cont, prefix=resp.virtual_directory_name, @@ -2992,6 +2993,10 @@ def egress_service_logs( resp.virtual_directory_name, storage_settings.account)) break + count += 1 + if count > 150: + logger.error('exceeded wait timeout for log egress') + return time.sleep(2) if generate_sas: sas = storage.create_saskey( diff --git a/convoy/remotefs.py b/convoy/remotefs.py index c70ed6ad..f47f99f3 100644 --- a/convoy/remotefs.py +++ b/convoy/remotefs.py @@ -526,8 +526,8 @@ def create_storage_cluster( not settings.is_premium_storage_vm_size( rfs.storage_cluster.vm_size)): raise RuntimeError( - ('Premium storage requires a DS, DS_V2, FS, GS or LS ' - 'series vm_size instead of {}'.format( + ('Premium storage requires premium storage capable ' + 'vm_size instead of {}'.format( rfs.storage_cluster.vm_size))) # confirm before proceeding if not util.confirm_action( @@ -1034,13 +1034,12 @@ def expand_storage_cluster( # check vms vms = {} new_disk_count = 0 + mdadm_expand = False for i in range(rfs.storage_cluster.vm_count): # check if this vm filesystem supports expanding if (rfs.storage_cluster.vm_disk_map[i].filesystem != 'btrfs' and rfs.storage_cluster.vm_disk_map[i].raid_level == 0): - raise RuntimeError( - 'Cannot expand mdadm-based RAID-0 volumes. Please re-create ' - 'your storage cluster with btrfs using new disks.') + mdadm_expand = True vm_name = settings.generate_virtual_machine_name( rfs.storage_cluster, i) try: @@ -1098,6 +1097,13 @@ def expand_storage_cluster( logger.error( 'no new disks detected for storage cluster {}'.format(sc_id)) return False + if mdadm_expand: + logger.warning( + '**WARNING** cluster expansion is being performed on mdadm-based ' + 'RAID arrays. This feature is experimental and can take an ' + 'extremely long time. Any interruption or unrecoverable ' + 'failure can result in data loss.') + del mdadm_expand # confirm before proceeding if not util.confirm_action( config, 'expand storage cluster {}'.format(sc_id)): @@ -1150,7 +1156,7 @@ def expand_storage_cluster( return False logger.debug( 'waiting for disks to attach to virtual machines and expanding ' - 'the gluster volume, this may take a while') + 'the volume; please be patient as this can take a very long time') for offset in async_ops: premium, op = async_ops[offset] vm = op.result() diff --git a/convoy/settings.py b/convoy/settings.py index d1b90ad6..656f850c 100644 --- a/convoy/settings.py +++ b/convoy/settings.py @@ -4121,7 +4121,7 @@ def remotefs_settings(config, sc_id=None): ('All disks {} for vm {} are not specified in ' 'managed_disks:disk_names ({})').format( disk_array, vmkey, _disk_set)) - raid_level = _kv_read(vmd_conf[vmkey], 'raid_level', -1) + raid_level = _kv_read(vmd_conf[vmkey], 'raid_level', default=-1) if len(disk_array) == 1 and raid_level != -1: raise ValueError( 'Cannot specify a RAID-level with 1 disk in array') @@ -4131,10 +4131,6 @@ def remotefs_settings(config, sc_id=None): if raid_level != 0: raise ValueError('Unsupported RAID level {}'.format( raid_level)) - filesystem = vmd_conf[vmkey]['filesystem'] - if filesystem != 'btrfs' and not filesystem.startswith('ext'): - raise ValueError('Unsupported filesystem type {}'.format( - filesystem)) disk_map[int(vmkey)] = MappedVmDiskSettings( disk_array=disk_array, filesystem=vmd_conf[vmkey]['filesystem'], diff --git a/docs/14-batch-shipyard-configuration-jobs.md b/docs/14-batch-shipyard-configuration-jobs.md index 245dc46d..b12ef7bf 100644 --- a/docs/14-batch-shipyard-configuration-jobs.md +++ b/docs/14-batch-shipyard-configuration-jobs.md @@ -481,7 +481,9 @@ directory for the container execution is not explicitly set. The default is directory, you can pass the appropriate working directory parameter to the container runtime through either `additional_docker_run_options` or `additional_singularity_options`. A working directory option specified within -that property takes precedence over this option. +that property takes precedence over this option. Note that this option does +not work in `native` mode currently; `native` mode will always override this +option to `batch`. The required `tasks` property is an array of tasks to add to the job: diff --git a/docs/15-batch-shipyard-configuration-fs.md b/docs/15-batch-shipyard-configuration-fs.md index c8e06652..af2d1f6c 100644 --- a/docs/15-batch-shipyard-configuration-fs.md +++ b/docs/15-batch-shipyard-configuration-fs.md @@ -365,16 +365,16 @@ The number of entries in this map must match the `vm_count`. attach to this instance. These disks must be provisioned before creating the storage cluster. * (required) `filesystem` is the filesystem to use. Valid values are - `btrfs`, `ext4`, `ext3` and `ext2`. `btrfs` is generally stable for - RAID-0, with better features and data integrity protection. `btrfs` - also allows for RAID-0 expansion and is the only filesystem - compatible with the `fs cluster expand` command. + `btrfs`, `ext4`, `ext3`, `ext2`, and `xfs`. `btrfs` is generally + stable for RAID-0, with many features, data integrity + protection and also allows for direct RAID-0 expansion. `xfs` + is a high performance, stable and proven option. * (optional for single disk, required for multiple disks) `raid_level` is the RAID level to apply to the disks in the `disk_array`. The - only valid value for multiple disks is `0`. Note that if you wish - to expand the number of disks in the array in the future, you must - use `btrfs` as the filesystem. At least two disks per virtual - machine are required for RAID-0. + only valid value for multiple disks is `0`. At least two disks per + virtual machine are required for RAID-0. Note that if you wish + to expand the number of disks in the array in the future, it is + strongly recommended to use `btrfs` as the filesystem. * (optional) `prometheus` properties are to control if collectors for metrics to export to [Prometheus](https://prometheus.io/) monitoring are enabled. Note that all exporters do not have their ports exposed to the internet by @@ -392,7 +392,7 @@ cluster VMs. * (optional) `options` is a list of options to pass to the node exporter instance running on all nodes. The following collectors are force disabled, in addition to others disabled by - default: textfile, wifi, xfs, zfs. The nfs collector is enabled if + default: textfile, wifi, zfs. The nfs collector is enabled if the file server is NFS, automatically. ## Remote Filesystems with Batch Shipyard Guide diff --git a/docs/65-batch-shipyard-remote-fs.md b/docs/65-batch-shipyard-remote-fs.md index cb96e1c0..92fe1d83 100644 --- a/docs/65-batch-shipyard-remote-fs.md +++ b/docs/65-batch-shipyard-remote-fs.md @@ -42,8 +42,8 @@ status queries tailored to file server types and hassle-free SSH for administration * Support for cluster suspension (deallocation) and restart * Support for definining and managing multiple clusters simultaneously -* Support for [btrfs](https://en.wikipedia.org/wiki/Btrfs) along with -ext4, ext3 and ext2 filesystems +* Support for [btrfs](https://en.wikipedia.org/wiki/Btrfs), +[XFS](https://en.wikipedia.org/wiki/XFS), ext4, ext3 and ext2 filesystems * Automatic disk array construction via RAID-0 through btrfs or Linux software RAID (mdadm) * Consistent private IP address allocation per virtual machine and virtual @@ -168,12 +168,258 @@ set, however, if using a premium storage virtual machine size along with all premium disks, then you may qualify for [single instance SLA](https://azure.microsoft.com/support/legal/sla/virtual-machines). -## Configuration and Usage Documentation +## Configuration +In order to create storage clusters, there are a few configuration changes +that must be made to enable this feature. + +### Azure Active Directory Authentication Required +Azure Active Directory authentication is required to create storage clusters. +Additionally, if leveraging integration features with Batch pools, then the +virtual network shared between the storage cluster and the Batch pool must +be the same. + +Your service principal requires at least `Contributor` role permission in +order to create the resources required for the storage cluster. + +#### Credentials Configuration +The following is an example for Azure Active Directory authentication in the +credentials configuration. + +```yaml +credentials: + # management settings required with aad auth + management: + aad: + # valid aad settings (or at the global level) + subscription_id: # subscription id required + # ... other required settings +``` + +### RemoteFS Configuration Please see [this page](15-batch-shipyard-configuration-fs.md) for a full explanation of each remote filesystem and storage cluster configuration -option. Please see [this page](20-batch-shipyard-usage.md) for documentation +option. + +The following will step through and explain the major configuration +portions. The RemoteFS configuration file has four top-level properties: + +```yaml +remote_fs: + resource_group: # resource group for all resources, can be overridden + location: # Azure region for all storage cluster resources + managed_disks: + # disk settings + storage_clusters: + # storage cluster settings +``` + +It is important to specify a location that is appropriate for your storage +cluster and if joining to a Batch pool, must be within the same region. + +#### Managed Disks Configuration +The `managed_disks` section describes disks to be created for use with +storage clusters. + +```yaml + managed_disks: + resource_group: # optional resource group just for the disks + # premium disks have provisioned IOPS and can provide higher throughput + # and lower latency with consistency. If selecting premium disks, + # you must use a premium storage compatible vm_size. + premium: true + disk_size_gb: # size of the disk, please see Azure Manage Disk docs + disk_names: + - # list of disk names +``` + +#### Storage Cluster Configuration +The `storage_clusters` section describes one or more storage clusters to +create and manage. + +```yaml + storage_clusters: + # unique name of the storage cluster, this is the "storage cluster id" + mystoragecluster: + resource_group: # optional resource group just for the storage cluster + hostname_prefix: # hostname prefix and prefix for all resources created + ssh: + # ssh settings + public_ip: + enabled: # true or false for enabling public ip. If public ip is not + # enabled, then it is only accessible via the private network. + static: # true or false if public ip should be static + virtual_network: + # virtual network settings. If joining to a Batch pool, ensure that + # the virtual network resides in the same region and subscription + # as the Batch account. It is recommended that the storage cluster + # is in a different subnet than that of the Batch pool. + network_security: + # network security rules, only "ssh" is required. All other settings + # are for external access and not needed for joining with Batch pools + # as traffic remains private/internal only for that scenario. + file_server: + type: # nfs or glusterfs + mountpoint: # the mountpoint on the storage cluster nodes + mount_options: + - # fstab mount options in list format + server_options: + glusterfs: # this section is only needed for "glusterfs" type + transport: tcp # tcp is only supported for now + volume_name: # name of the gluster volume + volume_type: # type of volume to create. This must be compatible + # with the number of bricks. + # other key:value pair tuning options can be specified here + nfs: # this section is only needed for "nfs" type + # key:value (where value is a list) mapping of /etc/exports options + samba: + # optional section, if samba server setup is required + vm_count: # 1 for nfs, 2+ for glusterfs + vm_size: # Azure VM size to use. This must a premium storage compatible + # size if using premium managed disks. + fault_domains: # optional tuning for the number of fault domains + accelerated_networking: # true to enable accelerated networking + vm_disk_map: + # cardinal mapping of VMs to their disk arrays, e.g.: + '0': # note that this key must be a string + disk_array: + - # list of disks in this disk array + filesystem: # filesystem to use, see documentation on available kinds + raid_level: # this should be set to 0 if disk_array has more than 1 + # disk. If disk_array has only 1 disk, then this property + # should be omitted. + prometheus: + # optional monitoring settings +``` + +### Batch Pool Integration +If you wish to use your storage cluster in conjunction with a Batch pool, then +you will need to modify the credentials, global, pool, and jobs configuration +files. + +#### Credentials Configuration +Azure Active Directory authentication for Batch is required for joining a +storage cluster with a [Batch pool](64-batch-shipyard-byovnet.md). + +```yaml +credentials: + # batch aad settings required if monitoring batch pools + batch: + aad: + # valid aad settings (or at the global level) + account_service_url: # valid batch service url + resource_group: # batch account resource group + management: + aad: + # valid aad settings (or at the global level) + subscription_id: # subscription id required + # ... other required settings +``` + +#### Global Configuration +You must specify the storage cluster under `global_resources` such that +bound Batch pools will provision the correct software to mount the storage +cluster. + +```yaml +# ... other global configuration settings +global_resources: + # ... other global resources settings + volumes: + shared_data_volumes: + mystoragecluster: # this name must match exactly with the storage cluster + # id from the RemoteFS configuration that you intend + # to link + volume_driver: storage_cluster + container_path: # that path to mount this storage cluster in + # containers when jobs/tasks execute + mount_options: # optional fstab mount options + bind_options: # optional bind options to the container, default is "rw" +``` + +#### Pool Configuration +The pool configuration file must specify a valid virtual network. Because +of this requirement, you must use Azure Active Directory authentication for +Batch. + +```yaml +pool_specification: + # ... other pool settings + virtual_network: + # virtual network settings must have the same virtual network as the + # RemoteFS configuration. However, it is strongly recommended to have + # the Batch pool compute nodes reside in a different subnet. +``` + +#### Jobs Configuration +The jobs configuration must refer to the shared data volume such that +it understands to mount the volume into the container for the task or all +tasks under a job. + +```yaml +job_specifications: + - id: # job id + shared_data_volumes: + # this name must match exactly with the global_resources + # shared_data_volumes name. If specified at the job level, then all + # tasks under the job will mount this volume. + - mystoragecluster + # ... other job settings + tasks: + - shared_data_volumes: + - # storage cluster can be specified for fine grained control at + # a per task level + # ... other task settings +``` + +## Usage Documentation +The workflow for creating a storage cluster is first creating the managed +disks, then the storage cluster itself. Below is an example command usage. + +```shell +# create managed disks +shipyard fs disks add + +# create storage cluster +shipyard fs cluster add +``` + +If there were provisioning errors during `fs cluster add` but the provisioning +had not yet reached the VM creation phase, you can remove the orphaned +resources with: + +```shell +# clean up a failed provisioning that did not reach VM creation +shipyard fs cluster del --generate-from-prefix +``` + +If any VMs were created and the provisioning failed after that, you can +delete normally (without `--generate-from-prefix`). + + +After there is no need for the storage cluster, you can either suspend +the storage cluster or delete it. Note that suspending a glusterfs +storage cluster is considered experimental. + +```shell +# suspend a storage cluster +shipyard fs cluster suspend + +# restart a suspended storage cluster +shipyard fs cluster start + +# delete a storage cluster +shipyard fs cluster del +``` + +Please see [this page](20-batch-shipyard-usage.md) for detailed documentation on `fs` command usage. +### Usage with Batch Pools +If joining to a Batch pool, the storage cluster must be created first. +After which, commands such as `pool add` and `jobs add` should work +normally with the storage cluster mounted into containers if configuration +is correct. + ## Sample Recipes Sample recipes for RemoteFS storage clusters of NFS and GlusterFS types can be found in the diff --git a/docs/66-batch-shipyard-resource-monitoring.md b/docs/66-batch-shipyard-resource-monitoring.md index a378f499..247fd975 100644 --- a/docs/66-batch-shipyard-resource-monitoring.md +++ b/docs/66-batch-shipyard-resource-monitoring.md @@ -7,7 +7,7 @@ for monitoring Batch pools and RemoteFS clusters. ## Overview For many scenarios, it is often desirable to have visibility into a set of machines to gain insights through certain metrics over time. A global -monitoring resource is valuable to peer into per-machine and aggregate +monitoring resource is valuable to avail per-machine and aggregate metrics for Batch processing workloads as jobs are processed for measurements such as CPU, memory and network usage. As Batch Shipyard's execution model is based on containers, insights into container behavior is also desirable @@ -249,7 +249,7 @@ the credentials entirely within KeyVault. Please see the guide for more information. Additionally, Azure Active Directory authentication is required under -`management` and a valid `subscription_id` must be provided. Additionally, +`management` and a valid `subscription_id` must be provided. Moreover, if monitoring Batch pools, Batch authentication must be through Azure Active Directory for joining a [virtual network](64-batch-shipyard-byovnet.md). diff --git a/docs/97-faq.md b/docs/97-faq.md index fa364ad3..18f852ea 100644 --- a/docs/97-faq.md +++ b/docs/97-faq.md @@ -83,6 +83,7 @@ multi-instance, then it is strongly recommended to use `native` mode. Disadvantages of `native` mode are: * Singularity containers are not supported. +* `default_working_dir` in jobs cannot be changed from `batch`. * `input_data` of any kind at the task-level is not possible; you must either use `resource_files` or build your own solution. * `output_data` options are limited and egress to `azure_storage` Azure Files diff --git a/schemas/fs.yaml b/schemas/fs.yaml index d7d00231..623b87ff 100644 --- a/schemas/fs.yaml +++ b/schemas/fs.yaml @@ -204,6 +204,7 @@ mapping: - type: str filesystem: type: str + enum: ['btrfs', 'ext2', 'ext3', 'ext4', 'xfs'] raid_level: type: int prometheus: diff --git a/scripts/shipyard_remotefs_bootstrap.sh b/scripts/shipyard_remotefs_bootstrap.sh index bc03aa76..44a125de 100755 --- a/scripts/shipyard_remotefs_bootstrap.sh +++ b/scripts/shipyard_remotefs_bootstrap.sh @@ -336,7 +336,7 @@ install_and_start_node_exporter() { ne_port=${pneo[0]} pneo=("${pneo[@]:1}") cat << EOF > /etc/node_exporter.conf -OPTIONS="$nfs --no-collector.textfile --no-collector.wifi --no-collector.xfs --no-collector.zfs --web.listen-address=\":${ne_port}\" ${pneo[@]}" +OPTIONS="$nfs --no-collector.textfile --no-collector.wifi --no-collector.zfs --web.listen-address=\":${ne_port}\" ${pneo[@]}" EOF cat << 'EOF' > /etc/systemd/system/node-exporter.service [Unit] @@ -661,12 +661,32 @@ if [ "$raid_level" -ge 0 ]; then btrfs device add "${raid_array[@]}" $mountpath raid_resized=1 else - # add new block device first - echo "Adding devices ${raid_array[*]} to $target_md" - mdadm --add $target_md "${raid_array[@]}" - # grow the array - echo "Growing array $target_md to a total of $numdisks devices" - mdadm --grow --raid-devices="$numdisks" "$target_md" + # increase raid rebuild/resync/reshape speed + oldmin=$(cat /proc/sys/dev/raid/speed_limit_min) + oldmax=$(cat /proc/sys/dev/raid/speed_limit_max) + echo 100000 > /proc/sys/dev/raid/speed_limit_min + echo 500000 > /proc/sys/dev/raid/speed_limit_max + # add new block device and grow + echo "Growing array $target_md to a total of $numdisks devices: ${raid_array[*]}" + mdadm --grow "$target_md" --raid-devices="$numdisks" --add "${raid_array[@]}" + sleep 5 + mdadm --detail --scan + # wait until reshape completes + set +e + while : + do + if ! mdadm --detail --scan | grep "spares="; then + break + fi + sleep 10 + done + # ensure array is back to RAID-0 + if ! mdadm --detail "$target_md" | grep "Raid Level : raid0"; then + mdadm --grow --level 0 "$target_md" + fi + set -e + echo "$oldmin" > /proc/sys/dev/raid/speed_limit_min + echo "$oldmax" > /proc/sys/dev/raid/speed_limit_max raid_resized=1 fi fi @@ -675,7 +695,7 @@ if [ "$raid_level" -ge 0 ]; then btrfs filesystem show else cat /proc/mdstat - mdadm --detail $target_md + mdadm --detail "$target_md" fi # get uuid of first disk as target uuid if not populated if [ -z "$target_uuid" ]; then @@ -692,9 +712,23 @@ if [ $format_target -eq 1 ]; then echo "Target not specified for format" exit 1 fi + sleep 5 echo "Creating filesystem on $target_md" if [ "$filesystem" == "btrfs" ]; then mkfs.btrfs "$target_md" + elif [ "$filesystem" == "xfs" ]; then + mdadm --detail --scan + set +e + # let mkfs.xfs automatically select the appropriate su/sw + if ! mkfs.xfs "$target_md"; then + # mkfs.xfs can sometimes fail because it can't query the + # underlying device, try to re-assemble and retry format + set -e + mdadm --verbose --assemble "$target_md" "${raid_array[@]}" + mdadm --detail --scan + mkfs.xfs "$target_md" + fi + set -e elif [[ $filesystem == ext* ]]; then mkfs."${filesystem}" -m 0 "$target_md" else @@ -746,7 +780,9 @@ if [ $attach_disks -eq 0 ]; then if [ "$filesystem" == "btrfs" ]; then # also enable ssd optimizations on btrfs mount_options+=",nobarrier,ssd" - else + elif [ "$filesystem" == "xfs" ]; then + mount_options+=",nobarrier" + elif [[ $filesystem == ext* ]]; then mount_options+=",barrier=0" fi else @@ -791,6 +827,8 @@ if [ $raid_resized -eq 1 ]; then btrfs filesystem balance $mountpath echo "Rebalance of btrfs on $mountpath complete." fi + elif [ "$filesystem" == "xfs" ]; then + xfs_growfs $mountpath elif [[ $filesystem == ext* ]]; then resize2fs $mountpath else diff --git a/scripts/shipyard_remotefs_stat.sh b/scripts/shipyard_remotefs_stat.sh index f37f6368..58c5045f 100755 --- a/scripts/shipyard_remotefs_stat.sh +++ b/scripts/shipyard_remotefs_stat.sh @@ -141,6 +141,11 @@ if [ "$raid_level" -ge 0 ]; then fi echo "mdadm detail:" mdadm --detail "${target[0]}" + if [ "$filesystem" == "xfs" ]; then + echo "" + echo "xfs detail:" + xfs_info "$mountpath" + fi fi fi diff --git a/shipyard.py b/shipyard.py index d3c85dc9..6076d957 100755 --- a/shipyard.py +++ b/shipyard.py @@ -152,7 +152,6 @@ def initialize_for_fs(self): skip_global_config=False, skip_pool_config=True, skip_monitor_config=True, fs_storage=True) self._ensure_credentials_section('storage') - self._ensure_credentials_section('remote_fs') _, self.resource_client, self.compute_client, self.network_client, \ self.storage_mgmt_client, _, _ = \ convoy.clients.create_all_clients(self)