From 97d73da1fa0987136d9b55d249a44d057efccd04 Mon Sep 17 00:00:00 2001 From: Volodymyr Samotiy Date: Sat, 1 Dec 2018 01:56:02 +0200 Subject: [PATCH] [mlnx|ffb]: Add fast-fast flow in fast(warm)-reboot script (#390) * [mlnx|ffb] Add fast-fast flow in fast(warm)-reboot script Signed-off-by: Stepan Blyschak * [Mellanox|FFB]: Fix review comments * Change naming convention from "fast-fast" to "fastfast" Signed-off-by: Volodymyr Samotiy --- scripts/fast-reboot | 102 +++++++++++++++++++++++++++++++++++++++----- show/mlnx.py | 75 ++++++++++++++++++++++++++++++-- 2 files changed, 163 insertions(+), 14 deletions(-) diff --git a/scripts/fast-reboot b/scripts/fast-reboot index 8e2a94b95289..434412bde85e 100755 --- a/scripts/fast-reboot +++ b/scripts/fast-reboot @@ -3,9 +3,10 @@ REBOOT_USER=$(logname) REBOOT_TIME=$(date) REBOOT_CAUSE_FILE="/var/cache/sonic/reboot-cause.txt" -REBOOT_TYPE=$(basename $0) WARM_DIR=/host/warmboot REDIS_FILE=dump.rdb +REBOOT_SCRIPT_NAME=$(basename $0) +REBOOT_TYPE="${REBOOT_SCRIPT_NAME}" # Check root privileges if [[ "$EUID" -ne 0 ]] @@ -14,6 +15,8 @@ then exit 1 fi +sonic_asic_type=$(sonic-cfggen -y /etc/sonic/sonic_version.yml -v asic_type) + function clear_warm_boot() { config warm_restart disable || /bin/true @@ -25,6 +28,19 @@ function clear_warm_boot() fi } +function cleanup_except_table() +{ + local REDIS_DB_NUMBER="$1" + local TABLE_PREFIX="$2" + redis-cli -n "${REDIS_DB_NUMBER}" eval " + for _, k in ipairs(redis.call('keys', '*')) do + if not string.match(k, '${TABLE_PREFIX}') then + redis.call('del', k) + end + end + " 0 +} + function initialize_pre_shutdown() { TABLE="WARM_RESTART_TABLE|warm-shutdown" @@ -86,9 +102,27 @@ case "$REBOOT_TYPE" in BOOT_TYPE_ARG=$REBOOT_TYPE ;; "warm-reboot") - BOOT_TYPE_ARG="warm" - trap clear_warm_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM - config warm_restart enable system + if [[ "$sonic_asic_type" == "mellanox" ]]; then + REBOOT_TYPE="fastfast-reboot" + BOOT_TYPE_ARG="fastfast" + # source mlnx-ffb.sh file with + # functions to check ISSU upgrade/do ISSU start + source mlnx-ffb.sh + + trap clear_warm_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM + + # Set warm reboot flag for some components. + # In fastfast boot flow, only APPL layer dockers + # are enabled to perform warm restart + config warm_restart disable system + config warm_restart disable swss + config warm_restart enable bgp + config warm_restart enable teamd + else + BOOT_TYPE_ARG="warm" + trap clear_warm_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM + config warm_restart enable system + fi ;; *) echo "Not supported reboot type: $REBOOT_TYPE" >&2 @@ -118,11 +152,22 @@ else fi INITRD=$(echo $KERNEL_IMAGE | sed 's/vmlinuz/initrd.img/g') -sonic_asic_type=$(sonic-cfggen -y /etc/sonic/sonic_version.yml -v asic_type) - # Install new FW for mellanox platforms before control plane goes down # So on boot switch will not spend time to upgrade FW increasing the CP downtime if [[ "$sonic_asic_type" == "mellanox" ]]; then + + if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + check_issu_enabled || { + echo "Warm reboot is not supported by this HWSKU" + exit 1 + } + + check_sdk_upgrade || { + echo "Warm reboot is not supported" + exit 1 + } + fi + echo "Prepare MLNX ASIC to ${REBOOT_TYPE}: install new FW if required" MLNX_EXIT_SUCCESS="0" @@ -136,12 +181,20 @@ if [[ "$sonic_asic_type" == "mellanox" ]]; then echo "Failed to burn MLNX FW: errno=${MLNX_EXIT_CODE}" exit "${MLNX_EXIT_ERROR}" fi + + if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + issu_start || { + echo "ISSU start failed" + echo "Cold reboot may be requiered to recover" + exit 1 + } + fi fi # Load kernel into the memory /sbin/kexec -l "$KERNEL_IMAGE" --initrd="$INITRD" --append="$BOOT_OPTIONS" -if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then +if [[ "$REBOOT_TYPE" = "fast-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then # Dump the ARP and FDB tables to files also as default routes for both IPv4 and IPv6 # into /host/fast-reboot mkdir -p /host/fast-reboot @@ -180,7 +233,28 @@ fi # Kill swss dockers docker kill swss -# Pre-shutdown syncd and stop teamd gracefully + +# Warm reboot: dump state to host disk +if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + mkdir -p $WARM_DIR + + # Dump route table form APPL DB. + # This route table will be used by fpmsyncd + # reconcialtion logic + cleanup_except_table 0 'ROUTE_TABLE' + cleanup_except_table 4 'WARM_RESTART_TABLE' + cleanup_except_table 6 'WARM_RESTART_TABLE' + + redis-cli -n 1 FLUSHDB + redis-cli -n 2 FLUSHDB + redis-cli -n 5 FLUSHDB + + redis-cli save + docker cp database:/var/lib/redis/$REDIS_FILE $WARM_DIR + docker exec -i database rm /var/lib/redis/$REDIS_FILE +fi + +# Pre-shutdown syncd if [[ "$REBOOT_TYPE" = "warm-reboot" ]]; then initialize_pre_shutdown @@ -189,7 +263,10 @@ if [[ "$REBOOT_TYPE" = "warm-reboot" ]]; then wait_for_pre_shutdown_complete_or_fail backup_datebase +fi +# Stop teamd gracefully +if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then # Send USR1 signal to all teamd instances to stop them # It will prepare teamd for warm-reboot # Note: We must send USR1 signal before syncd, because it will send the last packet through CPU port @@ -197,7 +274,12 @@ if [[ "$REBOOT_TYPE" = "warm-reboot" ]]; then fi # syncd service stop is capable of handling both warm/fast/cold shutdown -systemctl stop syncd +if [[ "$sonic_asic_type" = "mellanox" ]]; then + docker kill syncd +else + # syncd service stop is capable of handling both warm/fast/cold shutdown + systemctl stop syncd +fi # Kill other containers to make the reboot faster docker ps -q | xargs docker kill > /dev/null @@ -223,7 +305,7 @@ fi # Update the reboot cause file to reflect that user issued this script # Upon next boot, the contents of this file will be used to determine the # cause of the previous reboot -echo "User issued '${REBOOT_TYPE}' command [User: ${REBOOT_USER}, Time: ${REBOOT_TIME}]" > ${REBOOT_CAUSE_FILE} +echo "User issued '${REBOOT_SCRIPT_NAME}' command [User: ${REBOOT_USER}, Time: ${REBOOT_TIME}]" > ${REBOOT_CAUSE_FILE} # Wait until all buffers synced with disk sync diff --git a/show/mlnx.py b/show/mlnx.py index e741a8864c37..aff2da0b85f4 100644 --- a/show/mlnx.py +++ b/show/mlnx.py @@ -9,6 +9,9 @@ import sys import subprocess import click + import sonic_platform + from swsssdk import ConfigDBConnector + import xml.etree.ElementTree as ET except ImportError as e: raise ImportError("%s - required module not found" % str(e)) @@ -18,9 +21,12 @@ SNIFFER_CONF_FILE_IN_CONTAINER = CONTAINER_NAME + ':' + SNIFFER_CONF_FILE TMP_SNIFFER_CONF_FILE = '/tmp/tmp.conf' +HWSKU_PATH = '/usr/share/sonic/hwsku/' + +SAI_PROFILE_DELIMITER = '=' # run command -def run_command(command, display_cmd=False, ignore_error=False): +def run_command(command, display_cmd=False, ignore_error=False, print_to_console=True): """Run bash command and print output to stdout """ if display_cmd == True: @@ -29,12 +35,14 @@ def run_command(command, display_cmd=False, ignore_error=False): proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) (out, err) = proc.communicate() - if len(out) > 0: + if len(out) > 0 and print_to_console: click.echo(out) if proc.returncode != 0 and not ignore_error: sys.exit(proc.returncode) + return out, err + # 'mlnx' group @click.group() @@ -61,8 +69,57 @@ def sniffer_status_get(env_variable_name): return enabled -@mlnx.command() -def sniffer(): +def is_issu_status_enabled(): + """ This function parses the SAI XML profile used for mlnx to + get whether ISSU is enabled or disabled + @return: True/False + """ + + # ISSU disabled if node in XML config wasn't found + issu_enabled = False + + # Get the SAI XML path from sai.profile + sai_profile_path = '/{}/sai.profile'.format(HWSKU_PATH) + + DOCKER_CAT_COMMAND = 'docker exec -ti {container_name} cat {path}' + + command = DOCKER_CAT_COMMAND.format(container_name=CONTAINER_NAME, path=sai_profile_path) + sai_profile_content, _ = run_command(command, print_to_console=False) + + sai_profile_kvs = {} + + for line in sai_profile_content.split('\n'): + if not SAI_PROFILE_DELIMITER in line: + continue + key, value = line.split(SAI_PROFILE_DELIMITER) + sai_profile_kvs[key] = value.strip() + + try: + sai_xml_path = sai_profile_kvs['SAI_INIT_CONFIG_FILE'] + except KeyError: + print >> sys.stderr, "Failed to get SAI XML from sai profile" + sys.exit(1) + + # Get ISSU from SAI XML + command = DOCKER_CAT_COMMAND.format(container_name=CONTAINER_NAME, path=sai_xml_path) + sai_xml_content, _ = run_command(command, print_to_console=False) + + try: + root = ET.fromstring(sai_xml_content) + except ET.ParseError: + print >> sys.stderr, "Failed to parse SAI xml" + sys.exit(1) + + el = root.find('platform_info').find('issu-enabled') + + if el is not None: + issu_enabled = int(el.text) == 1 + + return issu_enabled + + +@mlnx.command('sniffer') +def sniffer_status(): """ Show sniffer status """ components = ['sdk'] env_variable_strings = [ENV_VARIABLE_SX_SNIFFER] @@ -72,3 +129,13 @@ def sniffer(): print components[index] + " sniffer is enabled" else: print components[index] + " sniffer is disabled" + + +@mlnx.command('issu') +def issu_status(): + """ Show ISSU status """ + + res = is_issu_status_enabled() + + print 'ISSU is enabled' if res else 'ISSU is disabled' +