diff --git a/scripts/generate_dump b/scripts/generate_dump index bc33c0bcc8..2d19b59374 100755 --- a/scripts/generate_dump +++ b/scripts/generate_dump @@ -38,6 +38,9 @@ HOME=${HOME:-/root} USER=${USER:-root} TIMEOUT_MIN="5" SKIP_BCMCMD=0 +SAVE_STDERR=true +RETURN_CODE=0 + handle_signal() { @@ -47,7 +50,15 @@ handle_signal() } trap 'handle_signal' SIGINT +handle_error() { + if [ "$1" != "0" ]; then + echo "ERR: RC:-$1 observed on line $2" >&2 + RETURN_CODE=1 + fi +} + save_bcmcmd() { + trap 'handle_error $? $LINENO' ERR local start_t=$(date +%s%3N) local end_t=0 local cmd="$1" @@ -56,7 +67,9 @@ save_bcmcmd() { local do_gzip=${3:-false} local tarpath="${BASE}/dump/$filename" local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m" - [ ! -d $LOGDIR ] && $MKDIR $V -p $LOGDIR + if [ ! -d $LOGDIR ]; then + $MKDIR $V -p $LOGDIR + fi if [ $SKIP_BCMCMD -eq 1 ]; then echo "Skip $cmd" @@ -69,14 +82,15 @@ save_bcmcmd() { if $NOOP; then echo "${timeout_cmd} $cmd &> '${filepath}'" else - eval "${timeout_cmd} $cmd" &> "${filepath}" - ret=$? + ret=0 + eval "${timeout_cmd} $cmd" &> "${filepath}" || ret=$? if [ $ret -ne 0 ]; then if [ $ret -eq 124 ]; then echo "Command: $cmd timedout after ${TIMEOUT_MIN} minutes." else - grep "polling socket timeout: Success" ${filepath} &>/dev/null - if [ $? -eq 0 ]; then + RC=0 + grep "polling socket timeout: Success" ${filepath} &>/dev/null || RC=$? + if [ $RC -eq 0 ]; then echo "bcmcmd command timeout. Setting SKIP_BCMCMD to true ..." SKIP_BCMCMD=1 fi @@ -107,6 +121,7 @@ save_bcmcmd() { # None ############################################################################### save_bcmcmd_all_ns() { + trap 'handle_error $? $LINENO' ERR local do_gzip=${3:-false} if [[ ( "$NUM_ASICS" > 1 ) ]]; then @@ -139,26 +154,29 @@ save_bcmcmd_all_ns() { # cmd: The command to run. Make sure that arguments with spaces have quotes # filename: the filename to save the output as in $BASE/dump # do_gzip: (OPTIONAL) true or false. Should the output be gzipped -# save_stderr: (OPTIONAL) true or false. Should the stderr output be saved # Returns: # None ############################################################################### save_cmd() { + trap 'handle_error $? $LINENO' ERR local start_t=$(date +%s%3N) local end_t=0 local cmd="$1" local filename=$2 local filepath="${LOGDIR}/$filename" local do_gzip=${3:-false} - local save_stderr=${4:-true} local tarpath="${BASE}/dump/$filename" local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m" - local redirect="&>" - [ ! -d $LOGDIR ] && $MKDIR $V -p $LOGDIR + local redirect='&>' + local redirect_eval='2>&1' + if [ ! -d $LOGDIR ]; then + $MKDIR $V -p $LOGDIR + fi - if ! $save_stderr + if ! $SAVE_STDERR then redirect=">" + redirect_eval="" fi # eval required here to re-evaluate the $cmd properly at runtime @@ -168,12 +186,13 @@ save_cmd() { if $do_gzip; then tarpath="${tarpath}.gz" filepath="${filepath}.gz" - local cmds="$cmd 2>&1 | gzip -c > '${filepath}'" + local cmds="$cmd $redirect_eval | gzip -c > '${filepath}'" if $NOOP; then echo "${timeout_cmd} bash -c \"${cmds}\"" else - eval "${timeout_cmd} bash -c \"${cmds}\"" - if [ $? -ne 0 ]; then + RC=0 + eval "${timeout_cmd} bash -c \"${cmds}\"" || RC=$? + if [ $RC -ne 0 ]; then echo "Command: $cmds timedout after ${TIMEOUT_MIN} minutes." fi fi @@ -181,8 +200,9 @@ save_cmd() { if $NOOP; then echo "${timeout_cmd} $cmd $redirect '$filepath'" else - eval "${timeout_cmd} $cmd" "$redirect" "$filepath" - if [ $? -ne 0 ]; then + RC=0 + eval "${timeout_cmd} $cmd" "$redirect" "$filepath" || RC=$? + if [ $RC -ne 0 ]; then echo "Command: $cmd timedout after ${TIMEOUT_MIN} minutes." fi fi @@ -207,6 +227,7 @@ save_cmd() { # None ############################################################################### save_cmd_all_ns() { + trap 'handle_error $? $LINENO' ERR local do_zip=${3:-false} # host or default namespace @@ -235,6 +256,7 @@ save_cmd_all_ns() { # None ############################################################################### copy_from_docker() { + trap 'handle_error $? $LINENO' ERR local start_t=$(date +%s%3N) local end_t=0 local docker=$1 @@ -242,19 +264,20 @@ copy_from_docker() { local dstpath=$3 local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m" - local touch_cmd="sudo docker exec -i ${docker} touch ${filename}" + local touch_cmd="sudo docker exec ${docker} touch ${filename}" local cp_cmd="sudo docker cp ${docker}:${filename} ${dstpath}" if $NOOP; then echo "${timeout_cmd} ${touch_cmd}" echo "${timeout_cmd} ${cp_cmd}" else - eval "${timeout_cmd} ${touch_cmd}" - if [ $? -ne 0 ]; then + RC=0 + eval "${timeout_cmd} ${touch_cmd}" || RC=$? + if [ $RC -ne 0 ]; then echo "Command: $touch_cmd timedout after ${TIMEOUT_MIN} minutes." fi - eval "${timeout_cmd} ${cp_cmd}" - if [ $? -ne 0 ]; then + eval "${timeout_cmd} ${cp_cmd}" || RC=$? + if [ $RC -ne 0 ]; then echo "Command: $cp_cmd timedout after ${TIMEOUT_MIN} minutes." fi fi @@ -276,6 +299,7 @@ copy_from_docker() { # None ############################################################################### copy_from_masic_docker() { + trap 'handle_error $? $LINENO' ERR local docker=$1 local filename=$2 local dstpath=$3 @@ -301,6 +325,7 @@ copy_from_masic_docker() { # vtysh namespace option ############################################################################### get_vtysh_namespace() { + trap 'handle_error $? $LINENO' ERR local asic_id=${1:-""} local ns="" if [[ ( $asic_id = "" ) ]] ; then @@ -325,6 +350,7 @@ get_vtysh_namespace() { # None ############################################################################### save_vtysh() { + trap 'handle_error $? $LINENO' ERR local vtysh_cmd=$1 local filename=$2 local do_gzip=${3:-false} @@ -354,6 +380,7 @@ save_vtysh() { # None ############################################################################### save_ip() { + trap 'handle_error $? $LINENO' ERR local ip_args=$1 local filename="ip.$2" local do_gzip=${3:-false} @@ -372,6 +399,7 @@ save_ip() { # None ############################################################################### save_bridge() { + trap 'handle_error $? $LINENO' ERR local br_args=$1 local filename="bridge.$2" local do_gzip=${3:-false} @@ -388,6 +416,7 @@ save_bridge() { # None ############################################################################### save_bridge_info() { + trap 'handle_error $? $LINENO' ERR save_bridge "fdb show" "fdb" save_bridge "vlan show" "vlan" } @@ -404,6 +433,7 @@ save_bridge_info() { # None ############################################################################### save_bgp_neighbor() { + trap 'handle_error $? $LINENO' ERR local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m" local asic_id=${1:-""} local ns=$(get_vtysh_namespace $asic_id) @@ -440,6 +470,7 @@ save_bgp_neighbor() { # None ############################################################################### save_bgp_neighbor_all_ns() { + trap 'handle_error $? $LINENO' ERR if [[ ( "$NUM_ASICS" == 1 ) ]] ; then save_bgp_neighbor else @@ -460,6 +491,7 @@ save_bgp_neighbor_all_ns() { # None ############################################################################### save_nat_info() { + trap 'handle_error $? $LINENO' ERR save_cmd_all_ns "iptables -t nat -nv -L" "nat.iptables" save_cmd_all_ns "conntrack -j -L" "nat.conntrack" save_cmd_all_ns "conntrack -j -L | wc" "nat.conntrackcount" @@ -478,6 +510,7 @@ save_nat_info() { # None ############################################################################### save_bfd_info() { + trap 'handle_error $? $LINENO' ERR save_vtysh "show bfd peers" "frr.bfd.peers" save_vtysh "show bfd peers counters" "frr.bfd.peers.counters" save_vtysh "show bfd peers json" "frr.bfd.peers.json" @@ -494,6 +527,7 @@ save_bfd_info() { # None ############################################################################### save_ip_info() { + trap 'handle_error $? $LINENO' ERR save_ip "link" "link" save_ip "addr" "addr" save_ip "rule" "rule" @@ -512,6 +546,7 @@ save_ip_info() { # None ############################################################################### save_bgp_info() { + trap 'handle_error $? $LINENO' ERR save_vtysh "show ip bgp summary" "bgp.summary" save_vtysh "show ip bgp neighbors" "bgp.neighbors" save_vtysh "show ip bgp" "bgp.table" @@ -531,6 +566,7 @@ save_bgp_info() { # None ############################################################################### save_frr_info() { + trap 'handle_error $? $LINENO' ERR save_vtysh "show running-config" "frr.running_config" save_vtysh "show ip route vrf all" "frr.ip_route" save_vtysh "show ipv6 route vrf all" "frr.ip6_route" @@ -550,6 +586,7 @@ save_frr_info() { # None ############################################################################### save_redis_info() { + trap 'handle_error $? $LINENO' ERR save_redis "APPL_DB" save_redis "ASIC_DB" save_redis "COUNTERS_DB" @@ -577,6 +614,7 @@ save_redis_info() { # None ############################################################################### save_proc() { + trap 'handle_error $? $LINENO' ERR local procfiles="$@" $MKDIR $V -p $TARDIR/proc for f in $procfiles @@ -602,6 +640,7 @@ save_proc() { # None ############################################################################### save_redis() { + trap 'handle_error $? $LINENO' ERR local db_name=$1 if [ $# -ge 2 ] && [ -n "$2" ]; then local dest_file_name=$2 @@ -621,12 +660,13 @@ save_redis() { # None ############################################################################### save_saidump() { + trap 'handle_error $? $LINENO' ERR if [[ ( "$NUM_ASICS" == 1 ) ]] ; then - save_cmd "docker exec -it syncd saidump" "saidump" + save_cmd "docker exec syncd saidump" "saidump" else for (( i=0; i<$NUM_ASICS; i++ )) do - save_cmd "docker exec -it syncd$i saidump" "saidump$i" + save_cmd "docker exec syncd$i saidump" "saidump$i" done fi } @@ -641,6 +681,7 @@ save_saidump() { # None ############################################################################### save_platform_info() { + trap 'handle_error $? $LINENO' ERR save_cmd "show platform syseeprom" "syseeprom" save_cmd "show platform psustatus" "psustatus" save_cmd "show platform ssdhealth" "ssdhealth" @@ -668,6 +709,7 @@ save_platform_info() { # None ############################################################################### save_file() { + trap 'handle_error $? $LINENO' ERR local start_t=$(date +%s%3N) local end_t=0 local orig_path=$1 @@ -676,7 +718,9 @@ save_file() { local tar_path="${BASE}/$supp_dir/$(basename $orig_path)" local do_gzip=${3:-true} local do_tar_append=${4:-true} - [ ! -d "$TARDIR/$supp_dir" ] && $MKDIR $V -p "$TARDIR/$supp_dir" + if [ ! -d "$TARDIR/$supp_dir" ]; then + $MKDIR $V -p "$TARDIR/$supp_dir" + fi if $do_gzip; then gz_path="${gz_path}.gz" @@ -714,6 +758,7 @@ save_file() { # None ############################################################################### find_files() { + trap 'handle_error $? $LINENO' ERR local -r directory=$1 $TOUCH --date="${SINCE_DATE}" "${REFERENCE_FILE}" local -r find_command="find -L $directory -type f -newer ${REFERENCE_FILE}" @@ -758,11 +803,12 @@ enable_logrotate() { # None ############################################################################### collect_mellanox() { + trap 'handle_error $? $LINENO' ERR local sai_dump_folder="/tmp/saisdkdump" local sai_dump_filename="${sai_dump_folder}/sai_sdk_dump_$(date +"%m_%d_%Y_%I_%M_%p")" - ${CMD_PREFIX}docker exec -it syncd mkdir -p $sai_dump_folder - ${CMD_PREFIX}docker exec -it syncd saisdkdump -f $sai_dump_filename + ${CMD_PREFIX}docker exec syncd mkdir -p $sai_dump_folder + ${CMD_PREFIX}docker exec syncd saisdkdump -f $sai_dump_filename copy_from_docker syncd $sai_dump_folder $sai_dump_folder echo "$sai_dump_folder" @@ -771,13 +817,13 @@ collect_mellanox() { done ${CMD_PREFIX}rm -rf $sai_dump_folder - ${CMD_PREFIX}docker exec -it syncd rm -rf $sai_dump_folder + ${CMD_PREFIX}docker exec syncd rm -rf $sai_dump_folder # Save SDK error dumps local sdk_dump_path=`${CMD_PREFIX}docker exec syncd cat /tmp/sai.profile|grep "SAI_DUMP_STORE_PATH"|cut -d = -f2` - if [[ $sdk_dump_path ]]; then + if [[ -d $sdk_dump_path ]]; then copy_from_docker syncd $sdk_dump_path /tmp/sdk-dumps - for file in $(find /tmp/sdk-dumps); do + for file in $(find /tmp/sdk-dumps -type f); do save_file ${file} sai_sdk_dump false done rm -rf /tmp/sdk-dumps @@ -794,6 +840,7 @@ collect_mellanox() { # None ############################################################################### collect_broadcom() { + trap 'handle_error $? $LINENO' ERR local platform=$(show platform summary --json | python -c 'import sys, json; \ print(json.load(sys.stdin)["platform"])') local hwsku=$(show platform summary --json | python -c 'import sys, json; \ @@ -880,6 +927,7 @@ collect_broadcom() { # None ############################################################################### save_log_files() { + trap 'handle_error $? $LINENO' ERR disable_logrotate trap enable_logrotate HUP INT QUIT TERM KILL ABRT ALRM @@ -920,6 +968,7 @@ save_log_files() { ############################################################################### save_warmboot_files() { # Copy the warmboot files + trap 'handle_error $? $LINENO' ERR start_t=$(date +%s%3N) if $NOOP; then echo "$CP $V -rf /host/warmboot $TARDIR" @@ -947,26 +996,31 @@ save_warmboot_files() { ############################################################################### save_crash_files() { # archive core dump files - for file in $(find_files "/var/core/"); do - # don't gzip already-gzipped log files :) - if [ -z "${file##*.gz}" ]; then - save_file $file core false - else - save_file $file core true - fi - done + trap 'handle_error $? $LINENO' ERR + if [ -d /var/core/ ]; then + for file in $(find_files "/var/core/"); do + # don't gzip already-gzipped log files :) + if [ -z "${file##*.gz}" ]; then + save_file $file core false + else + save_file $file core true + fi + done + fi # archive kernel dump files - [ -d /var/crash/ ] && for file in $(find_files "/var/crash/"); do - # don't gzip already-gzipped dmesg files :) - if [ ! ${file} = "/var/crash/kexec_cmd" -a ! ${file} = "/var/crash/export" ]; then - if [[ ${file} == *"kdump."* ]]; then - save_file $file kdump false - else - save_file $file kdump true + if [ -d /var/crash/ ]; then + for file in $(find_files "/var/crash/"); do + # don't gzip already-gzipped dmesg files :) + if [ ! ${file} = "/var/crash/kexec_cmd" -a ! ${file} = "/var/crash/export" ]; then + if [[ ${file} == *"kdump."* ]]; then + save_file $file kdump false + else + save_file $file kdump true + fi fi - fi - done + done + fi } ############################################################################### @@ -979,9 +1033,15 @@ save_crash_files() { # ASIC Count ############################################################################### get_asic_count() { + trap 'handle_error $? $LINENO' ERR + local redirect_eval="2>&1" + if ! $SAVE_STDERR + then + redirect_eval="" + fi local cmd="show platform summary --json | python -c 'import sys, json; \ print(json.load(sys.stdin)[\"asic_count\"])'" - echo `eval ${cmd} 2>&1` + echo `eval ${cmd} ${redirect_eval}` } ############################################################################### @@ -995,6 +1055,7 @@ get_asic_count() { # None ############################################################################### save_counter_snapshot() { + trap 'handle_error $? $LINENO' ERR local asic_name="$1" local idx=$2 counter_t=$(date +'%d/%m/%Y %H:%M:%S:%6N') @@ -1018,6 +1079,7 @@ save_counter_snapshot() { save_cmd_all_ns "ifconfig -a" "ifconfig.counters_$idx" } + ############################################################################### # Main generate_dump routine # Globals: @@ -1028,6 +1090,7 @@ save_counter_snapshot() { # None ############################################################################### main() { + trap 'handle_error $? $LINENO' ERR local start_t=0 local end_t=0 if [ `whoami` != root ] && ! $NOOP; @@ -1106,12 +1169,12 @@ main() { if [[ ( "$NUM_ASICS" > 1 ) ]]; then for (( i=0; i<$NUM_ASICS; i++ )) do - save_cmd "docker exec -it lldp$i lldpcli show statistics" "lldp$i.statistics" + save_cmd "docker exec lldp$i lldpcli show statistics" "lldp$i.statistics" save_cmd "docker logs bgp$i" "docker.bgp$i.log" save_cmd "docker logs swss$i" "docker.swss$i.log" done else - save_cmd "docker exec -it lldp lldpcli show statistics" "lldp.statistics" + save_cmd "docker exec lldp lldpcli show statistics" "lldp.statistics" save_cmd "docker logs bgp" "docker.bgp.log" save_cmd "docker logs swss" "docker.swss.log" fi @@ -1132,12 +1195,14 @@ main() { save_cmd "docker ps -a" "docker.ps" save_cmd "docker top pmon" "docker.pmon" - - local -r dump_plugins="$(find ${PLUGINS_DIR} -type f -executable)" - for plugin in $dump_plugins; do - # save stdout output of plugin and gzip it - save_cmd "$plugin" "$(basename $plugin)" true false - done + + if [[ -d ${PLUGINS_DIR} ]]; then + local -r dump_plugins="$(find ${PLUGINS_DIR} -type f -executable)" + for plugin in $dump_plugins; do + # save stdout output of plugin and gzip it + save_cmd "$plugin" "$(basename $plugin)" true + done + fi save_saidump @@ -1201,8 +1266,9 @@ main() { $RM $V -rf $TARDIR if $DO_COMPRESS; then - $GZIP $V $TARFILE - if [ $? -eq 0 ]; then + RC=0 + $GZIP $V $TARFILE || RC=$? + if [ $RC -eq 0 ]; then TARFILE="${TARFILE}.gz" else echo "WARNING: gzip operation appears to have failed." >&2 @@ -1210,6 +1276,11 @@ main() { fi echo ${TARFILE} + + if ! $SAVE_STDERR + then + exit $RETURN_CODE + fi } ############################################################################### @@ -1265,11 +1336,13 @@ OPTIONS "24 March", "yesterday", etc. -t TIMEOUT_MINS Command level timeout in minutes - + -r + Redirect any intermediate errors to STDERR EOF } -while getopts ":xnvhzas:t:" opt; do + +while getopts ":xnvhzas:t:r" opt; do case $opt in x) # enable bash debugging @@ -1311,6 +1384,9 @@ while getopts ":xnvhzas:t:" opt; do t) TIMEOUT_MIN="${OPTARG}" ;; + r) + SAVE_STDERR=false + ;; /?) echo "Invalid option: -$OPTARG" >&2 exit 1 diff --git a/show/main.py b/show/main.py index 08e6ff61d7..ca392a7618 100755 --- a/show/main.py +++ b/show/main.py @@ -1061,7 +1061,8 @@ def users(verbose): @click.option('--verbose', is_flag=True, help="Enable verbose output") @click.option('--allow-process-stop', is_flag=True, help="Dump additional data which may require system interruption") @click.option('--silent', is_flag=True, help="Run techsupport in silent mode") -def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop, silent): +@click.option('--redirect-stderr', '-r', is_flag=True, help="Redirect any intermediate error to STDERR") +def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop, silent, redirect_stderr): """Gather information for troubleshooting""" cmd = "sudo timeout -s SIGTERM --foreground {}m".format(global_timeout) @@ -1077,6 +1078,8 @@ def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop, if since: cmd += " -s '{}'".format(since) cmd += " -t {}".format(cmd_timeout) + if redirect_stderr: + cmd += " -r" run_command(cmd, display_cmd=verbose)