Skip to content

Commit

Permalink
[202205] [generate dump] Move the Core/Log collection to the End of p…
Browse files Browse the repository at this point in the history
…rocess Execution and removed default timeout (#2230)

Thus moved the core/log collection to the end.

But there is a catch regarding the above change, For eg: system is in a unstable state and most of the individual commands start to timeout, the techsupport dump eventually times out at 30m (because of the global timeout), then the dump is pretty useless, since it might not have any useful information at all
Thus, i've removed the default global timeout, Clients can/should knowingly provide a value using -g option if the execution time has to be capped.

A global timeout of 60 mins is used for Global timeout for Auto-techsupport invocation.

Co-authored-by: Vivek Reddy Karri <vkarri@nvidia.com>
  • Loading branch information
vivekrnv and vivekrnv authored Jun 24, 2022
1 parent 785508d commit 430cd65
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 16 deletions.
2 changes: 1 addition & 1 deletion scripts/coredump_gen_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def parse_ts_dump_name(self, ts_stdout):
return ""

def invoke_ts_cmd(self, since_cfg, num_retry=0):
cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
cmd_opts = ["show", "techsupport", "--silent", "--global-timeout", TS_GLOBAL_TIMEOUT, "--since", since_cfg]
cmd = " ".join(cmd_opts)
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
new_dump = ""
Expand Down
9 changes: 4 additions & 5 deletions scripts/generate_dump
Original file line number Diff line number Diff line change
Expand Up @@ -1287,11 +1287,6 @@ main() {
end_t=$(date +%s%3N)
echo "[ Capture Proc State ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO

# Save logs and cores early
save_log_files
save_crash_files
save_warmboot_files

# Save all the processes within each docker
save_cmd "show services" services.summary

Expand Down Expand Up @@ -1426,6 +1421,10 @@ main() {
end_t=$(date +%s%3N)
echo "[ TAR /etc Files ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO

save_log_files
save_crash_files
save_warmboot_files

finalize
}

Expand Down
7 changes: 5 additions & 2 deletions show/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,7 +1138,7 @@ def users(verbose):

@cli.command()
@click.option('--since', required=False, help="Collect logs and core files since given date")
@click.option('-g', '--global-timeout', default=30, type=int, help="Global timeout in minutes. Default 30 mins")
@click.option('-g', '--global-timeout', required=False, type=int, help="Global timeout in minutes. WARN: Dump might be incomplete if enforced")
@click.option('-c', '--cmd-timeout', default=5, type=int, help="Individual command timeout in minutes. Default 5 mins")
@click.option('--verbose', is_flag=True, help="Enable verbose output")
@click.option('--allow-process-stop', is_flag=True, help="Dump additional data which may require system interruption")
Expand All @@ -1147,7 +1147,10 @@ def users(verbose):
@click.option('--redirect-stderr', '-r', is_flag=True, help="Redirect an intermediate errors to STDERR")
def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop, silent, debug_dump, redirect_stderr):
"""Gather information for troubleshooting"""
cmd = "sudo timeout --kill-after={}s -s SIGTERM --foreground {}m".format(COMMAND_TIMEOUT, global_timeout)
cmd = "sudo"

if global_timeout:
cmd += " timeout --kill-after={}s -s SIGTERM --foreground {}m".format(COMMAND_TIMEOUT, global_timeout)

if allow_process_stop:
cmd += " -a"
Expand Down
21 changes: 21 additions & 0 deletions tests/coredump_gen_handler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
/tmp/saisdkdump
"""

TS_DEFAULT_CMD = "show techsupport --silent --global-timeout 60 --since 2 days ago"

def signal_handler(signum, frame):
raise Exception("Timed out!")

Expand Down Expand Up @@ -427,4 +429,23 @@ def mock_cmd(cmd, env):
assert False, "Method should not time out"
finally:
signal.alarm(0)

def test_auto_ts_options(self):
"""
Scenario: Check if the techsupport is called as expected
"""
db_wrap = Db()
redis_mock = db_wrap.db
set_auto_ts_cfg(redis_mock, state="enabled", since_cfg="2 days ago")
set_feature_table_cfg(redis_mock, state="enabled")
with Patcher() as patcher:
def mock_cmd(cmd, env):
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str and cmd_str != TS_DEFAULT_CMD:
assert False, "Expected TS_CMD: {}, Recieved: {}".format(TS_DEFAULT_CMD, cmd_str)
return 0, AUTO_TS_STDOUT, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz")
cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock)
cls.handle_core_dump_creation_event()

14 changes: 7 additions & 7 deletions tests/techsupport_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@
from unittest.mock import patch, Mock
from click.testing import CliRunner

EXPECTED_BASE_COMMAND = 'sudo timeout --kill-after=300s -s SIGTERM --foreground '
EXPECTED_BASE_COMMAND = 'sudo '

@patch("show.main.run_command")
@pytest.mark.parametrize(
"cli_arguments,expected",
[
([], '30m generate_dump -v -t 5'),
(['--since', '2 days ago'], "30m generate_dump -v -s '2 days ago' -t 5"),
(['-g', '50'], '50m generate_dump -v -t 5'),
(['--allow-process-stop'], '30m -a generate_dump -v -t 5'),
(['--silent'], '30m generate_dump -t 5'),
(['--debug-dump', '--redirect-stderr'], '30m generate_dump -v -d -t 5 -r'),
([], 'generate_dump -v -t 5'),
(['--since', '2 days ago'], "generate_dump -v -s '2 days ago' -t 5"),
(['-g', '50'], 'timeout --kill-after=300s -s SIGTERM --foreground 50m generate_dump -v -t 5'),
(['--allow-process-stop'], '-a generate_dump -v -t 5'),
(['--silent'], 'generate_dump -t 5'),
(['--debug-dump', '--redirect-stderr'], 'generate_dump -v -d -t 5 -r'),
]
)
def test_techsupport(run_command, cli_arguments, expected):
Expand Down
3 changes: 2 additions & 1 deletion utilities_common/auto_techsupport_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB",
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", "TIME_BUF",
"SINCE_DEFAULT", "TS_PTRN_GLOB", "EXT_LOCKFAIL", "EXT_RETRY",
"EXT_SUCCESS", "MAX_RETRY_LIMIT"
"EXT_SUCCESS", "MAX_RETRY_LIMIT", "TS_GLOBAL_TIMEOUT"
] + [ # Methods
"verify_recent_file_creation",
"get_ts_dumps",
Expand Down Expand Up @@ -60,6 +60,7 @@

TIME_BUF = 20
SINCE_DEFAULT = "2 days ago"
TS_GLOBAL_TIMEOUT = "60"

# Techsupport Exit Codes
EXT_LOCKFAIL = 2
Expand Down

0 comments on commit 430cd65

Please sign in to comment.