Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pytest] Ignore specific syslog error messages when testing autorestart #2430

Merged
merged 3 commits into from
Oct 31, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 113 additions & 9 deletions tests/autorestart/test_container_autorestart.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,76 @@
CONTAINER_RESTART_THRESHOLD_SECS = 180


@pytest.fixture(autouse=True)
def ignore_expected_loganalyzer_exception(duthost, loganalyzer):
"""
Ignore expected failure/error messages during testing the autorestart feature.

First, since we killed a critical process in a specific container to test the feature of
autorestart, we expect to see error messages which were fired by Monit such as
"ERR monit[563]: 'lldp|lldpd' status failed (1) -- 'lldpd' is not running."

Second, if teammgrd process was killed for testing, orchagent process in
swss container would write the error messages into syslog such as
"ERR swss#orchagent: :- removeLag: Failed to remove ref count 1 LAG PortChannel10008."
When teamd was restarted, there was an error message in the syslog: "ERR teamd#teamsyncd:
:- readData: netlink reports an error=-33 on reading a netlink socket."

Third, during pmon container was restarted due to ledd process was killed for testing,
xcvrd process would write an error message into syslog such as "ERR pmon#xcvrd[29]: :-initialize
GlobalConfig: Sonic database config global file doesn't exist at /var/run/redis/sonic-db/database_global.json."
thermalctld process would write an error message into syslog such as "ERR pmon#thermalctld[33]:
Caught exception while initializing thermal manager."

Fourth, if orchagent process was killed and swss container was restarted, then syncd process
would write error messages such as "ERR syncd#syncd: [none] driverEgressMemoryUpdate:1395
Error getting cosq for port 1.". At the same time, syncd process also wrote two WARNING messages
into syslog such as "WARNING syncd#syncd:- saiDiscover: skipping since it causes crash:
SAI_STP_ATTR_BRIDGE_ID". Since there was a keyword "crash" in these warning message, logAnalyzer
would fail.

Fifth, systemd would fire an error message:"ERR systemd[1]: Failed to start SNMP/TEAMD container." since
SNMP/TEAMD container hits the limitation of restart. route_check.py also wrote an error message into syslog.

"""
monit_ignoreRegex = [
".*ERR monit.*",
]
swss_ignoreRegex = [
".*ERR swss#orchagent.*removeLag.*",
]
pmon_ignoreRegex = [
".*ERR pmon#xcvrd.*initializeGlobalConfig.*",
".*ERR pmon#thermalctld.*Caught exception while initializing thermal manager.*",
]
syncd_ignoreRegex = [
".*ERR syncd#syncd.*driverEgressMemoryUpdate.*",
".*ERR syncd#syncd.*brcm_sai*",
".*WARNING syncd#syncd.*saiDiscover: skipping since it causes crash.*",
]
teamd_ignoreRegex = [
".*ERR teamd#teamsyncd.*readData.*netlink reports an error=-33 on reading a netlink socket.*",
]
systemd_ignoreRegex = [
".*ERR systemd.*Failed to start .* container*",
]
kernel_ignoreRegex = [
".*ERR kernel.*PortChannel.*",
]
other_ignoreRegex = [
".*ERR route_check.*",
]
if loganalyzer:
loganalyzer.ignore_regex.extend(monit_ignoreRegex)
loganalyzer.ignore_regex.extend(swss_ignoreRegex)
loganalyzer.ignore_regex.extend(pmon_ignoreRegex)
loganalyzer.ignore_regex.extend(syncd_ignoreRegex)
loganalyzer.ignore_regex.extend(teamd_ignoreRegex)
loganalyzer.ignore_regex.extend(systemd_ignoreRegex)
loganalyzer.ignore_regex.extend(kernel_ignoreRegex)
loganalyzer.ignore_regex.extend(other_ignoreRegex)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yozhao101 this is not exactly what I had in mind when asked for splitting. But I understand that you have to put all regex up because you are looping through all services and run test against them. you currently cannot really apply individual regex for individual service.

I'll approve this PR and let it go in. Please open a separate PR to parameterize the test so that the test will run for each service individually, that way you can apply ignore regex per service.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I misunderstood what you ware meaning in our meeting. I will create a separate PR to improve the ignore regex.



def get_group_program_info(duthost, container_name, group_name):
"""
@summary: Get program names, running status and their pids by analyzing the command
Expand All @@ -34,7 +104,7 @@ def get_group_program_info(duthost, container_name, group_name):
program_status = None
program_pid = -1

program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name))
program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name), module_ignore_errors=True)
for program_info in program_list["stdout_lines"]:
if program_info.find(group_name) != -1:
program_name = program_info.split()[0].split(':')[1].strip()
Expand All @@ -59,7 +129,7 @@ def get_program_info(duthost, container_name, program_name):
program_status = None
program_pid = -1

program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name))
program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name), module_ignore_errors=True)
for program_info in program_list["stdout_lines"]:
if program_info.find(program_name) != -1:
program_status = program_info.split()[1].strip()
Expand Down Expand Up @@ -137,6 +207,20 @@ def is_hiting_start_limit(duthost, container_name):
return False


def clear_failed_flag_and_restart(duthost, container_name):
"""
@summary: If a container hits the restart limitation, then we clear the failed flag and
restart it.
"""
logger.info("{} hits start limit and clear reset-failed flag".format(container_name))
duthost.shell("sudo systemctl reset-failed {}.service".format(container_name))
duthost.shell("sudo systemctl start {}.service".format(container_name))
restarted = wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
CONTAINER_CHECK_INTERVAL_SECS,
check_container_state, duthost, container_name, True)
pytest_assert(restarted, "Failed to restart container '{}' after reset-failed was cleared".format(container_name))


def verify_autorestart_with_critical_process(duthost, container_name, program_name,
program_status, program_pid):
"""
Expand Down Expand Up @@ -165,13 +249,7 @@ def verify_autorestart_with_critical_process(duthost, container_name, program_na
check_container_state, duthost, container_name, True)
if not restarted:
if is_hiting_start_limit(duthost, container_name):
logger.info("{} hits start limit and clear reset-failed flag".format(container_name))
duthost.shell("sudo systemctl reset-failed {}.service".format(container_name))
duthost.shell("sudo systemctl start {}.service".format(container_name))
restarted = wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
CONTAINER_CHECK_INTERVAL_SECS,
check_container_state, duthost, container_name, True)
pytest_assert(restarted, "Failed to restart container '{}' after reset-failed was cleared".format(container_name))
clear_failed_flag_and_restart(duthost, container_name)
else:
pytest.fail("Failed to restart container '{}'".format(container_name))

Expand Down Expand Up @@ -202,6 +280,30 @@ def verify_no_autorestart_with_non_critical_process(duthost, container_name, pro
logger.info("Restart the program '{}' in container '{}'".format(program_name, container_name))
duthost.shell("docker exec {} supervisorctl start {}".format(container_name, program_name))

def check_all_critical_processes_status(duthost):
processes_status = duthost.all_critical_process_status()
for container_name, processes in processes_status.items():
if processes["status"] is False or len(processes["exited_critical_process"]) > 0:
return False

return True


def postcheck_critical_processes_status(duthost, container_autorestart_states):
"""
@summary: Do the post check to see whether all the critical processes are alive after testing
the autorestart feature.
First we restart the containers which hit the restart limitation and then do the post check
"""
for container_name in container_autorestart_states.keys():
if is_hiting_start_limit(duthost, container_name):
clear_failed_flag_and_restart(duthost, container_name)

pytest_assert(wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
CONTAINER_CHECK_INTERVAL_SECS,
check_all_critical_processes_status, duthost),
"Post checking the healthy of critical processes failed.")


def test_containers_autorestart(duthost, tbinfo):
"""
Expand Down Expand Up @@ -275,3 +377,5 @@ def test_containers_autorestart(duthost, tbinfo):
duthost.shell("sudo config feature autorestart {} disabled".format(container_name))

logger.info("End of testing the container '{}'".format(container_name))

postcheck_critical_processes_status(duthost, container_autorestart_states)