Skip to content

Commit

Permalink
Provision error logs (#310)
Browse files Browse the repository at this point in the history
* Capture provision error logs in device connector to be processed by agent

* Handle exceptions and logging in the provision function

* Add agent test for provision error file parsing

* Capture exception messages from all phases

* Remove catch decorator and change logging decorator to return exit codes

* Fix error logging for exceptions with no cause

* Add more exceptions to catch when parsing error logs
  • Loading branch information
val500 authored Sep 12, 2024
1 parent 3b388ab commit f92a3b1
Show file tree
Hide file tree
Showing 20 changed files with 272 additions and 68 deletions.
35 changes: 32 additions & 3 deletions agent/testflinger_agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,29 @@ def secure_filter(member, path):
return tarfile.data_filter(member, path)


def parse_error_logs(error_log_path: str, phase: str):
with open(error_log_path, "r") as error_file:
error_file_contents = error_file.read()
try:
exception_info = json.loads(error_file_contents)[
f"{phase}_exception_info"
]
if exception_info["exception_cause"] is None:
detail = "%s: %s" % (
exception_info["exception_name"],
exception_info["exception_message"],
)
else:
detail = "%s: %s caused by %s" % (
exception_info["exception_name"],
exception_info["exception_message"],
exception_info["exception_cause"],
)
return detail
except (json.JSONDecodeError, KeyError):
return ""


class TestflingerAgent:
def __init__(self, client):
self.client = client
Expand Down Expand Up @@ -253,6 +276,12 @@ def process_jobs(self):
if job_data.get("attachments_status") == "complete":
self.unpack_attachments(job_data, cwd=Path(rundir))

error_log_path = os.path.join(
rundir, "device-connector-error.json"
)
# Clear error log before starting
open(error_log_path, "w").close()

for phase in TEST_PHASES:
# First make sure the job hasn't been cancelled
if (
Expand All @@ -265,14 +294,13 @@ def process_jobs(self):

self.client.post_job_state(job.job_id, phase)
self.set_agent_state(phase)

event_emitter.emit_event(TestEvent(phase + "_start"))
exit_code, exit_event, exit_reason = job.run_test_phase(
phase, rundir
)
self.client.post_influx(phase, exit_code)
event_emitter.emit_event(exit_event, exit_reason)

detail = ""
if exit_code:
# exit code 46 is our indication that recovery failed!
# In this case, we need to mark the device offline
Expand All @@ -281,9 +309,10 @@ def process_jobs(self):
exit_event = TestEvent.RECOVERY_FAIL
else:
exit_event = TestEvent(phase + "_fail")
detail = parse_error_logs(error_log_path, phase)
else:
exit_event = TestEvent(phase + "_success")
event_emitter.emit_event(exit_event)
event_emitter.emit_event(exit_event, detail)
if phase == TestPhase.PROVISION:
self.client.post_provision_log(
job.job_id, exit_code, exit_event
Expand Down
138 changes: 138 additions & 0 deletions agent/testflinger_agent/tests/test_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,3 +584,141 @@ def test_post_provision_log_fail(self, agent, requests_mock):
) as mock_post_provision_log:
agent.process_jobs()
mock_post_provision_log.assert_called_with(*expected_log_params)

def test_provision_error_in_event_detail(self, agent, requests_mock):
"""Tests provision log error messages in event log detail field"""
self.config["test_command"] = "echo test1"
job_id = str(uuid.uuid1())
fake_job_data = {
"job_id": job_id,
"job_queue": "test",
"test_data": {"test_cmds": "foo"},
"job_status_webhook": "https://mywebhook",
}
requests_mock.get(
"http://127.0.0.1:8000/v1/job?queue=test",
[{"text": json.dumps(fake_job_data)}, {"text": "{}"}],
)
status_url = f"http://127.0.0.1:8000/v1/job/{job_id}/events"
requests_mock.post(status_url, status_code=200)

provision_exception_info = {
"provision_exception_info": {
"exception_name": "MyExceptionName",
"exception_message": "MyExceptionMessage",
"exception_cause": "MyExceptionCause",
}
}

with patch("shutil.rmtree"):
with patch(
"testflinger_agent.agent.TestflingerJob.run_test_phase"
) as mock_run_test_phase:

def run_test_phase_side_effect(phase, rundir):
if phase == "provision":
provision_log_path = os.path.join(
rundir, "device-connector-error.json"
)
with open(
provision_log_path, "w"
) as provision_log_file:
provision_log_file.write(
json.dumps(provision_exception_info)
)
provision_log_file.close()
return 99, None, ""
else:
return 0, None, ""

mock_run_test_phase.side_effect = run_test_phase_side_effect
agent.process_jobs()

status_update_requests = list(
filter(
lambda req: req.url == status_url,
requests_mock.request_history,
)
)
event_list = status_update_requests[-1].json()["events"]
provision_fail_events = list(
filter(
lambda event: event["event_name"] == "provision_fail",
event_list,
)
)
assert len(provision_fail_events) == 1
provision_fail_event_detail = provision_fail_events[0]["detail"]
assert (
provision_fail_event_detail
== "MyExceptionName: MyExceptionMessage caused by MyExceptionCause"
)

def test_provision_error_no_cause(self, agent, requests_mock):
"""Tests provision log error messages for exceptions with no cause"""
self.config["test_command"] = "echo test1"
job_id = str(uuid.uuid1())
fake_job_data = {
"job_id": job_id,
"job_queue": "test",
"test_data": {"test_cmds": "foo"},
"job_status_webhook": "https://mywebhook",
}
requests_mock.get(
"http://127.0.0.1:8000/v1/job?queue=test",
[{"text": json.dumps(fake_job_data)}, {"text": "{}"}],
)
status_url = f"http://127.0.0.1:8000/v1/job/{job_id}/events"
requests_mock.post(status_url, status_code=200)

provision_exception_info = {
"provision_exception_info": {
"exception_name": "MyExceptionName",
"exception_message": "MyExceptionMessage",
"exception_cause": None,
}
}

with patch("shutil.rmtree"):
with patch(
"testflinger_agent.agent.TestflingerJob.run_test_phase"
) as mock_run_test_phase:

def run_test_phase_side_effect(phase, rundir):
if phase == "provision":
provision_log_path = os.path.join(
rundir, "device-connector-error.json"
)
with open(
provision_log_path, "w"
) as provision_log_file:
provision_log_file.write(
json.dumps(provision_exception_info)
)
provision_log_file.close()
return 99, None, ""
else:
return 0, None, ""

mock_run_test_phase.side_effect = run_test_phase_side_effect
agent.process_jobs()

status_update_requests = list(
filter(
lambda req: req.url == status_url,
requests_mock.request_history,
)
)
event_list = status_update_requests[-1].json()["events"]
provision_fail_events = list(
filter(
lambda event: event["event_name"] == "provision_fail",
event_list,
)
)
assert len(provision_fail_events) == 1
provision_fail_event_detail = provision_fail_events[0]["detail"]
assert (
provision_fail_event_detail
== "MyExceptionName: MyExceptionMessage"
)
37 changes: 36 additions & 1 deletion device-connectors/src/testflinger_device_connectors/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@
import logging
import yaml
import sys
from typing import Callable
import json

from testflinger_device_connectors import configure_logging
from testflinger_device_connectors.devices import (
DEVICE_CONNECTORS,
RecoveryError,
get_device_stage_func,
)

Expand Down Expand Up @@ -70,6 +73,36 @@ def get_args(argv=None):
return parser.parse_args(argv)


def add_exception_logging_to_file(func: Callable, stage: str):
"""Decorator function to add logging of exceptions to a json file"""

def _wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as exception:
if exception.__cause__ is None:
exception_cause = None
else:
exception_cause = repr(exception.__cause__)
exception_info = {
f"{stage}_exception_info": {
"exception_name": type(exception).__name__,
"exception_message": str(exception),
"exception_cause": exception_cause,
}
}
with open(
"device-connector-error.json", "a", encoding="utf-8"
) as error_file:
error_file.write(json.dumps(exception_info))
if isinstance(exception, RecoveryError):
return 46
else:
return 1

return _wrapper


def main():
"""
Dynamically load the selected module and call the selected method
Expand All @@ -79,5 +112,7 @@ def main():
config = yaml.safe_load(configfile)
configure_logging(config)

func = get_device_stage_func(args.device, args.stage)
func = add_exception_logging_to_file(
get_device_stage_func(args.device, args.stage), args.stage
)
sys.exit(func(args))
Original file line number Diff line number Diff line change
Expand Up @@ -313,26 +313,6 @@ def cleanup(self, _):
pass


def catch(exception, returnval=0):
"""Decorator for catching Exceptions and returning values instead
This is useful because for certain things, like RecoveryError, we
need to give the calling process a hint that we failed for that
reason, so it can act accordingly, by disabling the device for example
"""

def _wrapper(func):
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except exception:
return returnval

return wrapper

return _wrapper


def get_device_stage_func(device: str, stage: str) -> Callable:
"""
Load the selected device connector and return the selected stage method
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@

from testflinger_device_connectors.devices import (
DefaultDevice,
RecoveryError,
SerialLogger,
catch,
)
from testflinger_device_connectors.devices.cm3.cm3 import CM3

Expand All @@ -32,7 +30,6 @@
class DeviceConnector(DefaultDevice):
"""Tool for provisioning baremetal with a given image."""

@catch(RecoveryError, 46)
def provision(self, args):
"""Method called when the command is invoked."""
with open(args.config) as configfile:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@

from testflinger_device_connectors.devices import (
DefaultDevice,
RecoveryError,
catch,
)
from .dell_oemscript import DellOemScript

Expand All @@ -34,7 +32,6 @@
class DeviceConnector(DefaultDevice):
"""Tool for provisioning Dell OEM devices with an oem image."""

@catch(RecoveryError, 46)
def provision(self, args):
"""Method called when the command is invoked."""
device = DellOemScript(args.config, args.job_data)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@

from testflinger_device_connectors.devices import (
DefaultDevice,
RecoveryError,
SerialLogger,
catch,
)
from testflinger_device_connectors.devices.dragonboard.dragonboard import (
Dragonboard,
Expand All @@ -34,7 +32,6 @@
class DeviceConnector(DefaultDevice):
"""Tool for provisioning baremetal with a given image."""

@catch(RecoveryError, 46)
def provision(self, args):
"""Method called when the command is invoked."""
with open(args.config) as configfile:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@

from testflinger_device_connectors.devices import (
DefaultDevice,
RecoveryError,
catch,
)


Expand All @@ -34,7 +32,6 @@
class DeviceConnector(DefaultDevice):
"""Fake device connector."""

@catch(RecoveryError, 46)
def provision(self, args):
"""dummy provision"""
with open(args.job_data) as json_file:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@

from testflinger_device_connectors.devices import (
DefaultDevice,
RecoveryError,
catch,
)
from .hp_oemscript import HPOemScript

Expand All @@ -34,7 +32,6 @@
class DeviceConnector(DefaultDevice):
"""Tool for provisioning HP OEM devices with an oem image."""

@catch(RecoveryError, 46)
def provision(self, args):
"""Method called when the command is invoked."""
device = HPOemScript(args.config, args.job_data)
Expand Down
Loading

0 comments on commit f92a3b1

Please sign in to comment.