Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provision error logs #310

Merged
merged 7 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions agent/testflinger_agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,29 @@ def secure_filter(member, path):
return tarfile.data_filter(member, path)


def parse_error_logs(error_log_path: str, phase: str):
with open(error_log_path, "r") as error_file:
plars marked this conversation as resolved.
Show resolved Hide resolved
error_file_contents = error_file.read()
try:
exception_info = json.loads(error_file_contents)[
f"{phase}_exception_info"
]
if exception_info["exception_cause"] is None:
detail = "%s: %s" % (
exception_info["exception_name"],
exception_info["exception_message"],
)
else:
detail = "%s: %s caused by %s" % (
exception_info["exception_name"],
exception_info["exception_message"],
exception_info["exception_cause"],
)
return detail
except (json.JSONDecodeError, KeyError):
return ""


class TestflingerAgent:
def __init__(self, client):
self.client = client
Expand Down Expand Up @@ -253,6 +276,12 @@ def process_jobs(self):
if job_data.get("attachments_status") == "complete":
self.unpack_attachments(job_data, cwd=Path(rundir))

error_log_path = os.path.join(
rundir, "device-connector-error.json"
)
# Clear error log before starting
open(error_log_path, "w").close()

for phase in TEST_PHASES:
# First make sure the job hasn't been cancelled
if (
Expand All @@ -265,14 +294,13 @@ def process_jobs(self):

self.client.post_job_state(job.job_id, phase)
self.set_agent_state(phase)

event_emitter.emit_event(TestEvent(phase + "_start"))
exit_code, exit_event, exit_reason = job.run_test_phase(
phase, rundir
)
self.client.post_influx(phase, exit_code)
event_emitter.emit_event(exit_event, exit_reason)

detail = ""
if exit_code:
# exit code 46 is our indication that recovery failed!
# In this case, we need to mark the device offline
Expand All @@ -281,9 +309,10 @@ def process_jobs(self):
exit_event = TestEvent.RECOVERY_FAIL
else:
exit_event = TestEvent(phase + "_fail")
detail = parse_error_logs(error_log_path, phase)
else:
exit_event = TestEvent(phase + "_success")
event_emitter.emit_event(exit_event)
event_emitter.emit_event(exit_event, detail)
if phase == TestPhase.PROVISION:
self.client.post_provision_log(
job.job_id, exit_code, exit_event
Expand Down
138 changes: 138 additions & 0 deletions agent/testflinger_agent/tests/test_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,3 +584,141 @@ def test_post_provision_log_fail(self, agent, requests_mock):
) as mock_post_provision_log:
agent.process_jobs()
mock_post_provision_log.assert_called_with(*expected_log_params)

def test_provision_error_in_event_detail(self, agent, requests_mock):
plars marked this conversation as resolved.
Show resolved Hide resolved
"""Tests provision log error messages in event log detail field"""
self.config["test_command"] = "echo test1"
job_id = str(uuid.uuid1())
fake_job_data = {
"job_id": job_id,
"job_queue": "test",
"test_data": {"test_cmds": "foo"},
"job_status_webhook": "https://mywebhook",
}
requests_mock.get(
"http://127.0.0.1:8000/v1/job?queue=test",
[{"text": json.dumps(fake_job_data)}, {"text": "{}"}],
)
status_url = f"http://127.0.0.1:8000/v1/job/{job_id}/events"
requests_mock.post(status_url, status_code=200)

provision_exception_info = {
"provision_exception_info": {
"exception_name": "MyExceptionName",
"exception_message": "MyExceptionMessage",
"exception_cause": "MyExceptionCause",
}
}

with patch("shutil.rmtree"):
with patch(
"testflinger_agent.agent.TestflingerJob.run_test_phase"
) as mock_run_test_phase:

def run_test_phase_side_effect(phase, rundir):
if phase == "provision":
provision_log_path = os.path.join(
rundir, "device-connector-error.json"
)
with open(
provision_log_path, "w"
) as provision_log_file:
provision_log_file.write(
json.dumps(provision_exception_info)
)
provision_log_file.close()
return 99, None, ""
else:
return 0, None, ""

mock_run_test_phase.side_effect = run_test_phase_side_effect
agent.process_jobs()

status_update_requests = list(
filter(
lambda req: req.url == status_url,
requests_mock.request_history,
)
)
event_list = status_update_requests[-1].json()["events"]
provision_fail_events = list(
filter(
lambda event: event["event_name"] == "provision_fail",
event_list,
)
)
assert len(provision_fail_events) == 1
provision_fail_event_detail = provision_fail_events[0]["detail"]
assert (
provision_fail_event_detail
== "MyExceptionName: MyExceptionMessage caused by MyExceptionCause"
)

def test_provision_error_no_cause(self, agent, requests_mock):
"""Tests provision log error messages for exceptions with no cause"""
self.config["test_command"] = "echo test1"
job_id = str(uuid.uuid1())
fake_job_data = {
"job_id": job_id,
"job_queue": "test",
"test_data": {"test_cmds": "foo"},
"job_status_webhook": "https://mywebhook",
}
requests_mock.get(
"http://127.0.0.1:8000/v1/job?queue=test",
[{"text": json.dumps(fake_job_data)}, {"text": "{}"}],
)
status_url = f"http://127.0.0.1:8000/v1/job/{job_id}/events"
requests_mock.post(status_url, status_code=200)

provision_exception_info = {
"provision_exception_info": {
"exception_name": "MyExceptionName",
"exception_message": "MyExceptionMessage",
"exception_cause": None,
}
}

with patch("shutil.rmtree"):
with patch(
"testflinger_agent.agent.TestflingerJob.run_test_phase"
) as mock_run_test_phase:

def run_test_phase_side_effect(phase, rundir):
if phase == "provision":
provision_log_path = os.path.join(
rundir, "device-connector-error.json"
)
with open(
provision_log_path, "w"
) as provision_log_file:
provision_log_file.write(
json.dumps(provision_exception_info)
)
provision_log_file.close()
return 99, None, ""
else:
return 0, None, ""

mock_run_test_phase.side_effect = run_test_phase_side_effect
agent.process_jobs()

status_update_requests = list(
filter(
lambda req: req.url == status_url,
requests_mock.request_history,
)
)
event_list = status_update_requests[-1].json()["events"]
provision_fail_events = list(
filter(
lambda event: event["event_name"] == "provision_fail",
event_list,
)
)
assert len(provision_fail_events) == 1
provision_fail_event_detail = provision_fail_events[0]["detail"]
assert (
provision_fail_event_detail
== "MyExceptionName: MyExceptionMessage"
)
37 changes: 36 additions & 1 deletion device-connectors/src/testflinger_device_connectors/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@
import logging
import yaml
import sys
from typing import Callable
import json

from testflinger_device_connectors import configure_logging
from testflinger_device_connectors.devices import (
DEVICE_CONNECTORS,
RecoveryError,
get_device_stage_func,
)

Expand Down Expand Up @@ -70,6 +73,36 @@ def get_args(argv=None):
return parser.parse_args(argv)


def add_exception_logging_to_file(func: Callable, stage: str):
"""Decorator function to add logging of exceptions to a json file"""

def _wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as exception:
if exception.__cause__ is None:
exception_cause = None
else:
exception_cause = repr(exception.__cause__)
exception_info = {
f"{stage}_exception_info": {
"exception_name": type(exception).__name__,
"exception_message": str(exception),
"exception_cause": exception_cause,
}
}
with open(
"device-connector-error.json", "a", encoding="utf-8"
) as error_file:
error_file.write(json.dumps(exception_info))
if isinstance(exception, RecoveryError):
return 46
else:
return 1

return _wrapper


def main():
"""
Dynamically load the selected module and call the selected method
Expand All @@ -79,5 +112,7 @@ def main():
config = yaml.safe_load(configfile)
configure_logging(config)

func = get_device_stage_func(args.device, args.stage)
func = add_exception_logging_to_file(
get_device_stage_func(args.device, args.stage), args.stage
)
sys.exit(func(args))
Original file line number Diff line number Diff line change
Expand Up @@ -307,26 +307,6 @@ def cleanup(self, _):
pass


def catch(exception, returnval=0):
"""Decorator for catching Exceptions and returning values instead

This is useful because for certain things, like RecoveryError, we
need to give the calling process a hint that we failed for that
reason, so it can act accordingly, by disabling the device for example
"""

def _wrapper(func):
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except exception:
return returnval

return wrapper

return _wrapper


def get_device_stage_func(device: str, stage: str) -> Callable:
"""
Load the selected device connector and return the selected stage method
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@

from testflinger_device_connectors.devices import (
DefaultDevice,
RecoveryError,
SerialLogger,
catch,
)
from testflinger_device_connectors.devices.cm3.cm3 import CM3

Expand All @@ -32,7 +30,6 @@
class DeviceConnector(DefaultDevice):
"""Tool for provisioning baremetal with a given image."""

@catch(RecoveryError, 46)
def provision(self, args):
"""Method called when the command is invoked."""
with open(args.config) as configfile:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@

from testflinger_device_connectors.devices import (
DefaultDevice,
RecoveryError,
catch,
)
from .dell_oemscript import DellOemScript

Expand All @@ -34,7 +32,6 @@
class DeviceConnector(DefaultDevice):
"""Tool for provisioning Dell OEM devices with an oem image."""

@catch(RecoveryError, 46)
def provision(self, args):
"""Method called when the command is invoked."""
device = DellOemScript(args.config, args.job_data)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@

from testflinger_device_connectors.devices import (
DefaultDevice,
RecoveryError,
SerialLogger,
catch,
)
from testflinger_device_connectors.devices.dragonboard.dragonboard import (
Dragonboard,
Expand All @@ -34,7 +32,6 @@
class DeviceConnector(DefaultDevice):
"""Tool for provisioning baremetal with a given image."""

@catch(RecoveryError, 46)
def provision(self, args):
"""Method called when the command is invoked."""
with open(args.config) as configfile:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@

from testflinger_device_connectors.devices import (
DefaultDevice,
RecoveryError,
catch,
)


Expand All @@ -34,7 +32,6 @@
class DeviceConnector(DefaultDevice):
"""Fake device connector."""

@catch(RecoveryError, 46)
def provision(self, args):
"""dummy provision"""
with open(args.job_data) as json_file:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@

from testflinger_device_connectors.devices import (
DefaultDevice,
RecoveryError,
catch,
)
from .hp_oemscript import HPOemScript

Expand All @@ -34,7 +32,6 @@
class DeviceConnector(DefaultDevice):
"""Tool for provisioning HP OEM devices with an oem image."""

@catch(RecoveryError, 46)
def provision(self, args):
"""Method called when the command is invoked."""
device = HPOemScript(args.config, args.job_data)
Expand Down
Loading