Skip to content

Commit

Permalink
Log radar_interface CAN error, Process Comm Errors (commaai#95)
Browse files Browse the repository at this point in the history
* Log radar_interface CAN error

* Add support for process comm error logging

* Improve Process Comm error logging

* Can and CommError throttling
  • Loading branch information
rafcabezas authored Aug 21, 2019
1 parent 14eef4f commit 6852d99
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 30 deletions.
6 changes: 0 additions & 6 deletions selfdrive/can/plant_can_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from selfdrive.car.honda.hondacan import fix
from common.realtime import sec_since_boot
from common.dbc import dbc
from selfdrive.tinklad.tinkla_interface import TinklaClient

class CANParser(object):
def __init__(self, dbc_f, signals, checks=None):
Expand Down Expand Up @@ -57,8 +56,6 @@ def __init__(self, dbc_f, signals, checks=None):
for i, x in enumerate(self._msgs):
self._message_indices[x].append(i)

self.tinklaClient = TinklaClient()

def update_can(self, can_recv):
msgs_upd = []
cn_vl_max = 5 # no more than 5 wrong counter checks
Expand All @@ -82,7 +79,6 @@ def update_can(self, can_recv):
# compare recalculated vs received checksum
if msg_vl != cdat:
print("CHECKSUM FAIL: {0}".format(hex(msg)))
self.tinklaClient.logCANErrorEvent(canMessage=msg, additionalInformation="Checksum failure")
self.ck[msg] = False
self.ok[msg] = False
# counter check
Expand All @@ -98,7 +94,6 @@ def update_can(self, can_recv):
# message status is invalid if we received too many wrong counter values
if self.cn_vl[msg] >= cn_vl_max:
print("COUNTER WRONG: {0}".format(hex(msg)))
self.tinklaClient.logCANErrorEvent(canMessage=msg, additionalInformation="Too many wrong counter values")
self.ok[msg] = False

# update msg time stamps and counter value
Expand Down Expand Up @@ -134,5 +129,4 @@ def _check_dead_msgs(self):
for msg in set(self._msgs):
if msg in self.msgs_ck and self.sec_since_boot_cached - self.ct[msg] > 10./self.frqs[msg]:
self.ok[msg] = False
self.tinklaClient.logCANErrorEvent(canMessage=msg, additionalInformation="Dead message")

5 changes: 5 additions & 0 deletions selfdrive/car/tesla/radar_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from selfdrive.services import service_list
import selfdrive.messaging as messaging
from selfdrive.car.tesla.readconfig import CarSettings
from selfdrive.tinklad.tinkla_interface import TinklaClient

#RADAR_A_MSGS = list(range(0x371, 0x37F , 3))
#RADAR_B_MSGS = list(range(0x372, 0x37F, 3))
Expand Down Expand Up @@ -47,6 +48,9 @@ def _create_radard_can_parser():


class RadarInterface(object):

tinklaClient = TinklaClient()

def __init__(self,CP):
# radar
self.pts = {}
Expand Down Expand Up @@ -159,6 +163,7 @@ def _update(self, updated_messages):
errors = []
if not self.rcp.can_valid:
errors.append("canError")
self.tinklaClient.logCANErrorEvent(source="radar_interface", canMessage=0, additionalInformation="Invalid CAN Count")
ret.errors = errors
return ret,self.extPts.values()

Expand Down
14 changes: 13 additions & 1 deletion selfdrive/controls/controlsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@
from selfdrive.controls.lib.driver_monitor import DriverStatus, MAX_TERMINAL_ALERTS
from selfdrive.controls.lib.planner import LON_MPC_STEP
from selfdrive.locationd.calibration_helpers import Calibration, Filter
from selfdrive.tinklad.tinkla_interface import TinklaClient


ThermalStatus = log.ThermalData.ThermalStatus
State = log.ControlsState.OpenpilotState


def isActive(state):
"""Check if the actuators are enabled"""
return state in [State.enabled, State.softDisabling]
Expand Down Expand Up @@ -414,6 +415,13 @@ def data_send(sm, CS, CI, CP, VM, state, events, actuators, v_cruise_kph, rk, ca

return CC, events_bytes

def logAllAliveAndValidInfoToTinklad(sm, tinklaClient):
areAllAlive, aliveProcessName, aliveCount = sm.all_alive()
areAllValid, validProcessName, validCount = sm.all_valid()
if not areAllAlive:
tinklaClient.logProcessCommErrorEvent(source="carcontroller", processName=aliveProcessName, count=aliveCount, eventType="Not Alive")
else:
tinklaClient.logProcessCommErrorEvent(source="carcontroller", processName=validProcessName, count=validCount, eventType="Not Valid")

def controlsd_thread(gctx=None):
gc.disable()
Expand All @@ -423,6 +431,8 @@ def controlsd_thread(gctx=None):

params = Params()

tinklaClient = TinklaClient()

# Pub Sockets
sendcan = messaging.pub_sock(service_list['sendcan'].port)
controlsstate = messaging.pub_sock(service_list['controlsState'].port)
Expand Down Expand Up @@ -515,6 +525,7 @@ def controlsd_thread(gctx=None):
# Create alerts
if not sm.all_alive_and_valid():
events.append(create_event('commIssue', [ET.NO_ENTRY, ET.SOFT_DISABLE]))
logAllAliveAndValidInfoToTinklad(sm=sm, tinklaClient=tinklaClient)
if not sm['pathPlan'].mpcSolutionValid:
events.append(create_event('plannerError', [ET.NO_ENTRY, ET.IMMEDIATE_DISABLE]))
if not sm['pathPlan'].sensorValid:
Expand All @@ -529,6 +540,7 @@ def controlsd_thread(gctx=None):
events.append(create_event('radarCanError', [ET.NO_ENTRY, ET.SOFT_DISABLE]))
if not CS.canValid:
events.append(create_event('canError', [ET.NO_ENTRY, ET.IMMEDIATE_DISABLE]))
tinklaClient.logCANErrorEvent(source="carcontroller", canMessage=0, additionalInformation="Invalid CAN")
if not sounds_available:
events.append(create_event('soundsUnavailable', [ET.NO_ENTRY, ET.PERMANENT]))

Expand Down
57 changes: 54 additions & 3 deletions selfdrive/messaging.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,67 @@ def update_msgs(self, cur_time, msgs):
self.alive[s] = True

def all_alive(self, service_list=None):
"""Returns alive state for tracked processes.
Args:
service_list (list): Optional service list.
Returns:
tuple: areAllAlive, processName, count
"""
if service_list is None: # check all
service_list = self.alive.keys()
return all(self.alive[s] for s in service_list)
areAllAlive = True
processName = ""
count = 0
for s in service_list:
if not self.alive[s]:
areAllAlive = False
processName = s
count = self.alive_cnt[s]
break
return (areAllAlive, processName, count)

def all_valid(self, service_list=None):
"""Returns valid state for tracked processes.
Args:
service_list (list): Optional service list.
Returns:
tuple: areAllValid, processName, count
"""
if service_list is None: # check all
service_list = self.valid.keys()
return all(self.valid[s] for s in service_list)
areAllValid = True
processName = ""
count = 0
for s in service_list:
if not self.valid[s]:
areAllValid = False
processName = s
count = self.valid_cnt[s]
break
return (areAllValid, processName, count)

def all_alive_and_valid_with_info(self, service_list=None):
"""Returns alive and valid state for tracked processes.
Args:
service_list (list): Optional service list.
Returns:
tuple: areAllAlive, areAllValid, aliveProcessName, aliveCount, validProcessName, validCount
"""
if service_list is None: # check all
service_list = self.alive.keys()
areAllAlive, aliveProcessName, aliveCount = self.all_alive(service_list=service_list)
areAllValid, validProcessName, validCount = self.all_valid(service_list=service_list)
return (areAllAlive, areAllValid, aliveProcessName, aliveCount, validProcessName, )

def all_alive_and_valid(self, service_list=None):
if service_list is None: # check all
service_list = self.alive.keys()
return self.all_alive(service_list=service_list) and self.all_valid(service_list=service_list)
areAllAlive, aliveProcessName, aliveCount = self.all_alive(service_list=service_list)
areAllValid, validProcessName, validCount = self.all_valid(service_list=service_list)
return areAllAlive and areAllValid
62 changes: 50 additions & 12 deletions selfdrive/tinklad/tinkla_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import zmq
import datetime
import tinklad
import time

## For helpers:
import traceback
Expand All @@ -20,10 +21,20 @@ def now_iso8601():
class TinklaClient():
sock = None
pid = None
lastCanErrorTimestamp = 0
lastProcessErrorTimestamp = 0

eventCategoryKeys = tinklad.TinklaInterfaceEventCategoryKeys()
messageTypeKeys = tinklad.TinklaInterfaceMessageKeys()
actions = tinklad.TinklaInterfaceActions()

# Configurable:
# Note: If throttling, events are dropped
shouldThrottleCanErrorEvents = True
shouldThrottleProcessCommErrorEvents = True
# Setting to every 30min for now, because we're getting a bunch of plan, pathPlan issues.
# Should change to around every 1min in the future when this is resolved
throttlingPeriodInSeconds = (60*30) # One event every `throttlingPeriodInSeconds`

def start_client(self):
if os.getpid() == self.pid:
Expand Down Expand Up @@ -90,43 +101,70 @@ def attemptToSendPendingMessages(self):

## Helpers:

def logCrashStackTraceEvent(self, dongleId = None):
if dongleId is None:
dongleId = self.dongleId
def logCrashStackTraceEvent(self, openPilotId = None):
if openPilotId is None:
openPilotId = self.openPilotId
event = tinkla.Interface.Event.new_message(
openPilotId=dongleId,
openPilotId=openPilotId,
source="n/a",
category=self.eventCategoryKeys.crash,
name="crash",
)
trace = traceback.format_exc().replace('"', '`').replace("'", '`')
userInfo = "User Handle: %s OpenPilotId: %s" % (self.userHandle, self.dongleId)
userInfo = "User Handle: %s \nOpenPilotId: %s" % (self.userHandle, self.openPilotId)
gitInfo = "Git Remote: %s\nBranch: %s\nCommit: %s" % (self.gitRemote, self.gitBranch, self.gitHash)
event.value.textValue="%s\n%s\n%s" % (userInfo, gitInfo, trace)
self.logUserEvent(event)

def logCANErrorEvent(self, canMessage, additionalInformation, dongleId = None):
if dongleId is None:
dongleId = self.dongleId
def logCANErrorEvent(self, source, canMessage, additionalInformation, openPilotId = None):
if self.shouldThrottleCanErrorEvents:
now = time.time()
if now - self.lastCanErrorTimestamp < self.throttlingPeriodInSeconds:
return
self.lastCanErrorTimestamp = now

if openPilotId is None:
openPilotId = self.openPilotId
event = tinkla.Interface.Event.new_message(
openPilotId=dongleId,
source=hex(canMessage),
openPilotId=openPilotId,
source=source,
category=self.eventCategoryKeys.canError,
name="CAN Error",
)
canInfo = "Can Message: {0}".format(hex(canMessage))
userInfo = "User Handle: %s OpenPilotId: %s" % (self.userHandle, self.dongleId)
userInfo = "User Handle: %s \nOpenPilotId: %s" % (self.userHandle, self.openPilotId)
gitInfo = "Git Remote: %s\nBranch: %s\nCommit: %s" % (self.gitRemote, self.gitBranch, self.gitHash)
event.value.textValue="%s\n%s\n%s\n%s" % (userInfo, gitInfo, canInfo, additionalInformation)
self.logUserEvent(event)

def logProcessCommErrorEvent(self, source, processName, count, eventType, openPilotId = None):
if self.shouldThrottleProcessCommErrorEvents:
now = time.time()
if now - self.lastProcessErrorTimestamp < self.throttlingPeriodInSeconds:
return
self.lastProcessErrorTimestamp = now

if openPilotId is None:
openPilotId = self.openPilotId
event = tinkla.Interface.Event.new_message(
openPilotId=openPilotId,
source=processName,
category=self.eventCategoryKeys.processCommError,
name="Process Comm Error",
)
additionalInformation = "Process: '%s' \nType: '%s' \nCount: '%d' \nSource: '%s'" % (processName, eventType, count, source)
userInfo = "User Handle: %s \nOpenPilotId: %s" % (self.userHandle, self.openPilotId)
gitInfo = "Git Remote: %s\nBranch: %s\nCommit: %s" % (self.gitRemote, self.gitBranch, self.gitHash)
event.value.textValue="%s\n%s\n%s" % (userInfo, gitInfo, additionalInformation)
self.logUserEvent(event)

def print_msg(self, message):
print(message)

def __init__(self):
carSettings = CarSettings()
params = Params()
self.dongleId = params.get("DongleId")
self.openPilotId = params.get("DongleId")
self.userHandle = carSettings.userHandle
self.gitRemote = params.get("GitRemote")
self.gitBranch = params.get("GitBranch")
Expand Down
25 changes: 17 additions & 8 deletions selfdrive/tinklad/tinkladTestClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@ class TinklaTestClient():
def __init__(self):
#self.start_server()
self.tinklaClient = TinklaClient()
openPilotId = "test_openpilotId"
source = "tinkladTestClient"
userHandle = "test_user_handle"

info = tinkla.Interface.UserInfo.new_message(
openPilotId="test_openpilotId",
userHandle="test_user_handle",
openPilotId=openPilotId,
userHandle=userHandle,
gitRemote="test_github.com/something",
gitBranch="test_gitbranch",
gitHash="test_123456"
Expand All @@ -24,8 +27,8 @@ def __init__(self):
print("Info Time Elapsed = %d" % (elapsed_time_us))

event = tinkla.Interface.Event.new_message(
openPilotId="test_openpilotId",
source="unittest",
openPilotId=openPilotId,
source=source,
category=self.tinklaClient.eventCategoryKeys.userAction,
name="pull_stalk",
)
Expand All @@ -36,18 +39,24 @@ def __init__(self):
print("Event Time Elapsed = %d" % (elapsed_time_us))

carsettings = CarSettings("./bb_openpilot_config.cfg")
userHandle = carsettings.userHandle
carsettings.userHandle = userHandle
print("userHandle = '%s'" % (userHandle))

print("attemptToSendPendingMessages")
self.tinklaClient.attemptToSendPendingMessages()

print("send crash log")
self.tinklaClient.logCrashStackTraceEvent(dongleId="test_openpilotId")
self.tinklaClient.logCrashStackTraceEvent(openPilotId=openPilotId)

print("send can error")
self.tinklaClient.logCANErrorEvent(canMessage=123, additionalInformation="test can error logging", dongleId="test_openpilotId")
self.tinklaClient.logCANErrorEvent(source=source, canMessage=1, additionalInformation="test can error logging", openPilotId=openPilotId)
time.sleep(1)
self.tinklaClient.logCANErrorEvent(source=source, canMessage=2, additionalInformation="test can error logging", openPilotId=openPilotId)

print("send process comm error")
self.tinklaClient.logProcessCommErrorEvent(source=source, processName="processNameWouldBeHere1", count=10, eventType="Not Alive", openPilotId=openPilotId)
time.sleep(1)
self.tinklaClient.logProcessCommErrorEvent(source=source, processName="processNameWouldBeHere2", count=10, eventType="Not Alive", openPilotId=openPilotId)

if __name__ == "__main__":
TinklaTestClient()

0 comments on commit 6852d99

Please sign in to comment.