Skip to content

Commit

Permalink
Merge pull request #26 from stephenxs/sfp-bit-map-error-status
Browse files Browse the repository at this point in the history
Enhanced - Handle the error status returned by platform APIs
  • Loading branch information
Junchao-Mellanox authored Jun 8, 2021
2 parents 1ad32df + 08ab761 commit f7b5949
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 100 deletions.
49 changes: 5 additions & 44 deletions sonic-xcvrd/tests/test_xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from sonic_py_common import daemon_base
from swsscommon import swsscommon
from sonic_platform_base.sfp_base import SfpBase
from .mock_swsscommon import Table


Expand Down Expand Up @@ -315,46 +316,6 @@ def test_get_media_settings_key(self):
assert result == ['MOLEX-1064141421', 'QSFP+']
# TODO: Ensure that error message was logged

def test_update_port_transceiver_status_table(self):
logical_port_name = "Ethernet0"
status_tbl = Table("STATE_DB", TRANSCEIVER_STATUS_TABLE)
update_port_transceiver_status_table(logical_port_name, status_tbl, SFP_STATUS_INSERTED)
entry = status_tbl.get(logical_port_name)
print(entry[1])
print(entry[0][0])
assert status_tbl.get(logical_port_name)[0][1] == SFP_STATUS_INSERTED
assert status_tbl.get(logical_port_name)[1][1] == 'N/A'

update_port_transceiver_status_table(logical_port_name, status_tbl, SFP_STATUS_REMOVED)
assert status_tbl.get(logical_port_name)[0][1] == SFP_STATUS_REMOVED
assert status_tbl.get(logical_port_name)[1][1] == 'N/A'

error_dict = {
'3': 'SFP_STATUS_ERR_I2C_STUCK',
'5': 'SFP_STATUS_ERR_BAD_EEPROM',
'9': 'SFP_STATUS_ERR_UNSUPPORTED_CABLE',
'17': 'SFP_STATUS_ERR_HIGH_TEMP',
'33': 'SFP_STATUS_ERR_BAD_CABLE'
}

# Test single errors
for error_value, error_msg in error_dict.items():
update_port_transceiver_status_table(logical_port_name, status_tbl, error_value, True)
assert status_tbl.get(logical_port_name)[0][1] == SFP_STATUS_INSERTED
assert status_tbl.get(logical_port_name)[1][1] == error_msg

# Test multiple errors
update_port_transceiver_status_table(logical_port_name, status_tbl, '63', True)
assert status_tbl.get(logical_port_name)[0][1] == SFP_STATUS_INSERTED
error = status_tbl.get(logical_port_name)[1][1]
for error_msg in error_dict.values():
assert error_msg in error

# Test unsupported errors
status_tbl = Table("STATE_DB", TRANSCEIVER_STATUS_TABLE)
update_port_transceiver_status_table(logical_port_name, status_tbl, '1024', True)
assert status_tbl.get(logical_port_name) is None

def test_detect_port_in_error_status(self):
class MockTable:
def get(self, key):
Expand All @@ -364,13 +325,13 @@ def get(self, key):
status_tbl.get = MagicMock(return_value=(True, {'error': 'N/A'}))
assert not detect_port_in_error_status(None, status_tbl)

status_tbl.get = MagicMock(return_value=(True, {'error': 'SFP_STATUS_ERR_I2C_STUCK'}))
status_tbl.get = MagicMock(return_value=(True, {'error': SfpBase.SFP_ERROR_DESCRIPTION_BLOCKING}))
assert detect_port_in_error_status(None, status_tbl)

def test_is_error_sfp_status(self):
error_values = ['3', '5', '9', '17', '33']
error_values = [7, 11, 19, 35]
for error_value in error_values:
assert is_error_block_eeprom_reading(error_value)

assert not is_error_block_eeprom_reading(SFP_STATUS_INSERTED)
assert not is_error_block_eeprom_reading(SFP_STATUS_REMOVED)
assert not is_error_block_eeprom_reading(int(SFP_STATUS_INSERTED))
assert not is_error_block_eeprom_reading(int(SFP_STATUS_REMOVED))
74 changes: 41 additions & 33 deletions sonic-xcvrd/xcvrd/xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,13 @@ def _wrapper_get_transceiver_change_event(timeout):
if platform_chassis is not None:
try:
status, events = platform_chassis.get_change_event(timeout)
sfp_events = events['sfp']
return status, sfp_events
sfp_events = events.get('sfp')
sfp_errors = events.get('sfp_error')
return status, sfp_events, sfp_errors
except NotImplementedError:
pass
return platform_sfputil.get_transceiver_change_event(timeout)
status, events = platform_sfputil.get_transceiver_change_event(timeout)
return status, events, None


def _wrapper_get_sfp_type(physical_port):
Expand All @@ -191,6 +193,14 @@ def _wrapper_get_sfp_type(physical_port):
pass
return None


def _wrapper_get_sfp_error_description(physical_port):
if platform_chassis:
try:
return platform_chassis.get_sfp(physical_port).get_error_description()
except NotImplementedError:
pass
return None
# Remove unnecessary unit from the raw data


Expand Down Expand Up @@ -759,22 +769,9 @@ def waiting_time_compensation_with_sleep(time_start, time_to_wait):
# Update port SFP status table on receiving SFP change event


def update_port_transceiver_status_table(logical_port_name, status_tbl, status, has_error=False):
if not has_error:
fvs = swsscommon.FieldValuePairs([('status', status), ('error', 'N/A')])
status_tbl.set(logical_port_name, fvs)
else:
error_list = []
int_status = int(status)
for error_code, error_msg in sfp_status_helper.SFP_STATUS_ERR_DICT.items():
if error_code & int_status:
error_list.append(error_msg)
if error_list:
fvs = swsscommon.FieldValuePairs([('status', str(int_status & 1)), ('error', '|'.join(error_list))])
status_tbl.set(logical_port_name, fvs)
else:
# SFP return unkown event, just ignore for now.
helper_logger.log_warning("Got unknown event {}, ignored".format(status))
def update_port_transceiver_status_table(logical_port_name, status_tbl, status, error_descriptions='N/A'):
fvs = swsscommon.FieldValuePairs([('status', status), ('error', error_descriptions)])
status_tbl.set(logical_port_name, fvs)


# Delete port from SFP status table
Expand Down Expand Up @@ -1003,7 +1000,7 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence):
while not stopping_event.is_set():
next_state = state
time_start = time.time()
status, port_dict = _wrapper_get_transceiver_change_event(timeout)
status, port_dict, error_dict = _wrapper_get_transceiver_change_event(timeout)
if not port_dict:
continue
helper_logger.log_debug("Got event {} {} in state {}".format(status, port_dict, state))
Expand Down Expand Up @@ -1083,21 +1080,32 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence):
helper_logger.log_info("Got SFP removed event")
update_port_transceiver_status_table(
logical_port, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_REMOVED)
helper_logger.log_info("receive plug out and pdate port sfp status table.")
helper_logger.log_info("receive plug out and update port sfp status table.")
del_port_sfp_dom_info_from_db(logical_port, int_tbl[asic_index], dom_tbl[asic_index])
else:
helper_logger.log_info("Got SFP Error event")
# Add port to error table to stop accessing eeprom of it
# If the port already in the error table, the stored error code will
# be updated to the new one.
update_port_transceiver_status_table(logical_port, status_tbl[asic_index], value, True)
helper_logger.log_info("receive error update port sfp status table.")
# In this case EEPROM is not accessible, so remove the DOM info
# since it will be outdated if long time no update.
# but will keep the interface info in the DB since it static.
if sfp_status_helper.is_error_block_eeprom_reading(value):
del_port_sfp_dom_info_from_db(logical_port, None, dom_tbl[asic_index])

try:
error_bits = int(value)
helper_logger.log_info("Got SFP error event {}".format(value))

error_descriptions = sfp_status_helper.fetch_generic_error_description(error_bits)

if sfp_status_helper.has_vendor_specific_error(error_bits):
if error_dict:
vendor_specific_error_description = error_dict.get(key)
else:
vendor_specific_error_description = _wrapper_get_sfp_error_description(key)
error_descriptions.append(vendor_specific_error_description)

# Add error info to database
# Any existing error will be replaced by the new one.
update_port_transceiver_status_table(logical_port, status_tbl[asic_index], value, '|'.join(error_descriptions))
helper_logger.log_info("Receive error update port sfp status table.")
# In this case EEPROM is not accessible. The DOM info will be removed since it can be out-of-date.
# The interface info remains in the DB since it is static.
if sfp_status_helper.is_error_block_eeprom_reading(error_bits):
del_port_sfp_dom_info_from_db(logical_port, None, dom_tbl[asic_index])
except (TypeError, ValueError) as e:
logger.log_error("Got unrecognized event {}, ignored".format(value))

# Since ports could be connected to a mux cable, if there is a change event process the change for being on a Y cable Port
y_cable_helper.change_ports_status_for_y_cable_change_event(
Expand Down
46 changes: 24 additions & 22 deletions sonic-xcvrd/xcvrd/xcvrd_utilities/sfp_status_helper.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,37 @@
from sonic_platform_base.sfp_base import SfpBase

# SFP status definition, shall be aligned with the definition in get_change_event() of ChassisBase
SFP_STATUS_REMOVED = '0'
SFP_STATUS_INSERTED = '1'

# SFP error code dictinary, new elements can be added if new errors need to be supported.
SFP_STATUS_ERR_DICT = {
2: 'SFP_STATUS_ERR_I2C_STUCK',
4: 'SFP_STATUS_ERR_BAD_EEPROM',
8: 'SFP_STATUS_ERR_UNSUPPORTED_CABLE',
16: 'SFP_STATUS_ERR_HIGH_TEMP',
32: 'SFP_STATUS_ERR_BAD_CABLE'
}

error_code_block_eeprom_reading = set((error_code for error_code in SFP_STATUS_ERR_DICT.keys()))
error_str_block_eeprom_reading = set((error for error in SFP_STATUS_ERR_DICT.values()))


def is_error_block_eeprom_reading(status):
int_status = int(status)
for error_code in error_code_block_eeprom_reading:
if int_status & error_code:
return True
return False
SFP_ERRORS_BLOCKING_MASK = 0x02
SFP_ERRORS_GENERIC_MASK = 0x0000FFFE
SFP_ERRORS_VENDOR_SPECIFIC_MASK = 0xFFFF0000

def is_error_block_eeprom_reading(error_bits):
return 0 != (error_bits & SFP_ERRORS_BLOCKING_MASK)


def has_vendor_specific_error(error_bits):
return 0 != (error_bits & SFP_ERRORS_VENDOR_SPECIFIC_MASK)


def fetch_generic_error_description(error_bits):
generic_error_bits = (error_bits & SFP_ERRORS_GENERIC_MASK)
error_descriptions = []
if generic_error_bits:
for error_bit, error_description in SfpBase.SFP_ERROR_BIT_TO_DESCRIPTION_DICT.items():
if error_bit & generic_error_bits:
error_descriptions.append(error_description)
return error_descriptions


def detect_port_in_error_status(logical_port_name, status_tbl):
rec, fvp = status_tbl.get(logical_port_name)
if rec:
status_dict = dict(fvp)
if 'error' in status_dict:
for error in error_str_block_eeprom_reading:
if error in status_dict['error']:
return True
error = status_dict.get('error')
return SfpBase.SFP_ERROR_DESCRIPTION_BLOCKING in error
return False

10 changes: 9 additions & 1 deletion sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,11 +419,19 @@ def change_ports_status_for_y_cable_change_event(port_dict, y_cable_presence, st
helper_logger.log_info("Got SFP inserted event")
check_identifier_presence_and_update_mux_table_entry(
state_db, port_tbl, y_cable_tbl, static_tbl, mux_tbl, asic_index, logical_port_name, y_cable_presence)
elif value == sfp_status_helper.SFP_STATUS_REMOVED or sfp_status_helper.is_error_block_eeprom_reading(value):
elif value == sfp_status_helper.SFP_STATUS_REMOVED:
check_identifier_presence_and_delete_mux_table_entry(
state_db, port_tbl, asic_index, logical_port_name, y_cable_presence, delete_change_event)

else:
try:
# Now that the value is in bitmap format, let's convert it to number
event_bits = int(value)
if sfp_status_helper.is_error_block_eeprom_reading(event_bits):
check_identifier_presence_and_delete_mux_table_entry(
state_db, port_tbl, asic_index, logical_port_name, y_cable_presence, delete_change_event)
except:
pass
# SFP return unkown event, just ignore for now.
helper_logger.log_warning("Got unknown event {}, ignored".format(value))
continue
Expand Down

0 comments on commit f7b5949

Please sign in to comment.