Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[xcvrd] Add bitmap support for SFP error event #184

Merged
merged 11 commits into from
Jun 22, 2021
41 changes: 29 additions & 12 deletions sonic-xcvrd/tests/test_xcvrd.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
import os
import sys
import subprocess

import pytest
import unittest
from imp import load_source
if sys.version_info >= (3, 3):
from unittest.mock import MagicMock, patch
else:
from mock import MagicMock, patch

from sonic_py_common import daemon_base
from swsscommon import swsscommon
from sonic_platform_base.sfp_base import SfpBase
from .mock_swsscommon import Table


Expand All @@ -24,13 +22,12 @@
test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
scripts_path = os.path.join(modules_path, "xcvrd")
helper_file_path = os.path.join(scripts_path, "xcvrd_utilities"+"/y_cable_helper.py")
sys.path.insert(0, modules_path)

os.environ["XCVRD_UNIT_TESTING"] = "1"
load_source('y_cable_helper', scripts_path + '/xcvrd_utilities/y_cable_helper.py')
from y_cable_helper import *
from xcvrd.xcvrd import *
from xcvrd.xcvrd_utilities.y_cable_helper import *
from xcvrd.xcvrd_utilities.sfp_status_helper import *


class TestXcvrdScript(object):
Expand Down Expand Up @@ -219,9 +216,9 @@ def test_init_port_sfp_status_tbl(self):
init_port_sfp_status_tbl(stop_event)

@patch('xcvrd.xcvrd_utilities.y_cable_helper.y_cable_platform_sfputil', MagicMock(return_value=[0]))
@patch('y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0]))
@patch('y_cable_helper._wrapper_get_presence', MagicMock(return_value=True))
@patch('y_cable_helper.get_muxcable_info', MagicMock(return_value={'tor_active': 'self',
@patch('xcvrd.xcvrd_utilities.y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0]))
@patch('xcvrd.xcvrd_utilities.y_cable_helper._wrapper_get_presence', MagicMock(return_value=True))
@patch('xcvrd.xcvrd_utilities.y_cable_helper.get_muxcable_info', MagicMock(return_value={'tor_active': 'self',
'mux_direction': 'self',
'manual_switch_count': '7',
'auto_switch_count': '71',
Expand Down Expand Up @@ -258,9 +255,9 @@ def test_post_port_mux_info_to_db(self):
assert(rc != -1)

@patch('xcvrd.xcvrd_utilities.y_cable_helper.y_cable_platform_sfputil', MagicMock(return_value=[0]))
@patch('y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0]))
@patch('y_cable_helper._wrapper_get_presence', MagicMock(return_value=True))
@patch('y_cable_helper.get_muxcable_static_info', MagicMock(return_value={'read_side': 'self',
@patch('xcvrd.xcvrd_utilities.y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0]))
@patch('xcvrd.xcvrd_utilities.y_cable_helper._wrapper_get_presence', MagicMock(return_value=True))
@patch('xcvrd.xcvrd_utilities.y_cable_helper.get_muxcable_static_info', MagicMock(return_value={'read_side': 'self',
'nic_lane1_precursor1': '1',
'nic_lane1_precursor2': '-7',
'nic_lane1_maincursor': '-1',
Expand Down Expand Up @@ -318,3 +315,23 @@ def test_get_media_settings_key(self):
result = get_media_settings_key(0, xcvr_info_dict)
assert result == ['MOLEX-1064141421', 'QSFP+']
# TODO: Ensure that error message was logged

def test_detect_port_in_error_status(self):
class MockTable:
def get(self, key):
pass

status_tbl = MockTable()
status_tbl.get = MagicMock(return_value=(True, {'error': 'N/A'}))
assert not detect_port_in_error_status(None, status_tbl)

status_tbl.get = MagicMock(return_value=(True, {'error': SfpBase.SFP_ERROR_DESCRIPTION_BLOCKING}))
assert detect_port_in_error_status(None, status_tbl)

def test_is_error_sfp_status(self):
error_values = [7, 11, 19, 35]
for error_value in error_values:
assert is_error_block_eeprom_reading(error_value)

assert not is_error_block_eeprom_reading(int(SFP_STATUS_INSERTED))
assert not is_error_block_eeprom_reading(int(SFP_STATUS_REMOVED))
109 changes: 51 additions & 58 deletions sonic-xcvrd/xcvrd/xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
import threading
import time

from enum import Enum
from sonic_py_common import daemon_base, device_info, logger
from sonic_py_common import multi_asic
from swsscommon import swsscommon

from .xcvrd_utilities import sfp_status_helper
from .xcvrd_utilities import y_cable_helper
except ImportError as e:
raise ImportError(str(e) + " - required module not found")
Expand All @@ -43,18 +43,6 @@
TIME_FOR_SFP_READY_SECS = 1
XCVRD_MAIN_THREAD_SLEEP_SECS = 60

# SFP status definition, shall be aligned with the definition in get_change_event() of ChassisBase
SFP_STATUS_REMOVED = '0'
SFP_STATUS_INSERTED = '1'

# SFP error code enum, new elements can be added to the enum if new errors need to be supported.
SFP_STATUS_ERR_ENUM = Enum('SFP_STATUS_ERR_ENUM', ['SFP_STATUS_ERR_I2C_STUCK', 'SFP_STATUS_ERR_BAD_EEPROM',
'SFP_STATUS_ERR_UNSUPPORTED_CABLE', 'SFP_STATUS_ERR_HIGH_TEMP',
'SFP_STATUS_ERR_BAD_CABLE'], start=2)

# Convert the error code to string and store them in a set for convenience
errors_block_eeprom_reading = set(str(error_code.value) for error_code in SFP_STATUS_ERR_ENUM)

EVENT_ON_ALL_SFP = '-1'
# events definition
SYSTEM_NOT_READY = 'system_not_ready'
Expand Down Expand Up @@ -188,11 +176,13 @@ def _wrapper_get_transceiver_change_event(timeout):
if platform_chassis is not None:
try:
status, events = platform_chassis.get_change_event(timeout)
sfp_events = events['sfp']
return status, sfp_events
sfp_events = events.get('sfp')
sfp_errors = events.get('sfp_error')
return status, sfp_events, sfp_errors
except NotImplementedError:
pass
return platform_sfputil.get_transceiver_change_event(timeout)
status, events = platform_sfputil.get_transceiver_change_event(timeout)
return status, events, None


def _wrapper_get_sfp_type(physical_port):
Expand All @@ -203,6 +193,14 @@ def _wrapper_get_sfp_type(physical_port):
pass
return None


def _wrapper_get_sfp_error_description(physical_port):
if platform_chassis:
try:
return platform_chassis.get_sfp(physical_port).get_error_description()
except NotImplementedError:
pass
return None
# Remove unnecessary unit from the raw data


Expand Down Expand Up @@ -540,7 +538,7 @@ def recover_missing_sfp_table_entries(sfp_util, int_tbl, status_tbl, stop_event)
continue

keys = int_tbl[asic_index].getKeys()
if logical_port_name not in keys and not detect_port_in_error_status(logical_port_name, status_tbl[asic_index]):
if logical_port_name not in keys and not sfp_status_helper.detect_port_in_error_status(logical_port_name, status_tbl[asic_index]):
post_port_sfp_info_to_db(logical_port_name, int_tbl[asic_index], transceiver_dict, stop_event)


Expand Down Expand Up @@ -771,30 +769,17 @@ def waiting_time_compensation_with_sleep(time_start, time_to_wait):
# Update port SFP status table on receiving SFP change event


def update_port_transceiver_status_table(logical_port_name, status_tbl, status):
fvs = swsscommon.FieldValuePairs([('status', status)])
def update_port_transceiver_status_table(logical_port_name, status_tbl, status, error_descriptions='N/A'):
fvs = swsscommon.FieldValuePairs([('status', status), ('error', error_descriptions)])
status_tbl.set(logical_port_name, fvs)


# Delete port from SFP status table


def delete_port_from_status_table(logical_port_name, status_tbl):
status_tbl._del(logical_port_name)

# Check whether port in error status


def detect_port_in_error_status(logical_port_name, status_tbl):
rec, fvp = status_tbl.get(logical_port_name)
if rec:
status_dict = dict(fvp)
if status_dict['status'] in errors_block_eeprom_reading:
return True
else:
return False
else:
return False

# Init TRANSCEIVER_STATUS table


Expand Down Expand Up @@ -824,16 +809,16 @@ def init_port_sfp_status_tbl(stop_event=threading.Event()):
physical_port_list = logical_port_name_to_physical_port_list(logical_port_name)
if physical_port_list is None:
helper_logger.log_error("No physical ports found for logical port '{}'".format(logical_port_name))
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], SFP_STATUS_REMOVED)
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_REMOVED)

for physical_port in physical_port_list:
if stop_event.is_set():
break

if not _wrapper_get_presence(physical_port):
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], SFP_STATUS_REMOVED)
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_REMOVED)
else:
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], SFP_STATUS_INSERTED)
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_INSERTED)

#
# Helper classes ===============================================================
Expand Down Expand Up @@ -872,7 +857,7 @@ def task_worker(self, y_cable_presence):
logger.log_warning("Got invalid asic index for {}, ignored".format(logical_port_name))
continue

if not detect_port_in_error_status(logical_port_name, status_tbl[asic_index]):
if not sfp_status_helper.detect_port_in_error_status(logical_port_name, status_tbl[asic_index]):
post_port_dom_info_to_db(logical_port_name, dom_tbl[asic_index], self.task_stopping_event)
post_port_dom_threshold_info_to_db(logical_port_name, dom_tbl[asic_index], self.task_stopping_event)
if y_cable_presence[0] is True:
Expand Down Expand Up @@ -1015,7 +1000,7 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence):
while not stopping_event.is_set():
next_state = state
time_start = time.time()
status, port_dict = _wrapper_get_transceiver_change_event(timeout)
status, port_dict, error_dict = _wrapper_get_transceiver_change_event(timeout)
if not port_dict:
continue
helper_logger.log_debug("Got event {} {} in state {}".format(status, port_dict, state))
Expand Down Expand Up @@ -1075,11 +1060,11 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence):
logger.log_warning("Got invalid asic index for {}, ignored".format(logical_port))
continue

if value == SFP_STATUS_INSERTED:
if value == sfp_status_helper.SFP_STATUS_INSERTED:
helper_logger.log_info("Got SFP inserted event")
# A plugin event will clear the error state.
update_port_transceiver_status_table(
logical_port, status_tbl[asic_index], SFP_STATUS_INSERTED)
logical_port, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_INSERTED)
helper_logger.log_info("receive plug in and update port sfp status table.")
rc = post_port_sfp_info_to_db(logical_port, int_tbl[asic_index], transceiver_dict)
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
Expand All @@ -1091,28 +1076,36 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence):
post_port_dom_threshold_info_to_db(logical_port, dom_tbl[asic_index])
notify_media_setting(logical_port, transceiver_dict, app_port_tbl[asic_index])
transceiver_dict.clear()
elif value == SFP_STATUS_REMOVED:
elif value == sfp_status_helper.SFP_STATUS_REMOVED:
helper_logger.log_info("Got SFP removed event")
update_port_transceiver_status_table(
logical_port, status_tbl[asic_index], SFP_STATUS_REMOVED)
helper_logger.log_info("receive plug out and pdate port sfp status table.")
logical_port, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_REMOVED)
helper_logger.log_info("receive plug out and update port sfp status table.")
del_port_sfp_dom_info_from_db(logical_port, int_tbl[asic_index], dom_tbl[asic_index])
elif value in errors_block_eeprom_reading:
helper_logger.log_info("Got SFP Error event")
# Add port to error table to stop accessing eeprom of it
# If the port already in the error table, the stored error code will
# be updated to the new one.
update_port_transceiver_status_table(logical_port, status_tbl[asic_index], value)
helper_logger.log_info("receive error update port sfp status table.")
# In this case EEPROM is not accessible, so remove the DOM info
# since it will be outdated if long time no update.
# but will keep the interface info in the DB since it static.
del_port_sfp_dom_info_from_db(logical_port, None, dom_tbl[asic_index])

else:
# SFP return unkown event, just ignore for now.
helper_logger.log_warning("Got unknown event {}, ignored".format(value))
continue
try:
error_bits = int(value)
helper_logger.log_info("Got SFP error event {}".format(value))

error_descriptions = sfp_status_helper.fetch_generic_error_description(error_bits)

if sfp_status_helper.has_vendor_specific_error(error_bits):
if error_dict:
vendor_specific_error_description = error_dict.get(key)
else:
vendor_specific_error_description = _wrapper_get_sfp_error_description(key)
error_descriptions.append(vendor_specific_error_description)

# Add error info to database
# Any existing error will be replaced by the new one.
update_port_transceiver_status_table(logical_port, status_tbl[asic_index], value, '|'.join(error_descriptions))
helper_logger.log_info("Receive error update port sfp status table.")
# In this case EEPROM is not accessible. The DOM info will be removed since it can be out-of-date.
# The interface info remains in the DB since it is static.
if sfp_status_helper.is_error_block_eeprom_reading(error_bits):
del_port_sfp_dom_info_from_db(logical_port, None, dom_tbl[asic_index])
except (TypeError, ValueError) as e:
logger.log_error("Got unrecognized event {}, ignored".format(value))

# Since ports could be connected to a mux cable, if there is a change event process the change for being on a Y cable Port
y_cable_helper.change_ports_status_for_y_cable_change_event(
Expand Down
37 changes: 37 additions & 0 deletions sonic-xcvrd/xcvrd/xcvrd_utilities/sfp_status_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from sonic_platform_base.sfp_base import SfpBase

# SFP status definition, shall be aligned with the definition in get_change_event() of ChassisBase
SFP_STATUS_REMOVED = '0'
SFP_STATUS_INSERTED = '1'

# SFP error code dictinary, new elements can be added if new errors need to be supported.
SFP_ERRORS_BLOCKING_MASK = 0x02
SFP_ERRORS_GENERIC_MASK = 0x0000FFFE
SFP_ERRORS_VENDOR_SPECIFIC_MASK = 0xFFFF0000

def is_error_block_eeprom_reading(error_bits):
return 0 != (error_bits & SFP_ERRORS_BLOCKING_MASK)


def has_vendor_specific_error(error_bits):
return 0 != (error_bits & SFP_ERRORS_VENDOR_SPECIFIC_MASK)


def fetch_generic_error_description(error_bits):
generic_error_bits = (error_bits & SFP_ERRORS_GENERIC_MASK)
error_descriptions = []
if generic_error_bits:
for error_bit, error_description in SfpBase.SFP_ERROR_BIT_TO_DESCRIPTION_DICT.items():
if error_bit & generic_error_bits:
error_descriptions.append(error_description)
return error_descriptions


def detect_port_in_error_status(logical_port_name, status_tbl):
rec, fvp = status_tbl.get(logical_port_name)
if rec:
status_dict = dict(fvp)
error = status_dict.get('error')
return SfpBase.SFP_ERROR_DESCRIPTION_BLOCKING in error
return False

Loading