-
Notifications
You must be signed in to change notification settings - Fork 166
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[202012] Refactor Pcied and add unittest #199
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[pytest] | ||
addopts = --cov=scripts --cov-report html --cov-report term --cov-report xml --junitxml=test-results.xml -vv |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -5,31 +5,62 @@ | |||||
PCIe device monitoring daemon for SONiC | ||||||
""" | ||||||
|
||||||
try: | ||||||
import os | ||||||
import signal | ||||||
import sys | ||||||
import threading | ||||||
|
||||||
import swsssdk | ||||||
from sonic_py_common import daemon_base, device_info | ||||||
from swsscommon import swsscommon | ||||||
except ImportError as e: | ||||||
raise ImportError(str(e) + " - required module not found") | ||||||
import os | ||||||
import signal | ||||||
import sys | ||||||
import threading | ||||||
|
||||||
from sonic_py_common import daemon_base, device_info | ||||||
from swsscommon import swsscommon | ||||||
|
||||||
# | ||||||
# Constants ==================================================================== | ||||||
# | ||||||
|
||||||
# TODO: Once we no longer support Python 2, we can eliminate this and get the | ||||||
# name using the 'name' field (e.g., `signal.SIGINT.name`) starting with Python 3.5 | ||||||
SIGNALS_TO_NAMES_DICT = dict((getattr(signal, n), n) | ||||||
for n in dir(signal) if n.startswith('SIG') and '_' not in n) | ||||||
|
||||||
SYSLOG_IDENTIFIER = "pcied" | ||||||
|
||||||
PCIE_RESULT_REGEX = "PCIe Device Checking All Test" | ||||||
PCIE_TABLE_NAME = "PCIE_STATUS" | ||||||
PCIE_DEVICE_TABLE_NAME = "PCIE_DEVICE" | ||||||
|
||||||
PCIE_CONF_FILE = 'pcie.yaml' | ||||||
PCIE_STATUS_TABLE_NAME = "PCIE_DEVICES" | ||||||
|
||||||
PCIED_MAIN_THREAD_SLEEP_SECS = 60 | ||||||
REDIS_HOSTIP = "127.0.0.1" | ||||||
|
||||||
PCIEUTIL_CONF_FILE_ERROR = 1 | ||||||
PCIEUTIL_LOAD_ERROR = 2 | ||||||
|
||||||
platform_pcieutil = None | ||||||
|
||||||
exit_code = 0 | ||||||
|
||||||
# wrapper functions to call the platform api | ||||||
def load_platform_pcieutil(): | ||||||
_platform_pcieutil = None | ||||||
(platform_path, _) = device_info.get_paths_to_platform_and_hwsku_dirs() | ||||||
try: | ||||||
from sonic_platform.pcie import Pcie | ||||||
_platform_pcieutil = Pcie(platform_path) | ||||||
except ImportError as e: | ||||||
self.log_error("Failed to load platform Pcie module. Error : {}".format(str(e)), True) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This caused in master for pcied to enter FATAL state (sonic-net/sonic-buildimage#7993)
Suggested change
|
||||||
try: | ||||||
from sonic_platform_base.sonic_pcie.pcie_common import PcieUtil | ||||||
_platform_pcieutil = PcieUtil(platform_path) | ||||||
except ImportError as e: | ||||||
self.log_error("Failed to load default PcieUtil module. Error : {}".format(str(e)), True) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
return _platform_pcieutil | ||||||
|
||||||
def read_id_file(device_name): | ||||||
id = None | ||||||
dev_id_path = '/sys/bus/pci/devices/0000:%s/device' % device_name | ||||||
|
||||||
if os.path.exists(dev_id_path): | ||||||
with open(dev_id_path, 'r') as fd: | ||||||
id = fd.read().strip() | ||||||
return id | ||||||
|
||||||
# | ||||||
# Daemon ======================================================================= | ||||||
|
@@ -40,142 +71,145 @@ class DaemonPcied(daemon_base.DaemonBase): | |||||
def __init__(self, log_identifier): | ||||||
super(DaemonPcied, self).__init__(log_identifier) | ||||||
|
||||||
(platform_path, _) = device_info.get_paths_to_platform_and_hwsku_dirs() | ||||||
pciefilePath = os.path.join(platform_path, PCIE_CONF_FILE) | ||||||
if not os.path.exists(pciefilePath): | ||||||
self.log_error("Platform pcie configuration file doesn't exist! Exiting ...") | ||||||
sys.exit("Platform PCIe Configuration file doesn't exist!") | ||||||
|
||||||
self.timeout = PCIED_MAIN_THREAD_SLEEP_SECS | ||||||
self.stop_event = threading.Event() | ||||||
|
||||||
self.state_db = swsssdk.SonicV2Connector(host=REDIS_HOSTIP) | ||||||
self.state_db.connect("STATE_DB") | ||||||
state_db = daemon_base.db_connect("STATE_DB") | ||||||
self.device_table = swsscommon.Table(state_db, PCIE_DEVICE_TABLE_NAME) | ||||||
|
||||||
# Load AER-fields into STATEDB | ||||||
def update_aer_to_statedb(self, device_name, aer_stats): | ||||||
self.state_db = None | ||||||
self.device_table = None | ||||||
self.table = None | ||||||
self.resultInfo = [] | ||||||
self.device_name = None | ||||||
self.aer_stats = {} | ||||||
|
||||||
global platform_pcieutil | ||||||
|
||||||
platform_pcieutil = load_platform_pcieutil() | ||||||
if platform_pcieutil is None: | ||||||
sys.exit(PCIEUTIL_LOAD_ERROR) | ||||||
|
||||||
# Connect to STATE_DB and create pcie device table | ||||||
self.state_db = daemon_base.db_connect("STATE_DB") | ||||||
self.device_table = swsscommon.Table(self.state_db, PCIE_DEVICE_TABLE_NAME) | ||||||
self.status_table = swsscommon.Table(self.state_db, PCIE_STATUS_TABLE_NAME) | ||||||
|
||||||
def __del__(self): | ||||||
if self.device_table: | ||||||
table_keys = self.device_table.getKeys() | ||||||
for tk in table_keys: | ||||||
self.device_table._del(tk) | ||||||
if self.status_table: | ||||||
stable_keys = self.status_table.getKeys() | ||||||
for stk in stable_keys: | ||||||
self.status_table._del(stk) | ||||||
|
||||||
# load aer-fields into statedb | ||||||
def update_aer_to_statedb(self): | ||||||
if self.aer_stats is None: | ||||||
self.log_debug("PCIe device {} has no AER Stats".format(device_name)) | ||||||
return | ||||||
|
||||||
aer_fields = {} | ||||||
|
||||||
for field, value in aer_stats['correctable'].items(): | ||||||
correctable_field = "correctable|" + field | ||||||
aer_fields[correctable_field] = value | ||||||
|
||||||
for field, value in aer_stats['fatal'].items(): | ||||||
fatal_field = "fatal|" + field | ||||||
aer_fields[fatal_field] = value | ||||||
|
||||||
for field, value in aer_stats['non_fatal'].items(): | ||||||
non_fatal_field = "non_fatal|" + field | ||||||
aer_fields[non_fatal_field] = value | ||||||
for key, fv in self.aer_stats.items(): | ||||||
for field, value in fv.items(): | ||||||
key_field = "{}|{}".format(key,field) | ||||||
aer_fields[key_field] = value | ||||||
|
||||||
if aer_fields: | ||||||
formatted_fields = swsscommon.FieldValuePairs(list(aer_fields.items())) | ||||||
self.device_table.set(device_name, formatted_fields) | ||||||
self.device_table.set(self.device_name, formatted_fields) | ||||||
else: | ||||||
self.log_debug("PCIe device {} has no AER attriutes".format(device_name)) | ||||||
self.log_debug("PCIe device {} has no AER attriutes".format(self.device_name)) | ||||||
|
||||||
# Check the PCIe devices | ||||||
def check_pcie_devices(self): | ||||||
try: | ||||||
platform_path, _ = device_info.get_paths_to_platform_and_hwsku_dirs() | ||||||
from sonic_platform_base.sonic_pcie.pcie_common import PcieUtil | ||||||
platform_pcieutil = PcieUtil(platform_path) | ||||||
except ImportError as e: | ||||||
self.log_error("Failed to load default PcieUtil module. Error : {}".format(str(e)), True) | ||||||
raise e | ||||||
|
||||||
resultInfo = platform_pcieutil.get_pcie_check() | ||||||
err = 0 | ||||||
# Check the PCIe AER Stats | ||||||
def check_n_update_pcie_aer_stats(self, Bus, Dev, Fn): | ||||||
self.device_name = "%02x:%02x.%d" % (Bus, Dev, Fn) | ||||||
|
||||||
for item in resultInfo: | ||||||
if item["result"] == "Failed": | ||||||
self.log_warning("PCIe Device: " + item["name"] + " Not Found") | ||||||
err += 1 | ||||||
Id = read_id_file(self.device_name) | ||||||
|
||||||
self.aer_stats = {} | ||||||
if Id is not None: | ||||||
self.device_table.set(self.device_name, [('id', Id)]) | ||||||
self.aer_stats = platform_pcieutil.get_pcie_aer_stats(bus=Bus, dev=Dev, func=Fn) | ||||||
self.update_aer_to_statedb() | ||||||
|
||||||
|
||||||
# Update the PCIe devices status to DB | ||||||
def update_pcie_devices_status_db(self, err): | ||||||
if err: | ||||||
self.update_state_db("PCIE_DEVICES", "status", "FAILED") | ||||||
self.log_error("PCIe device status check : FAILED") | ||||||
pcie_status = "FAILED" | ||||||
self.log_error("PCIe device status check : {}".format(pcie_status)) | ||||||
else: | ||||||
self.update_state_db("PCIE_DEVICES", "status", "PASSED") | ||||||
self.log_info("PCIe device status check : PASSED") | ||||||
pcie_status = "PASSED" | ||||||
self.log_info("PCIe device status check : {}".format(pcie_status)) | ||||||
fvs = swsscommon.FieldValuePairs([ | ||||||
('status', pcie_status) | ||||||
]) | ||||||
|
||||||
# update AER-attributes to DB | ||||||
for item in resultInfo: | ||||||
if item["result"] == "Failed": | ||||||
continue | ||||||
self.status_table.set("status", fvs) | ||||||
|
||||||
Bus = int(item["bus"], 16) | ||||||
Dev = int(item["dev"], 16) | ||||||
Fn = int(item["fn"], 16) | ||||||
# Check the PCIe devices | ||||||
def check_pcie_devices(self): | ||||||
self.resultInfo = platform_pcieutil.get_pcie_check() | ||||||
err = 0 | ||||||
if self.resultInfo is None: | ||||||
return | ||||||
|
||||||
device_name = "%02x:%02x.%d" % (Bus, Dev, Fn) | ||||||
dev_id_path = '/sys/bus/pci/devices/0000:%s/device' % device_name | ||||||
with open(dev_id_path, 'r') as fd: | ||||||
Id = fd.read().strip() | ||||||
for result in self.resultInfo: | ||||||
if result["result"] == "Failed": | ||||||
self.log_warning("PCIe Device: " + result["name"] + " Not Found") | ||||||
err += 1 | ||||||
else: | ||||||
Bus = int(result["bus"], 16) | ||||||
Dev = int(result["dev"], 16) | ||||||
Fn = int(result["fn"], 16) | ||||||
# update AER-attributes to DB | ||||||
self.check_n_update_pcie_aer_stats(Bus, Dev, Fn) | ||||||
|
||||||
self.device_table.set(device_name, [('id', Id)]) | ||||||
aer_stats = platform_pcieutil.get_pcie_aer_stats(bus=Bus, device=Dev, func=Fn) | ||||||
self.update_aer_to_statedb(device_name, aer_stats) | ||||||
# update PCIe Device Status to DB | ||||||
self.update_pcie_devices_status_db(err) | ||||||
|
||||||
def read_state_db(self, key1, key2): | ||||||
return self.state_db.get('STATE_DB', key1, key2) | ||||||
# Override signal handler from DaemonBase | ||||||
def signal_handler(self, sig, frame): | ||||||
FATAL_SIGNALS = [signal.SIGINT, signal.SIGTERM] | ||||||
NONFATAL_SIGNALS = [signal.SIGHUP] | ||||||
|
||||||
def update_state_db(self, key1, key2, value): | ||||||
self.state_db.set('STATE_DB', key1, key2, value) | ||||||
global exit_code | ||||||
|
||||||
# Signal handler | ||||||
def signal_handler(self, sig, frame): | ||||||
if sig == signal.SIGHUP: | ||||||
self.log_info("Caught SIGHUP - ignoring...") | ||||||
elif sig == signal.SIGINT: | ||||||
self.log_info("Caught SIGINT - exiting...") | ||||||
self.stop_event.set() | ||||||
elif sig == signal.SIGTERM: | ||||||
self.log_info("Caught SIGTERM - exiting...") | ||||||
if sig in FATAL_SIGNALS: | ||||||
self.log_info("Caught signal '{}' - exiting...".format(SIGNALS_TO_NAMES_DICT[sig])) | ||||||
exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us | ||||||
self.stop_event.set() | ||||||
elif sig in NONFATAL_SIGNALS: | ||||||
self.log_info("Caught signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig])) | ||||||
else: | ||||||
self.log_warning("Caught unhandled signal '" + sig + "'") | ||||||
self.log_warning("Caught unhandled signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig])) | ||||||
|
||||||
# Initialize daemon | ||||||
def init(self): | ||||||
self.log_info("Start daemon init...") | ||||||
|
||||||
# Deinitialize daemon | ||||||
def deinit(self): | ||||||
self.log_info("Start daemon deinit...") | ||||||
|
||||||
# Run daemon | ||||||
# Main daemon logic | ||||||
def run(self): | ||||||
self.log_info("Starting up...") | ||||||
|
||||||
# Start daemon initialization sequence | ||||||
self.init() | ||||||
|
||||||
# Start main loop | ||||||
self.log_info("Start daemon main loop") | ||||||
|
||||||
while not self.stop_event.wait(self.timeout): | ||||||
# Check the Pcie device status | ||||||
self.check_pcie_devices() | ||||||
|
||||||
self.log_info("Stop daemon main loop") | ||||||
if self.stop_event.wait(self.timeout): | ||||||
# We received a fatal signal | ||||||
return False | ||||||
|
||||||
# Start daemon deinitialization sequence | ||||||
self.deinit() | ||||||
|
||||||
self.log_info("Shutting down...") | ||||||
self.check_pcie_devices() | ||||||
|
||||||
return True | ||||||
# | ||||||
# Main ========================================================================= | ||||||
# | ||||||
|
||||||
|
||||||
def main(): | ||||||
pcied = DaemonPcied(SYSLOG_IDENTIFIER) | ||||||
pcied.run() | ||||||
|
||||||
pcied.log_info("Starting up...") | ||||||
|
||||||
while pcied.run(): | ||||||
pass | ||||||
|
||||||
pcied.log_info("Shutting down...") | ||||||
|
||||||
return exit_code | ||||||
|
||||||
if __name__ == '__main__': | ||||||
main() | ||||||
sys.exit(main()) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[aliases] | ||
test=pytest |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change is needed for the fixes on line 48 & 53 to run