Skip to content

Commit

Permalink
Adding support for persistent storage and retrieval of DPU reboot-cau…
Browse files Browse the repository at this point in the history
…se (#169)

* Adding support for persistent storage and retrieval of DPU reboot-cause

* Added support for persisting dpu reboot-cause on smartswitch host

* Working on coverage

* Working on ut coverage

* working on coverage

* working on coverage

* working on coverage

* working on coverage

* working on coverage

* Fixed a typo

* Working on coverage

* Fixing test failure

* improving coverage

* Improving coverage

* working on coverage

* Modifying reboot-cause workflow to meet multiple smartswitch vendor
hardware implementation requirements

* Fixig the assertions to meet the new change

* Fixed the DB

* Using the common API device_info.get_dpu_list()

* Addressed review comments

* Added new test file tests/process-reboot-cause_test.py

* Added the scripts_path

* Moved setup outside the test class

* Fixed the file name

* Fixing test isssues

* Working on UT

* Fixed the numbeer of arguments to load_module_from_source

* addressed review comments

* adding mock for uid

* passing uid arg

* Fixing test failure

* Fixing test failure

* Fixing test failure

* Fixing test failure

* Fixing test failure

* Fixing test failure

* Iproving coverage

* Iproving coverage

* Iproving coverage

* Iproving coverage

* Iproving coverage

* Iproving coverage

* Iproving coverage

* Iproving coverage

* Iproving coverage

* Iproving coverage

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Addressed review comments: Using a common function ead_reboot_cause_files_and_save_to_db for regular switch and smartswitch

* Working on coverage

* Working on coverage

* Working on coverage

* Working on coverage

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Addressed review comments

* Fixed a test issue

* Fixed a test issue

* Fixed a test issue

* Fixed a test issue

* Fixed a test issue

* Fixed a test issue

* Fixed a test issue

* Fixed a test issue

* Fixed a test issue

* Fixed a test issue

* Fixed a bug

* Did a minor cleanup

* Addressed a review comment
  • Loading branch information
rameshraghupathy authored Feb 9, 2025
1 parent 5e08927 commit bb0a31c
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 20 deletions.
32 changes: 29 additions & 3 deletions scripts/determine-reboot-cause
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ VERSION = "1.0"
SYSLOG_IDENTIFIER = "determine-reboot-cause"

REBOOT_CAUSE_DIR = "/host/reboot-cause/"
REBOOT_CAUSE_MODULE_DIR = "/host/reboot-cause/module"
REBOOT_CAUSE_HISTORY_DIR = "/host/reboot-cause/history/"
REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "reboot-cause.txt")
PREVIOUS_REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "previous-reboot-cause.json")
Expand Down Expand Up @@ -136,10 +137,10 @@ def find_hardware_reboot_cause():


def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):
"""Store the key infomation of device reboot into a dictionary by parsing the string in
"""Store the key information of device reboot into a dictionary by parsing the string in
previous_reboot_cause.
If user issused a command to reboot device, then user, command and time will be
If user issued a command to reboot device, then user, command and time will be
stored into a dictionary.
If device was rebooted due to the kernel panic, then the string `Kernel Panic`
Expand Down Expand Up @@ -185,7 +186,7 @@ def determine_reboot_cause():

# The main decision logic of the reboot cause:
# If there is a valid hardware reboot cause indicated by platform API,
# check the software reboot cause to add additional rebot cause.
# check the software reboot cause to add additional reboot cause.
# If there is a reboot cause indicated by /proc/cmdline, and/or warmreboot/fastreboot/softreboot
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
# will be treated as the additional reboot cause
Expand All @@ -211,6 +212,27 @@ def determine_reboot_cause():

return previous_reboot_cause, additional_reboot_info

def check_and_create_dpu_dirs():
# Get the list of DPUs
dpus = device_info.get_dpu_list()

# Create directories for each DPU and its history
for dpu in dpus:
dpu_dir = os.path.join(REBOOT_CAUSE_MODULE_DIR, dpu)
history_dir = os.path.join(dpu_dir, "history")

# Create the DPU directory if it doesn't exist
if not os.path.exists(dpu_dir):
os.makedirs(dpu_dir)

# Create reboot-cause.txt and write 'First boot' to it
reboot_file = os.path.join(dpu_dir, 'reboot-cause.txt')
with open(reboot_file, 'w') as f:
f.write('First boot\n')

# Create the history directory if it doesn't exist
if not os.path.exists(history_dir):
os.makedirs(history_dir)

def main():
# Configure logger to log all messages INFO level and higher
Expand Down Expand Up @@ -261,6 +283,10 @@ def main():
with open(REBOOT_CAUSE_FILE, "w") as cause_file:
cause_file.write(REBOOT_CAUSE_UNKNOWN)

# Create directories for DPUs in SmartSwitch platforms
if device_info.is_smartswitch():
check_and_create_dpu_dirs()


if __name__ == "__main__":
main()
56 changes: 39 additions & 17 deletions scripts/process-reboot-cause
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ try:

from swsscommon import swsscommon
from sonic_py_common import syslogger
from sonic_py_common import device_info
except ImportError as err:
raise ImportError("%s - required module not found" % str(err))

VERSION = "1.0"
CHASSIS_SERVER_PORT = 6380

SYSLOG_IDENTIFIER = "process-reboot-cause"

Expand All @@ -28,6 +30,7 @@ USER_ISSUED_REBOOT_CAUSE_REGEX ="User issued \'{}\' command [User: {}, Time: {}]

REBOOT_CAUSE_UNKNOWN = "Unknown"
REBOOT_CAUSE_TABLE_NAME = "REBOOT_CAUSE"
MAX_HISTORY_FILES = 10

REDIS_HOSTIP = "127.0.0.1"
state_db = None
Expand All @@ -37,39 +40,52 @@ sonic_logger = syslogger.SysLogger(SYSLOG_IDENTIFIER)


# ============================= Functions =============================
def read_reboot_cause_files_and_save_state_db():
def read_reboot_cause_files_and_save_to_db(device='npu'):
# Connect State DB
state_db = swsscommon.SonicV2Connector(host=REDIS_HOSTIP)
state_db.connect(state_db.STATE_DB)
if device == 'npu':
db = swsscommon.SonicV2Connector(host=REDIS_HOSTIP)
table = db.STATE_DB
history_dir = REBOOT_CAUSE_HISTORY_DIR
else:
db = swsscommon.SonicV2Connector(host="redis_chassis.server", port=CHASSIS_SERVER_PORT)
table = db.CHASSIS_STATE_DB
history_dir = os.path.join('/host/reboot-cause/module', device , 'history')
db.connect(table)

# Sort the previous reboot cause files by creation time
REBOOT_FILE_LIST = [os.path.join(REBOOT_CAUSE_HISTORY_DIR, i) for i in os.listdir(REBOOT_CAUSE_HISTORY_DIR)]
REBOOT_FILE_LIST = [os.path.join(history_dir, i) for i in os.listdir(history_dir)]
TIME_SORTED_FULL_REBOOT_FILE_LIST = sorted(REBOOT_FILE_LIST, key=os.path.getmtime, reverse=True)

data = []
# Read each sorted previous reboot cause file and update the state db with previous reboot cause information
for i in range(min(10, len(TIME_SORTED_FULL_REBOOT_FILE_LIST))):
for i in range(min(MAX_HISTORY_FILES, len(TIME_SORTED_FULL_REBOOT_FILE_LIST))):
x = TIME_SORTED_FULL_REBOOT_FILE_LIST[i]
if os.path.isfile(x):
with open(x, "r") as cause_file:
try:
data = json.load(cause_file)
_hash = '{}|{}'.format(REBOOT_CAUSE_TABLE_NAME, data['gen_time'])
state_db.set(state_db.STATE_DB, _hash, 'cause', data['cause'])
state_db.set(state_db.STATE_DB, _hash, 'time', data['time'])
state_db.set(state_db.STATE_DB, _hash, 'user', data['user'])
state_db.set(state_db.STATE_DB, _hash, 'comment', data['comment'])
if device == 'npu':
_hash = '{}|{}'.format(REBOOT_CAUSE_TABLE_NAME, data['gen_time'])
else:
# Ensure keys exist
if 'name' not in data:
sonic_logger.log_warning(f"Missing 'name' in reboot-cause file")
continue # Skip this file
_hash = f"{REBOOT_CAUSE_TABLE_NAME}|{device.upper()}|{data['name']}"
db.set(table, _hash, 'cause', data.get('cause', ''))
db.set(table, _hash, 'time', data.get('time', ''))
db.set(table, _hash, 'user', data.get('user', ''))
db.set(table, _hash, 'comment', data.get('comment', ''))
except json.decoder.JSONDecodeError as je:
sonic_logger.log_info("Unable to process reload cause file {}: {}".format(x, je))
sonic_logger.log_error("Unable to process reload cause file {}: {}".format(x, je))
pass

if len(TIME_SORTED_FULL_REBOOT_FILE_LIST) > 10:
if len(TIME_SORTED_FULL_REBOOT_FILE_LIST) > MAX_HISTORY_FILES:
for i in range(len(TIME_SORTED_FULL_REBOOT_FILE_LIST)):
if i >= 10:
if i >= MAX_HISTORY_FILES:
x = TIME_SORTED_FULL_REBOOT_FILE_LIST[i]
os.remove(x)


def main():
# Configure logger to log all messages INFO level and higher
sonic_logger.set_min_log_priority(sonic_logger.DEFAULT_LOG_LEVEL)
Expand All @@ -96,9 +112,15 @@ def main():
sonic_logger.log_info("Previous reboot cause: {}".format(previous_reboot_cause))

if os.path.exists(REBOOT_CAUSE_HISTORY_DIR):
# Read the previous reboot cause from saved reboot-cause files and save the previous reboot cause upto 10 entry to the state db
read_reboot_cause_files_and_save_state_db()

# Read the previous npu reboot cause from saved reboot-cause files
# Save the previous npu reboot cause upto 10 entry to the state db
read_reboot_cause_files_and_save_to_db('npu')
# Read the previous dpu reboot cause from saved reboot-cause files
# Save the previous dpu reboot cause upto 10 entry to the state db
if device_info.is_smartswitch():
dpu_list = device_info.get_dpu_list()
for dpu in dpu_list:
read_reboot_cause_files_and_save_to_db(dpu)

if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions tests/determine-reboot-cause_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import shutil
import pytest
import json

from swsscommon import swsscommon
from sonic_py_common.general import load_module_from_source
Expand Down Expand Up @@ -33,6 +34,8 @@
determine_reboot_cause_path = os.path.join(scripts_path, 'determine-reboot-cause')
determine_reboot_cause = load_module_from_source('determine_reboot_cause', determine_reboot_cause_path)

# Get the function to create dpu dir
check_and_create_dpu_dirs = determine_reboot_cause.check_and_create_dpu_dirs

PROC_CMDLINE_CONTENTS = """\
BOOT_IMAGE=/image-20191130.52/boot/vmlinuz-4.9.0-11-2-amd64 root=/dev/sda4 rw console=tty0 console=ttyS1,9600n8 quiet net.ifnames=0 biosdevname=0 loop=image-20191130.52/fs.squashfs loopfstype=squashfs apparmor=1 security=apparmor varlog_size=4096 usbcore.autosuspend=-1 module_blacklist=gpio_ich SONIC_BOOT_TYPE=warm"""
Expand Down Expand Up @@ -73,6 +76,8 @@
EXPECTED_KERNEL_PANIC_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2021_3_28_13_48_49', 'cause': 'Kernel Panic', 'user': 'N/A', 'time': 'Sun Mar 28 13:45:12 UTC 2021'}

REBOOT_CAUSE_DIR="host/reboot-cause/"
PLATFORM_JSON_PATH = "/usr/share/sonic/device/test_platform/platform.json"
REBOOT_CAUSE_MODULE_DIR = "/host/reboot-cause/module"

class TestDetermineRebootCause(object):
def test_parse_warmfast_reboot_from_proc_cmdline(self):
Expand Down Expand Up @@ -206,3 +211,35 @@ def test_determine_reboot_cause_main_with_reboot_cause_dir(self):
determine_reboot_cause.main()
assert os.path.exists("host/reboot-cause/reboot-cause.txt") == True
assert os.path.exists("host/reboot-cause/previous-reboot-cause.json") == True

def create_mock_platform_json(self, dpus):
"""Helper function to create a mock platform.json file."""
os.makedirs(os.path.dirname(PLATFORM_JSON_PATH), exist_ok=True)
with open(PLATFORM_JSON_PATH, "w") as f:
json.dump({"DPUS": dpus}, f)

@mock.patch('os.makedirs')
@mock.patch('builtins.open', new_callable=mock.mock_open)
@mock.patch('os.path.exists', side_effect=lambda path: False)
@mock.patch('sonic_py_common.device_info.is_smartswitch', return_value=True)
@mock.patch('sonic_py_common.device_info.get_dpu_list', return_value=["dpu0", "dpu1"])
def test_check_and_create_dpu_dirs(
self,
mock_get_dpu_list,
mock_is_smartswitch,
mock_exists,
mock_open,
mock_makedirs
):
# Call the function under test
check_and_create_dpu_dirs()

# Assert that directories were created for each DPU
mock_makedirs.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu0"))
mock_makedirs.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu1"))
mock_makedirs.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu0", "history"))
mock_makedirs.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu1", "history"))

# Assert that reboot-cause.txt was created for each DPU
mock_open.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu0", "reboot-cause.txt"), 'w')
mock_open.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu1", "reboot-cause.txt"), 'w')
132 changes: 132 additions & 0 deletions tests/process-reboot-cause_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import sys
import os
from unittest import TestCase
from unittest.mock import patch, MagicMock, mock_open
from io import StringIO
from sonic_py_common.general import load_module_from_source

# Mock the connector
from .mock_connector import MockConnector
import swsscommon

# Mock the SonicV2Connector
swsscommon.SonicV2Connector = MockConnector

# Define the path to the script and load it using the helper function
test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
scripts_path = os.path.join(modules_path, "scripts")
sys.path.insert(0, modules_path)

# Load the process-reboot-cause module using the helper function
process_reboot_cause_path = os.path.join(scripts_path, "process-reboot-cause")
process_reboot_cause = load_module_from_source('process_reboot_cause', process_reboot_cause_path)

# Now proceed with your test class and mocks
class TestProcessRebootCause(TestCase):
@patch("builtins.open", new_callable=mock_open, read_data='{"cause": "Non-Hardware", "user": "", "comment": "Switch rebooted DPU", "device": "DPU0", "time": "Fri Dec 13 01:12:36 AM UTC 2024", "gen_time": "2024_12_13_01_12_36"}')
@patch("os.listdir", return_value=["file1.json", "file2.json"])
@patch("os.path.isfile", return_value=True)
@patch("os.path.exists", return_value=True)
@patch("os.path.getmtime", side_effect=lambda path: 1700000000 if "file1.json" in path else 1700001000)
@patch("os.remove")
@patch("process_reboot_cause.swsscommon.SonicV2Connector")
@patch("process_reboot_cause.device_info.is_smartswitch", return_value=False)
@patch("sys.stdout", new_callable=StringIO)
@patch("os.geteuid", return_value=0)
@patch("process_reboot_cause.device_info.get_dpu_list", return_value=["dpu1"])
def test_process_reboot_cause(self, mock_get_dpu_list, mock_geteuid, mock_stdout, mock_is_smartswitch, mock_connector, mock_remove, mock_getmtime, mock_exists, mock_isfile, mock_listdir, mock_open):
# Mock DB
mock_db = MagicMock()
mock_connector.return_value = mock_db

# Simulate running the script
with patch.object(sys, "argv", ["process-reboot-cause"]):
process_reboot_cause.main()

# Validate syslog and stdout logging
output = mock_stdout.getvalue()

# Verify DB interactions
mock_db.connect.assert_called()

@patch("builtins.open", new_callable=mock_open, read_data='{"invalid_json": ') # Malformed JSON
@patch("os.listdir", return_value=["file1.json"])
@patch("os.path.isfile", return_value=True)
@patch("os.path.exists", return_value=True)
@patch("os.path.getmtime", side_effect=lambda path: 1700000000 if "file1.json" in path else 1700001000)
@patch("os.remove")
@patch("process_reboot_cause.swsscommon.SonicV2Connector")
@patch("process_reboot_cause.device_info.is_smartswitch", return_value=False)
@patch("sys.stdout", new_callable=StringIO)
@patch("os.geteuid", return_value=0)
@patch("process_reboot_cause.device_info.get_dpu_list", return_value=["dpu1", "dpu2"])
def test_invalid_json(
self, mock_get_dpu_list, mock_geteuid, mock_stdout, mock_is_smartswitch,
mock_connector, mock_remove, mock_getmtime, mock_exists, mock_isfile,
mock_listdir, mock_open
):
# Mock DB
mock_db = MagicMock()
mock_connector.return_value = mock_db

# Simulate running the script
with patch.object(sys, "argv", ["process-reboot-cause"]):
try:
process_reboot_cause.read_reboot_cause_files_and_save_to_db('npu')
except json.JSONDecodeError:
pass # Expected failure due to invalid JSON

# Check invalid JSON handling
output = mock_stdout.getvalue()

# Test read_reboot_cause_files_and_save_to_db - smartswitch
@patch("builtins.open", new_callable=mock_open, read_data='{"cause": "Non-Hardware", "user": "admin", "name": "2024_12_13_01_12_36", "comment": "Switch rebooted DPU", "device": "DPU0", "time": "Fri Dec 13 01:12:36 AM UTC 2024"}')
@patch("os.listdir", return_value=["file1.json"])
@patch("os.path.isfile", return_value=True)
@patch("os.path.exists", return_value=True)
@patch("os.path.getmtime", side_effect=lambda path: 1700000000 if "file1.json" in path else 1700001000)
@patch("os.remove")
@patch("process_reboot_cause.swsscommon.SonicV2Connector")
@patch("process_reboot_cause.device_info.is_smartswitch", return_value=True)
@patch("sys.stdout", new_callable=StringIO)
@patch("os.geteuid", return_value=0)
@patch("process_reboot_cause.device_info.get_dpu_list", return_value=["dpu1"])
def test_read_reboot_cause_files_and_save_to_db(
self, mock_get_dpu_list, mock_geteuid, mock_stdout, mock_is_smartswitch,
mock_connector, mock_remove, mock_getmtime, mock_exists, mock_isfile,
mock_listdir, mock_open
):
# Mock DB
mock_db = MagicMock()
mock_connector.return_value = mock_db

# Simulate running the script
with patch.object(sys, "argv", ["process-reboot-cause"]):
process_reboot_cause.read_reboot_cause_files_and_save_to_db('dpu1')

# Test read_reboot_cause_files_and_save_to_db - smartswitch - name not in data
@patch("builtins.open", new_callable=mock_open, read_data='{"cause": "Non-Hardware", "user": "admin", "comment": "Switch rebooted DPU", "device": "DPU0", "time": "Fri Dec 13 01:12:36 AM UTC 2024"}')
@patch("os.listdir", return_value=["file1.json"])
@patch("os.path.isfile", return_value=True)
@patch("os.path.exists", return_value=True)
@patch("os.path.getmtime", side_effect=lambda path: 1700000000 if "file1.json" in path else 1700001000)
@patch("os.remove")
@patch("process_reboot_cause.swsscommon.SonicV2Connector")
@patch("process_reboot_cause.device_info.is_smartswitch", return_value=True)
@patch("sys.stdout", new_callable=StringIO)
@patch("os.geteuid", return_value=0)
@patch("process_reboot_cause.device_info.get_dpu_list", return_value=["dpu1"])
def test_read_reboot_cause_files_name_not_in_data(
self, mock_get_dpu_list, mock_geteuid, mock_stdout, mock_is_smartswitch,
mock_connector, mock_remove, mock_getmtime, mock_exists, mock_isfile,
mock_listdir, mock_open
):
# Mock DB
mock_db = MagicMock()
mock_connector.return_value = mock_db

# Simulate running the script
with patch.object(sys, "argv", ["process-reboot-cause"]):
process_reboot_cause.read_reboot_cause_files_and_save_to_db('dpu1')

0 comments on commit bb0a31c

Please sign in to comment.