Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ASIC/SDK health event #44

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion clear/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
import click
import utilities_common.cli as clicommon
import utilities_common.multi_asic as multi_asic_util
from sonic_py_common import multi_asic
from sonic_py_common.general import getstatusoutput_noshell_pipe
from flow_counter_util.route import exit_if_route_flow_counter_not_support
from utilities_common import util_base
from show.plugins.pbh import read_pbh_counters
from config.plugins.pbh import serialize_pbh_counters
from . import plugins


# This is from the aliases example:
# https://github.com/pallets/click/blob/57c6f09611fc47ca80db0bd010f05998b3c0aa95/examples/aliases/aliases.py
class Config(object):
Expand Down Expand Up @@ -550,6 +550,28 @@ def route(prefix, vrf, namespace):
helper = util_base.UtilHelper()
helper.load_and_register_plugins(plugins, cli)

# ("sonic-clear asic-sdk-health-event")
@cli.command()
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
help='Option needed for multi-asic only: provide namespace name',
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
@clicommon.pass_db
def asic_sdk_health_event(db, namespace):
"""Clear received ASIC/SDK health events"""
if multi_asic.get_num_asics() > 1:
namespace_list = multi_asic.get_namespaces_from_linux()
else:
namespace_list = [multi_asic.DEFAULT_NAMESPACE]

for ns in namespace_list:
if namespace and namespace != ns:
continue

state_db = db.db_clients[ns]
keys = state_db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE*")
for key in keys:
state_db.delete(state_db.STATE_DB, key);


if __name__ == '__main__':
cli()
119 changes: 119 additions & 0 deletions config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7374,5 +7374,124 @@ def date(date, time):
clicommon.run_command(['timedatectl', 'set-time', date_time])


#
# 'asic-sdk-health-event' group ('config asic-sdk-health-event ...')
#
@config.group()
def asic_sdk_health_event():
"""Configuring asic-sdk-health-event"""
pass


@asic_sdk_health_event.group()
def suppress():
"""Suppress ASIC/SDK health event"""
pass


def handle_asic_sdk_health_suppress(db, severity, category_list, max_events, namespace):
ctx = click.get_current_context()

if multi_asic.get_num_asics() > 1:
namespace_list = multi_asic.get_namespaces_from_linux()
else:
namespace_list = [DEFAULT_NAMESPACE]

severityCapabilities = {
"fatal": "REG_FATAL_ASIC_SDK_HEALTH_CATEGORY",
"warning": "REG_WARNING_ASIC_SDK_HEALTH_CATEGORY",
"notice": "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY"
}

if category_list:
categories = {"software", "firmware", "cpu_hw", "asic_hw"}

if category_list == 'none':
suppressedCategoriesList = []
elif category_list == 'all':
suppressedCategoriesList = list(categories)
else:
suppressedCategoriesList = category_list.split(',')

unsupportCategories = set(suppressedCategoriesList) - categories
if unsupportCategories:
ctx.fail("Invalid category(ies): {}".format(unsupportCategories))

for ns in namespace_list:
if namespace and namespace != ns:
continue

config_db = db.cfgdb_clients[ns]
state_db = db.db_clients[ns]

entry_name = "SWITCH_CAPABILITY|switch"
if "true" != state_db.get(state_db.STATE_DB, entry_name, "ASIC_SDK_HEALTH_EVENT"):
ctx.fail("ASIC/SDK health event is not supported on the platform")

if "true" != state_db.get(state_db.STATE_DB, entry_name, severityCapabilities[severity]):
ctx.fail("Suppressing ASIC/SDK health {} event is not supported on the platform".format(severity))

entry = config_db.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)
need_remove = False
noarg = True

if category_list:
noarg = False
if suppressedCategoriesList:
entry["categories"] = suppressedCategoriesList
elif entry.get("categories"):
entry.pop("categories")
need_remove = True

if max_events is not None:
noarg = False
if max_events > 0:
entry["max_events"] = max_events
elif entry.get("max_events"):
entry.pop("max_events")
need_remove = True

if noarg:
ctx.fail("At least one argument should be provided!")

if entry:
config_db.set_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, entry)
elif need_remove:
config_db.set_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, None)


@suppress.command()
@click.option('--category-list', metavar='<category_list>', type=str, help="Categories to be suppressed")
@click.option('--max-events', metavar='<max_events>', type=click.IntRange(0), help="Maximum number of received events")
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
help='Option needed for multi-asic only: provide namespace name',
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
@clicommon.pass_db
def fatal(db, category_list, max_events, namespace):
handle_asic_sdk_health_suppress(db, 'fatal', category_list, max_events, namespace)


@suppress.command()
@click.option('--category-list', metavar='<category_list>', type=str, help="Categories to be suppressed")
@click.option('--max-events', metavar='<max_events>', type=click.IntRange(0), help="Maximum number of received events")
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
help='Option needed for multi-asic only: provide namespace name',
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
@clicommon.pass_db
def warning(db, category_list, max_events, namespace):
handle_asic_sdk_health_suppress(db, 'warning', category_list, max_events, namespace)


@suppress.command()
@click.option('--category-list', metavar='<category_list>', type=str, help="Categories to be suppressed")
@click.option('--max-events', metavar='<max_events>', type=click.IntRange(0), help="Maximum number of received events")
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
help='Option needed for multi-asic only: provide namespace name',
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
@clicommon.pass_db
def notice(db, category_list, max_events, namespace):
handle_asic_sdk_health_suppress(db, 'notice', category_list, max_events, namespace)


if __name__ == '__main__':
config()
156 changes: 156 additions & 0 deletions doc/Command-Reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
* [ARP & NDP](#arp--ndp)
* [ARP show commands](#arp-show-commands)
* [NDP show commands](#ndp-show-commands)
* [ASIC SDK health event](#asic-sdk-health-event)
* [ASIC SDK health event config commands](#asic-sdk-health-event-config-commands)
* [ASIC SDK health event show commands](#asic-sdk-health-event-show-commands)
* [ASIC SDK health event clear commands](#asic-sdk-health-event-clear-commands)
* [BFD](#bfd)
* [BFD show commands](#bfd-show-commands)
* [BGP](#bgp)
Expand Down Expand Up @@ -1928,6 +1932,158 @@ This command is used to display: ACL rules, tables and their priority, ACL packe

If the `PACKETS COUNT` and `BYTES COUNT` fields have some numeric value it means that it is a SONiC ACL's and those counters are created in SONiC `COUNTERS_DB`.

## ASIC SDK health event

### ASIC SDK health event config commands

**config asic-sdk-health-event suppress **

This command is for a customer to configure the categories that he/she wants to suppress for a certain severity.

- Usage:
```
config config asic-sdk-health-event suppress <severity> [--category-list <category-list>|<none>|<all>] [--max-events <max-events>]
```

- Parameters:
- severity: Specify the severity whose ASIC/SDK health events to be suppressed. It can be one of `fatal`, `warning`, and `notice`.
- category-list: Specify the categories from which the ASIC/SDK health events to be suppressed. It is a list whose element is one of `software`, `firmware`, `cpu_hw`, `asic_hw` separated by a comma.
If the category-list is `none`, none category is suppressed and all the categories will be notified for `severity`. In this case, it will not be stored in the CONFIG_DB.
If the category-list is `all`, all the categories are suppressed and none category will be notified for `severity`.
- max-events: Specify the maximum number of events of the severity to be stored in the STATE_DB.
There is no limitation if the max-events is 0. In this case, it will not be stored in the CONFIG_DB.

- Examples:
```
admin@sonic:~$ sudo config asic-sdk-health-event suppress fatal --category-list cpu_hw,software --max-events 10240
```

This command will suppress ASIC/SDK health events whose severity is fatal and cagetory is cpu_hw or software. Maximum number of such events in the STATE_DB is 10240.

### ASIC SDK health event show commands

**show asic-sdk-health-event received**

This command displays the received ASIC/SDK health events.

- Usage:
```
show asic-sdk-health-event received [-n <asicname>]
```

- Details:
- show asic-sdk-health-event received: Display the ASIC/SDK health events received on all ASICs
- show asic-sdk-health-event received -n asic0: Display all the ASIC/SDK health events received on asic0


- Example:
```
admin@sonic:~$ show asic-sdk-health-event received
Time Severity Category Description
------------------- ----------- --------- -----------------
2023-10-20 05:07:34 fatal firmware Command timeout
2023-10-20 03:06:25 fatal software SDK daemon keep alive failed
2023-10-20 05:07:34 fatal asic_hw Uncorrectable ECC error
2023-10-20 01:58:43 notice asic_hw Correctable ECC error
```

- Example on a multi ASIC system:
```
admin@sonic:~$ show asic-sdk-health-event received
asic0:
Time Severity Category Description
------------------- ----------- --------- -----------------
2023-10-20 05:07:34 fatal firmware Command timeout
2023-10-20 03:06:25 fatal software SDK daemon keep alive failed
asic1:
Time Severity Category Description
------------------- ----------- --------- -----------------
2023-10-20 05:07:34 fatal asic_hw Uncorrectable ECC error
2023-10-20 01:58:43 notice asic_hw Correctable ECC error
```

Optionally, you can specify the asic name in order to display the ASIC/SDK health events received on that particular ASIC on a multi ASIC system

- Example:
```
admin@sonic:~$ show asic-sdk-health-event received -n asic1
asic1:
Time Severity Category Description
------------------- ----------- --------- -----------------
2023-10-20 05:07:34 fatal firmware Command timeout
```

**show asic-sdk-health-event suppress-configuration**

This command displays the suppressed category list and maximum number of events of ASIC/SDK health events.

- Usage:
```
show asic-sdk-health-event suppressed-category-list [-n <asicname>]
```

- Details:
- show asic-sdk-health-event suppress-configuration: Display the ASIC/SDK health event suppress category list and maximum number of events on all ASICs
- show asic-sdk-health-event suppress-configuration -n asic0: Display all the ASIC/SDK health event suppress category list and maximum number of events on asic0


- Example:
```
admin@sonic:~$ show asic-sdk-health-event suppress-configuration
Severity Suppressed category-list Max events
---------- -------------------------- ------------
fatal software unlimited
notice none 1024
warning firmware,asic_hw 10240
```

- Example on a multi ASIC system:
```
admin@sonic:~$ show asic-sdk-health-event suppress-configuration
asic0:
Severity Suppressed category-list Max events
---------- -------------------------- ------------
notice none 1024
warning firmware,asic_hw 10240
asic1:
Severity Suppressed category-list Max events
---------- -------------------------- ------------
fatal software unlimited
```

Optionally, you can specify the asic name in order to display the ASIC/SDK health event suppress category list on that particular ASIC on a multi ASIC system

- Example:
```
admin@sonic:~$ show asic-sdk-health-event suppress-configuration -n asic1
asic1:
Severity Suppressed category-list Max events
---------- -------------------------- ------------
fatal software unlimited
```

### ASIC SDK health event clear commands

**sonic-clear asic-sdk-health-event**

This command clears all the received ASIC/SDK health events.

- Usage:
```
sonic-clear asic-sdk-health-event [-n <asicname>]
```

- Details:
- sonic-clear asic-sdk-health-event: Clear the ASIC/SDK health events received on all ASICs
- sonic-clear asic-sdk-health-event -n asic0: Display all the ASIC/SDK health events received on asic0


- Example:
```
admin@sonic:~$ sonic-clear asic-sdk-health-event
```

Go Back To [Beginning of the document](#) or [Beginning of this section](#asic-sdk-health-event)

## ARP & NDP

Expand Down
2 changes: 2 additions & 0 deletions scripts/generate_dump
Original file line number Diff line number Diff line change
Expand Up @@ -1869,6 +1869,8 @@ main() {
# 1st counter snapshot early. Need 2 snapshots to make sense of counters trend.
save_counter_snapshot $asic 1

save_cmd "show asic-sdk-health-event received" "asic.sdk.health.event" &

save_cmd "systemd-analyze blame" "systemd.analyze.blame" &
save_cmd "systemd-analyze dump" "systemd.analyze.dump" &
save_cmd "systemd-analyze plot" "systemd.analyze.plot.svg" &
Expand Down
Loading
Loading