Skip to content

Commit

Permalink
Add NIM saved views and logs pipeline (#19113)
Browse files Browse the repository at this point in the history
* Create Nvidia NIM scaffolding

* Add Initial Release changelog

* sync models and config

* Add metadata and tests

* Add Readme

* nvidia dash (#19074)

* nvidia dash

* nits

* more nits

* nit

* validate-assets fixes

* remove astericks in README hyperlink ref

* Add NIM saved views and logs pipeline

* Fix add saved view to manifest.json

* yaml fixes

* Fix metric_id and log parsing

* use notSpace instead of data

---------

Co-authored-by: Steven Yuen <steven.yuen@datadoghq.com>
  • Loading branch information
Kyle-Neale and steveny91 authored Dec 11, 2024
1 parent 76b7835 commit 33be8d2
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 0 deletions.
47 changes: 47 additions & 0 deletions nvidia_nim/assets/logs/nvidia_nim.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
id: nvidia_nim
metric_id: nvidia-nim
backend_only: false
facets:
pipeline:
type: pipeline
name: 'NVIDIA NIM'
enabled: true
filter:
query: source:nvidia_nim
processors:
- type: grok-parser
name: Parse timestamp, level, logger, and message
enabled: true
source: message
samples:
- "2024-10-30 21:56:25,295 [INFO] PyTorch version 2.3.1 available."
- "2024-10-30 21:58:26,914 [WARNING] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: error"
- "INFO 2024-10-30 21:56:28.831 ngc_injector.py:152] Valid profile: e45b4b991bbc51d0df3ce53e87060fc3a7f76555406ed534a8479c6faa706987 (tensorrt_llm-a10g-bf16-tp4-latency) on GPUs [0, 1, 2, 3]"
- "WARNING 2024-10-30 21:58:27.670 arg_utils.py:775] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False."
- "[1730325496.647520] [dd317ab0670e:126 :0] parser.c:2305 UCX WARN (set UCX_WARN_UNUSED_ENV_VARS=n to suppress this warning)"
grok:
matchRules: |
nvidia_nim %{date("yyyy-MM-dd HH:mm:ss,SSS"):timestamp} \[%{_level}\] \[%{notSpace:component_name}\] \[%{word}\] %{_msg}
nvidia_nim_logger %{_level} %{date("yyyy-MM-dd HH:mm:ss.SSS"):timestamp} %{_logger_name}:%{_logger_line}\] %{_msg}
generic_log %{date("yyyy-MM-dd HH:mm:ss,SSS"):timestamp} \[%{_level}\] %{_msg}
componont_log \[%{number:timestamp}\]\W+\[%{notSpace:container_id}:%{number:pid}\W+:%{number:thread_id}\W+%{_logger_name}:%{_logger_line} %{word:component_name}\W+%{_level}\W+\(%{_msg}\)
supportRules: |
_logger_line %{notSpace:logger.line}
_logger_name %{notSpace:logger.name}
_level %{word:level}
_msg %{data:msg}
- type: message-remapper
name: Define `msg` as the official message of the log
enabled: true
sources:
- msg
- type: date-remapper
name: Define `timestamp` as the official date of the log
enabled: true
sources:
- timestamp
- type: status-remapper
name: Define `level` as the official status of the log
enabled: true
sources:
- level
58 changes: 58 additions & 0 deletions nvidia_nim/assets/logs/nvidia_nim_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
id: "nvidia_nim"
tests:
# This log sample satisfies the validation.
-
sample: |-
2024-10-30 21:56:25,295 [INFO] PyTorch version 2.3.1 available.
result:
custom:
level: "INFO"
timestamp: 1730325385295
message: "PyTorch version 2.3.1 available."
status: "info"
tags:
- "source:LOGS_SOURCE"
timestamp: 1730325385295
-
sample: |-
2024-10-30 21:58:26,914 [WARNING] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: error
result:
custom:
level: "WARNING"
timestamp: 1730325506914
component_name: "TRT-LLM"
message: "Logger level already set from environment. Discard new verbosity: error"
status: "warn"
tags:
- "source:LOGS_SOURCE"
timestamp: 1730325506914
-
sample: |-
INFO 2024-10-30 21:56:28.831 ngc_injector.py:152] Valid profile: e45b4b991bbc51d0df3ce53e87060fc3a7f76555406ed534a8479c6faa706987 (tensorrt_llm-a10g-bf16-tp4-latency) on GPUs [0, 1, 2, 3]
result:
custom:
level: "INFO"
timestamp: 1730325388831
logger:
line: "152"
name: "ngc_injector.py"
message: "Valid profile: e45b4b991bbc51d0df3ce53e87060fc3a7f76555406ed534a8479c6faa706987 (tensorrt_llm-a10g-bf16-tp4-latency) on GPUs [0, 1, 2, 3]"
status: "info"
tags:
- "source:LOGS_SOURCE"
timestamp: 1730325388831
-
sample: |-
WARNING 2024-10-30 21:58:27.670 arg_utils.py:775] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False.
result:
custom:
level: "WARNING"
timestamp: 1730325507670
logger:
line: "775"
name: "arg_utils.py"
message: "Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False."
status: "warn"
tags:
- "source:LOGS_SOURCE"
timestamp: 1730325507670
20 changes: 20 additions & 0 deletions nvidia_nim/assets/saved_views/nim_errors.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"name": "NVIDIA NIM Errors",
"options": {
"columns": [
"host",
"service"
],
"message_display": "inline",
"show_date_column": true,
"show_message_column": true,
"show_timeline": true
},
"page": "stream",
"query": "source:nvidia_nim status:error",
"timerange": {
"interval_ms": 900000
},
"type": "logs",
"visible_facets": []
}
3 changes: 3 additions & 0 deletions nvidia_nim/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@
},
"monitors": {
"Average Request Latency is High": "assets/monitors/latency.json"
},
"saved_views": {
"NVIDIA NIM Errors": "assets/saved_views/nim_errors.json"
}
},
"author": {
Expand Down

0 comments on commit 33be8d2

Please sign in to comment.