From 63ea8deb3f01aaa26dab3cd7408fb6b663082341 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:17:52 +0200 Subject: [PATCH 01/29] Add check for ROCM >= 6.2 --- make/config_checks.mk | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/make/config_checks.mk b/make/config_checks.mk index c84edd4a2..949e38330 100644 --- a/make/config_checks.mk +++ b/make/config_checks.mk @@ -84,7 +84,14 @@ INCLUDES += -I$(CUDAINCLUDE) -I$(CUPTIINCLUDE) endif ifeq ($(strip $(ROCM_INTERFACE)), true) +ROCM_SDK_CHECK := $(shell which rocprofv3 2>/dev/null | wc -l) +ifeq ($(strip $(ROCM_SDK_CHECK)),0) # HSA includes 'hsa/xxx.h' and rocprofiler 'xxx.h' DEFINES += -D__HIP_PLATFORM_AMD__ INCLUDES += -I$(HIPINCLUDE) -I$(HSAINCLUDE) -I$(HSAINCLUDE)/hsa -I$(ROCPROFILERINCLUDE) -I$(RSMIINCLUDE) +else +$(info Compile for ROCm >= 6.2) +DEFINES += -DLIKWID_ROCPROF_SDK +INCLUDES += -I$(ROCPROFILERINCLUDE) -I$(RSMIINCLUDE) +endif endif From 90b84be06666a7db28aa1564ebe6f7adfff6181f Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:18:35 +0200 Subject: [PATCH 02/29] Split ROCM backends in 'v1' and 'sdk' --- src/includes/rocmon_v1.h | 64 + .../{rocmon_types.h => rocmon_v1_types.h} | 10 +- src/rocmon.c | 2225 ++-------------- src/rocmon_marker.c | 4 +- src/rocmon_v1.c | 2275 +++++++++++++++++ 5 files changed, 2512 insertions(+), 2066 deletions(-) create mode 100644 src/includes/rocmon_v1.h rename src/includes/{rocmon_types.h => rocmon_v1_types.h} (90%) create mode 100644 src/rocmon_v1.c diff --git a/src/includes/rocmon_v1.h b/src/includes/rocmon_v1.h new file mode 100644 index 000000000..0ea8b70e0 --- /dev/null +++ b/src/includes/rocmon_v1.h @@ -0,0 +1,64 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_v1.h + * + * Description: Header File of rocmon module for ROCm < 6.2. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_V1_H +#define LIKWID_ROCMON_V1_H + +int rocmon_v1_init(int numGpus, const int* gpuIds); +void rocmon_v1_finalize(void); +int rocmon_v1_addEventSet(const char* eventString, int* gid); +int rocmon_v1_setupCounters(int gid); +int rocmon_v1_startCounters(void); +int rocmon_v1_stopCounters(void); +int rocmon_v1_readCounters(void); +double rocmon_v1_getResult(int gpuIdx, int groupId, int eventId); +double rocmon_v1_getLastResult(int gpuIdx, int groupId, int eventId); +int rocmon_v1_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list); +void rocmon_v1_freeEventsOfGpu(EventList_rocm_t list); +int rocmon_v1_switchActiveGroup(int newGroupId); +int rocmon_v1_getNumberOfGroups(void); +int rocmon_v1_getIdOfActiveGroup(void); +int rocmon_v1_getNumberOfGPUs(void); +int rocmon_v1_getNumberOfEvents(int groupId); +int rocmon_v1_getNumberOfMetrics(int groupId); +double rocmon_v1_getTimeOfGroup(int groupId); +double rocmon_v1_getLastTimeOfGroup(int groupId); +double rocmon_v1_getTimeToLastReadOfGroup(int groupId); +char* rocmon_v1_getEventName(int groupId, int eventId); +char* rocmon_v1_getCounterName(int groupId, int eventId); +char* rocmon_v1_getMetricName(int groupId, int metricId); +char* rocmon_v1_getGroupName(int groupId); +char* rocmon_v1_getGroupInfoShort(int groupId); +char* rocmon_v1_getGroupInfoLong(int groupId); +int rocmon_v1_getGroups(char*** groups, char*** shortinfos, char*** longinfos); +int rocmon_v1_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos); + + +#endif /* LIKWID_ROCMON_V1_H */ + diff --git a/src/includes/rocmon_types.h b/src/includes/rocmon_v1_types.h similarity index 90% rename from src/includes/rocmon_types.h rename to src/includes/rocmon_v1_types.h index 7af2e1518..a126077de 100644 --- a/src/includes/rocmon_types.h +++ b/src/includes/rocmon_v1_types.h @@ -35,8 +35,16 @@ #include // #include #ifndef ROCPROFILER_VERSION_MAJOR -#include +#ifdef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE +#undef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #endif +#include +#endif +#include +#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 +typedef struct metrics_table_header_t metrics_table_header_t; +#endif +#include #include typedef struct { diff --git a/src/rocmon.c b/src/rocmon.c index ba7bdf85b..7e552f968 100644 --- a/src/rocmon.c +++ b/src/rocmon.c @@ -45,1816 +45,108 @@ #include #include -#include -#include -#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 -typedef struct metrics_table_header_t metrics_table_header_t; -#endif -#include - -// #include -// #include -// #include - -// Variables -static void *dl_hsa_lib = NULL; -static void *dl_profiler_lib = NULL; -static void *dl_rsmi_lib = NULL; - -RocmonContext *rocmon_context = NULL; -static bool rocmon_initialized = FALSE; -int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; - -// Macros -#define membersize(type, member) sizeof(((type *) NULL)->member) -#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } -#define ROCM_CALL( call, args, handleerror ) \ - do { \ - hsa_status_t _status = (*call##_ptr)args; \ - if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ - const char* err = NULL; \ - fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ - rocprofiler_error_string(&err); \ - fprintf(stderr, "Error: %s\n", err); \ - handleerror; \ - } \ - } while (0) - -#define RSMI_CALL( call, args, handleerror ) \ - do { \ - rsmi_status_t _status = (*call##_ptr)args; \ - if (_status != RSMI_STATUS_SUCCESS) { \ - fprintf(stderr, "Error: function %s failed with error %d.\n", #call, _status); \ - handleerror; \ - } \ - } while (0) - -// ROCm function declarations -#define ROCMWEAK __attribute__(( weak )) -#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; -#define DECLAREFUNC_SMI(funcname, funcsig) rsmi_status_t ROCMWEAK funcname funcsig; rsmi_status_t ( *funcname##_ptr ) funcsig; - -DECLAREFUNC_HSA(hsa_init, ()); -DECLAREFUNC_HSA(hsa_shut_down, ()); -DECLAREFUNC_HSA(hsa_iterate_agents, (hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data)); -DECLAREFUNC_HSA(hsa_agent_get_info, (hsa_agent_t agent, hsa_agent_info_t attribute, void* value)); -DECLAREFUNC_HSA(hsa_system_get_info, (hsa_system_info_t attribute, void *value)); - -DECLAREFUNC_HSA(rocprofiler_iterate_info, (const hsa_agent_t* agent, rocprofiler_info_kind_t kind, hsa_status_t (*callback)(const rocprofiler_info_data_t, void* data), void* data)); -DECLAREFUNC_HSA(rocprofiler_close, (rocprofiler_t* context)); -DECLAREFUNC_HSA(rocprofiler_open, (hsa_agent_t agent, rocprofiler_feature_t* features, uint32_t feature_count, rocprofiler_t** context, uint32_t mode, rocprofiler_properties_t* properties)); -DECLAREFUNC_HSA(rocprofiler_error_string, ()); -DECLAREFUNC_HSA(rocprofiler_start, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_stop, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_read, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_get_data, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_get_metrics, (const rocprofiler_t* context)); - -DECLAREFUNC_SMI(rsmi_init, (uint64_t flags)); -DECLAREFUNC_SMI(rsmi_shut_down, ()); -DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_open, (uint32_t dv_ind, rsmi_func_id_iter_handle_t* handle)); -DECLAREFUNC_SMI(rsmi_dev_supported_variant_iterator_open, (rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t* var_iter)); -DECLAREFUNC_SMI(rsmi_func_iter_value_get, (rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t* value )); -DECLAREFUNC_SMI(rsmi_func_iter_next, (rsmi_func_id_iter_handle_t handle)); -DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_close, (rsmi_func_id_iter_handle_t* handle)); -DECLAREFUNC_SMI(rsmi_dev_power_ave_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* power)); -DECLAREFUNC_SMI(rsmi_dev_pci_throughput_get, (uint32_t dv_ind, uint64_t* sent, uint64_t* received, uint64_t* max_pkt_sz)); -DECLAREFUNC_SMI(rsmi_dev_pci_replay_counter_get, (uint32_t dv_ind, uint64_t* counter)); -DECLAREFUNC_SMI(rsmi_dev_memory_total_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* total)); -DECLAREFUNC_SMI(rsmi_dev_memory_usage_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* used )); -DECLAREFUNC_SMI(rsmi_dev_memory_busy_percent_get, (uint32_t dv_ind, uint32_t* busy_percent)); -DECLAREFUNC_SMI(rsmi_dev_memory_reserved_pages_get, (uint32_t dv_ind, uint32_t* num_pages, rsmi_retired_page_record_t* records)); -DECLAREFUNC_SMI(rsmi_dev_fan_rpms_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); -DECLAREFUNC_SMI(rsmi_dev_fan_speed_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); -DECLAREFUNC_SMI(rsmi_dev_fan_speed_max_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* max_speed)); -DECLAREFUNC_SMI(rsmi_dev_temp_metric_get, (uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t* temperature)); -DECLAREFUNC_SMI(rsmi_dev_volt_metric_get, (uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t* voltage)); -DECLAREFUNC_SMI(rsmi_dev_overdrive_level_get, (uint32_t dv_ind, uint32_t* od)); -DECLAREFUNC_SMI(rsmi_dev_ecc_count_get, (uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t* ec)); -DECLAREFUNC_SMI(rsmi_compute_process_info_get, (rsmi_process_info_t* procs, uint32_t* num_items)); - - -// ---------------------------------------------------- -// SMI event wrapper -// ---------------------------------------------------- - -static int -_smi_wrapper_pci_throughput_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t value; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _smi_wrapper_pci_throughput_get(%d, %d), deviceId, event->extra); - // Internal variant: 0 for sent, 1 for received bytes and 2 for max packet size - if (event->extra == 0) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, &value, NULL, NULL), return -1); - else if (event->extra == 1) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, &value, NULL), return -1); - else if (event->extra == 2) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, NULL, &value), return -1); - else return -1; - - result->fullValue += value; - result->lastValue = value; - - return 0; -} - - -static int -_smi_wrapper_pci_replay_counter_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t counter; - RSMI_CALL(rsmi_dev_pci_replay_counter_get, (deviceId, &counter), return -1); - result->fullValue += counter; - result->lastValue = counter; - - return 0; -} - - -static int -_smi_wrapper_power_ave_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t power; - RSMI_CALL(rsmi_dev_power_ave_get, (deviceId, event->subvariant, &power), return -1); - result->fullValue += power; - result->lastValue = power; - - return 0; -} - - -static int -_smi_wrapper_memory_total_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t total; - RSMI_CALL(rsmi_dev_memory_total_get, (deviceId, event->variant, &total), return -1); - result->fullValue += total; - result->lastValue = total; - - return 0; -} - - -static int -_smi_wrapper_memory_usage_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t used; - RSMI_CALL(rsmi_dev_memory_usage_get, (deviceId, event->variant, &used), return -1); - result->fullValue += used; - result->lastValue = used; - - return 0; -} - - -static int -_smi_wrapper_memory_busy_percent_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t percent; - RSMI_CALL(rsmi_dev_memory_busy_percent_get, (deviceId, &percent), return -1); - result->fullValue += percent; - result->lastValue = percent; - - return 0; -} - - -static int -_smi_wrapper_memory_reserved_pages_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t num_pages; - RSMI_CALL(rsmi_dev_memory_reserved_pages_get, (deviceId, &num_pages, NULL), return -1); - result->fullValue += num_pages; - result->lastValue = num_pages; - - return 0; -} - - -static int -_smi_wrapper_fan_rpms_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t speed; - RSMI_CALL(rsmi_dev_fan_rpms_get, (deviceId, event->subvariant, &speed), return -1); - result->fullValue += speed; - result->lastValue = speed; - - return 0; -} - - -static int -_smi_wrapper_fan_speed_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t speed; - RSMI_CALL(rsmi_dev_fan_speed_get, (deviceId, event->subvariant, &speed), return -1); - result->fullValue += speed; - result->lastValue = speed; - - return 0; -} - - -static int -_smi_wrapper_fan_speed_max_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t max_speed; - RSMI_CALL(rsmi_dev_fan_speed_max_get, (deviceId, event->subvariant, &max_speed), return -1); - result->fullValue += max_speed; - result->lastValue = max_speed; - - return 0; -} - - -static int -_smi_wrapper_temp_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t temperature; - RSMI_CALL(rsmi_dev_temp_metric_get, (deviceId, event->subvariant, event->variant, &temperature), return -1); - result->fullValue += temperature; - result->lastValue = temperature; - - return 0; -} - - -static int -_smi_wrapper_volt_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t voltage; - RSMI_CALL(rsmi_dev_volt_metric_get, (deviceId, event->subvariant, event->variant, &voltage), return -1); - result->fullValue += voltage; - result->lastValue = voltage; - - return 0; -} - - -static int -_smi_wrapper_overdrive_level_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t overdrive; - RSMI_CALL(rsmi_dev_overdrive_level_get, (deviceId, &overdrive), return -1); - result->fullValue += overdrive; - result->lastValue = overdrive; - - return 0; -} - - -static int -_smi_wrapper_ecc_count_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - rsmi_error_count_t error_count; - RSMI_CALL(rsmi_dev_ecc_count_get, (deviceId, event->variant, &error_count), return -1); - - if (event->extra == 0) - { - result->lastValue = error_count.correctable_err - result->fullValue; - result->fullValue = error_count.correctable_err; - } - else if (event->extra == 1) - { - result->lastValue = error_count.uncorrectable_err - result->fullValue; - result->fullValue = error_count.uncorrectable_err; - } - else - { - return -1; - } - - return 0; -} - - -static int -_smi_wrapper_compute_process_info_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t num_items; - RSMI_CALL(rsmi_compute_process_info_get, (NULL, &num_items), return -1); - result->fullValue += num_items; - result->lastValue = num_items; - - return 0; -} - - -// ---------------------------------------------------- -// Rocmon helper functions -// ---------------------------------------------------- - -static int -_rocmon_link_libraries() -{ - #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries); - - // Need to link in the ROCm HSA libraries - dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_hsa_lib) - { - ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); - return -1; - } - - // Need to link in the Rocprofiler libraries - dl_profiler_lib = dlopen("librocprofiler64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_profiler_lib) - { - dl_profiler_lib = dlopen("librocprofiler64.so.1", RTLD_NOW | RTLD_GLOBAL); - if (!dl_profiler_lib) - { - ERROR_PRINT(Rocprofiler library librocprofiler64.so not found: %s, dlerror()); - return -1; - } - } - - // Need to link in the Rocprofiler libraries - dl_rsmi_lib = dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_rsmi_lib) - { - ERROR_PRINT(ROCm SMI library librocm_smi64.so not found: %s, dlerror()); - return -1; - } - - // Link HSA functions - DLSYM_AND_CHECK(dl_hsa_lib, hsa_init); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_shut_down); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_iterate_agents); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_agent_get_info); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_system_get_info); - - // Link Rocprofiler functions - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_iterate_info); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_close); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_open); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_error_string); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_start); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_stop); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_read); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_data); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_metrics); - - // Link SMI functions - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_init); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_shut_down); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_open); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_variant_iterator_open); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_value_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_next); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_close); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_power_ave_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_throughput_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_replay_counter_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_total_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_usage_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_busy_percent_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_reserved_pages_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_rpms_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_max_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_temp_metric_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_volt_metric_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_overdrive_level_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_ecc_count_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_compute_process_info_get); - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries done); - return 0; -} - -typedef struct { - RocmonContext* context; - int numGpus; - const int* gpuIds; -} iterate_agents_cb_arg; - -typedef struct { - RocmonDevice* device; - int currIndex; -} iterate_info_cb_arg; - - -static hsa_status_t -_rocmon_iterate_info_callback_count(const rocprofiler_info_data_t info, void* data) -{ - RocmonDevice* device = (RocmonDevice*) data; - if (device) { - device->numRocMetrics++; - } - return HSA_STATUS_SUCCESS; -} - -static void -_rocmon_print_rocprofiler_info_data(const rocprofiler_info_data_t info) -{ - if (info.kind != ROCPROFILER_INFO_KIND_METRIC) - { - return; - } - printf("Name '%s':\n", info.metric.name); - printf("\tKind: '%s'\n", (info.kind == ROCPROFILER_INFO_KIND_METRIC ? "Metric" : "Trace")); - printf("\tInstances: %d\n", info.metric.instances); - printf("\tDescription: '%s'\n", info.metric.description); - printf("\tExpression: '%s'\n", info.metric.expr); - printf("\tBlockName: '%s'\n", info.metric.block_name); - printf("\tBlockCounters: %d\n", info.metric.block_counters); -} - -static hsa_status_t -_rocmon_iterate_info_callback_add(const rocprofiler_info_data_t info, void* data) -{ - iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; - - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); - if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP) - { - _rocmon_print_rocprofiler_info_data(info); - } - // Check info kind - if (info.kind != ROCPROFILER_INFO_KIND_METRIC) - { - ERROR_PRINT(Wrong info kind %u, info.kind); - return HSA_STATUS_ERROR; - } - - // Check index - if (arg->currIndex >= arg->device->numRocMetrics) - { - ERROR_PRINT(Metric index out of bounds: %d, arg->currIndex); - return HSA_STATUS_ERROR; - } - - // Copy info data - rocprofiler_info_data_t* target_info = &arg->device->rocMetrics[arg->currIndex]; - memcpy(target_info, &info, sizeof(rocprofiler_info_data_t)); - arg->currIndex++; - - return HSA_STATUS_SUCCESS; -} - - -static hsa_status_t -_rocmon_iterate_agents_callback(hsa_agent_t agent, void* argv) -{ - // Count number of callback invocations as the devices id - static int nextDeviceId = 0; - int deviceId = nextDeviceId; - bool noAgent = false; - - iterate_agents_cb_arg *arg = (iterate_agents_cb_arg*) argv; - - // Check if device is a GPU - hsa_device_type_t type; - ROCM_CALL(hsa_agent_get_info, (agent, HSA_AGENT_INFO_DEVICE, &type), return -1); - if (type != HSA_DEVICE_TYPE_GPU) - { - return HSA_STATUS_SUCCESS; - } - nextDeviceId++; - - // Check if device is includes in arg->gpuIds - int gpuIndex = -1; - for (int i = 0; i < arg->numGpus; i++) - { - if (deviceId == arg->gpuIds[i]) - { - gpuIndex = i; - break; - } - } - if (gpuIndex < 0) - { - return HSA_STATUS_SUCCESS; - } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing agent %d, gpuIndex); - - // Add agent to context - RocmonDevice *device = &arg->context->devices[gpuIndex]; - device->deviceId = deviceId; - device->hsa_agent = agent; - device->context = NULL; - device->numActiveRocEvents = 0; - device->activeRocEvents = NULL; - device->numGroupResults = 0; - device->groupResults = NULL; - - // Get number of available metrics - device->numRocMetrics = 0; - ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, RocProfiler provides %d events, device->numRocMetrics); - - // workaround for bug in ROCm 5.4.0 - if(device->numRocMetrics == 0) { - ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); - noAgent = true; - } - - // Allocate memory for metrics - device->rocMetrics = (rocprofiler_info_data_t*) malloc(device->numRocMetrics * sizeof(rocprofiler_info_data_t)); - if (device->rocMetrics == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate set of rocMetrics); - return HSA_STATUS_ERROR; - } - - // Initialize SMI events map - if (init_map(&device->smiMetrics, MAP_KEY_TYPE_STR, 0, &free) < 0) - { - ERROR_PLAIN_PRINT(Cannot init smiMetrics map); - return HSA_STATUS_ERROR; - } - - // Fetch metric informatino - iterate_info_cb_arg info_arg = { - .device = device, - .currIndex = 0, - }; - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, Read %d RocProfiler events for device %d, device->numRocMetrics, device->deviceId); - - // If the call fails with agent, call rocprofiler_iterate_info without agent - if(noAgent) - { - ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); - } else { - ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); - } - - return HSA_STATUS_SUCCESS; -} - - -static int -_rocmon_parse_eventstring(const char* eventString, GroupInfo* group) -{ - int err = 0; - Configuration_t config = get_configuration(); - bstring eventBString = bfromcstr(eventString); - - if (bstrchrp(eventBString, ':', 0) != BSTR_ERR) - { - // If custom group -> perfgroup_customGroup - err = perfgroup_customGroup(eventString, group); - if (err < 0) - { - ERROR_PRINT(Cannot transform %s to performance group, eventString); - return err; - } - } - else - { - // If performance group -> perfgroup_readGroup - err = perfgroup_readGroup(config->groupPath, "amd_gpu", eventString, group); - if (err == -EACCES) - { - ERROR_PRINT(Access to performance group %s not allowed, eventString); - return err; - } - else if (err == -ENODEV) - { - ERROR_PRINT(Performance group %s only available with deactivated HyperThreading, eventString); - return err; - } - if (err < 0) - { - ERROR_PRINT(Cannot read performance group %s, eventString); - return err; - } - } - - return 0; -} - - -static int -_rocmon_get_timestamp(uint64_t* timestamp_ns) -{ - uint64_t timestamp; - - // Get timestamp from system - ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP, ×tamp), return -1); - // Convert to nanoseconds - *timestamp_ns = (uint64_t)((long double)timestamp * rocmon_context->hsa_timestamp_factor); - - return 0; -} - - -static int -_rocmon_getLastResult(RocmonDevice* device, int eventId, double* value) -{ - rocprofiler_data_t* data = &device->activeRocEvents[eventId].data; - - switch (data->kind) - { - case ROCPROFILER_DATA_KIND_INT32: - *value = (double) data->result_int32; - break; - case ROCPROFILER_DATA_KIND_INT64: - *value = (double) data->result_int64; - break; - case ROCPROFILER_DATA_KIND_FLOAT: - *value = (double) data->result_float; - break; - case ROCPROFILER_DATA_KIND_DOUBLE: - *value = data->result_double; - break; - - case ROCPROFILER_DATA_KIND_BYTES: - case ROCPROFILER_DATA_KIND_UNINIT: - default: - return -1; - } - - return 0; -} - - -static int -_rocmon_readCounters_rocprofiler(RocmonDevice* device) -{ - int ret; - - // Check if there are any counters to start - if (device->numActiveRocEvents <= 0) - { - return 0; - } - - if (!device->context) - { - return 0; - } - - ROCM_CALL(rocprofiler_read, (device->context, 0), return -1); - ROCM_CALL(rocprofiler_get_data, (device->context, 0), return -1); - ROCM_CALL(rocprofiler_get_metrics, (device->context), return -1); - - // Update results - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveRocEvents; i++) - { - RocmonEventResult* result = &groupResult->results[i]; - - // Read value - ret = _rocmon_getLastResult(device, i, &result->fullValue); - if (ret < 0) - { - return -1; - } - - // Calculate delta since last read - result->lastValue = result->fullValue - result->lastValue; - } - - return 0; -} - - -static int -_rocmon_readCounters_smi(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveSmiEvents <= 0) - { - return 0; - } - - // Save baseline values - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) - { - double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; - - // Measure counter - if (event->measureFunc) - { - event->measureFunc(device->deviceId, event, result); - } - } - - return 0; -} - - -static int -_rocmon_readCounters(uint64_t* (*getDestTimestampFunc)(RocmonDevice* device)) -{ - int ret; - - // Get timestamp - uint64_t timestamp; - if (ret = _rocmon_get_timestamp(×tamp)) - { - return ret; - } - - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Save timestamp - if (getDestTimestampFunc) - { - uint64_t* timestampDest = getDestTimestampFunc(device); - if (timestampDest) - { - *timestampDest = timestamp; - } - } - - // Read rocprofiler counters - ret = _rocmon_readCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Read SMI counters - ret = _rocmon_readCounters_smi(device); - if (ret < 0) return ret; - } - - return 0; -} - - -static uint64_t* -_rocmon_get_read_time(RocmonDevice* device) -{ - return &device->time.read; -} - - -static uint64_t* -_rocmon_get_stop_time(RocmonDevice* device) -{ - return &device->time.stop; -} - - -// ---------------------------------------------------- -// Rocmon SMI helper functions -// ---------------------------------------------------- - -static bstring -_rocmon_smi_build_label(RocmonSmiEventType type, const char* funcname, uint64_t variant, uint64_t subvariant) -{ - switch (type) - { - case ROCMON_SMI_EVENT_TYPE_NORMAL: - return bfromcstr(funcname); - case ROCMON_SMI_EVENT_TYPE_VARIANT: - return bformat("%s|%" PRIu64, funcname, variant); - case ROCMON_SMI_EVENT_TYPE_SUBVARIANT: - return bformat("%s|%" PRIu64 "|%" PRIu64, funcname, variant, subvariant); - case ROCMON_SMI_EVENT_TYPE_INSTANCES: - return bfromcstr(funcname); - } -} - - -static int -_rocmon_smi_add_event_to_device(RocmonDevice* device, const char* funcname, RocmonSmiEventType type, int64_t variant, uint64_t subvariant) -{ - int ret; - - // Get event by label - RocmonSmiEventList* list = NULL; - bstring label = _rocmon_smi_build_label(type, funcname, variant, subvariant); - ret = get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list); - bdestroy(label); - if (ret < 0) - { - // Event not registered -> ignore - return 0; - } - - // For events with multiple sensor, only make one entry -> find if one exists - if (type == ROCMON_SMI_EVENT_TYPE_INSTANCES && subvariant > 0) - { - // Get list from map - for (int i = 0; i < list->numEntries; i++) - { - RocmonSmiEvent* event = &list->entries[i]; - RocmonSmiEvent* existingEvent = NULL; - ret = get_smap_by_key(device->smiMetrics, event->name, (void**)&existingEvent); - if (ret < 0) - { - ERROR_PRINT(Failed to find previous instance for event %s, event->name); - return -1; - } - - // Update instance information - existingEvent->instances++; - } - return 0; - } - - for (int i = 0; i < list->numEntries; i++) - { - RocmonSmiEvent* event = &list->entries[i]; - - // Allocate memory for device event description - RocmonSmiEvent* tmpEvent = (RocmonSmiEvent*) malloc(sizeof(RocmonSmiEvent)); - if (tmpEvent == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event in device list %s, event->name); - return -ENOMEM; - } - - // Copy information from global description - memcpy(tmpEvent, event, sizeof(RocmonSmiEvent)); - tmpEvent->variant = variant; - tmpEvent->subvariant = subvariant; - tmpEvent->instances = 1; - - // Save event info to device event map - add_smap(device->smiMetrics, tmpEvent->name, tmpEvent); - } - - return 0; -} - - -static int -_rocmon_smi_get_function_subvariants(RocmonDevice* device, const char* funcname, uint64_t variant, rsmi_func_id_iter_handle_t var_iter) -{ - rsmi_func_id_iter_handle_t sub_var_iter; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Get open subvariants iterator - status = (*rsmi_dev_supported_variant_iterator_open_ptr)(var_iter, &sub_var_iter); - if (status == RSMI_STATUS_NO_DATA) - { - // No subvariants - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_VARIANT, variant, 0); - if (ret < 0) return -1; - return 0; - } - - // Subvariants available -> iterate them - do { - // Get subvariant information - (*rsmi_func_iter_value_get_ptr)(sub_var_iter, &value); - - // Process info - if (variant == RSMI_DEFAULT_VARIANT) - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_INSTANCES, variant, value.id); - else - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, variant, value.id); - if (ret < 0) return ret; - - // Advance iterator - status = (*rsmi_func_iter_next_ptr)(sub_var_iter); - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - (*rsmi_dev_supported_func_iterator_close_ptr)(&sub_var_iter); - - return 0; -} - - -static int -_rocmon_smi_get_function_variants(RocmonDevice* device, const char* funcname, rsmi_func_id_iter_handle_t iter_handle) -{ - rsmi_func_id_iter_handle_t var_iter; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Get open variants iterator - status = (*rsmi_dev_supported_variant_iterator_open_ptr)(iter_handle, &var_iter); - if (status == RSMI_STATUS_NO_DATA) - { - // No variants - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); - if (ret < 0) return -1; - return 0; - } - - // Variants available -> iterate them - do { - // Get variant information - (*rsmi_func_iter_value_get_ptr)(var_iter, &value); - - // Get function subvariants - ret = _rocmon_smi_get_function_subvariants(device, funcname, value.id, var_iter); - if (ret < 0) return -1; - - // Advance iterator - status = (*rsmi_func_iter_next_ptr)(var_iter); - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - (*rsmi_dev_supported_func_iterator_close_ptr)(&var_iter); - - return 0; -} - - -static int -_rocmon_smi_get_functions(RocmonDevice* device) -{ - rsmi_func_id_iter_handle_t iter_handle; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Open iterator - //(*rsmi_dev_supported_func_iterator_open_ptr)(device->deviceId, &iter_handle); - RSMI_CALL(rsmi_dev_supported_func_iterator_open, (device->deviceId, &iter_handle), { - return -1; - }); - - do - { - // Get function information - //(*rsmi_func_iter_value_get_ptr)(iter_handle, &value); - RSMI_CALL(rsmi_func_iter_value_get, (iter_handle, &value), { - ERROR_PRINT(Failed to get smi function value for device %d, device->deviceId); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - return -1; - }); - - // Get function variants - ret = _rocmon_smi_get_function_variants(device, value.name, iter_handle); - if (ret < 0) - { - ERROR_PRINT(Failed to get smi function variants for device %d, device->deviceId); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - return -1; - } - - // Advance iterator (cannot use RSMI_CALL macro here because we have an assignment, - // so we check that the function pointer exists to avoid segfaults.) - if (rsmi_func_iter_next_ptr) { - status = (*rsmi_func_iter_next_ptr)(iter_handle); - } - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - //(*rsmi_dev_supported_func_iterator_close_ptr)(&iter_handle); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - - // Add device independent functions - ret = _rocmon_smi_add_event_to_device(device, "rsmi_compute_process_info_get", ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); - if (ret < 0) return -1; - - return 0; -} - -#define ADD_SMI_EVENT(name, type, smifunc, variant, subvariant, extra, measurefunc) if (_rocmon_smi_add_event_to_map(name, type, smifunc, variant, subvariant, extra, measurefunc) < 0) { return -1; } -#define ADD_SMI_EVENT_N(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_NORMAL, smifunc, 0, 0, extra, measurefunc) -#define ADD_SMI_EVENT_V(name, smifunc, variant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_VARIANT, smifunc, variant, 0, extra, measurefunc) -#define ADD_SMI_EVENT_S(name, smifunc, variant, subvariant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, smifunc, variant, subvariant, extra, measurefunc) -#define ADD_SMI_EVENT_I(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_INSTANCES, smifunc, 0, 0, extra, measurefunc) - -static int -_rocmon_smi_add_event_to_map(char* name, RocmonSmiEventType type, char* smifunc, uint64_t variant, uint64_t subvariant, uint64_t extra, RocmonSmiMeasureFunc measureFunc) -{ - // Add new event list to map (if not already present) - bstring label = _rocmon_smi_build_label(type, smifunc, variant, subvariant); - RocmonSmiEventList* list; - if (get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list) < 0) - { - // Allocate memory for event list - list = (RocmonSmiEventList*) malloc(sizeof(RocmonSmiEventList)); - if (list == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event list %s, name); - return -ENOMEM; - } - list->entries = NULL; - list->numEntries = 0; - - add_smap(rocmon_context->smiEvents, bdata(label), list); - } - bdestroy(label); - - // Allocate memory for another event in list - list->numEntries++; - list->entries = (RocmonSmiEvent*) realloc(list->entries, list->numEntries * sizeof(RocmonSmiEvent)); - if (list->entries == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event %s, name); - return -ENOMEM; - } - - // Set event properties - RocmonSmiEvent* event = &list->entries[list->numEntries-1]; - strncpy(event->name, name, sizeof(event->name)); - event->name[sizeof(event->name)] = '\0'; - event->type = type; - event->variant = variant; - event->subvariant = subvariant; - event->extra = extra; - event->instances = 0; // gets set when scanning supported device functions - event->measureFunc = measureFunc; - - return 0; -} - - -static void -_rcomon_smi_free_event_list(void* vlist) -{ - RocmonSmiEventList* list = (RocmonSmiEventList*)vlist; - if (list) - { - FREE_IF_NOT_NULL(list->entries); - free(list); - } -} - - -static int -_rocmon_smi_init_events() -{ - int ret; - - // Init map - ret = init_map(&rocmon_context->smiEvents, MAP_KEY_TYPE_STR, 0, &_rcomon_smi_free_event_list); - if (ret < 0) - { - ERROR_PRINT(Failed to create map for ROCm SMI events); - return -1; - } - - // Add events - ADD_SMI_EVENT_N("PCI_THROUGHPUT_SENT", "rsmi_dev_pci_throughput_get", 0, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_THROUGHPUT_RECEIVED", "rsmi_dev_pci_throughput_get", 1, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_THROUGHPUT_MAX_PKT_SZ", "rsmi_dev_pci_throughput_get", 2, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_REPLAY_COUNTER", "rsmi_dev_pci_replay_counter_get", 0, &_smi_wrapper_pci_replay_counter_get ); - ADD_SMI_EVENT_I("POWER_AVE", "rsmi_dev_power_ave_get", 0, &_smi_wrapper_power_ave_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_VIS_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_GTT", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_VIS_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_GTT", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_N("MEMORY_BUSY_PERCENT", "rsmi_dev_memory_busy_percent_get", 0, &_smi_wrapper_memory_busy_percent_get ); - ADD_SMI_EVENT_N("MEMORY_NUM_RESERVED_PAGES", "rsmi_dev_memory_reserved_pages_get", 0, &_smi_wrapper_memory_reserved_pages_get ); - ADD_SMI_EVENT_I("FAN_RPMS", "rsmi_dev_fan_rpms_get", 0, &_smi_wrapper_fan_rpms_get ); - ADD_SMI_EVENT_I("FAN_SPEED", "rsmi_dev_fan_speed_get", 0, &_smi_wrapper_fan_speed_get ); - ADD_SMI_EVENT_I("FAN_SPEED_MAX", "rsmi_dev_fan_speed_max_get", 0, &_smi_wrapper_fan_speed_max_get ); - ADD_SMI_EVENT_S("TEMP_EDGE", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_EDGE, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("TEMP_JUNCTION", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_JUNCTION, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("TEMP_MEMORY", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_MEMORY, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("VOLT_VDDGFX", "rsmi_dev_volt_metric_get", RSMI_VOLT_CURRENT, RSMI_VOLT_TYPE_VDDGFX, 0, &_smi_wrapper_volt_metric_get ); - ADD_SMI_EVENT_N("OVERDRIVE_LEVEL", "rsmi_dev_overdrive_level_get", 0, &_smi_wrapper_overdrive_level_get ); - ADD_SMI_EVENT_V("ECC_COUNT_UMC_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_UMC_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SDMA_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SDMA_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_GFX_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_GFX_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_HDP_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_HDP_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_DF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_DF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SMN_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SMN_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SEM_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SEM_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP0_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP0_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP1_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP1_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_FUSE_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_FUSE_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_LAST_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_LAST_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_N("PROCS_USING_GPU", "rsmi_compute_process_info_get", 0, &_smi_wrapper_compute_process_info_get ); - - return 0; -} - - -int -rocmon_init(int numGpus, const int* gpuIds) -{ - hsa_status_t status; - - // check if already initialized - if (rocmon_initialized) - { - return 0; - } - if (rocmon_context != NULL) - { - return -EEXIST; - } - - // Validate arguments - if (numGpus <= 0) - { - ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); - return -EINVAL; - } - - // Initialize other parts - init_configuration(); - - // initialize libraries - int ret = _rocmon_link_libraries(); - if (ret < 0) - { - ERROR_PLAIN_PRINT(Failed to initialize libraries); - return ret; - } - - // Allocate memory for context - rocmon_context = (RocmonContext*) malloc(sizeof(RocmonContext)); - if (rocmon_context == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate Rocmon context); - return -ENOMEM; - } - rocmon_context->groups = NULL; - rocmon_context->numGroups = 0; - rocmon_context->numActiveGroups = 0; - - rocmon_context->devices = (RocmonDevice*) malloc(numGpus * sizeof(RocmonDevice)); - rocmon_context->numDevices = numGpus; - if (rocmon_context->devices == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate set of GPUs); - free(rocmon_context); - rocmon_context = NULL; - return -ENOMEM; - } - - // init hsa library - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing HSA); - ROCM_CALL(hsa_init, (), - { - ERROR_PLAIN_PRINT(Failed to init hsa library); - goto rocmon_init_hsa_failed; - }); - - // init rocm smi library - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RSMI); - RSMI_CALL(rsmi_init, (0), - { - ERROR_PLAIN_PRINT(Failed to init rocm_smi); - goto rocmon_init_rsmi_failed; - }); - - // Get hsa timestamp factor - uint64_t frequency_hz; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting HSA timestamp factor); - ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency_hz), - { - ERROR_PLAIN_PRINT(Failed to get HSA timestamp factor); - goto rocmon_init_info_agents_failed; - }); - rocmon_context->hsa_timestamp_factor = (long double)1000000000 / (long double)frequency_hz; - - // initialize structures for specified devices (fetch ROCm specific info) - iterate_agents_cb_arg arg = { - .context = rocmon_context, - .numGpus = numGpus, - .gpuIds = gpuIds, - }; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Iterating through %d available agents, numGpus); - ROCM_CALL(hsa_iterate_agents, (_rocmon_iterate_agents_callback, &arg), - { - ERROR_PRINT(Error while iterating through available agents); - goto rocmon_init_info_agents_failed; - }); - - // Get available SMI events for devices - _rocmon_smi_init_events(); - for (int i = 0; i < rocmon_context->numDevices; i++) - { - if (_rocmon_smi_get_functions(&rocmon_context->devices[i]) < 0) - { - ERROR_PRINT(Failed to get SMI functions for device %d, rocmon_context->devices[i].deviceId); - goto rocmon_init_info_agents_failed; - } - } - - rocmon_initialized = TRUE; - return 0; -rocmon_init_info_agents_failed: - RSMI_CALL(rsmi_shut_down, (), { - // fall through - }); -rocmon_init_rsmi_failed: - ROCM_CALL(hsa_shut_down, (), { - // fall through - }); -rocmon_init_hsa_failed: - free(rocmon_context->devices); - free(rocmon_context); - rocmon_context = NULL; - return -1; -} - - -void -rocmon_finalize(void) -{ - RocmonContext* context = rocmon_context; - - if (!rocmon_initialized) - { - return; - } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON); - - if (context) - { - if (context->devices) - { - // Free each devices fields - for (int i = 0; i < context->numDevices; i++) - { - RocmonDevice* device = &context->devices[i]; - FREE_IF_NOT_NULL(device->rocMetrics); - FREE_IF_NOT_NULL(device->activeRocEvents); - FREE_IF_NOT_NULL(device->activeSmiEvents); - if (device->groupResults) - { - // Free events of event result lists - for (int j = 0; j < device->numGroupResults; j++) - { - FREE_IF_NOT_NULL(device->groupResults[i].results); - } - // Free list - free(device->groupResults); - } - if (device->context) - { - ROCM_CALL(rocprofiler_close, (device->context),); - } - destroy_smap(device->smiMetrics); - } - - free(context->devices); - context->devices = NULL; - } - - FREE_IF_NOT_NULL(context->groups); - destroy_smap(context->smiEvents); - - free(context); - context = NULL; - } - - RSMI_CALL(rsmi_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); - // fall through - }); - ROCM_CALL(hsa_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); - // fall through - }); -} - - -int -rocmon_addEventSet(const char* eventString, int* gid) -{ - // Check arguments - if (!eventString) - { - return -EINVAL; - } - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Allocate memory for event group if necessary - if (rocmon_context->numActiveGroups == rocmon_context->numGroups) - { - GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo)); - if (tmpInfo == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate additional group); - return -ENOMEM; - } - rocmon_context->groups = tmpInfo; - rocmon_context->numGroups++; - } - - // Parse event string - int err = _rocmon_parse_eventstring(eventString, &rocmon_context->groups[rocmon_context->numActiveGroups]); - if (err < 0) - { - return err; - } - - // Allocate memory for event results - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Allocate memory for event results - int numEvents = rocmon_context->groups[rocmon_context->numActiveGroups].nevents; - RocmonEventResult* tmpResults = (RocmonEventResult*) malloc(numEvents * sizeof(RocmonEventResult)); - if (tmpResults == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate event results); - return -ENOMEM; - } - - // Allocate memory for new event result list entry - RocmonEventResultList* tmpGroupResults = (RocmonEventResultList*) realloc(device->groupResults, (device->numGroupResults+1) * sizeof(RocmonEventResultList)); - if (tmpGroupResults == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate new event group result list); - return -ENOMEM; - } - - device->groupResults = tmpGroupResults; - device->groupResults[device->numGroupResults].results = tmpResults; - device->groupResults[device->numGroupResults].numResults = numEvents; - device->numGroupResults++; - } - - *gid = rocmon_context->numActiveGroups; - rocmon_context->numActiveGroups++; - return 0; -} - - -static int -_rocmon_setupCounters_rocprofiler(RocmonDevice* device, const char** events, int numEvents) -{ - // Close previous rocprofiler context - if (device->context) - { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Closing previous rocprofiler context); - ROCM_CALL(rocprofiler_close, (device->context), return -1); - } - - // Look if the are any events - if (numEvents <= 0) - { - return 0; - } - - // Create feature array to monitor - rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(numEvents * sizeof(rocprofiler_feature_t)); - if (features == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate feature list); - return -ENOMEM; - } - for (int i = 0; i < numEvents; i++) - { - features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[i].name = events[i]; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP EVENT %d %s, i, events[i]); - } - - // Free previous feature array if present - FREE_IF_NOT_NULL(device->activeRocEvents); - - device->numActiveRocEvents = numEvents; - device->activeRocEvents = features; - - // Open context - rocprofiler_properties_t properties = {}; - properties.queue_depth = 128; - uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | ROCPROFILER_MODE_SINGLEGROUP; +#ifdef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE +#undef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE +#endif +#ifndef LIKWID_ROCPROF_SDK +#include +#include +#else +#include +#include +#endif - // Important: only a single profiling group is supported at this time which limits the number of events that can be monitored at a time. - ROCM_CALL(rocprofiler_open, (device->hsa_agent, device->activeRocEvents, device->numActiveRocEvents, &device->context, mode, &properties), return -1); +#include - return 0; -} -static int -_rocmon_setupCounters_smi(RocmonDevice* device, const char** events, int numEvents) +void +rocmon_finalize(void) { - int ret; - const int instanceNumLen = 5; - - // Delete previous events - if (device->activeSmiEvents) - { - device->activeSmiEvents = NULL; - device->numActiveSmiEvents = 0; - } - - // Look if the are any events - if (numEvents <= 0) - { - return 0; - } - - // Create event array - RocmonSmiEvent* activeEvents = (RocmonSmiEvent*) malloc(numEvents * sizeof(RocmonSmiEvent)); - if (activeEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate active event list); - return -ENOMEM; - } - - for (int i = 0; i < numEvents; i++) - { - char eventName[membersize(RocmonSmiEvent, name)]; - int instance = -1; - - // Parse event name -> normal event vs one with multiple instances (EVENT[0]) - const char* event = events[i]; - char* instancePart = strrchr(event, '['); - if (instancePart != NULL) - { - char withoutBrackets[instanceNumLen+1]; // +1 is '\0' - int partlen = strlen(instancePart); - - // Check if number fit in 'withoutBrackets' - if (partlen - 2 > instanceNumLen) - { - ERROR_PRINT(Instance number in '%s' is too large, event); - free(activeEvents); - return -EINVAL; - } - - // Copy instance number without brackets - strncpy(withoutBrackets, instancePart+1, partlen-2); - withoutBrackets[instanceNumLen] = '\0'; - - // Parse instance as number - char* endParsed; - instance = strtol(withoutBrackets, &endParsed, 10); - - // Check if parsing was successful - char* endOfString = &withoutBrackets[partlen-2]; - if (endParsed != endOfString) - { - ERROR_PRINT(Failed to parse instance number in '%s', event); - free(activeEvents); - return -EINVAL; - } - - // Copy event name without instance - int eventNameLen = instancePart - event; - strncpy(eventName, event, eventNameLen); - eventName[eventNameLen] = '\0'; - } - else - { - // Copy entire event name - strncpy(eventName, event, membersize(RocmonSmiEvent, name)); - } - - // Lookup event in available events - RocmonSmiEvent* metric = NULL; - ret = get_smap_by_key(device->smiMetrics, eventName, (void**)&metric); - if (ret < 0) - { - ERROR_PRINT(RSMI event '%s' not found for device %d, eventName, device->deviceId); - free(activeEvents); - return -EINVAL; - } - - // Copy event - RocmonSmiEvent* tmpEvent = &activeEvents[i]; - memcpy(tmpEvent, metric, sizeof(RocmonSmiEvent)); - - // Check if event supports instances - if (instance >= 0 && tmpEvent->type != ROCMON_SMI_EVENT_TYPE_INSTANCES) - { - ERROR_PRINT(Instance number given but event '%s' does not support one, eventName); - free(activeEvents); - return -EINVAL; - } - - // Check if event requires instances - if (instance < 0 && tmpEvent->type == ROCMON_SMI_EVENT_TYPE_INSTANCES) - { - ERROR_PRINT(No instance number given but event '%s' requires one, eventName); - free(activeEvents); - return -EINVAL; - } - - // Check if event has enough instances - if (instance >= 0 && instance >= metric->instances) - { - ERROR_PRINT(Instance %d seleced but event '%s' has only %d, instance, eventName, metric->instances); - free(activeEvents); - return -EINVAL; - } - - // Set instance number - if (instance >= 0) - { - tmpEvent->subvariant = instance; - } - } - - device->activeSmiEvents = activeEvents; - device->numActiveSmiEvents = numEvents; - - return 0; +#ifndef LIKWID_ROCPROF_SDK + rocmon_v1_finalize(); +#else + rocmon_sdk_finalize(); +#endif + return; } - int -rocmon_setupCounters(int gid) +rocmon_init(int numGpus, const int* gpuIds) { - int ret; - - // Check arguments - if (gid < 0 || gid >= rocmon_context->numActiveGroups) - { - return -EINVAL; - } - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Get group info - GroupInfo* group = &rocmon_context->groups[gid]; - - // - // Separate rocprofiler and SMI events - // - const char **smiEvents = NULL, **rocEvents = NULL; - int numSmiEvents = 0, numRocEvents = 0; - - // Allocate memory for string arrays - smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (smiEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); - return -ENOMEM; - } - rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (rocEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); - free(smiEvents); - return -ENOMEM; - } - - // Go through each event and sort it - for (int i = 0; i < group->nevents; i++) - { - const char* name = group->events[i]; - if (strncmp(name, "RSMI_", 5) == 0) - { - // RSMI event - smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix - numSmiEvents++; - } - else if (strncmp(name, "ROCP_", 5) == 0) - { - // Rocprofiler event - rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix - numRocEvents++; - } - else - { - // Unknown event - ERROR_PRINT(Event '%s' has no prefix ('ROCP_' or 'RSMI_'), name); - return -EINVAL; - } - } - - // Add events to each device - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Add rocprofiler events - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); - ret = _rocmon_setupCounters_rocprofiler(device, rocEvents, numRocEvents); - if (ret < 0) - { - free(smiEvents); - free(rocEvents); - return ret; - } - - // Add SMI events - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCM SMI WITH %d events, numSmiEvents); - ret = _rocmon_setupCounters_smi(device, smiEvents, numSmiEvents); - if (ret < 0) - { - free(smiEvents); - free(rocEvents); - return ret; - } - } - rocmon_context->activeGroup = gid; - - // Cleanup - free(smiEvents); - free(rocEvents); - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_init(numGpus, gpuIds); +#else + return rocmon_sdk_init(numGpus, gpuIds); +#endif } - -static int -_rocmon_startCounters_rocprofiler(RocmonDevice* device) +int +rocmon_addEventSet(const char* eventString, int* gid) { - // Check if there are any counters to start - if (device->numActiveRocEvents <= 0) - { - return 0; - } - - // Reset results - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveRocEvents; i++) - { - RocmonEventResult* result = &groupResult->results[i]; - result->lastValue = 0; - result->fullValue = 0; - } - - if (device->context) - { - ROCM_CALL(rocprofiler_start, (device->context, 0), return -1); - } - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_addEventSet(eventString, gid); +#else + return rocmon_sdk_addEventSet(eventString, gid); +#endif } -static int -_rocmon_startCounters_smi(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveSmiEvents <= 0) - { - return 0; - } - - // Save baseline values - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) - { - double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; - - // Measure counter - if (event->measureFunc) - { - event->measureFunc(device->deviceId, event, result); - } - - // Save value - result->fullValue = 0; - } - return 0; +int +rocmon_setupCounters(int gid) +{ +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_setupCounters(gid); +#else + return rocmon_sdk_setupCounters(gid); +#endif } + int rocmon_startCounters(void) { - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Get timestamp - uint64_t timestamp; - if (ret = _rocmon_get_timestamp(×tamp)) - { - return ret; - } - - // Start counters on each device - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - device->time.start = timestamp; - device->time.read = timestamp; - - // Start rocprofiler events - ret = _rocmon_startCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Start SMI events - _rocmon_startCounters_smi(device); - if (ret < 0) return ret; - } - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_startCounters(); +#else + return rocmon_sdk_startCounters(); +#endif } -static int -_rocmon_stopCounters_rocprofiler(RocmonDevice* device) -{ - if (device->context) - { - // Close context - ROCM_CALL(rocprofiler_stop, (device->context, 0), return -1); - } - - return 0; -} - int rocmon_stopCounters(void) { - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Read counters - ret = _rocmon_readCounters(&_rocmon_get_stop_time); - if (ret < 0) return ret; - - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Stop rocprofiler events - ret = _rocmon_stopCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Nothing to stop for SMI events - } - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_stopCounters(); +#else + return rocmon_sdk_stopCounters(); +#endif } int rocmon_readCounters(void) { - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Read counters - ret = _rocmon_readCounters(&_rocmon_get_read_time); - if (ret < 0) return ret; - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_readCounters(); +#else + return rocmon_sdk_readCounters(); +#endif } double rocmon_getResult(int gpuIdx, int groupId, int eventId) { - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate gpuIdx - if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) - { - return -EFAULT; - } - - // Validate groupId - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - if (groupId < 0 || groupId >= device->numGroupResults) - { - return -EFAULT; - } - - // Validate eventId - RocmonEventResultList* groupResult = &device->groupResults[groupId]; - if (eventId < 0 || eventId >= groupResult->numResults) - { - return -EFAULT; - } - - // Return result - return groupResult->results[eventId].fullValue; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getResult(gpuIdx, groupId, eventId); +#else + return rocmon_sdk_getResult(gpuIdx, groupId, eventId); +#endif } @@ -1862,413 +154,219 @@ rocmon_getResult(int gpuIdx, int groupId, int eventId) double rocmon_getLastResult(int gpuIdx, int groupId, int eventId) { - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate gpuIdx - if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) - { - return -EFAULT; - } - - // Validate groupId - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - if (groupId < 0 || groupId >= device->numGroupResults) - { - return -EFAULT; - } - - // Validate eventId - RocmonEventResultList* groupResult = &device->groupResults[groupId]; - if (eventId < 0 || eventId >= groupResult->numResults) - { - return -EFAULT; - } - - // Return result - return groupResult->results[eventId].lastValue; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getLastResult(gpuIdx, groupId, eventId); +#else + return rocmon_sdk_getLastResult(gpuIdx, groupId, eventId); +#endif } int rocmon_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) { - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate args - if (gpuIdx < 0 || gpuIdx > rocmon_context->numDevices) - { - return -EINVAL; - } - if (list == NULL) - { - return -EINVAL; - } - - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - - // Allocate list structure - EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); - if (tmpList == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate event list); - return -ENOMEM; - } - - // Get number of events - printf("NUmber of events %d + %d\n", device->numRocMetrics , get_map_size(device->smiMetrics)); - tmpList->numEvents = device->numRocMetrics + get_map_size(device->smiMetrics); - if (tmpList->numEvents == 0) - { - // No events -> return empty list - tmpList->events = NULL; - *list = tmpList; - return 0; - } - - // Allocate event array - tmpList->events = (Event_rocm_t*) malloc(tmpList->numEvents * sizeof(Event_rocm_t)); - if (tmpList->events == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate events for event list); - free(tmpList); - return -ENOMEM; - } - - // Copy rocprofiler event information - for (int i = 0; i < device->numRocMetrics; i++) - { - rocprofiler_info_data_t* event = &device->rocMetrics[i]; - Event_rocm_t* out = &tmpList->events[i]; - int len; - - // Copy name - printf("Name %s\n", event->metric.name); - len = strlen(event->metric.name) + 5 /* Prefix */ + 1 /* NULL byte */; - out->name = (char*) malloc(len); - if (out->name) - { - snprintf(out->name, len, "ROCP_%s", event->metric.name); - } - - // Copy description - len = strlen(event->metric.description) + 1 /* NULL byte */; - out->description = (char*) malloc(len); - if (out->description) - { - snprintf(out->description, len, "%s", event->metric.description); - } - - // Copy instances - out->instances = event->metric.instances; - } - - // Copy ROCm SMI metric information - for (int i = 0; i < get_map_size(device->smiMetrics); i++) - { - RocmonSmiEvent* event = NULL; - Event_rocm_t* out = &tmpList->events[device->numRocMetrics + i]; - int len; - - // Get event - if (get_smap_by_idx(device->smiMetrics, i, (void**)&event) < 0) - { - continue; - } - - // Copy name - len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; - out->name = (char*) malloc(len); - if (out->name) - { - snprintf(out->name, len, "RSMI_%s", event->name); - } - - // Copy description - char* description = "SMI Event"; // TODO: use real descriptions - len = strlen(description) + 1 /* NULL byte */; - out->description = (char*) malloc(len); - if (out->description) - { - snprintf(out->description, len, "%s", description); - } - - // Copy instances - out->instances = event->instances; - } - - *list = tmpList; - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getEventsOfGpu(gpuIdx, list); +#else + return rocmon_sdk_getEventsOfGpu(gpuIdx, list); +#endif } void rocmon_freeEventsOfGpu(EventList_rocm_t list) { -#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } - - // Check pointer - if (list == NULL) - { - return; - } - - if (list->events != NULL) - { - for (int i = 0; i < list->numEvents; i++) - { - Event_rocm_t* event = &list->events[i]; - FREE_IF_NOT_NULL(event->name); - FREE_IF_NOT_NULL(event->description); - } - free(list->events); - } - free(list); +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_freeEventsOfGpu(list); +#else + return rocmon_sdk_freeEventsOfGpu(list); +#endif } int rocmon_switchActiveGroup(int newGroupId) { - int ret; - - ret = rocmon_stopCounters(); - if (ret < 0) - { - return ret; - } - - ret = rocmon_setupCounters(newGroupId); - if (ret < 0) - { - return ret; - } - - ret = rocmon_startCounters(); - if (ret < 0) - { - return ret; - } - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_switchActiveGroup(newGroupId); +#else + return rocmon_sdk_switchActiveGroup(newGroupId); +#endif } int rocmon_getNumberOfGroups(void) { - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->numActiveGroups; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getNumberOfGroups(); +#else + return rocmon_sdk_getNumberOfGroups(); +#endif } int rocmon_getIdOfActiveGroup(void) { - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->activeGroup; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getIdOfActiveGroup(); +#else + return rocmon_sdk_getIdOfActiveGroup(); +#endif } int rocmon_getNumberOfGPUs(void) { - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->numDevices; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getNumberOfGPUs(); +#else + return rocmon_sdk_getNumberOfGPUs(); +#endif } int rocmon_getNumberOfEvents(int groupId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return -EFAULT; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->nevents; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getNumberOfEvents(groupId); +#else + return rocmon_sdk_getNumberOfEvents(groupId); +#endif } int rocmon_getNumberOfMetrics(int groupId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->nmetrics; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getNumberOfMetrics(groupId); +#else + return rocmon_sdk_getNumberOfMetrics(groupId); +#endif } double rocmon_getTimeOfGroup(int groupId) { - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.stop - device->time.start)); - } - return t*1E-9; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getTimeOfGroup(groupId); +#else + return rocmon_sdk_getTimeOfGroup(groupId); +#endif } double rocmon_getLastTimeOfGroup(int groupId) { - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.stop - device->time.read)); - } - return t*1E-9; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getLastTimeOfGroup(groupId); +#else + return rocmon_sdk_getLastTimeOfGroup(groupId); +#endif } double rocmon_getTimeToLastReadOfGroup(int groupId) { - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.read - device->time.start)); - } - return t*1E-9; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getTimeToLastReadOfGroup(groupId); +#else + return rocmon_sdk_getTimeToLastReadOfGroup(groupId); +#endif } char* rocmon_getEventName(int groupId, int eventId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((eventId < 0) || (eventId >= ginfo->nevents)) - { - return NULL; - } - return ginfo->events[eventId]; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getEventName(groupId, eventId); +#else + return rocmon_sdk_getEventName(groupId, eventId); +#endif } char* rocmon_getCounterName(int groupId, int eventId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((eventId < 0) || (eventId >= ginfo->nevents)) - { - return NULL; - } - return ginfo->counters[eventId]; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getCounterName(groupId, eventId); +#else + return rocmon_sdk_getCounterName(groupId, eventId); +#endif } char* rocmon_getMetricName(int groupId, int metricId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((metricId < 0) || (metricId >= ginfo->nmetrics)) - { - return NULL; - } - return ginfo->metricnames[metricId]; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getMetricName(groupId, metricId); +#else + return rocmon_sdk_getMetricName(groupId, metricId); +#endif } char* rocmon_getGroupName(int groupId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->groupname; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getGroupName(groupId); +#else + return rocmon_sdk_getGroupName(groupId); +#endif } char* rocmon_getGroupInfoShort(int groupId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->shortinfo; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getGroupInfoShort(groupId); +#else + return rocmon_sdk_getGroupInfoShort(groupId); +#endif } char* rocmon_getGroupInfoLong(int groupId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->longinfo; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getGroupInfoLong(groupId); +#else + return rocmon_sdk_getGroupInfoLong(groupId); +#endif } int rocmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos) { - init_configuration(); - Configuration_t config = get_configuration(); - - return perfgroup_getGroups(config->groupPath, "amd_gpu", groups, shortinfos, longinfos); +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getGroups(groups, shortinfos, longinfos); +#else + return rocmon_sdk_getGroups(groups, shortinfos, longinfos); +#endif } int rocmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) { - perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_returnGroups(nrgroups, groups, shortinfos, longinfos); +#else + return rocmon_sdk_returnGroups(nrgroups, groups, shortinfos, longinfos); +#endif } void rocmon_setVerbosity(int level) @@ -2279,5 +377,4 @@ void rocmon_setVerbosity(int level) } } - #endif /* LIKWID_WITH_ROCMON */ diff --git a/src/rocmon_marker.c b/src/rocmon_marker.c index 68337239d..01e43ffac 100644 --- a/src/rocmon_marker.c +++ b/src/rocmon_marker.c @@ -39,7 +39,9 @@ #include #include -#include +#ifndef LIKWID_ROCPROF_SDK +#include +#endif #define gettid() syscall(SYS_gettid) diff --git a/src/rocmon_v1.c b/src/rocmon_v1.c new file mode 100644 index 000000000..31ff459e8 --- /dev/null +++ b/src/rocmon_v1.c @@ -0,0 +1,2275 @@ + /* ======================================================================================= + * + * Filename: rocmon_v1.c + * + * Description: Main implementation of the performance monitoring module + * for AMD GPUs with ROCm < 6.2 + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.roehl@googlemail.com + * Project: likwid + * + * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifdef LIKWID_WITH_ROCMON + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + + + + +// #include +// #include +// #include + +// Variables +static void *dl_hsa_lib = NULL; +static void *dl_profiler_lib = NULL; +static void *dl_rsmi_lib = NULL; + +RocmonContext *rocmon_context = NULL; +static bool rocmon_initialized = FALSE; +int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; + +// Macros +#define membersize(type, member) sizeof(((type *) NULL)->member) +#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } +#define ROCM_CALL( call, args, handleerror ) \ + do { \ + hsa_status_t _status = (*call##_ptr)args; \ + if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ + const char* err = NULL; \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + rocprofiler_error_string(&err); \ + fprintf(stderr, "Error: %s\n", err); \ + handleerror; \ + } \ + } while (0) + +#define RSMI_CALL( call, args, handleerror ) \ + do { \ + rsmi_status_t _status = (*call##_ptr)args; \ + if (_status != RSMI_STATUS_SUCCESS) { \ + fprintf(stderr, "Error: function %s failed with error %d.\n", #call, _status); \ + handleerror; \ + } \ + } while (0) + +// ROCm function declarations +#define ROCMWEAK __attribute__(( weak )) +#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; +#define DECLAREFUNC_SMI(funcname, funcsig) rsmi_status_t ROCMWEAK funcname funcsig; rsmi_status_t ( *funcname##_ptr ) funcsig; + +DECLAREFUNC_HSA(hsa_init, ()); +DECLAREFUNC_HSA(hsa_shut_down, ()); +DECLAREFUNC_HSA(hsa_iterate_agents, (hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data)); +DECLAREFUNC_HSA(hsa_agent_get_info, (hsa_agent_t agent, hsa_agent_info_t attribute, void* value)); +DECLAREFUNC_HSA(hsa_system_get_info, (hsa_system_info_t attribute, void *value)); + +DECLAREFUNC_HSA(rocprofiler_iterate_info, (const hsa_agent_t* agent, rocprofiler_info_kind_t kind, hsa_status_t (*callback)(const rocprofiler_info_data_t, void* data), void* data)); +DECLAREFUNC_HSA(rocprofiler_close, (rocprofiler_t* context)); +DECLAREFUNC_HSA(rocprofiler_open, (hsa_agent_t agent, rocprofiler_feature_t* features, uint32_t feature_count, rocprofiler_t** context, uint32_t mode, rocprofiler_properties_t* properties)); +DECLAREFUNC_HSA(rocprofiler_error_string, ()); +DECLAREFUNC_HSA(rocprofiler_start, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_stop, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_read, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_get_data, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_get_metrics, (const rocprofiler_t* context)); + +DECLAREFUNC_SMI(rsmi_init, (uint64_t flags)); +DECLAREFUNC_SMI(rsmi_shut_down, ()); +DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_open, (uint32_t dv_ind, rsmi_func_id_iter_handle_t* handle)); +DECLAREFUNC_SMI(rsmi_dev_supported_variant_iterator_open, (rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t* var_iter)); +DECLAREFUNC_SMI(rsmi_func_iter_value_get, (rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t* value )); +DECLAREFUNC_SMI(rsmi_func_iter_next, (rsmi_func_id_iter_handle_t handle)); +DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_close, (rsmi_func_id_iter_handle_t* handle)); +DECLAREFUNC_SMI(rsmi_dev_power_ave_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* power)); +DECLAREFUNC_SMI(rsmi_dev_pci_throughput_get, (uint32_t dv_ind, uint64_t* sent, uint64_t* received, uint64_t* max_pkt_sz)); +DECLAREFUNC_SMI(rsmi_dev_pci_replay_counter_get, (uint32_t dv_ind, uint64_t* counter)); +DECLAREFUNC_SMI(rsmi_dev_memory_total_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* total)); +DECLAREFUNC_SMI(rsmi_dev_memory_usage_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* used )); +DECLAREFUNC_SMI(rsmi_dev_memory_busy_percent_get, (uint32_t dv_ind, uint32_t* busy_percent)); +DECLAREFUNC_SMI(rsmi_dev_memory_reserved_pages_get, (uint32_t dv_ind, uint32_t* num_pages, rsmi_retired_page_record_t* records)); +DECLAREFUNC_SMI(rsmi_dev_fan_rpms_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); +DECLAREFUNC_SMI(rsmi_dev_fan_speed_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); +DECLAREFUNC_SMI(rsmi_dev_fan_speed_max_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* max_speed)); +DECLAREFUNC_SMI(rsmi_dev_temp_metric_get, (uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t* temperature)); +DECLAREFUNC_SMI(rsmi_dev_volt_metric_get, (uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t* voltage)); +DECLAREFUNC_SMI(rsmi_dev_overdrive_level_get, (uint32_t dv_ind, uint32_t* od)); +DECLAREFUNC_SMI(rsmi_dev_ecc_count_get, (uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t* ec)); +DECLAREFUNC_SMI(rsmi_compute_process_info_get, (rsmi_process_info_t* procs, uint32_t* num_items)); + + +// ---------------------------------------------------- +// SMI event wrapper +// ---------------------------------------------------- + +static int +_smi_wrapper_pci_throughput_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t value; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _smi_wrapper_pci_throughput_get(%d, %d), deviceId, event->extra); + // Internal variant: 0 for sent, 1 for received bytes and 2 for max packet size + if (event->extra == 0) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, &value, NULL, NULL), return -1); + else if (event->extra == 1) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, &value, NULL), return -1); + else if (event->extra == 2) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, NULL, &value), return -1); + else return -1; + + result->fullValue += value; + result->lastValue = value; + + return 0; +} + + +static int +_smi_wrapper_pci_replay_counter_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t counter; + RSMI_CALL(rsmi_dev_pci_replay_counter_get, (deviceId, &counter), return -1); + result->fullValue += counter; + result->lastValue = counter; + + return 0; +} + + +static int +_smi_wrapper_power_ave_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t power; + RSMI_CALL(rsmi_dev_power_ave_get, (deviceId, event->subvariant, &power), return -1); + result->fullValue += power; + result->lastValue = power; + + return 0; +} + + +static int +_smi_wrapper_memory_total_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t total; + RSMI_CALL(rsmi_dev_memory_total_get, (deviceId, event->variant, &total), return -1); + result->fullValue += total; + result->lastValue = total; + + return 0; +} + + +static int +_smi_wrapper_memory_usage_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t used; + RSMI_CALL(rsmi_dev_memory_usage_get, (deviceId, event->variant, &used), return -1); + result->fullValue += used; + result->lastValue = used; + + return 0; +} + + +static int +_smi_wrapper_memory_busy_percent_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t percent; + RSMI_CALL(rsmi_dev_memory_busy_percent_get, (deviceId, &percent), return -1); + result->fullValue += percent; + result->lastValue = percent; + + return 0; +} + + +static int +_smi_wrapper_memory_reserved_pages_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t num_pages; + RSMI_CALL(rsmi_dev_memory_reserved_pages_get, (deviceId, &num_pages, NULL), return -1); + result->fullValue += num_pages; + result->lastValue = num_pages; + + return 0; +} + + +static int +_smi_wrapper_fan_rpms_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t speed; + RSMI_CALL(rsmi_dev_fan_rpms_get, (deviceId, event->subvariant, &speed), return -1); + result->fullValue += speed; + result->lastValue = speed; + + return 0; +} + + +static int +_smi_wrapper_fan_speed_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t speed; + RSMI_CALL(rsmi_dev_fan_speed_get, (deviceId, event->subvariant, &speed), return -1); + result->fullValue += speed; + result->lastValue = speed; + + return 0; +} + + +static int +_smi_wrapper_fan_speed_max_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t max_speed; + RSMI_CALL(rsmi_dev_fan_speed_max_get, (deviceId, event->subvariant, &max_speed), return -1); + result->fullValue += max_speed; + result->lastValue = max_speed; + + return 0; +} + + +static int +_smi_wrapper_temp_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t temperature; + RSMI_CALL(rsmi_dev_temp_metric_get, (deviceId, event->subvariant, event->variant, &temperature), return -1); + result->fullValue += temperature; + result->lastValue = temperature; + + return 0; +} + + +static int +_smi_wrapper_volt_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t voltage; + RSMI_CALL(rsmi_dev_volt_metric_get, (deviceId, event->subvariant, event->variant, &voltage), return -1); + result->fullValue += voltage; + result->lastValue = voltage; + + return 0; +} + + +static int +_smi_wrapper_overdrive_level_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t overdrive; + RSMI_CALL(rsmi_dev_overdrive_level_get, (deviceId, &overdrive), return -1); + result->fullValue += overdrive; + result->lastValue = overdrive; + + return 0; +} + + +static int +_smi_wrapper_ecc_count_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + rsmi_error_count_t error_count; + RSMI_CALL(rsmi_dev_ecc_count_get, (deviceId, event->variant, &error_count), return -1); + + if (event->extra == 0) + { + result->lastValue = error_count.correctable_err - result->fullValue; + result->fullValue = error_count.correctable_err; + } + else if (event->extra == 1) + { + result->lastValue = error_count.uncorrectable_err - result->fullValue; + result->fullValue = error_count.uncorrectable_err; + } + else + { + return -1; + } + + return 0; +} + + +static int +_smi_wrapper_compute_process_info_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t num_items; + RSMI_CALL(rsmi_compute_process_info_get, (NULL, &num_items), return -1); + result->fullValue += num_items; + result->lastValue = num_items; + + return 0; +} + + +// ---------------------------------------------------- +// Rocmon helper functions +// ---------------------------------------------------- + +static int +_rocmon_link_libraries() +{ + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries); + + // Need to link in the ROCm HSA libraries + dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (!dl_hsa_lib) + { + ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); + return -1; + } + + // Need to link in the Rocprofiler libraries + dl_profiler_lib = dlopen("librocprofiler64.so", RTLD_NOW | RTLD_GLOBAL); + if (!dl_profiler_lib) + { + dl_profiler_lib = dlopen("librocprofiler64.so.1", RTLD_NOW | RTLD_GLOBAL); + if (!dl_profiler_lib) + { + ERROR_PRINT(Rocprofiler library librocprofiler64.so not found: %s, dlerror()); + return -1; + } + } + + // Need to link in the Rocprofiler libraries + dl_rsmi_lib = dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL); + if (!dl_rsmi_lib) + { + ERROR_PRINT(ROCm SMI library librocm_smi64.so not found: %s, dlerror()); + return -1; + } + + // Link HSA functions + DLSYM_AND_CHECK(dl_hsa_lib, hsa_init); + DLSYM_AND_CHECK(dl_hsa_lib, hsa_shut_down); + DLSYM_AND_CHECK(dl_hsa_lib, hsa_iterate_agents); + DLSYM_AND_CHECK(dl_hsa_lib, hsa_agent_get_info); + DLSYM_AND_CHECK(dl_hsa_lib, hsa_system_get_info); + + // Link Rocprofiler functions + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_iterate_info); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_close); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_open); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_error_string); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_start); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_stop); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_read); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_data); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_metrics); + + // Link SMI functions + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_init); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_shut_down); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_open); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_variant_iterator_open); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_value_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_next); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_close); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_power_ave_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_throughput_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_replay_counter_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_total_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_usage_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_busy_percent_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_reserved_pages_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_rpms_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_max_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_temp_metric_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_volt_metric_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_overdrive_level_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_ecc_count_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_compute_process_info_get); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries done); + return 0; +} + +typedef struct { + RocmonContext* context; + int numGpus; + const int* gpuIds; +} iterate_agents_cb_arg; + +typedef struct { + RocmonDevice* device; + int currIndex; +} iterate_info_cb_arg; + + +static hsa_status_t +_rocmon_iterate_info_callback_count(const rocprofiler_info_data_t info, void* data) +{ + RocmonDevice* device = (RocmonDevice*) data; + if (device) { + device->numRocMetrics++; + } + return HSA_STATUS_SUCCESS; +} + +static void +_rocmon_print_rocprofiler_info_data(const rocprofiler_info_data_t info) +{ + if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + { + return; + } + printf("Name '%s':\n", info.metric.name); + printf("\tKind: '%s'\n", (info.kind == ROCPROFILER_INFO_KIND_METRIC ? "Metric" : "Trace")); + printf("\tInstances: %d\n", info.metric.instances); + printf("\tDescription: '%s'\n", info.metric.description); + printf("\tExpression: '%s'\n", info.metric.expr); + printf("\tBlockName: '%s'\n", info.metric.block_name); + printf("\tBlockCounters: %d\n", info.metric.block_counters); +} + +static hsa_status_t +_rocmon_iterate_info_callback_add(const rocprofiler_info_data_t info, void* data) +{ + iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); + if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP) + { + _rocmon_print_rocprofiler_info_data(info); + } + // Check info kind + if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + { + ERROR_PRINT(Wrong info kind %u, info.kind); + return HSA_STATUS_ERROR; + } + + // Check index + if (arg->currIndex >= arg->device->numRocMetrics) + { + ERROR_PRINT(Metric index out of bounds: %d, arg->currIndex); + return HSA_STATUS_ERROR; + } + + // Copy info data + rocprofiler_info_data_t* target_info = &arg->device->rocMetrics[arg->currIndex]; + memcpy(target_info, &info, sizeof(rocprofiler_info_data_t)); + arg->currIndex++; + + return HSA_STATUS_SUCCESS; +} + + +static hsa_status_t +_rocmon_iterate_agents_callback(hsa_agent_t agent, void* argv) +{ + // Count number of callback invocations as the devices id + static int nextDeviceId = 0; + int deviceId = nextDeviceId; + bool noAgent = false; + + iterate_agents_cb_arg *arg = (iterate_agents_cb_arg*) argv; + + // Check if device is a GPU + hsa_device_type_t type; + ROCM_CALL(hsa_agent_get_info, (agent, HSA_AGENT_INFO_DEVICE, &type), return -1); + if (type != HSA_DEVICE_TYPE_GPU) + { + return HSA_STATUS_SUCCESS; + } + nextDeviceId++; + + // Check if device is includes in arg->gpuIds + int gpuIndex = -1; + for (int i = 0; i < arg->numGpus; i++) + { + if (deviceId == arg->gpuIds[i]) + { + gpuIndex = i; + break; + } + } + if (gpuIndex < 0) + { + return HSA_STATUS_SUCCESS; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing agent %d, gpuIndex); + + // Add agent to context + RocmonDevice *device = &arg->context->devices[gpuIndex]; + device->deviceId = deviceId; + device->hsa_agent = agent; + device->context = NULL; + device->numActiveRocEvents = 0; + device->activeRocEvents = NULL; + device->numGroupResults = 0; + device->groupResults = NULL; + + // Get number of available metrics + device->numRocMetrics = 0; + ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, RocProfiler provides %d events, device->numRocMetrics); + + // workaround for bug in ROCm 5.4.0 + if(device->numRocMetrics == 0) { + ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); + noAgent = true; + } + + // Allocate memory for metrics + device->rocMetrics = (rocprofiler_info_data_t*) malloc(device->numRocMetrics * sizeof(rocprofiler_info_data_t)); + if (device->rocMetrics == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate set of rocMetrics); + return HSA_STATUS_ERROR; + } + + // Initialize SMI events map + if (init_map(&device->smiMetrics, MAP_KEY_TYPE_STR, 0, &free) < 0) + { + ERROR_PLAIN_PRINT(Cannot init smiMetrics map); + return HSA_STATUS_ERROR; + } + + // Fetch metric informatino + iterate_info_cb_arg info_arg = { + .device = device, + .currIndex = 0, + }; + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, Read %d RocProfiler events for device %d, device->numRocMetrics, device->deviceId); + + // If the call fails with agent, call rocprofiler_iterate_info without agent + if(noAgent) + { + ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); + } else { + ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); + } + + return HSA_STATUS_SUCCESS; +} + + +static int +_rocmon_parse_eventstring(const char* eventString, GroupInfo* group) +{ + int err = 0; + Configuration_t config = get_configuration(); + bstring eventBString = bfromcstr(eventString); + + if (bstrchrp(eventBString, ':', 0) != BSTR_ERR) + { + // If custom group -> perfgroup_customGroup + err = perfgroup_customGroup(eventString, group); + if (err < 0) + { + ERROR_PRINT(Cannot transform %s to performance group, eventString); + return err; + } + } + else + { + // If performance group -> perfgroup_readGroup + err = perfgroup_readGroup(config->groupPath, "amd_gpu", eventString, group); + if (err == -EACCES) + { + ERROR_PRINT(Access to performance group %s not allowed, eventString); + return err; + } + else if (err == -ENODEV) + { + ERROR_PRINT(Performance group %s only available with deactivated HyperThreading, eventString); + return err; + } + if (err < 0) + { + ERROR_PRINT(Cannot read performance group %s, eventString); + return err; + } + } + + return 0; +} + + +static int +_rocmon_get_timestamp(uint64_t* timestamp_ns) +{ + uint64_t timestamp; + + // Get timestamp from system + ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP, ×tamp), return -1); + // Convert to nanoseconds + *timestamp_ns = (uint64_t)((long double)timestamp * rocmon_context->hsa_timestamp_factor); + + return 0; +} + + +static int +_rocmon_getLastResult(RocmonDevice* device, int eventId, double* value) +{ + rocprofiler_data_t* data = &device->activeRocEvents[eventId].data; + + switch (data->kind) + { + case ROCPROFILER_DATA_KIND_INT32: + *value = (double) data->result_int32; + break; + case ROCPROFILER_DATA_KIND_INT64: + *value = (double) data->result_int64; + break; + case ROCPROFILER_DATA_KIND_FLOAT: + *value = (double) data->result_float; + break; + case ROCPROFILER_DATA_KIND_DOUBLE: + *value = data->result_double; + break; + + case ROCPROFILER_DATA_KIND_BYTES: + case ROCPROFILER_DATA_KIND_UNINIT: + default: + return -1; + } + + return 0; +} + + +static int +_rocmon_readCounters_rocprofiler(RocmonDevice* device) +{ + int ret; + + // Check if there are any counters to start + if (device->numActiveRocEvents <= 0) + { + return 0; + } + + if (!device->context) + { + return 0; + } + + ROCM_CALL(rocprofiler_read, (device->context, 0), return -1); + ROCM_CALL(rocprofiler_get_data, (device->context, 0), return -1); + ROCM_CALL(rocprofiler_get_metrics, (device->context), return -1); + + // Update results + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveRocEvents; i++) + { + RocmonEventResult* result = &groupResult->results[i]; + + // Read value + ret = _rocmon_getLastResult(device, i, &result->fullValue); + if (ret < 0) + { + return -1; + } + + // Calculate delta since last read + result->lastValue = result->fullValue - result->lastValue; + } + + return 0; +} + + +static int +_rocmon_readCounters_smi(RocmonDevice* device) +{ + // Check if there are any counters to start + if (device->numActiveSmiEvents <= 0) + { + return 0; + } + + // Save baseline values + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveSmiEvents; i++) + { + double value = 0; + RocmonSmiEvent* event = &device->activeSmiEvents[i]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + + // Measure counter + if (event->measureFunc) + { + event->measureFunc(device->deviceId, event, result); + } + } + + return 0; +} + + +static int +_rocmon_readCounters(uint64_t* (*getDestTimestampFunc)(RocmonDevice* device)) +{ + int ret; + + // Get timestamp + uint64_t timestamp; + if (ret = _rocmon_get_timestamp(×tamp)) + { + return ret; + } + + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Save timestamp + if (getDestTimestampFunc) + { + uint64_t* timestampDest = getDestTimestampFunc(device); + if (timestampDest) + { + *timestampDest = timestamp; + } + } + + // Read rocprofiler counters + ret = _rocmon_readCounters_rocprofiler(device); + if (ret < 0) return ret; + + // Read SMI counters + ret = _rocmon_readCounters_smi(device); + if (ret < 0) return ret; + } + + return 0; +} + + +static uint64_t* +_rocmon_get_read_time(RocmonDevice* device) +{ + return &device->time.read; +} + + +static uint64_t* +_rocmon_get_stop_time(RocmonDevice* device) +{ + return &device->time.stop; +} + + +// ---------------------------------------------------- +// Rocmon SMI helper functions +// ---------------------------------------------------- + +static bstring +_rocmon_smi_build_label(RocmonSmiEventType type, const char* funcname, uint64_t variant, uint64_t subvariant) +{ + switch (type) + { + case ROCMON_SMI_EVENT_TYPE_NORMAL: + return bfromcstr(funcname); + case ROCMON_SMI_EVENT_TYPE_VARIANT: + return bformat("%s|%" PRIu64, funcname, variant); + case ROCMON_SMI_EVENT_TYPE_SUBVARIANT: + return bformat("%s|%" PRIu64 "|%" PRIu64, funcname, variant, subvariant); + case ROCMON_SMI_EVENT_TYPE_INSTANCES: + return bfromcstr(funcname); + } +} + + +static int +_rocmon_smi_add_event_to_device(RocmonDevice* device, const char* funcname, RocmonSmiEventType type, int64_t variant, uint64_t subvariant) +{ + int ret; + + // Get event by label + RocmonSmiEventList* list = NULL; + bstring label = _rocmon_smi_build_label(type, funcname, variant, subvariant); + ret = get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list); + bdestroy(label); + if (ret < 0) + { + // Event not registered -> ignore + return 0; + } + + // For events with multiple sensor, only make one entry -> find if one exists + if (type == ROCMON_SMI_EVENT_TYPE_INSTANCES && subvariant > 0) + { + // Get list from map + for (int i = 0; i < list->numEntries; i++) + { + RocmonSmiEvent* event = &list->entries[i]; + RocmonSmiEvent* existingEvent = NULL; + ret = get_smap_by_key(device->smiMetrics, event->name, (void**)&existingEvent); + if (ret < 0) + { + ERROR_PRINT(Failed to find previous instance for event %s, event->name); + return -1; + } + + // Update instance information + existingEvent->instances++; + } + return 0; + } + + for (int i = 0; i < list->numEntries; i++) + { + RocmonSmiEvent* event = &list->entries[i]; + + // Allocate memory for device event description + RocmonSmiEvent* tmpEvent = (RocmonSmiEvent*) malloc(sizeof(RocmonSmiEvent)); + if (tmpEvent == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event in device list %s, event->name); + return -ENOMEM; + } + + // Copy information from global description + memcpy(tmpEvent, event, sizeof(RocmonSmiEvent)); + tmpEvent->variant = variant; + tmpEvent->subvariant = subvariant; + tmpEvent->instances = 1; + + // Save event info to device event map + add_smap(device->smiMetrics, tmpEvent->name, tmpEvent); + } + + return 0; +} + + +static int +_rocmon_smi_get_function_subvariants(RocmonDevice* device, const char* funcname, uint64_t variant, rsmi_func_id_iter_handle_t var_iter) +{ + rsmi_func_id_iter_handle_t sub_var_iter; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Get open subvariants iterator + status = (*rsmi_dev_supported_variant_iterator_open_ptr)(var_iter, &sub_var_iter); + if (status == RSMI_STATUS_NO_DATA) + { + // No subvariants + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_VARIANT, variant, 0); + if (ret < 0) return -1; + return 0; + } + + // Subvariants available -> iterate them + do { + // Get subvariant information + (*rsmi_func_iter_value_get_ptr)(sub_var_iter, &value); + + // Process info + if (variant == RSMI_DEFAULT_VARIANT) + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_INSTANCES, variant, value.id); + else + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, variant, value.id); + if (ret < 0) return ret; + + // Advance iterator + status = (*rsmi_func_iter_next_ptr)(sub_var_iter); + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + (*rsmi_dev_supported_func_iterator_close_ptr)(&sub_var_iter); + + return 0; +} + + +static int +_rocmon_smi_get_function_variants(RocmonDevice* device, const char* funcname, rsmi_func_id_iter_handle_t iter_handle) +{ + rsmi_func_id_iter_handle_t var_iter; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Get open variants iterator + status = (*rsmi_dev_supported_variant_iterator_open_ptr)(iter_handle, &var_iter); + if (status == RSMI_STATUS_NO_DATA) + { + // No variants + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); + if (ret < 0) return -1; + return 0; + } + + // Variants available -> iterate them + do { + // Get variant information + (*rsmi_func_iter_value_get_ptr)(var_iter, &value); + + // Get function subvariants + ret = _rocmon_smi_get_function_subvariants(device, funcname, value.id, var_iter); + if (ret < 0) return -1; + + // Advance iterator + status = (*rsmi_func_iter_next_ptr)(var_iter); + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + (*rsmi_dev_supported_func_iterator_close_ptr)(&var_iter); + + return 0; +} + + +static int +_rocmon_smi_get_functions(RocmonDevice* device) +{ + rsmi_func_id_iter_handle_t iter_handle; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Open iterator + //(*rsmi_dev_supported_func_iterator_open_ptr)(device->deviceId, &iter_handle); + RSMI_CALL(rsmi_dev_supported_func_iterator_open, (device->deviceId, &iter_handle), { + return -1; + }); + + do + { + // Get function information + //(*rsmi_func_iter_value_get_ptr)(iter_handle, &value); + RSMI_CALL(rsmi_func_iter_value_get, (iter_handle, &value), { + ERROR_PRINT(Failed to get smi function value for device %d, device->deviceId); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + return -1; + }); + + // Get function variants + ret = _rocmon_smi_get_function_variants(device, value.name, iter_handle); + if (ret < 0) + { + ERROR_PRINT(Failed to get smi function variants for device %d, device->deviceId); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + return -1; + } + + // Advance iterator (cannot use RSMI_CALL macro here because we have an assignment, + // so we check that the function pointer exists to avoid segfaults.) + if (rsmi_func_iter_next_ptr) { + status = (*rsmi_func_iter_next_ptr)(iter_handle); + } + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + //(*rsmi_dev_supported_func_iterator_close_ptr)(&iter_handle); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + + // Add device independent functions + ret = _rocmon_smi_add_event_to_device(device, "rsmi_compute_process_info_get", ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); + if (ret < 0) return -1; + + return 0; +} + +#define ADD_SMI_EVENT(name, type, smifunc, variant, subvariant, extra, measurefunc) if (_rocmon_smi_add_event_to_map(name, type, smifunc, variant, subvariant, extra, measurefunc) < 0) { return -1; } +#define ADD_SMI_EVENT_N(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_NORMAL, smifunc, 0, 0, extra, measurefunc) +#define ADD_SMI_EVENT_V(name, smifunc, variant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_VARIANT, smifunc, variant, 0, extra, measurefunc) +#define ADD_SMI_EVENT_S(name, smifunc, variant, subvariant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, smifunc, variant, subvariant, extra, measurefunc) +#define ADD_SMI_EVENT_I(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_INSTANCES, smifunc, 0, 0, extra, measurefunc) + +static int +_rocmon_smi_add_event_to_map(char* name, RocmonSmiEventType type, char* smifunc, uint64_t variant, uint64_t subvariant, uint64_t extra, RocmonSmiMeasureFunc measureFunc) +{ + // Add new event list to map (if not already present) + bstring label = _rocmon_smi_build_label(type, smifunc, variant, subvariant); + RocmonSmiEventList* list; + if (get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list) < 0) + { + // Allocate memory for event list + list = (RocmonSmiEventList*) malloc(sizeof(RocmonSmiEventList)); + if (list == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event list %s, name); + return -ENOMEM; + } + list->entries = NULL; + list->numEntries = 0; + + add_smap(rocmon_context->smiEvents, bdata(label), list); + } + bdestroy(label); + + // Allocate memory for another event in list + list->numEntries++; + list->entries = (RocmonSmiEvent*) realloc(list->entries, list->numEntries * sizeof(RocmonSmiEvent)); + if (list->entries == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event %s, name); + return -ENOMEM; + } + + // Set event properties + RocmonSmiEvent* event = &list->entries[list->numEntries-1]; + strncpy(event->name, name, sizeof(event->name)); + event->name[sizeof(event->name)] = '\0'; + event->type = type; + event->variant = variant; + event->subvariant = subvariant; + event->extra = extra; + event->instances = 0; // gets set when scanning supported device functions + event->measureFunc = measureFunc; + + return 0; +} + + +static void +_rcomon_smi_free_event_list(void* vlist) +{ + RocmonSmiEventList* list = (RocmonSmiEventList*)vlist; + if (list) + { + FREE_IF_NOT_NULL(list->entries); + free(list); + } +} + + +static int +_rocmon_smi_init_events() +{ + int ret; + + // Init map + ret = init_map(&rocmon_context->smiEvents, MAP_KEY_TYPE_STR, 0, &_rcomon_smi_free_event_list); + if (ret < 0) + { + ERROR_PRINT(Failed to create map for ROCm SMI events); + return -1; + } + + // Add events + ADD_SMI_EVENT_N("PCI_THROUGHPUT_SENT", "rsmi_dev_pci_throughput_get", 0, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_THROUGHPUT_RECEIVED", "rsmi_dev_pci_throughput_get", 1, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_THROUGHPUT_MAX_PKT_SZ", "rsmi_dev_pci_throughput_get", 2, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_REPLAY_COUNTER", "rsmi_dev_pci_replay_counter_get", 0, &_smi_wrapper_pci_replay_counter_get ); + ADD_SMI_EVENT_I("POWER_AVE", "rsmi_dev_power_ave_get", 0, &_smi_wrapper_power_ave_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_VIS_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_GTT", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_VIS_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_GTT", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_N("MEMORY_BUSY_PERCENT", "rsmi_dev_memory_busy_percent_get", 0, &_smi_wrapper_memory_busy_percent_get ); + ADD_SMI_EVENT_N("MEMORY_NUM_RESERVED_PAGES", "rsmi_dev_memory_reserved_pages_get", 0, &_smi_wrapper_memory_reserved_pages_get ); + ADD_SMI_EVENT_I("FAN_RPMS", "rsmi_dev_fan_rpms_get", 0, &_smi_wrapper_fan_rpms_get ); + ADD_SMI_EVENT_I("FAN_SPEED", "rsmi_dev_fan_speed_get", 0, &_smi_wrapper_fan_speed_get ); + ADD_SMI_EVENT_I("FAN_SPEED_MAX", "rsmi_dev_fan_speed_max_get", 0, &_smi_wrapper_fan_speed_max_get ); + ADD_SMI_EVENT_S("TEMP_EDGE", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_EDGE, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("TEMP_JUNCTION", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_JUNCTION, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("TEMP_MEMORY", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_MEMORY, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("VOLT_VDDGFX", "rsmi_dev_volt_metric_get", RSMI_VOLT_CURRENT, RSMI_VOLT_TYPE_VDDGFX, 0, &_smi_wrapper_volt_metric_get ); + ADD_SMI_EVENT_N("OVERDRIVE_LEVEL", "rsmi_dev_overdrive_level_get", 0, &_smi_wrapper_overdrive_level_get ); + ADD_SMI_EVENT_V("ECC_COUNT_UMC_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_UMC_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SDMA_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SDMA_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_GFX_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_GFX_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_HDP_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_HDP_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_DF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_DF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SMN_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SMN_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SEM_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SEM_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP0_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP0_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP1_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP1_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_FUSE_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_FUSE_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_LAST_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_LAST_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_N("PROCS_USING_GPU", "rsmi_compute_process_info_get", 0, &_smi_wrapper_compute_process_info_get ); + + return 0; +} + + +int +rocmon_v1_init(int numGpus, const int* gpuIds) +{ + hsa_status_t status; + + // check if already initialized + if (rocmon_initialized) + { + return 0; + } + if (rocmon_context != NULL) + { + return -EEXIST; + } + + // Validate arguments + if (numGpus <= 0) + { + ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); + return -EINVAL; + } + + // Initialize other parts + init_configuration(); + + // initialize libraries + int ret = _rocmon_link_libraries(); + if (ret < 0) + { + ERROR_PLAIN_PRINT(Failed to initialize libraries); + return ret; + } + + // Allocate memory for context + rocmon_context = (RocmonContext*) malloc(sizeof(RocmonContext)); + if (rocmon_context == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate Rocmon context); + return -ENOMEM; + } + rocmon_context->groups = NULL; + rocmon_context->numGroups = 0; + rocmon_context->numActiveGroups = 0; + + rocmon_context->devices = (RocmonDevice*) malloc(numGpus * sizeof(RocmonDevice)); + rocmon_context->numDevices = numGpus; + if (rocmon_context->devices == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate set of GPUs); + free(rocmon_context); + rocmon_context = NULL; + return -ENOMEM; + } + + // init hsa library + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing HSA); + ROCM_CALL(hsa_init, (), + { + ERROR_PLAIN_PRINT(Failed to init hsa library); + goto rocmon_init_hsa_failed; + }); + + // init rocm smi library + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RSMI); + RSMI_CALL(rsmi_init, (0), + { + ERROR_PLAIN_PRINT(Failed to init rocm_smi); + goto rocmon_init_rsmi_failed; + }); + + // Get hsa timestamp factor + uint64_t frequency_hz; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting HSA timestamp factor); + ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency_hz), + { + ERROR_PLAIN_PRINT(Failed to get HSA timestamp factor); + goto rocmon_init_info_agents_failed; + }); + rocmon_context->hsa_timestamp_factor = (long double)1000000000 / (long double)frequency_hz; + + // initialize structures for specified devices (fetch ROCm specific info) + iterate_agents_cb_arg arg = { + .context = rocmon_context, + .numGpus = numGpus, + .gpuIds = gpuIds, + }; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Iterating through %d available agents, numGpus); + ROCM_CALL(hsa_iterate_agents, (_rocmon_iterate_agents_callback, &arg), + { + ERROR_PRINT(Error while iterating through available agents); + goto rocmon_init_info_agents_failed; + }); + + // Get available SMI events for devices + _rocmon_smi_init_events(); + for (int i = 0; i < rocmon_context->numDevices; i++) + { + if (_rocmon_smi_get_functions(&rocmon_context->devices[i]) < 0) + { + ERROR_PRINT(Failed to get SMI functions for device %d, rocmon_context->devices[i].deviceId); + goto rocmon_init_info_agents_failed; + } + } + + rocmon_initialized = TRUE; + return 0; +rocmon_init_info_agents_failed: + RSMI_CALL(rsmi_shut_down, (), { + // fall through + }); +rocmon_init_rsmi_failed: + ROCM_CALL(hsa_shut_down, (), { + // fall through + }); +rocmon_init_hsa_failed: + free(rocmon_context->devices); + free(rocmon_context); + rocmon_context = NULL; + return -1; +} + + +void +rocmon_v1_finalize(void) +{ + RocmonContext* context = rocmon_context; + + if (!rocmon_initialized) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON); + + if (context) + { + if (context->devices) + { + // Free each devices fields + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + FREE_IF_NOT_NULL(device->rocMetrics); + FREE_IF_NOT_NULL(device->activeRocEvents); + FREE_IF_NOT_NULL(device->activeSmiEvents); + if (device->groupResults) + { + // Free events of event result lists + for (int j = 0; j < device->numGroupResults; j++) + { + FREE_IF_NOT_NULL(device->groupResults[i].results); + } + // Free list + free(device->groupResults); + } + if (device->context) + { + ROCM_CALL(rocprofiler_close, (device->context),); + } + destroy_smap(device->smiMetrics); + } + + free(context->devices); + context->devices = NULL; + } + + FREE_IF_NOT_NULL(context->groups); + destroy_smap(context->smiEvents); + + free(context); + context = NULL; + } + + RSMI_CALL(rsmi_shut_down, (), { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); + // fall through + }); + ROCM_CALL(hsa_shut_down, (), { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); + // fall through + }); +} + + +int +rocmon_v1_addEventSet(const char* eventString, int* gid) +{ + // Check arguments + if (!eventString) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Allocate memory for event group if necessary + if (rocmon_context->numActiveGroups == rocmon_context->numGroups) + { + GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo)); + if (tmpInfo == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate additional group); + return -ENOMEM; + } + rocmon_context->groups = tmpInfo; + rocmon_context->numGroups++; + } + + // Parse event string + int err = _rocmon_parse_eventstring(eventString, &rocmon_context->groups[rocmon_context->numActiveGroups]); + if (err < 0) + { + return err; + } + + // Allocate memory for event results + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Allocate memory for event results + int numEvents = rocmon_context->groups[rocmon_context->numActiveGroups].nevents; + RocmonEventResult* tmpResults = (RocmonEventResult*) malloc(numEvents * sizeof(RocmonEventResult)); + if (tmpResults == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event results); + return -ENOMEM; + } + + // Allocate memory for new event result list entry + RocmonEventResultList* tmpGroupResults = (RocmonEventResultList*) realloc(device->groupResults, (device->numGroupResults+1) * sizeof(RocmonEventResultList)); + if (tmpGroupResults == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate new event group result list); + return -ENOMEM; + } + + device->groupResults = tmpGroupResults; + device->groupResults[device->numGroupResults].results = tmpResults; + device->groupResults[device->numGroupResults].numResults = numEvents; + device->numGroupResults++; + } + + *gid = rocmon_context->numActiveGroups; + rocmon_context->numActiveGroups++; + return 0; +} + + +static int +_rocmon_setupCounters_rocprofiler(RocmonDevice* device, const char** events, int numEvents) +{ + // Close previous rocprofiler context + if (device->context) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Closing previous rocprofiler context); + ROCM_CALL(rocprofiler_close, (device->context), return -1); + } + + // Look if the are any events + if (numEvents <= 0) + { + return 0; + } + + // Create feature array to monitor + rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(numEvents * sizeof(rocprofiler_feature_t)); + if (features == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate feature list); + return -ENOMEM; + } + for (int i = 0; i < numEvents; i++) + { + features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[i].name = events[i]; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP EVENT %d %s, i, events[i]); + } + + // Free previous feature array if present + FREE_IF_NOT_NULL(device->activeRocEvents); + + device->numActiveRocEvents = numEvents; + device->activeRocEvents = features; + + // Open context + rocprofiler_properties_t properties = {}; + properties.queue_depth = 128; + uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | ROCPROFILER_MODE_SINGLEGROUP; + + // Important: only a single profiling group is supported at this time which limits the number of events that can be monitored at a time. + ROCM_CALL(rocprofiler_open, (device->hsa_agent, device->activeRocEvents, device->numActiveRocEvents, &device->context, mode, &properties), return -1); + + return 0; +} + + +static int +_rocmon_setupCounters_smi(RocmonDevice* device, const char** events, int numEvents) +{ + int ret; + const int instanceNumLen = 5; + + // Delete previous events + if (device->activeSmiEvents) + { + device->activeSmiEvents = NULL; + device->numActiveSmiEvents = 0; + } + + // Look if the are any events + if (numEvents <= 0) + { + return 0; + } + + // Create event array + RocmonSmiEvent* activeEvents = (RocmonSmiEvent*) malloc(numEvents * sizeof(RocmonSmiEvent)); + if (activeEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate active event list); + return -ENOMEM; + } + + for (int i = 0; i < numEvents; i++) + { + char eventName[membersize(RocmonSmiEvent, name)]; + int instance = -1; + + // Parse event name -> normal event vs one with multiple instances (EVENT[0]) + const char* event = events[i]; + char* instancePart = strrchr(event, '['); + if (instancePart != NULL) + { + char withoutBrackets[instanceNumLen+1]; // +1 is '\0' + int partlen = strlen(instancePart); + + // Check if number fit in 'withoutBrackets' + if (partlen - 2 > instanceNumLen) + { + ERROR_PRINT(Instance number in '%s' is too large, event); + free(activeEvents); + return -EINVAL; + } + + // Copy instance number without brackets + strncpy(withoutBrackets, instancePart+1, partlen-2); + withoutBrackets[instanceNumLen] = '\0'; + + // Parse instance as number + char* endParsed; + instance = strtol(withoutBrackets, &endParsed, 10); + + // Check if parsing was successful + char* endOfString = &withoutBrackets[partlen-2]; + if (endParsed != endOfString) + { + ERROR_PRINT(Failed to parse instance number in '%s', event); + free(activeEvents); + return -EINVAL; + } + + // Copy event name without instance + int eventNameLen = instancePart - event; + strncpy(eventName, event, eventNameLen); + eventName[eventNameLen] = '\0'; + } + else + { + // Copy entire event name + strncpy(eventName, event, membersize(RocmonSmiEvent, name)); + } + + // Lookup event in available events + RocmonSmiEvent* metric = NULL; + ret = get_smap_by_key(device->smiMetrics, eventName, (void**)&metric); + if (ret < 0) + { + ERROR_PRINT(RSMI event '%s' not found for device %d, eventName, device->deviceId); + free(activeEvents); + return -EINVAL; + } + + // Copy event + RocmonSmiEvent* tmpEvent = &activeEvents[i]; + memcpy(tmpEvent, metric, sizeof(RocmonSmiEvent)); + + // Check if event supports instances + if (instance >= 0 && tmpEvent->type != ROCMON_SMI_EVENT_TYPE_INSTANCES) + { + ERROR_PRINT(Instance number given but event '%s' does not support one, eventName); + free(activeEvents); + return -EINVAL; + } + + // Check if event requires instances + if (instance < 0 && tmpEvent->type == ROCMON_SMI_EVENT_TYPE_INSTANCES) + { + ERROR_PRINT(No instance number given but event '%s' requires one, eventName); + free(activeEvents); + return -EINVAL; + } + + // Check if event has enough instances + if (instance >= 0 && instance >= metric->instances) + { + ERROR_PRINT(Instance %d seleced but event '%s' has only %d, instance, eventName, metric->instances); + free(activeEvents); + return -EINVAL; + } + + // Set instance number + if (instance >= 0) + { + tmpEvent->subvariant = instance; + } + } + + device->activeSmiEvents = activeEvents; + device->numActiveSmiEvents = numEvents; + + return 0; +} + + +int +rocmon_v1_setupCounters(int gid) +{ + int ret; + + // Check arguments + if (gid < 0 || gid >= rocmon_context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Get group info + GroupInfo* group = &rocmon_context->groups[gid]; + + // + // Separate rocprofiler and SMI events + // + const char **smiEvents = NULL, **rocEvents = NULL; + int numSmiEvents = 0, numRocEvents = 0; + + // Allocate memory for string arrays + smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (smiEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); + return -ENOMEM; + } + rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (rocEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); + free(smiEvents); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "RSMI_", 5) == 0) + { + // RSMI event + smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix + numSmiEvents++; + } + else if (strncmp(name, "ROCP_", 5) == 0) + { + // Rocprofiler event + rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix + numRocEvents++; + } + else + { + // Unknown event + ERROR_PRINT(Event '%s' has no prefix ('ROCP_' or 'RSMI_'), name); + return -EINVAL; + } + } + + // Add events to each device + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Add rocprofiler events + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); + ret = _rocmon_setupCounters_rocprofiler(device, rocEvents, numRocEvents); + if (ret < 0) + { + free(smiEvents); + free(rocEvents); + return ret; + } + + // Add SMI events + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCM SMI WITH %d events, numSmiEvents); + ret = _rocmon_setupCounters_smi(device, smiEvents, numSmiEvents); + if (ret < 0) + { + free(smiEvents); + free(rocEvents); + return ret; + } + } + rocmon_context->activeGroup = gid; + + // Cleanup + free(smiEvents); + free(rocEvents); + + return 0; +} + + +static int +_rocmon_startCounters_rocprofiler(RocmonDevice* device) +{ + // Check if there are any counters to start + if (device->numActiveRocEvents <= 0) + { + return 0; + } + + // Reset results + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveRocEvents; i++) + { + RocmonEventResult* result = &groupResult->results[i]; + result->lastValue = 0; + result->fullValue = 0; + } + + if (device->context) + { + ROCM_CALL(rocprofiler_start, (device->context, 0), return -1); + } + + return 0; +} + + +static int +_rocmon_startCounters_smi(RocmonDevice* device) +{ + // Check if there are any counters to start + if (device->numActiveSmiEvents <= 0) + { + return 0; + } + + // Save baseline values + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveSmiEvents; i++) + { + double value = 0; + RocmonSmiEvent* event = &device->activeSmiEvents[i]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + + // Measure counter + if (event->measureFunc) + { + event->measureFunc(device->deviceId, event, result); + } + + // Save value + result->fullValue = 0; + } + + return 0; +} + + +int +rocmon_v1_startCounters(void) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Get timestamp + uint64_t timestamp; + if (ret = _rocmon_get_timestamp(×tamp)) + { + return ret; + } + + // Start counters on each device + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + device->time.start = timestamp; + device->time.read = timestamp; + + // Start rocprofiler events + ret = _rocmon_startCounters_rocprofiler(device); + if (ret < 0) return ret; + + // Start SMI events + _rocmon_startCounters_smi(device); + if (ret < 0) return ret; + } + + return 0; +} + + +static int +_rocmon_stopCounters_rocprofiler(RocmonDevice* device) +{ + if (device->context) + { + // Close context + ROCM_CALL(rocprofiler_stop, (device->context, 0), return -1); + } + + return 0; +} + + +int +rocmon_v1_stopCounters(void) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Read counters + ret = _rocmon_readCounters(&_rocmon_get_stop_time); + if (ret < 0) return ret; + + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Stop rocprofiler events + ret = _rocmon_stopCounters_rocprofiler(device); + if (ret < 0) return ret; + + // Nothing to stop for SMI events + } + + return 0; +} + + +int +rocmon_v1_readCounters(void) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Read counters + ret = _rocmon_readCounters(&_rocmon_get_read_time); + if (ret < 0) return ret; + + return 0; +} + + +double +rocmon_v1_getResult(int gpuIdx, int groupId, int eventId) +{ + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Validate gpuIdx + if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) + { + return -EFAULT; + } + + // Validate groupId + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + if (groupId < 0 || groupId >= device->numGroupResults) + { + return -EFAULT; + } + + // Validate eventId + RocmonEventResultList* groupResult = &device->groupResults[groupId]; + if (eventId < 0 || eventId >= groupResult->numResults) + { + return -EFAULT; + } + + // Return result + return groupResult->results[eventId].fullValue; +} + + +// TODO: multiple groups +double +rocmon_v1_getLastResult(int gpuIdx, int groupId, int eventId) +{ + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Validate gpuIdx + if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) + { + return -EFAULT; + } + + // Validate groupId + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + if (groupId < 0 || groupId >= device->numGroupResults) + { + return -EFAULT; + } + + // Validate eventId + RocmonEventResultList* groupResult = &device->groupResults[groupId]; + if (eventId < 0 || eventId >= groupResult->numResults) + { + return -EFAULT; + } + + // Return result + return groupResult->results[eventId].lastValue; +} + + +int +rocmon_v1_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) +{ + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Validate args + if (gpuIdx < 0 || gpuIdx > rocmon_context->numDevices) + { + return -EINVAL; + } + if (list == NULL) + { + return -EINVAL; + } + + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + + // Allocate list structure + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + + // Get number of events + printf("NUmber of events %d + %d\n", device->numRocMetrics , get_map_size(device->smiMetrics)); + tmpList->numEvents = device->numRocMetrics + get_map_size(device->smiMetrics); + if (tmpList->numEvents == 0) + { + // No events -> return empty list + tmpList->events = NULL; + *list = tmpList; + return 0; + } + + // Allocate event array + tmpList->events = (Event_rocm_t*) malloc(tmpList->numEvents * sizeof(Event_rocm_t)); + if (tmpList->events == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + free(tmpList); + return -ENOMEM; + } + + // Copy rocprofiler event information + for (int i = 0; i < device->numRocMetrics; i++) + { + rocprofiler_info_data_t* event = &device->rocMetrics[i]; + Event_rocm_t* out = &tmpList->events[i]; + int len; + + // Copy name + printf("Name %s\n", event->metric.name); + len = strlen(event->metric.name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "ROCP_%s", event->metric.name); + } + + // Copy description + len = strlen(event->metric.description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", event->metric.description); + } + + // Copy instances + out->instances = event->metric.instances; + } + + // Copy ROCm SMI metric information + for (int i = 0; i < get_map_size(device->smiMetrics); i++) + { + RocmonSmiEvent* event = NULL; + Event_rocm_t* out = &tmpList->events[device->numRocMetrics + i]; + int len; + + // Get event + if (get_smap_by_idx(device->smiMetrics, i, (void**)&event) < 0) + { + continue; + } + + // Copy name + len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "RSMI_%s", event->name); + } + + // Copy description + char* description = "SMI Event"; // TODO: use real descriptions + len = strlen(description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", description); + } + + // Copy instances + out->instances = event->instances; + } + + *list = tmpList; + return 0; +} + +void +rocmon_v1_freeEventsOfGpu(EventList_rocm_t list) +{ +#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } + + // Check pointer + if (list == NULL) + { + return; + } + + if (list->events != NULL) + { + for (int i = 0; i < list->numEvents; i++) + { + Event_rocm_t* event = &list->events[i]; + FREE_IF_NOT_NULL(event->name); + FREE_IF_NOT_NULL(event->description); + } + free(list->events); + } + free(list); +} + + +int +rocmon_v1_switchActiveGroup(int newGroupId) +{ + int ret; + + ret = rocmon_stopCounters(); + if (ret < 0) + { + return ret; + } + + ret = rocmon_setupCounters(newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_startCounters(); + if (ret < 0) + { + return ret; + } + + return 0; +} + + +int +rocmon_v1_getNumberOfGroups(void) +{ + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->numActiveGroups; +} + + +int +rocmon_v1_getIdOfActiveGroup(void) +{ + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->activeGroup; +} + + +int +rocmon_v1_getNumberOfGPUs(void) +{ + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->numDevices; +} + + +int +rocmon_v1_getNumberOfEvents(int groupId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return -EFAULT; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->nevents; +} + + +int +rocmon_v1_getNumberOfMetrics(int groupId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + { + return -EFAULT; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->nmetrics; +} + + +double +rocmon_v1_getTimeOfGroup(int groupId) +{ + int i = 0; + double t = 0; + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + { + return -EFAULT; + } + for (i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + t = MAX(t, (double)(device->time.stop - device->time.start)); + } + return t*1E-9; +} + + +double +rocmon_v1_getLastTimeOfGroup(int groupId) +{ + int i = 0; + double t = 0; + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + { + return -EFAULT; + } + for (i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + t = MAX(t, (double)(device->time.stop - device->time.read)); + } + return t*1E-9; +} + + +double +rocmon_v1_getTimeToLastReadOfGroup(int groupId) +{ + int i = 0; + double t = 0; + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + { + return -EFAULT; + } + for (i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + t = MAX(t, (double)(device->time.read - device->time.start)); + } + return t*1E-9; +} + + +char* +rocmon_v1_getEventName(int groupId, int eventId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((eventId < 0) || (eventId >= ginfo->nevents)) + { + return NULL; + } + return ginfo->events[eventId]; +} + + +char* +rocmon_v1_getCounterName(int groupId, int eventId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((eventId < 0) || (eventId >= ginfo->nevents)) + { + return NULL; + } + return ginfo->counters[eventId]; +} + + +char* +rocmon_v1_getMetricName(int groupId, int metricId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((metricId < 0) || (metricId >= ginfo->nmetrics)) + { + return NULL; + } + return ginfo->metricnames[metricId]; +} + + +char* +rocmon_v1_getGroupName(int groupId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->groupname; +} + + +char* +rocmon_v1_getGroupInfoShort(int groupId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->shortinfo; +} + + +char* +rocmon_v1_getGroupInfoLong(int groupId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->longinfo; +} + + +int +rocmon_v1_getGroups(char*** groups, char*** shortinfos, char*** longinfos) +{ + init_configuration(); + Configuration_t config = get_configuration(); + + return perfgroup_getGroups(config->groupPath, "amd_gpu_v1", groups, shortinfos, longinfos); +} + + +int +rocmon_v1_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) +{ + perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); +} + + + +#endif /* LIKWID_WITH_ROCMON */ From 5ec039fc80cffc267e59221c067e5cffd18cfda5 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:18:54 +0200 Subject: [PATCH 03/29] Filter files based on ROCM version check --- Makefile | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 35c667a87..43b1a5b4d 100644 --- a/Makefile +++ b/Makefile @@ -150,6 +150,12 @@ ifneq ($(ROCM_INTERFACE), true) OBJ := $(filter-out $(BUILD_DIR)/rocmon.o,$(OBJ)) OBJ := $(filter-out $(BUILD_DIR)/rocmon_marker.o,$(OBJ)) OBJ := $(filter-out $(BUILD_DIR)/topology_rocm.o,$(OBJ)) +else +ifeq ($(strip $(ROCM_SDK_CHECK)),0) +OBJ := $(filter-out $(BUILD_DIR)/rocmon_sdk.o,$(OBJ)) +else +OBJ := $(filter-out $(BUILD_DIR)/rocmon_v1.o,$(OBJ)) +endif endif ifeq ($(COMPILER),GCCPOWER) OBJ := $(filter-out $(BUILD_DIR)/topology_cpuid.o,$(OBJ)) @@ -351,10 +357,16 @@ $(BUILD_DIR)/%.o: %.c $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ $(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d -$(BUILD_DIR)/rocmon_marker.o: rocmon_marker.c - @echo "===> COMPILE $@" +$(BUILD_DIR)/rocmon_%.o: rocmon_%.c + @echo "===> COMPILE $@ with redefined symbol HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE" $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ - $(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE2 $@ + $(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE_$@ $@ + +$(BUILD_DIR)/rocmon.o: rocmon.c + @echo "===> COMPILE $@ with redefined symbol HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE" + $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ + $(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE_$@ $@ + $(BUILD_DIR)/%.o: %.cc @echo "===> COMPILE $@" From 90b6b2a534a4453596c4993df2d1016c0d9059c1 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:29:26 +0200 Subject: [PATCH 04/29] Rename defines in rocmon_v1_types --- src/includes/rocmon_v1_types.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/includes/rocmon_v1_types.h b/src/includes/rocmon_v1_types.h index a126077de..5d06f85d3 100644 --- a/src/includes/rocmon_v1_types.h +++ b/src/includes/rocmon_v1_types.h @@ -1,11 +1,9 @@ /* * ======================================================================================= * - * Filename: nvmon_types.h + * Filename: rocmon_v1_types.h * - * Description: Header File of nvmon module. - * Configures and reads out performance counters - * on NVIDIA GPUs. Supports multi GPUs. + * Description: Header File of rocmon v1 module. * * Version: * Released: @@ -29,8 +27,8 @@ * * ======================================================================================= */ -#ifndef LIKWID_ROCMON_TYPES_H -#define LIKWID_ROCMON_TYPES_H +#ifndef LIKWID_ROCMON_V1_TYPES_H +#define LIKWID_ROCMON_V1_TYPES_H #include // #include @@ -148,4 +146,4 @@ typedef struct { int* gpulist; double** counters; } LikwidRocmResults; -#endif /* LIKWID_ROCMON_TYPES_H */ +#endif /* LIKWID_ROCMON_V1_TYPES_H */ From 032ad5d0e68e9c8386f7aa829543c1c4ff1efa89 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:36:41 +0200 Subject: [PATCH 05/29] Rename groups for v1 and add groups for sdk --- groups/{amd_gpu => amd_gpu_sdk}/GDS.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/MEM.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/PCI.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/POWER.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/SALU.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/SFETCH.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/STALLED.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/UTIL.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/VALU.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/WAVE.txt | 0 groups/amd_gpu_v1/GDS.txt | 15 ++++++++++++++ groups/amd_gpu_v1/MEM.txt | 18 ++++++++++++++++ groups/amd_gpu_v1/PCI.txt | 23 +++++++++++++++++++++ groups/amd_gpu_v1/POWER.txt | 21 +++++++++++++++++++ groups/amd_gpu_v1/SALU.txt | 15 ++++++++++++++ groups/amd_gpu_v1/SFETCH.txt | 15 ++++++++++++++ groups/amd_gpu_v1/STALLED.txt | 19 +++++++++++++++++ groups/amd_gpu_v1/UTIL.txt | 18 ++++++++++++++++ groups/amd_gpu_v1/VALU.txt | 15 ++++++++++++++ groups/amd_gpu_v1/WAVE.txt | 15 ++++++++++++++ 20 files changed, 174 insertions(+) rename groups/{amd_gpu => amd_gpu_sdk}/GDS.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/MEM.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/PCI.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/POWER.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/SALU.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/SFETCH.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/STALLED.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/UTIL.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/VALU.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/WAVE.txt (100%) create mode 100644 groups/amd_gpu_v1/GDS.txt create mode 100644 groups/amd_gpu_v1/MEM.txt create mode 100644 groups/amd_gpu_v1/PCI.txt create mode 100644 groups/amd_gpu_v1/POWER.txt create mode 100644 groups/amd_gpu_v1/SALU.txt create mode 100644 groups/amd_gpu_v1/SFETCH.txt create mode 100644 groups/amd_gpu_v1/STALLED.txt create mode 100644 groups/amd_gpu_v1/UTIL.txt create mode 100644 groups/amd_gpu_v1/VALU.txt create mode 100644 groups/amd_gpu_v1/WAVE.txt diff --git a/groups/amd_gpu/GDS.txt b/groups/amd_gpu_sdk/GDS.txt similarity index 100% rename from groups/amd_gpu/GDS.txt rename to groups/amd_gpu_sdk/GDS.txt diff --git a/groups/amd_gpu/MEM.txt b/groups/amd_gpu_sdk/MEM.txt similarity index 100% rename from groups/amd_gpu/MEM.txt rename to groups/amd_gpu_sdk/MEM.txt diff --git a/groups/amd_gpu/PCI.txt b/groups/amd_gpu_sdk/PCI.txt similarity index 100% rename from groups/amd_gpu/PCI.txt rename to groups/amd_gpu_sdk/PCI.txt diff --git a/groups/amd_gpu/POWER.txt b/groups/amd_gpu_sdk/POWER.txt similarity index 100% rename from groups/amd_gpu/POWER.txt rename to groups/amd_gpu_sdk/POWER.txt diff --git a/groups/amd_gpu/SALU.txt b/groups/amd_gpu_sdk/SALU.txt similarity index 100% rename from groups/amd_gpu/SALU.txt rename to groups/amd_gpu_sdk/SALU.txt diff --git a/groups/amd_gpu/SFETCH.txt b/groups/amd_gpu_sdk/SFETCH.txt similarity index 100% rename from groups/amd_gpu/SFETCH.txt rename to groups/amd_gpu_sdk/SFETCH.txt diff --git a/groups/amd_gpu/STALLED.txt b/groups/amd_gpu_sdk/STALLED.txt similarity index 100% rename from groups/amd_gpu/STALLED.txt rename to groups/amd_gpu_sdk/STALLED.txt diff --git a/groups/amd_gpu/UTIL.txt b/groups/amd_gpu_sdk/UTIL.txt similarity index 100% rename from groups/amd_gpu/UTIL.txt rename to groups/amd_gpu_sdk/UTIL.txt diff --git a/groups/amd_gpu/VALU.txt b/groups/amd_gpu_sdk/VALU.txt similarity index 100% rename from groups/amd_gpu/VALU.txt rename to groups/amd_gpu_sdk/VALU.txt diff --git a/groups/amd_gpu/WAVE.txt b/groups/amd_gpu_sdk/WAVE.txt similarity index 100% rename from groups/amd_gpu/WAVE.txt rename to groups/amd_gpu_sdk/WAVE.txt diff --git a/groups/amd_gpu_v1/GDS.txt b/groups/amd_gpu_v1/GDS.txt new file mode 100644 index 000000000..39c3446be --- /dev/null +++ b/groups/amd_gpu_v1/GDS.txt @@ -0,0 +1,15 @@ +SHORT GDS Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_GDS +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU GDS rw insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU GDS rw insts per work-item = ROCP_SQ_INSTS_GDS/ROCP_SQ_WAVES +-- +The average number of GDS read or GDS write instructions executed +per work item (affected by flow control). diff --git a/groups/amd_gpu_v1/MEM.txt b/groups/amd_gpu_v1/MEM.txt new file mode 100644 index 000000000..acc63a627 --- /dev/null +++ b/groups/amd_gpu_v1/MEM.txt @@ -0,0 +1,18 @@ +SHORT Memory utilization + +EVENTSET +ROCM0 ROCP_TA_TA_BUSY +ROCM1 ROCP_GRBM_GUI_ACTIVE +ROCM2 ROCP_SE_NUM + +METRICS +GPU memory utilization 100*max(ROCM0,16)/ROCM1/ROCM2 + +LONG +Formulas: +GPU memory utilization = 100*max(ROCP_TA_TA_BUSY,16)/ROCP_GRBM_GUI_ACTIVE/ROCP_SE_NUM +-- +The percentage of GPUTime the memory unit is active. The result includes +the stall time (MemUnitStalled). This is measured with all extra fetches +and writes and any cache or memory effects taken into account. +Value range: 0% to 100% (fetch-bound). diff --git a/groups/amd_gpu_v1/PCI.txt b/groups/amd_gpu_v1/PCI.txt new file mode 100644 index 000000000..cefaf307d --- /dev/null +++ b/groups/amd_gpu_v1/PCI.txt @@ -0,0 +1,23 @@ +SHORT PCI Transfers + +EVENTSET +ROCM0 RSMI_PCI_THROUGHPUT_SENT +ROCM1 RSMI_PCI_THROUGHPUT_RECEIVED + + +METRICS +Runtime time +PCI sent ROCM0 +PCI received ROCM1 +PCI send bandwidth 1E-6*ROCM0/time +PCI recv bandwidth 1E-6*ROCM1/time + +LONG +Formulas: +PCI sent = RSMI_PCI_THROUGHPUT_SENT +PCI received = RSMI_PCI_THROUGHPUT_RECEIVED +PCI send bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_SENT/runtime +PCI recv bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_RECEIVED/runtime +-- +Currently not usable since the RSMI_PCI_THROUGHPUT_* events require +one second per call, so 2 seconds for both of them. diff --git a/groups/amd_gpu_v1/POWER.txt b/groups/amd_gpu_v1/POWER.txt new file mode 100644 index 000000000..49830efc0 --- /dev/null +++ b/groups/amd_gpu_v1/POWER.txt @@ -0,0 +1,21 @@ +SHORT Power, temperature and voltage + +EVENTSET +ROCM0 RSMI_POWER_AVE[0] +ROCM1 RSMI_TEMP_EDGE +ROCM2 RSMI_VOLT_VDDGFX + + +METRICS +Power average 1E-6*ROCM0 +Edge temperature 1E-3*ROCM1 +Voltage 1E-3*ROCM2 + +LONG +Formulas: +Power average = RSMI_POWER_AVE[0] +Edge temperature = 1E-3*RSMI_TEMP_EDGE +Voltage = 1E-3*RSMI_VOLT_VDDGFX +-- +Gets the current average power consumption in watts, the +temperature in celsius and the voltage in volts. diff --git a/groups/amd_gpu_v1/SALU.txt b/groups/amd_gpu_v1/SALU.txt new file mode 100644 index 000000000..a693421d1 --- /dev/null +++ b/groups/amd_gpu_v1/SALU.txt @@ -0,0 +1,15 @@ +SHORT SALU Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_SALU +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU SALU insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU SALU insts per work-item = ROCP_SQ_INSTS_SALU/ROCP_SQ_WAVES +-- +The average number of scalar ALU instructions executed per work-item +(affected by flow control). diff --git a/groups/amd_gpu_v1/SFETCH.txt b/groups/amd_gpu_v1/SFETCH.txt new file mode 100644 index 000000000..bd0dfc3ff --- /dev/null +++ b/groups/amd_gpu_v1/SFETCH.txt @@ -0,0 +1,15 @@ +SHORT SFetch Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_SMEM +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU SFETCH insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU SFETCH insts per work-item = ROCP_SQ_INSTS_SMEM/ROCP_SQ_WAVES +-- +The average number of scalar fetch instructions from the video memory +executed per work-item (affected by flow control). diff --git a/groups/amd_gpu_v1/STALLED.txt b/groups/amd_gpu_v1/STALLED.txt new file mode 100644 index 000000000..9d6dc42c4 --- /dev/null +++ b/groups/amd_gpu_v1/STALLED.txt @@ -0,0 +1,19 @@ +SHORT ALU stalled by LDS + +EVENTSET +ROCM0 ROCP_SQ_WAIT_INST_LDS +ROCM1 ROCP_SQ_WAVES +ROCM2 ROCP_GRBM_GUI_ACTIVE + +METRICS +GPU ALD stalled 100*ROCM0*4/ROCM1/ROCM2 + +LONG +Formulas: +GPU ALD stalled = 100*ROCP_SQ_WAIT_INST_LDS*4/ROCP_SQ_WAVES/ROCP_GRBM_GUI_ACTIVE +-- +The percentage of GPUTime ALU units are stalled by the LDS input queue +being full or the output queue being not ready. If there are LDS bank +conflicts, reduce them. Otherwise, try reducing the number of LDS +accesses if possible. +Value range: 0% (optimal) to 100% (bad). diff --git a/groups/amd_gpu_v1/UTIL.txt b/groups/amd_gpu_v1/UTIL.txt new file mode 100644 index 000000000..7d9271e11 --- /dev/null +++ b/groups/amd_gpu_v1/UTIL.txt @@ -0,0 +1,18 @@ +SHORT GPU utilization + +EVENTSET +ROCM0 ROCP_GRBM_COUNT +ROCM1 ROCP_GRBM_GUI_ACTIVE + + +METRICS +GPU utilization 100*ROCM1/ROCM0 + + +LONG +Formulas: +GPU utilization = 100*ROCP_GRBM_GUI_ACTIVE/ROCP_GRBM_COUNT +-- +This group reassembles the 'GPUBusy' metric provided by RocProfiler. +We should add, that we can select the GPUBusy metric directly and the +calculations are done internally in case the metric formula changes. diff --git a/groups/amd_gpu_v1/VALU.txt b/groups/amd_gpu_v1/VALU.txt new file mode 100644 index 000000000..5d57b9b20 --- /dev/null +++ b/groups/amd_gpu_v1/VALU.txt @@ -0,0 +1,15 @@ +SHORT VALU Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_VALU +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU VALU insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU VALU insts per work-item = ROCP_SQ_INSTS_VALU/ROCP_SQ_WAVES +-- +The average number of vector ALU instructions executed per work-item +(affected by flow control). diff --git a/groups/amd_gpu_v1/WAVE.txt b/groups/amd_gpu_v1/WAVE.txt new file mode 100644 index 000000000..fe8914ae1 --- /dev/null +++ b/groups/amd_gpu_v1/WAVE.txt @@ -0,0 +1,15 @@ +SHORT Wavefronts + +EVENTSET +ROCM0 ROCP_SQ_WAVES + + +METRICS +GPU wavefronts ROCM0 + + +LONG +Formulas: +GPU wavefronts = ROCP_SQ_WAVES +-- +Total Wavefronts From ae0c4e404f46812039dfc5fa23ccb4f687c0e89f Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:37:26 +0200 Subject: [PATCH 06/29] Add skeleton for rocmon sdk --- src/includes/rocmon_sdk.h | 64 ++++++++ src/includes/rocmon_sdk_types.h | 35 +++++ src/rocmon_sdk.c | 251 ++++++++++++++++++++++++++++++++ 3 files changed, 350 insertions(+) create mode 100644 src/includes/rocmon_sdk.h create mode 100644 src/includes/rocmon_sdk_types.h create mode 100644 src/rocmon_sdk.c diff --git a/src/includes/rocmon_sdk.h b/src/includes/rocmon_sdk.h new file mode 100644 index 000000000..82b15b3ff --- /dev/null +++ b/src/includes/rocmon_sdk.h @@ -0,0 +1,64 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_sdk.h + * + * Description: Header File of rocmon module for ROCm >= 6.2. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SDK_H +#define LIKWID_ROCMON_SDK_H + +int rocmon_sdk_init(int numGpus, const int* gpuIds); +void rocmon_sdk_finalize(void); +int rocmon_sdk_addEventSet(const char* eventString, int* gid); +int rocmon_sdk_setupCounters(int gid); +int rocmon_sdk_startCounters(void); +int rocmon_sdk_stopCounters(void); +int rocmon_sdk_readCounters(void); +double rocmon_sdk_getResult(int gpuIdx, int groupId, int eventId); +double rocmon_sdk_getLastResult(int gpuIdx, int groupId, int eventId); +int rocmon_sdk_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list); +void rocmon_sdk_freeEventsOfGpu(EventList_rocm_t list); +int rocmon_sdk_switchActiveGroup(int newGroupId); +int rocmon_sdk_getNumberOfGroups(void); +int rocmon_sdk_getIdOfActiveGroup(void); +int rocmon_sdk_getNumberOfGPUs(void); +int rocmon_sdk_getNumberOfEvents(int groupId); +int rocmon_sdk_getNumberOfMetrics(int groupId); +double rocmon_sdk_getTimeOfGroup(int groupId); +double rocmon_sdk_getLastTimeOfGroup(int groupId); +double rocmon_sdk_getTimeToLastReadOfGroup(int groupId); +char* rocmon_sdk_getEventName(int groupId, int eventId); +char* rocmon_sdk_getCounterName(int groupId, int eventId); +char* rocmon_sdk_getMetricName(int groupId, int metricId); +char* rocmon_sdk_getGroupName(int groupId); +char* rocmon_sdk_getGroupInfoShort(int groupId); +char* rocmon_sdk_getGroupInfoLong(int groupId); +int rocmon_sdk_getGroups(char*** groups, char*** shortinfos, char*** longinfos); +int rocmon_sdk_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos); + + +#endif /* LIKWID_ROCMON_SDK_H */ + diff --git a/src/includes/rocmon_sdk_types.h b/src/includes/rocmon_sdk_types.h new file mode 100644 index 000000000..280edb6c5 --- /dev/null +++ b/src/includes/rocmon_sdk_types.h @@ -0,0 +1,35 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_sdk_types.h + * + * Description: Header File of rocmon sdk module for ROCM >= 6.2 + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SDK_TYPES_H +#define LIKWID_ROCMON_SDK_TYPES_H + + + +#endif /* LIKWID_ROCMON_SDK_TYPES_H */ diff --git a/src/rocmon_sdk.c b/src/rocmon_sdk.c new file mode 100644 index 000000000..7e66a1402 --- /dev/null +++ b/src/rocmon_sdk.c @@ -0,0 +1,251 @@ + /* ======================================================================================= + * + * Filename: rocmon_sdk.c + * + * Description: Main implementation of the performance monitoring module + * for AMD GPUs with ROCm >= 6.2 + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.roehl@googlemail.com + * Project: likwid + * + * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifdef LIKWID_WITH_ROCMON + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +static bool rocmon_initialized = FALSE; +int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; + +int +rocmon_sdk_init(int numGpus, const int* gpuIds) +{ + return 0; +} + + +void +rocmon_sdk_finalize(void) +{ + return; +} + + +int +rocmon_sdk_addEventSet(const char* eventString, int* gid) +{ + return 0; +} + +int +rocmon_sdk_setupCounters(int gid) +{ + return 0; +} + + +int +rocmon_sdk_startCounters(void) +{ + return 0; +} + +int +rocmon_sdk_stopCounters(void) +{ + return 0; +} + + +int +rocmon_sdk_readCounters(void) +{ + return 0; +} + + +double +rocmon_sdk_getResult(int gpuIdx, int groupId, int eventId) +{ + return 0.0; +} + + +// TODO: multiple groups +double +rocmon_sdk_getLastResult(int gpuIdx, int groupId, int eventId) +{ + return 0.0; +} + + +int +rocmon_sdk_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) +{ + return -EINVAL; +} + +void +rocmon_sdk_freeEventsOfGpu(EventList_rocm_t list) +{ + return; +} + + +int +rocmon_sdk_switchActiveGroup(int newGroupId) +{ + return 0; +} + + +int +rocmon_sdk_getNumberOfGroups(void) +{ + return 0; +} + + +int +rocmon_sdk_getIdOfActiveGroup(void) +{ + return 0; +} + + +int +rocmon_sdk_getNumberOfGPUs(void) +{ + return 0; +} + + +int +rocmon_sdk_getNumberOfEvents(int groupId) +{ + return 0; +} + + +int +rocmon_sdk_getNumberOfMetrics(int groupId) +{ + return 0; +} + + +double +rocmon_sdk_getTimeOfGroup(int groupId) +{ + return 0; +} + + +double +rocmon_sdk_getLastTimeOfGroup(int groupId) +{ + return 0; +} + + +double +rocmon_sdk_getTimeToLastReadOfGroup(int groupId) +{ + return 0; +} + + +char* +rocmon_sdk_getEventName(int groupId, int eventId) +{ + return NULL; +} + + +char* +rocmon_sdk_getCounterName(int groupId, int eventId) +{ + return NULL; +} + + +char* +rocmon_sdk_getMetricName(int groupId, int metricId) +{ + return NULL; +} + + +char* +rocmon_sdk_getGroupName(int groupId) +{ + return NULL; +} + + +char* +rocmon_sdk_getGroupInfoShort(int groupId) +{ + return NULL; +} + + +char* +rocmon_sdk_getGroupInfoLong(int groupId) +{ + return NULL; +} + + +int +rocmon_sdk_getGroups(char*** groups, char*** shortinfos, char*** longinfos) +{ + init_configuration(); + Configuration_t config = get_configuration(); + + return perfgroup_getGroups(config->groupPath, "amd_gpu_sdk", groups, shortinfos, longinfos); +} + + +int +rocmon_sdk_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) +{ + perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); +} + + +#endif /* LIKWID_WITH_ROCMON */ From abc80014fb83097b571b176d3d6e8c1f6693eaa3 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Oct 2024 12:32:18 +0200 Subject: [PATCH 07/29] Update Rocprofiler SDK support. Not working yet --- src/includes/rocmon.h | 8 + src/includes/rocmon_common_types.h | 228 +++ src/includes/rocmon_sdk.h | 1183 ++++++++++++++- src/includes/rocmon_sdk_types.h | 35 + src/includes/rocmon_smi.h | 1181 +++++++++++++++ src/includes/rocmon_smi_types.h | 81 + src/includes/rocmon_v1.h | 985 +++++++++++- src/includes/rocmon_v1_types.h | 120 +- src/rocmon.c | 891 +++++++++-- src/rocmon_marker.c | 24 +- src/rocmon_sdk.c | 251 --- src/rocmon_v1.c | 2275 ---------------------------- test/test_rocmon.c | 72 + 13 files changed, 4484 insertions(+), 2850 deletions(-) create mode 100644 src/includes/rocmon.h create mode 100644 src/includes/rocmon_common_types.h create mode 100644 src/includes/rocmon_smi.h create mode 100644 src/includes/rocmon_smi_types.h delete mode 100644 src/rocmon_sdk.c delete mode 100644 src/rocmon_v1.c create mode 100644 test/test_rocmon.c diff --git a/src/includes/rocmon.h b/src/includes/rocmon.h new file mode 100644 index 000000000..896138a99 --- /dev/null +++ b/src/includes/rocmon.h @@ -0,0 +1,8 @@ +#ifndef LIKWID_INTERNAL_ROCMON_H +#define LIKWID_INTERNAL_ROCMON_H + +#include + +GroupInfo* rocmon_get_group(int gid); + +#endif diff --git a/src/includes/rocmon_common_types.h b/src/includes/rocmon_common_types.h new file mode 100644 index 000000000..fe48bc866 --- /dev/null +++ b/src/includes/rocmon_common_types.h @@ -0,0 +1,228 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_common_types.h + * + * Description: Header File of rocmon for v1 and sdk backend. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_COMMON_TYPES_H +#define LIKWID_ROCMON_COMMON_TYPES_H + +#include + +#include +#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 +typedef struct metrics_table_header_t metrics_table_header_t; +#endif +#include +#include +#include +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT +#endif +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT +#endif +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include +#ifdef LIKWID_ROCPROF_SDK +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT +#endif +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT +#endif +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include +/*#ifdef ROCPROFILER_EXPORT*/ +/*#undef ROCPROFILER_EXPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_IMPORT*/ +/*#undef ROCPROFILER_IMPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MAJOR*/ +/*#undef ROCPROFILER_VERSION_MAJOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MINOR*/ +/*#undef ROCPROFILER_VERSION_MINOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_API*/ +/*#undef ROCPROFILER_API*/ +/*#endif*/ +#include +#endif + + + +#ifndef ROCMWEAK +#define ROCMWEAK __attribute__(( weak )) +#endif +#ifndef FREE_IF_NOT_NULL +#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } +#endif +/*#ifndef ARRAY_COUNT*/ +/*#define ARRAY_COUNT(arr) (sizeof(arr) / sizeof((arr)[0]))*/ +/*#endif*/ +/*#ifndef SIZEOF_STRUCT_MEMBER*/ +/*#define SIZEOF_STRUCT_MEMBER(type, member) (sizeof(((type *) NULL)->member))*/ +/*#endif*/ + +typedef struct { + double lastValue; + double fullValue; +} RocmonEventResult; + +typedef struct { + RocmonEventResult* results; // First rocprofiler results, then SMI results + int numResults; +} RocmonEventResultList; + +#include +#include + +typedef struct { + bstring tag; + int groupID; + int gpuCount; + int eventCount; + double* time; + uint32_t* count; + int* gpulist; + double** counters; +} LikwidRocmResults; + +typedef struct { + int deviceId; // LIKWID device id + int rocprof_v1; + int activeGroup; + + // Rocprofiler V1 + hsa_agent_t hsa_agent; // HSA agent handle for this device + rocprofiler_t* v1_context; // Rocprofiler context (has activeEvents configured) +#ifdef LIKWID_ROCPROF_SDK + // Rocprofiler SDK + rocprofiler_agent_t agent; + rocprofiler_context_id_t sdk_context; // Rocprofiler context (has activeEvents configured) + rocprofiler_buffer_id_t buffer; + rocprofiler_callback_thread_t thread; +#endif + + // Available rocprofiler metrics + rocprofiler_info_data_t* v1_rocMetrics; +#ifdef LIKWID_ROCPROF_SDK + rocprofiler_counter_info_v0_t* sdk_rocMetrics; +#endif + int numRocMetrics; + + // Available ROCm SMI events + Map_t smiMetrics; + + // Currently configured rocprofiler events (bound to context) + rocprofiler_feature_t* v1_activeRocEvents; +#ifdef LIKWID_ROCPROF_SDK + rocprofiler_counter_info_v0_t* sdk_activeRocEvents; +#endif + int numActiveRocEvents; + + // Currently configured ROCm SMI events + RocmonSmiEvent* activeSmiEvents; + int numActiveSmiEvents; + + // Results for all events in all event sets + RocmonEventResultList* groupResults; + int numGroupResults; + +#ifdef LIKWID_ROCPROF_SDK + rocprofiler_profile_config_id_t* profiles; + int numProfiles; +#endif + + // Timestamps in ns + struct { + uint64_t start; + uint64_t read; + uint64_t stop; + } time; + + // buffer? +} RocmonDevice; + +typedef enum { + ROCMON_STATE_FINALIZED = 0, + ROCMON_STATE_INITIALIZED, + ROCMON_STATE_SETUP, + ROCMON_STATE_RUNNING, + ROCMON_STATE_STOPPED, + MAX_ROCMON_STATE, +} RocmonContextState; +#define MIN_ROCMON_STATE ROCMON_STATE_FINALIZED + +typedef struct { + int numGroups; // Number of allocated groups + int numActiveGroups; // Number of used groups + int activeGroup; // Currently active group + GroupInfo *groups; + + // Devices (HSA agents) + RocmonDevice *devices; + int numDevices; + + // System information + long double hsa_timestamp_factor; // hsa_timestamp * hsa_timestamp_factor = timestamp_in_ns + + // Rocprofiler SDK agents with buffers +#ifdef LIKWID_ROCPROF_SDK + int num_sdk_agents; + RocprofilerSdkAgentData* agents; +#endif + + // ROCm SMI events + Map_t smiEvents; + + // Use legacy rocprofiler v1 + int use_rocprofiler_v1:1; + RocmonContextState state; +} RocmonContext; + +//extern static RocmonContext* rocmon_context; + + +#endif /* LIKWID_ROCMON_COMMON_TYPES_H */ diff --git a/src/includes/rocmon_sdk.h b/src/includes/rocmon_sdk.h index 82b15b3ff..b2d32df87 100644 --- a/src/includes/rocmon_sdk.h +++ b/src/includes/rocmon_sdk.h @@ -30,34 +30,1161 @@ #ifndef LIKWID_ROCMON_SDK_H #define LIKWID_ROCMON_SDK_H -int rocmon_sdk_init(int numGpus, const int* gpuIds); -void rocmon_sdk_finalize(void); -int rocmon_sdk_addEventSet(const char* eventString, int* gid); -int rocmon_sdk_setupCounters(int gid); -int rocmon_sdk_startCounters(void); -int rocmon_sdk_stopCounters(void); -int rocmon_sdk_readCounters(void); -double rocmon_sdk_getResult(int gpuIdx, int groupId, int eventId); -double rocmon_sdk_getLastResult(int gpuIdx, int groupId, int eventId); -int rocmon_sdk_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list); -void rocmon_sdk_freeEventsOfGpu(EventList_rocm_t list); -int rocmon_sdk_switchActiveGroup(int newGroupId); -int rocmon_sdk_getNumberOfGroups(void); -int rocmon_sdk_getIdOfActiveGroup(void); -int rocmon_sdk_getNumberOfGPUs(void); -int rocmon_sdk_getNumberOfEvents(int groupId); -int rocmon_sdk_getNumberOfMetrics(int groupId); -double rocmon_sdk_getTimeOfGroup(int groupId); -double rocmon_sdk_getLastTimeOfGroup(int groupId); -double rocmon_sdk_getTimeToLastReadOfGroup(int groupId); -char* rocmon_sdk_getEventName(int groupId, int eventId); -char* rocmon_sdk_getCounterName(int groupId, int eventId); -char* rocmon_sdk_getMetricName(int groupId, int metricId); -char* rocmon_sdk_getGroupName(int groupId); -char* rocmon_sdk_getGroupInfoShort(int groupId); -char* rocmon_sdk_getGroupInfoLong(int groupId); -int rocmon_sdk_getGroups(char*** groups, char*** shortinfos, char*** longinfos); -int rocmon_sdk_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos); +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + + +static int rocmon_sdk_initialized = FALSE; + +static void *rocmon_sdk_dl_profiler_lib = NULL; +static void *rocmon_sdk_dl_hsa_lib = NULL; +//static void *rocmon_sdk_dl_rsmi_lib = NULL; + + +// setup function for rocprofiler sdk +//rocprofiler_tool_configure_result_t* rocprofiler_configure(uint32_t, const char*, uint32_t, rocprofiler_client_id_t*); + +#ifndef ROCM_CALL +#define ROCM_CALL( call, args, handleerror ) \ + do { \ + hsa_status_t _status = (*call##_ptr)args; \ + if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ + const char* err = NULL; \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + rocprofiler_error_string(&err); \ + fprintf(stderr, "Error: %s\n", err); \ + handleerror; \ + } \ + } while (0) +#endif + +#ifndef ROCPROFILER_CALL +#define ROCPROFILER_CALL( call, args, handleerror ) \ + do { \ + rocprofiler_status_t _status = (*call##_ptr)args; \ + if(_status != ROCPROFILER_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + handleerror; \ + } \ + } while (0); +#endif +// fprintf(stderr, "Error: %s\n", (*rocprofiler_get_status_string_ptr)(_status)); \ + +#ifndef DECLARE_ROCPROFILER_SDK +#define DECLARE_ROCPROFILER_SDK(funcname, funcsig) rocprofiler_status_t ROCMWEAK funcname funcsig; rocprofiler_status_t ( *funcname##_ptr ) funcsig; +#endif + + +DECLARE_ROCPROFILER_SDK(rocprofiler_create_context, (rocprofiler_context_id_t*)) +DECLARE_ROCPROFILER_SDK(rocprofiler_create_buffer, (rocprofiler_context_id_t, size_t, size_t, rocprofiler_buffer_policy_t, rocprofiler_buffer_tracing_cb_t, void*, rocprofiler_buffer_id_t*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_query_available_agents, (rocprofiler_agent_version_t, rocprofiler_query_available_agents_cb_t, size_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_get_timestamp, (rocprofiler_timestamp_t* ts)); +DECLARE_ROCPROFILER_SDK(rocprofiler_query_counter_info, (rocprofiler_counter_id_t, rocprofiler_counter_info_version_id_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_start_context, (rocprofiler_context_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_stop_context, (rocprofiler_context_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_create_profile_config, (rocprofiler_agent_id_t, rocprofiler_counter_id_t *, size_t, rocprofiler_profile_config_id_t *)); +DECLARE_ROCPROFILER_SDK(rocprofiler_destroy_profile_config, (rocprofiler_profile_config_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_configure_agent_profile_counting_service, (rocprofiler_context_id_t, rocprofiler_buffer_id_t, rocprofiler_agent_id_t, rocprofiler_agent_profile_callback_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_sample_agent_profile_counting_service, (rocprofiler_context_id_t, rocprofiler_user_data_t, rocprofiler_counter_flag_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_iterate_agent_supported_counters, (rocprofiler_agent_id_t, rocprofiler_available_counters_cb_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_flush_buffer, (rocprofiler_buffer_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_force_configure, (rocprofiler_configure_func_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_destroy_buffer, (rocprofiler_buffer_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_context_is_active, (rocprofiler_context_id_t, int*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_create_callback_thread, (rocprofiler_callback_thread_t*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_assign_callback_thread, (rocprofiler_buffer_id_t, rocprofiler_callback_thread_t)); + +const char *rocprofiler_get_status_string(rocprofiler_status_t); +const char * (*rocprofiler_get_status_string_ptr)(rocprofiler_status_t); + +#ifndef DECLAREFUNC_HSA +#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; +#endif +DECLAREFUNC_HSA(hsa_init, ()); +DECLAREFUNC_HSA(hsa_shut_down, ()); + + +typedef struct { + rocprofiler_agent_t *agents; + int num_agents; +} _rocmon_sdk_count_agents_cb_data; + +rocprofiler_status_t _rocmon_sdk_count_agents_cb(rocprofiler_agent_version_t agents_ver, + const void** agents_arr, + size_t num_agents, + void* udata) +{ + int gpu_agents = 0; + RocmonContext **stat_context = (RocmonContext **)udata; + RocmonContext* context = *stat_context; + RocmonDevice* devices = malloc(num_agents * sizeof(RocmonDevice)); + if (!devices) + { + return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES; + } + memset(devices, 0, num_agents * sizeof(RocmonDevice)); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Found %d ROCm agents, num_agents); + for(size_t i = 0; i < num_agents; ++i) + { + const rocprofiler_agent_t* in_agent = agents_arr[i]; + if (in_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding AMD GPU at index %d, gpu_agents); + RocmonDevice* device = &devices[gpu_agents]; + device->agent = (rocprofiler_agent_t)*in_agent; + device->deviceId = in_agent->logical_node_type_id; + gpu_agents++; + } + } + context->devices = devices; + context->numDevices = gpu_agents; + return ROCPROFILER_STATUS_SUCCESS; +} + + +typedef struct { + rocprofiler_counter_info_v0_t *counters; + int num_counters; +} _rocmon_sdk_fill_agent_counters_cb_data; + +static void +_rocmon_sdk_free_agent_counters_internal(int num_counters, rocprofiler_counter_info_v0_t* counters) +{ + if ((num_counters < 0) || (!counters)) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing %d counters, num_counters); + for (int i = 0; i < num_counters; i++) + { + rocprofiler_counter_info_v0_t* info = &counters[i]; + if (info) + { + if (info->name) free((char*)info->name); + if (info->description) free((char*)info->description); + if (info->block) free((char*)info->block); + if (info->expression) free((char*)info->expression); + } + } + free(counters); +} + + +rocprofiler_status_t +_rocmon_sdk_fill_agent_counters_cb(rocprofiler_agent_id_t agent, + rocprofiler_counter_id_t* counters, + size_t num_counters, + void* udata) +{ + _rocmon_sdk_fill_agent_counters_cb_data *data = (_rocmon_sdk_fill_agent_counters_cb_data*)udata; + + rocprofiler_counter_info_v0_t* out = malloc(num_counters * sizeof(rocprofiler_counter_info_v0_t)); + if (!out) + { + return -ENOMEM; + } + for (int i = 0; i < num_counters; i++) + { + rocprofiler_counter_info_v0_t info; + rocprofiler_status_t stat = (*rocprofiler_query_counter_info_ptr)(counters[i], (rocprofiler_counter_info_version_id_t)ROCPROFILER_COUNTER_INFO_VERSION_0, &info); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to query counter info for %d, i); + for (int j = 0; j < i; j++) + { + free((char*)out[j].name); + free((char*)out[j].description); + } + free(out); + return -EFAULT; + } + //ROCPROFILER_CALL(rocprofiler_query_counter_info, (counters[i], ROCPROFILER_COUNTER_INFO_VERSION_0, &info), + /*{ + free(out); + return -EFAULT; + });*/ + int namelen = strlen(info.name)+1; + int desclen = strlen(info.description)+1; + out[i].name = malloc(namelen * sizeof(char)); + if (!out[i].name) + { + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + out[i].description = malloc(desclen * sizeof(char)); + if (!out[i].description) + { + free((char*)out[i].name); + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + out[i].block = malloc((strlen(info.block)+1) * sizeof(char)); + if (!out[i].block) + { + free((char*)out[i].name); + free((char*)out[i].description); + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + out[i].expression = malloc((strlen(info.expression)+1) * sizeof(char)); + if (!out[i].expression) + { + free((char*)out[i].name); + free((char*)out[i].description); + free((char*)out[i].block); + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + int ret = 0; + ret = snprintf((char*)out[i].name, namelen-1, "%s", info.name); + ret = snprintf((char*)out[i].description, desclen-1, "%s", info.description); + out[i].id = info.id; + out[i].is_constant = info.is_constant; + out[i].is_derived = info.is_derived; + } + data->counters = out; + data->num_counters = num_counters; + return ROCPROFILER_STATUS_SUCCESS; +} + +int _rocmon_sdk_fill_agent_counters(RocmonDevice *device) +{ + _rocmon_sdk_fill_agent_counters_cb_data fill_data = { + .counters = NULL, + .num_counters = 0, + }; + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting counters for agent %d, device->deviceId); + rocprofiler_status_t _status = (rocprofiler_iterate_agent_supported_counters_ptr)(device->agent.id, _rocmon_sdk_fill_agent_counters_cb, &fill_data); + if (_status != ROCPROFILER_STATUS_SUCCESS) + { + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Agent %d provides %d counters, device->deviceId, fill_data.num_counters); + device->sdk_rocMetrics = fill_data.counters; + device->numRocMetrics = fill_data.num_counters; + + return ROCPROFILER_STATUS_SUCCESS; +} + + +static void +_rocmon_sdk_free_agent_counters(RocmonDevice *device) +{ + if (!device->sdk_rocMetrics) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing counters for agent %d, device->deviceId); + _rocmon_sdk_free_agent_counters_internal(device->numRocMetrics, device->sdk_rocMetrics); + device->sdk_rocMetrics = NULL; + device->numRocMetrics = 0; +} + + +typedef struct { + rocprofiler_context_id_t* context; + rocprofiler_agent_t agent; + RocmonEventResultList* result; +} rocmon_sdk_read_buffers_cb; + +static void +_rocmon_sdk_read_buffers(rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* udata, + uint64_t) +{ + rocmon_sdk_read_buffers_cb* cbdata = (rocmon_sdk_read_buffers_cb*)udata; + +/* if (cbdata->result->numResults == 0)*/ +/* {*/ +/* cbdata->result->results = malloc(sizeof(RocmonEventResult))*/ +/* }*/ + printf("_rocmon_sdk_read_buffers\n"); + for (int i = 0; i < num_headers; i++) + { + rocprofiler_record_header_t* h = headers[i]; + if(h->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && h->kind == ROCPROFILER_COUNTER_RECORD_VALUE) + { + rocprofiler_record_counter_t* r = h->payload; + printf("Counter ID %d Value %f Dispatch %ld\n", r->id, r->counter_value, r->dispatch_id); + } + } + + +/* RocmonContext* mycontext = *cbdata->context;*/ +/* for (int i = 0; i < mycontext->numDevices; i++)*/ +/* {*/ +/* RocmonDevice* device = &mycontext->devices[i];*/ +/* if (device->agent.id.handle == cbdata->agent.id.handle)*/ +/* {*/ +/* RocmonEventResultList* groupResults = &device->groupResults[device->activeGroup];*/ + +/* for(int i = 0; i < num_headers; ++i)*/ +/* {*/ +/* rocprofiler_record_header_t* h = headers[i];*/ +/* if(h->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && h->kind == ROCPROFILER_COUNTER_RECORD_VALUE)*/ +/* {*/ +/* rocprofiler_record_counter_t* r = h->payload;*/ +/* if (r->id >= 0 && r->id < groupResults->numResults)*/ +/* {*/ +/* RocmonEventResult* eventResult = &cbdata->result->results[r->id];*/ +/* double diff = r->counter_value - eventResult->fullValue;*/ +/* eventResult->lastValue = eventResult->fullValue;*/ +/* eventResult->fullValue += diff;*/ +/* }*/ +/* }*/ +/* }*/ +/* }*/ +/* }*/ + + return; +} + + +int +tool_init(rocprofiler_client_finalize_t fini, void* udata) +{ + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + RocmonContext** stat_context = (RocmonContext**)udata; + RocmonContext* context = *stat_context; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_init); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA); + hsa_status_t hstat = (*hsa_init_ptr)(); + if (hstat != HSA_STATUS_SUCCESS) + { + return -EFAULT; + } + + //ROCPROFILER_CALL(rocprofiler_query_available_agents, (ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), &agent_count), return -EFAULT;); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents); + stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), udata); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + return -EFAULT; + } + if (context->numDevices == 0) + { + FREE_IF_NOT_NULL(context->devices); + return -1; + } + + for (int i = 0; i < context->numDevices; i++) + { + rocprofiler_context_id_t device_context; + rocprofiler_buffer_id_t buffer; + rocprofiler_callback_thread_t thread; + RocmonDevice* device = &context->devices[i]; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating context for device %d, device->deviceId); + stat = (*rocprofiler_create_context_ptr)(&device_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating buffer for device %d, device->deviceId); + stat = (*rocprofiler_create_buffer_ptr)(device_context, 100, 50, ROCPROFILER_BUFFER_POLICY_LOSSLESS, _rocmon_sdk_read_buffers, udata, &buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating callback thread for device %d, device->deviceId); + stat = (*rocprofiler_create_callback_thread_ptr)(&thread); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Assign callback thread to buffer for device %d, device->deviceId); + stat = (*rocprofiler_assign_callback_thread_ptr)(buffer, thread); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + + device->sdk_context = device_context; + device->buffer = buffer; + device->thread = thread; + } + return 0; +} + + +void +tool_fini(void* udata) +{ + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_fini); + RocmonContext** stat_context = (RocmonContext**)udata; + RocmonContext* context = *stat_context; + if ((!context) || (!context->devices) || (context->numDevices == 0)) + { + return; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + int active = 0; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Checking context for device %d, device->deviceId); + stat = (*rocprofiler_context_is_active_ptr)(device->sdk_context, &active); + if (active) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping context for device %d, device->deviceId); + stat = (*rocprofiler_stop_context_ptr)(device->sdk_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to stop context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Flushing buffer for device %d, device->deviceId); + stat = (*rocprofiler_flush_buffer_ptr)(device->buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroying buffer for device %d, device->deviceId); + stat = (*rocprofiler_destroy_buffer_ptr)(device->buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to destroy buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + _rocmon_sdk_free_agent_counters(device); + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); + (*hsa_shut_down_ptr)(); +} + +void +_rocmon_sdk_set_profile(rocprofiler_context_id_t context_id, + rocprofiler_agent_id_t agent, + rocprofiler_agent_set_profile_callback_t set_config, + void* udata) +{ + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_sdk_set_profile); + RocmonDevice* device = (RocmonDevice*) udata; + if (device->agent.id.handle == agent.handle) + { + if (device->activeGroup >= 0 && device->activeGroup < device->numProfiles) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Setting profile %d for device %d, device->activeGroup, device->deviceId); + set_config(context_id, device->profiles[device->activeGroup]); + } + else + { + ERROR_PRINT(Invalid active group for device %d, device->deviceId); + } + } + else + { + ERROR_PRINT(Mismatch between device %s agent and given agent, device->deviceId); + } + return; +} + + + +static int +_rocmon_sdk_link_libraries() +{ + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm SDK libraries); + dlerror(); + // Need to link in the ROCm HSA libraries + rocmon_sdk_dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_hsa_lib) + { + ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); + return -1; + } + + // Need to link in the Rocprofiler libraries + rocmon_sdk_dl_profiler_lib = dlopen("librocprofiler-sdk.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_profiler_lib) + { + // Delete last error + dlerror(); + rocmon_sdk_dl_profiler_lib = dlopen("librocprofiler-sdk.so.1", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_profiler_lib) + { + ERROR_PRINT(Rocprofiler library librocprofiler-sdk.so not found: %s, dlerror()); + return -1; + } + } + + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_status_string); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_available_agents); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_timestamp); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_start_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_stop_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_profile_config); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_destroy_profile_config); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_configure_agent_profile_counting_service); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_iterate_agent_supported_counters); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_flush_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_counter_info); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_sample_agent_profile_counting_service); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_force_configure); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_destroy_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_context_is_active); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_callback_thread); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_assign_callback_thread); + + DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_init); + DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_shut_down); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm SDK libraries done); + return 0; +} + + +rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* client_id) +{ + client_id->name = "LIKWID"; + static rocprofiler_tool_configure_result_t config_result = { + .size = sizeof(rocprofiler_tool_configure_result_t), + .initialize = tool_init, + .finalize = tool_fini, + .tool_data = &rocmon_context, + }; + return &config_result; +} + +int +rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds) +{ + int ret = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + if ((numGpus < 0) || (!gpuIds) || (!context)) + { + return -EINVAL; + } + if (rocmon_sdk_initialized) + { + return 0; + } + + // initialize libraries + ret = _rocmon_sdk_link_libraries(); + if (ret < 0) + { + //ERROR_PLAIN_PRINT(Failed to initialize libraries); + return ret; + } + + stat = (*rocprofiler_force_configure_ptr)(rocprofiler_configure); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + return -EFAULT; + } + + if (context->numDevices == 0) + { + errno = ENODEV; + ERROR_PRINT(Cannot ROCm GPUs); + return -ENODEV; + } + + RocmonDevice* devices = malloc(numGpus * sizeof(RocmonDevice)); + if (!devices) + { + return -ENOMEM; + } + memset(devices, 0, numGpus * sizeof(RocmonDevice)); + + for (int i = 0; i < numGpus; i++) + { + int idx = -1; + for (int j = 0; j < context->numDevices; j++) + { + RocmonDevice* device = &context->devices[j]; + if (gpuIds[i] == device->deviceId) + { + idx = j; + break; + } + } + if (idx >= 0) + { + memcpy(&devices[i], &context->devices[idx], sizeof(RocmonDevice)); + RocmonDevice* out = &devices[i]; +/* RocmonDevice* in = &context->devices[idx];*/ +/* out->agent = in->agent;*/ +/* printf("%d -> %d\n", in->agent.id.handle, out->agent.id.handle);*/ +/* out->thread = in->thread;*/ +/* out->buffer = in->buffer;*/ +/* printf("%d -> %d\n", in->buffer.handle, out->buffer.handle);*/ +/* out->sdk_context = in->sdk_context;*/ +/* printf("%d -> %d\n", in->sdk_context.handle, out->sdk_context.handle);*/ + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Fill agent counters for device %d, out->deviceId); + ret = _rocmon_sdk_fill_agent_counters(out); + if (ret < 0) + { + errno = -ret; + ERROR_PRINT(Failed to fill events for device %d: %s, out->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + } + else + { + errno = ENODEV; + ERROR_PRINT(Cannot find ROCm GPU %d, gpuIds[i]); + free(devices); + return -ENODEV; + } + } + free(context->devices); + context->devices = devices; + context->numDevices = numGpus; + + rocmon_sdk_initialized = TRUE; + return 0; +} + + +void +rocmon_sdk_finalize(RocmonContext* context) +{ + if (context) + { + if (context->devices) + { + for (int i = 0; i < context->numDevices; i++) + { + //free device i + RocmonDevice* dev = &context->devices[i]; + if (dev->sdk_activeRocEvents) + { + free(dev->sdk_activeRocEvents); + dev->sdk_activeRocEvents = NULL; + dev->numActiveRocEvents = 0; + } + if (dev->sdk_rocMetrics) + { + _rocmon_sdk_free_agent_counters_internal(dev->numRocMetrics, dev->sdk_rocMetrics); + dev->sdk_rocMetrics = NULL; + dev->numRocMetrics = 0; + } + if (dev->profiles) + { + for (int i = 0; i < dev->numProfiles; i++) + { + (*rocprofiler_destroy_profile_config_ptr)(dev->profiles[i]); + } + } + } + } +/* if (context->sdk_agents)*/ +/* {*/ +/* free(context->sdk_agents);*/ +/* context->sdk_agents = NULL;*/ +/* free(context->sdk_agent_buffers);*/ +/* context->sdk_agent_buffers = NULL;*/ +/* context->num_sdk_agents = 0;*/ +/* }*/ + } + rocmon_sdk_initialized = 0; + return; +} + + + +static int +_rocmon_setupCounters_rocprofiler_sdk(RocmonDevice* device, const char** events, int numEvents) +{ + rocprofiler_profile_config_id_t profile; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + if ((!device) || (!events) || (numEvents <= 0)) + { + return -EINVAL; + } + + int num_counters = 0; + rocprofiler_counter_id_t* counters = malloc(numEvents * sizeof(rocprofiler_counter_id_t)); + if (!counters) + { + return -ENOMEM; + } + + for (int i = 0; i < numEvents; i++) + { + int found = -1; + for (int j = 0; j < device->numRocMetrics; j++) + { + rocprofiler_counter_info_v0_t* m = &device->sdk_rocMetrics[j]; + if (strncmp(events[i], m->name, strlen(m->name)) == 0) + { + found = j; + break; + } + } + if (found >= 0) + { + counters[num_counters++] = device->sdk_rocMetrics[found].id; + } + else + { + ERROR_PRINT(Unknown ROCm event %s, events[i]); + } + } + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating profile for %d event(s) for device %d, num_counters, device->deviceId); + stat = (*rocprofiler_create_profile_config_ptr)(device->agent.id, counters, num_counters * sizeof(rocprofiler_counter_id_t), &profile); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to create profile: %s, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(counters); + return -ENOMEM; + } + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Increasing profile space to %d for device %d, device->numProfiles + 1, device->deviceId); + rocprofiler_profile_config_id_t* profiles = realloc(device->profiles, (device->numProfiles+1) * sizeof(rocprofiler_profile_config_id_t)); + if (!profiles) + { + (*rocprofiler_destroy_profile_config_ptr)(profile); + FREE_IF_NOT_NULL(counters); + return -ENOMEM; + } + device->profiles = profiles; + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding profile %d at idx %d for device %d, device->numProfiles, device->numProfiles, device->deviceId); + device->profiles[device->numProfiles++] = profile; + FREE_IF_NOT_NULL(counters); + return 0; +} + +int +rocmon_sdk_setupCounters(RocmonContext* context, int gid) +{ + int ret = 0; + int numRocEvents = 0; + const char **rocEvents = NULL; + // Check arguments + if (gid < 0 || gid >= context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + ERROR_PRINT(Rocmon SDK not initialized); + return -EFAULT; + } + + // Get group info + GroupInfo* group = &context->groups[gid]; + + // Allocate memory for string arrays + rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (rocEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "ROCP_", 5) == 0) + { + // Rocprofiler event + rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix + numRocEvents++; + } + } + if (numRocEvents == 0) + { + free(rocEvents); + return 0; + } + + // Add events to each device + //rocmon_context->activeGroup = gid; + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Add rocprofiler events + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); + ret = _rocmon_setupCounters_rocprofiler_sdk(device, rocEvents, numRocEvents); + if (ret < 0) + { + if (rocEvents) free(rocEvents); + return ret; + } + + } + // Cleanup + free(rocEvents); + + return 0; +} + +static int _rocmon_sdk_get_timestamp(uint64_t* timestamp) +{ + rocprofiler_timestamp_t ts; + rocprofiler_status_t stat = (*rocprofiler_get_timestamp_ptr)(&ts); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to get timestamp: %s, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + + + *timestamp = (uint64_t) ts; + return 0; +} + +static int +_rocmon_startCounters_rocprofiler_sdk(RocmonDevice* device) +{ + int active = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + //ROCPROFILER_CALL(rocprofiler_configure_agent_profile_counting_service, (device->sdk_context, device->buffer, device->agent.id, _rocmon_sdk_set_profile, NULL), \ + //ROCPROFILER_CALL(rocprofiler_destroy_profile_config, (profile), free(counters); return -EFAULT;); \ + free(counters); return -ENOMEM); + + // if not running + stat = (*rocprofiler_context_is_active)(device->sdk_context, &active); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to check ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + if (!active) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Configuring counting service for device %d, device->deviceId); + stat = (*rocprofiler_configure_agent_profile_counting_service_ptr)(device->sdk_context, device->buffer, device->agent.id, _rocmon_sdk_set_profile, device); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to configure counting service for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting context for device %d, device->deviceId); + stat = (*rocprofiler_start_context_ptr)(device->sdk_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to start ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + } + return 0; +} + +int +rocmon_sdk_startCounters(RocmonContext* context) +{ + int ret = 0; + uint64_t timestamp = 0; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + ERROR_PRINT(Rocmon SDK not initialized); + return -EFAULT; + } + + // Get timestamp + if (ret = _rocmon_sdk_get_timestamp(×tamp)) + { + return ret; + } + + // Start counters on each device + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + device->time.start = timestamp; + device->time.read = timestamp; + + // Start rocprofiler events + ret = _rocmon_startCounters_rocprofiler_sdk(device); + if (ret < 0) return ret; + + } + + return 0; +} + + +static int +_rocmon_stopCounters_rocprofiler_sdk(RocmonDevice* device) +{ + int active = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Checking context for device %d, device->deviceId); + stat = (*rocprofiler_context_is_active)(device->sdk_context, &active); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to check ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + if (active) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping context for device %d, device->deviceId); + stat = (*rocprofiler_stop_context_ptr)(device->sdk_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to stop ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } +/* stat = (*rocprofiler_flush_buffer_ptr)(device->buffer);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* }*/ + } + return 0; +} + +int +rocmon_sdk_stopCounters(RocmonContext* context) +{ + int ret = 0; + uint64_t t = 0; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + ERROR_PRINT(Rocmon SDK not initialized); + return -EFAULT; + } + // Read counters + ret = _rocmon_sdk_get_timestamp(&t); + if (ret < 0) + { + return ret; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Stop rocprofiler events + ret = _rocmon_stopCounters_rocprofiler_sdk(device); + if (ret < 0) return ret; + device->time.stop = t; + } + + return 0; +} + +static int +_rocmon_readCounters_rocprofiler_sdk(RocmonDevice* device) +{ + int active = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + // do read + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Checking context for device %d, device->deviceId); + stat = (*rocprofiler_context_is_active)(device->sdk_context, &active); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to check ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + if (active) + { + rocprofiler_user_data_t udata = { + .value = 0, + .ptr = NULL, + }; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Sampling counting service for device %d, device->deviceId); + stat = (*rocprofiler_sample_agent_profile_counting_service_ptr)(device->sdk_context, udata, ROCPROFILER_COUNTER_FLAG_NONE); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to sample counting service for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + } +/* stat = (*rocprofiler_flush_buffer_ptr)(device->buffer);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* return -EFAULT;*/ +/* }*/ + return 0; +} + + +int +rocmon_sdk_readCounters(RocmonContext* context) +{ + int ret = 0; + uint64_t t = 0; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + return -EFAULT; + } + ret = _rocmon_sdk_get_timestamp(&t); + if (ret < 0) + { + return ret; + } + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + // Read counters + ret = _rocmon_readCounters_rocprofiler_sdk(device); + if (ret < 0) return ret; + device->time.read = t; + } + + return 0; +} + + + + +int +rocmon_sdk_getEventsOfGpu(RocmonContext* context, int gpuIdx, EventList_rocm_t* list) +{ + EventList_rocm_t tmpList = NULL; + Event_rocm_t* tmpEventList = NULL; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + return -EFAULT; + } + // Validate args + if ((gpuIdx < 0) || (gpuIdx > context->numDevices) || (!list)) + { + return -EINVAL; + } + + RocmonDevice* device = &context->devices[gpuIdx]; + + if (*list) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reusing existing event list); + tmpList = *list; + } + else + { + // Allocate list structure + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Allocate new event list); + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + tmpList->numEvents = 0; + tmpList->events = NULL; + } + + // Get number of events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Add %d RocProfiler SDK events, device->numRocMetrics); + if (device->numRocMetrics == 0) + { + // No events -> return list + *list = tmpList; + return 0; + } + // (Re-)Allocate event array + tmpEventList = realloc(tmpList->events, (tmpList->numEvents + device->numRocMetrics) * sizeof(Event_rocm_t)); + if (!tmpEventList) + { + if (!*list) free(tmpList); + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + return -ENOMEM; + } + tmpList->events = tmpEventList; + int startindex = tmpList->numEvents; + + // Copy rocprofiler event information + for (int i = 0; i < device->numRocMetrics; i++) + { + rocprofiler_counter_info_v0_t* event = &device->sdk_rocMetrics[i]; + Event_rocm_t* out = &tmpList->events[startindex + i]; + int len; + + // Copy name + len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "ROCP_%s", event->name); + } + + // Copy description + len = strlen(event->description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", event->description); + } + tmpList->numEvents++; + } + *list = tmpList; + return 0; +} + + + + +int +rocmon_sdk_switchActiveGroup(RocmonContext* context, int newGroupId) +{ + int ret; + + ret = rocmon_sdk_stopCounters(context); + if (ret < 0) + { + return ret; + } + + ret = rocmon_sdk_setupCounters(context, newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_sdk_startCounters(context); + if (ret < 0) + { + return ret; + } + + return 0; +} + + + #endif /* LIKWID_ROCMON_SDK_H */ diff --git a/src/includes/rocmon_sdk_types.h b/src/includes/rocmon_sdk_types.h index 280edb6c5..7c8da13fb 100644 --- a/src/includes/rocmon_sdk_types.h +++ b/src/includes/rocmon_sdk_types.h @@ -30,6 +30,41 @@ #ifndef LIKWID_ROCMON_SDK_TYPES_H #define LIKWID_ROCMON_SDK_TYPES_H +#include +/*#ifdef ROCPROFILER_EXPORT*/ +/*#undef ROCPROFILER_EXPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_IMPORT*/ +/*#undef ROCPROFILER_IMPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MAJOR*/ +/*#undef ROCPROFILER_VERSION_MAJOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MINOR*/ +/*#undef ROCPROFILER_VERSION_MINOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_API*/ +/*#undef ROCPROFILER_API*/ +/*#endif*/ +#include +/*#ifdef ROCPROFILER_API*/ +/*#undef ROCPROFILER_API*/ +/*#endif*/ +#include + + +typedef struct { + rocprofiler_agent_t* agent; + rocprofiler_buffer_id_t buffer; + rocprofiler_context_id_t context; + RocmonEventResultList *result; +} RocprofilerSdkAgentData; + +typedef struct { + int num_agents; + RocprofilerSdkAgentData* agents; +} RocprofilerSdkData; + #endif /* LIKWID_ROCMON_SDK_TYPES_H */ diff --git a/src/includes/rocmon_smi.h b/src/includes/rocmon_smi.h new file mode 100644 index 000000000..9c959a7fe --- /dev/null +++ b/src/includes/rocmon_smi.h @@ -0,0 +1,1181 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_smi.h + * + * Description: Header File of rocmon module for ROCm SMI. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SMI_H +#define LIKWID_ROCMON_SMI_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +static void *rocmon_dl_rsmi_lib = NULL; + +static int rocmon_smi_initialized = 0; + +#ifndef RSMI_CALL +#define RSMI_CALL( call, args, handleerror ) \ + do { \ + rsmi_status_t _status = (*call##_ptr)args; \ + if (_status != RSMI_STATUS_SUCCESS) { \ + fprintf(stderr, "Error: function %s failed with error %d.\n", #call, _status); \ + handleerror; \ + } \ + } while (0) +#endif + +#ifndef DECLAREFUNC_SMI +#define DECLAREFUNC_SMI(funcname, funcsig) rsmi_status_t ROCMWEAK funcname funcsig; rsmi_status_t ( *funcname##_ptr ) funcsig; +#endif + +DECLAREFUNC_SMI(rsmi_init, (uint64_t flags)); +DECLAREFUNC_SMI(rsmi_shut_down, ()); +DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_open, (uint32_t dv_ind, rsmi_func_id_iter_handle_t* handle)); +DECLAREFUNC_SMI(rsmi_dev_supported_variant_iterator_open, (rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t* var_iter)); +DECLAREFUNC_SMI(rsmi_func_iter_value_get, (rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t* value )); +DECLAREFUNC_SMI(rsmi_func_iter_next, (rsmi_func_id_iter_handle_t handle)); +DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_close, (rsmi_func_id_iter_handle_t* handle)); +DECLAREFUNC_SMI(rsmi_dev_power_ave_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* power)); +DECLAREFUNC_SMI(rsmi_dev_pci_throughput_get, (uint32_t dv_ind, uint64_t* sent, uint64_t* received, uint64_t* max_pkt_sz)); +DECLAREFUNC_SMI(rsmi_dev_pci_replay_counter_get, (uint32_t dv_ind, uint64_t* counter)); +DECLAREFUNC_SMI(rsmi_dev_memory_total_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* total)); +DECLAREFUNC_SMI(rsmi_dev_memory_usage_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* used )); +DECLAREFUNC_SMI(rsmi_dev_memory_busy_percent_get, (uint32_t dv_ind, uint32_t* busy_percent)); +DECLAREFUNC_SMI(rsmi_dev_memory_reserved_pages_get, (uint32_t dv_ind, uint32_t* num_pages, rsmi_retired_page_record_t* records)); +DECLAREFUNC_SMI(rsmi_dev_fan_rpms_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); +DECLAREFUNC_SMI(rsmi_dev_fan_speed_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); +DECLAREFUNC_SMI(rsmi_dev_fan_speed_max_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* max_speed)); +DECLAREFUNC_SMI(rsmi_dev_temp_metric_get, (uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t* temperature)); +DECLAREFUNC_SMI(rsmi_dev_volt_metric_get, (uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t* voltage)); +DECLAREFUNC_SMI(rsmi_dev_overdrive_level_get, (uint32_t dv_ind, uint32_t* od)); +DECLAREFUNC_SMI(rsmi_dev_ecc_count_get, (uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t* ec)); +DECLAREFUNC_SMI(rsmi_compute_process_info_get, (rsmi_process_info_t* procs, uint32_t* num_items)); + + +// ---------------------------------------------------- +// SMI event wrapper +// ---------------------------------------------------- + +static int +_smi_wrapper_pci_throughput_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t value; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _smi_wrapper_pci_throughput_get(%d, %d), deviceId, event->extra); + // Internal variant: 0 for sent, 1 for received bytes and 2 for max packet size + if (event->extra == 0) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, &value, NULL, NULL), return -1); + else if (event->extra == 1) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, &value, NULL), return -1); + else if (event->extra == 2) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, NULL, &value), return -1); + else return -1; + + result->fullValue += value; + result->lastValue = value; + + return 0; +} + + +static int +_smi_wrapper_pci_replay_counter_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t counter; + RSMI_CALL(rsmi_dev_pci_replay_counter_get, (deviceId, &counter), return -1); + result->fullValue += counter; + result->lastValue = counter; + + return 0; +} + + +static int +_smi_wrapper_power_ave_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t power; + RSMI_CALL(rsmi_dev_power_ave_get, (deviceId, event->subvariant, &power), return -1); + result->fullValue += power; + result->lastValue = power; + + return 0; +} + + +static int +_smi_wrapper_memory_total_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t total; + RSMI_CALL(rsmi_dev_memory_total_get, (deviceId, event->variant, &total), return -1); + result->fullValue += total; + result->lastValue = total; + + return 0; +} + + +static int +_smi_wrapper_memory_usage_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t used; + RSMI_CALL(rsmi_dev_memory_usage_get, (deviceId, event->variant, &used), return -1); + result->fullValue += used; + result->lastValue = used; + + return 0; +} + + +static int +_smi_wrapper_memory_busy_percent_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t percent; + RSMI_CALL(rsmi_dev_memory_busy_percent_get, (deviceId, &percent), return -1); + result->fullValue += percent; + result->lastValue = percent; + + return 0; +} + + +static int +_smi_wrapper_memory_reserved_pages_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t num_pages; + RSMI_CALL(rsmi_dev_memory_reserved_pages_get, (deviceId, &num_pages, NULL), return -1); + result->fullValue += num_pages; + result->lastValue = num_pages; + + return 0; +} + + +static int +_smi_wrapper_fan_rpms_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t speed; + RSMI_CALL(rsmi_dev_fan_rpms_get, (deviceId, event->subvariant, &speed), return -1); + result->fullValue += speed; + result->lastValue = speed; + + return 0; +} + + +static int +_smi_wrapper_fan_speed_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t speed; + RSMI_CALL(rsmi_dev_fan_speed_get, (deviceId, event->subvariant, &speed), return -1); + result->fullValue += speed; + result->lastValue = speed; + + return 0; +} + + +static int +_smi_wrapper_fan_speed_max_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t max_speed; + RSMI_CALL(rsmi_dev_fan_speed_max_get, (deviceId, event->subvariant, &max_speed), return -1); + result->fullValue += max_speed; + result->lastValue = max_speed; + + return 0; +} + + +static int +_smi_wrapper_temp_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t temperature; + RSMI_CALL(rsmi_dev_temp_metric_get, (deviceId, event->subvariant, event->variant, &temperature), return -1); + result->fullValue += temperature; + result->lastValue = temperature; + + return 0; +} + + +static int +_smi_wrapper_volt_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t voltage; + RSMI_CALL(rsmi_dev_volt_metric_get, (deviceId, event->subvariant, event->variant, &voltage), return -1); + result->fullValue += voltage; + result->lastValue = voltage; + + return 0; +} + + +static int +_smi_wrapper_overdrive_level_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t overdrive; + RSMI_CALL(rsmi_dev_overdrive_level_get, (deviceId, &overdrive), return -1); + result->fullValue += overdrive; + result->lastValue = overdrive; + + return 0; +} + + +static int +_smi_wrapper_ecc_count_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + rsmi_error_count_t error_count; + RSMI_CALL(rsmi_dev_ecc_count_get, (deviceId, event->variant, &error_count), return -1); + + if (event->extra == 0) + { + result->lastValue = error_count.correctable_err - result->fullValue; + result->fullValue = error_count.correctable_err; + } + else if (event->extra == 1) + { + result->lastValue = error_count.uncorrectable_err - result->fullValue; + result->fullValue = error_count.uncorrectable_err; + } + else + { + return -1; + } + + return 0; +} + + +static int +_smi_wrapper_compute_process_info_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t num_items; + RSMI_CALL(rsmi_compute_process_info_get, (NULL, &num_items), return -1); + result->fullValue += num_items; + result->lastValue = num_items; + + return 0; +} + + +static int +_rocmon_smi_link_libraries() +{ + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD SMI libraries); + + // Need to link in the Rocprofiler libraries + rocmon_dl_rsmi_lib = dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_dl_rsmi_lib) + { + ERROR_PRINT(ROCm SMI library librocm_smi64.so not found: %s, dlerror()); + return -1; + } + + // Link SMI functions + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_init); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_shut_down); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_supported_func_iterator_open); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_supported_variant_iterator_open); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_func_iter_value_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_func_iter_next); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_supported_func_iterator_close); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_power_ave_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_pci_throughput_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_pci_replay_counter_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_total_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_usage_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_busy_percent_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_reserved_pages_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_fan_rpms_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_fan_speed_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_fan_speed_max_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_temp_metric_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_volt_metric_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_overdrive_level_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_ecc_count_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_compute_process_info_get); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries done); + return 0; +} + + + + +// ---------------------------------------------------- +// Rocmon SMI helper functions +// ---------------------------------------------------- + +static bstring +_rocmon_smi_build_label(RocmonSmiEventType type, const char* funcname, uint64_t variant, uint64_t subvariant) +{ + switch (type) + { + case ROCMON_SMI_EVENT_TYPE_NORMAL: + return bfromcstr(funcname); + case ROCMON_SMI_EVENT_TYPE_VARIANT: + return bformat("%s|%" PRIu64, funcname, variant); + case ROCMON_SMI_EVENT_TYPE_SUBVARIANT: + return bformat("%s|%" PRIu64 "|%" PRIu64, funcname, variant, subvariant); + case ROCMON_SMI_EVENT_TYPE_INSTANCES: + return bfromcstr(funcname); + } +} + + +static int +_rocmon_smi_add_event_to_device(RocmonDevice* device, const char* funcname, RocmonSmiEventType type, int64_t variant, uint64_t subvariant) +{ + int ret; + + // Get event by label + RocmonSmiEventList* list = NULL; + bstring label = _rocmon_smi_build_label(type, funcname, variant, subvariant); + ret = get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list); + bdestroy(label); + if (ret < 0) + { + // Event not registered -> ignore + return 0; + } + + // For events with multiple sensor, only make one entry -> find if one exists + if (type == ROCMON_SMI_EVENT_TYPE_INSTANCES && subvariant > 0) + { + // Get list from map + for (int i = 0; i < list->numEntries; i++) + { + RocmonSmiEvent* event = &list->entries[i]; + RocmonSmiEvent* existingEvent = NULL; + ret = get_smap_by_key(device->smiMetrics, event->name, (void**)&existingEvent); + if (ret < 0) + { + ERROR_PRINT(Failed to find previous instance for event %s, event->name); + return -1; + } + + // Update instance information + existingEvent->instances++; + } + return 0; + } + + for (int i = 0; i < list->numEntries; i++) + { + RocmonSmiEvent* event = &list->entries[i]; + + // Allocate memory for device event description + RocmonSmiEvent* tmpEvent = (RocmonSmiEvent*) malloc(sizeof(RocmonSmiEvent)); + if (tmpEvent == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event in device list %s, event->name); + return -ENOMEM; + } + + // Copy information from global description + memcpy(tmpEvent, event, sizeof(RocmonSmiEvent)); + tmpEvent->variant = variant; + tmpEvent->subvariant = subvariant; + tmpEvent->instances = 1; + + // Save event info to device event map + add_smap(device->smiMetrics, tmpEvent->name, tmpEvent); + } + + return 0; +} + + +static int +_rocmon_smi_get_function_subvariants(RocmonDevice* device, const char* funcname, uint64_t variant, rsmi_func_id_iter_handle_t var_iter) +{ + rsmi_func_id_iter_handle_t sub_var_iter; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Get open subvariants iterator + status = (*rsmi_dev_supported_variant_iterator_open_ptr)(var_iter, &sub_var_iter); + if (status == RSMI_STATUS_NO_DATA) + { + // No subvariants + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_VARIANT, variant, 0); + if (ret < 0) return -1; + return 0; + } + + // Subvariants available -> iterate them + do { + // Get subvariant information + (*rsmi_func_iter_value_get_ptr)(sub_var_iter, &value); + + // Process info + if (variant == RSMI_DEFAULT_VARIANT) + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_INSTANCES, variant, value.id); + else + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, variant, value.id); + if (ret < 0) return ret; + + // Advance iterator + status = (*rsmi_func_iter_next_ptr)(sub_var_iter); + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + (*rsmi_dev_supported_func_iterator_close_ptr)(&sub_var_iter); + + return 0; +} + + +static int +_rocmon_smi_get_function_variants(RocmonDevice* device, const char* funcname, rsmi_func_id_iter_handle_t iter_handle) +{ + rsmi_func_id_iter_handle_t var_iter; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Get open variants iterator + status = (*rsmi_dev_supported_variant_iterator_open_ptr)(iter_handle, &var_iter); + if (status == RSMI_STATUS_NO_DATA) + { + // No variants + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); + if (ret < 0) return -1; + return 0; + } + + // Variants available -> iterate them + do { + // Get variant information + (*rsmi_func_iter_value_get_ptr)(var_iter, &value); + + // Get function subvariants + ret = _rocmon_smi_get_function_subvariants(device, funcname, value.id, var_iter); + if (ret < 0) return -1; + + // Advance iterator + status = (*rsmi_func_iter_next_ptr)(var_iter); + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + (*rsmi_dev_supported_func_iterator_close_ptr)(&var_iter); + + return 0; +} + + +static int +_rocmon_smi_get_functions(RocmonDevice* device) +{ + rsmi_func_id_iter_handle_t iter_handle; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Open iterator + //(*rsmi_dev_supported_func_iterator_open_ptr)(device->deviceId, &iter_handle); + RSMI_CALL(rsmi_dev_supported_func_iterator_open, (device->deviceId, &iter_handle), { + return -1; + }); + + do + { + // Get function information + //(*rsmi_func_iter_value_get_ptr)(iter_handle, &value); + RSMI_CALL(rsmi_func_iter_value_get, (iter_handle, &value), { + ERROR_PRINT(Failed to get smi function value for device %d, device->deviceId); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + return -1; + }); + + // Get function variants + ret = _rocmon_smi_get_function_variants(device, value.name, iter_handle); + if (ret < 0) + { + ERROR_PRINT(Failed to get smi function variants for device %d, device->deviceId); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + return -1; + } + + // Advance iterator (cannot use RSMI_CALL macro here because we have an assignment, + // so we check that the function pointer exists to avoid segfaults.) + if (rsmi_func_iter_next_ptr) { + status = (*rsmi_func_iter_next_ptr)(iter_handle); + } + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + //(*rsmi_dev_supported_func_iterator_close_ptr)(&iter_handle); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + + // Add device independent functions + ret = _rocmon_smi_add_event_to_device(device, "rsmi_compute_process_info_get", ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); + if (ret < 0) return -1; + + return 0; +} + + + +static int +_rocmon_smi_add_event_to_map(char* name, RocmonSmiEventType type, char* smifunc, uint64_t variant, uint64_t subvariant, uint64_t extra, RocmonSmiMeasureFunc measureFunc) +{ + // Add new event list to map (if not already present) + bstring label = _rocmon_smi_build_label(type, smifunc, variant, subvariant); + RocmonSmiEventList* list; + if (get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list) < 0) + { + // Allocate memory for event list + list = (RocmonSmiEventList*) malloc(sizeof(RocmonSmiEventList)); + if (list == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event list %s, name); + return -ENOMEM; + } + list->entries = NULL; + list->numEntries = 0; + + add_smap(rocmon_context->smiEvents, bdata(label), list); + } + bdestroy(label); + + // Allocate memory for another event in list + list->numEntries++; + list->entries = (RocmonSmiEvent*) realloc(list->entries, list->numEntries * sizeof(RocmonSmiEvent)); + if (list->entries == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event %s, name); + return -ENOMEM; + } + + // Set event properties + RocmonSmiEvent* event = &list->entries[list->numEntries-1]; + strncpy(event->name, name, sizeof(event->name)); + event->name[sizeof(event->name)] = '\0'; + event->type = type; + event->variant = variant; + event->subvariant = subvariant; + event->extra = extra; + event->instances = 0; // gets set when scanning supported device functions + event->measureFunc = measureFunc; + + return 0; +} + +#define ADD_SMI_EVENT(name, type, smifunc, variant, subvariant, extra, measurefunc) if (_rocmon_smi_add_event_to_map(name, type, smifunc, variant, subvariant, extra, measurefunc) < 0) { return -1; } +#define ADD_SMI_EVENT_N(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_NORMAL, smifunc, 0, 0, extra, measurefunc) +#define ADD_SMI_EVENT_V(name, smifunc, variant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_VARIANT, smifunc, variant, 0, extra, measurefunc) +#define ADD_SMI_EVENT_S(name, smifunc, variant, subvariant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, smifunc, variant, subvariant, extra, measurefunc) +#define ADD_SMI_EVENT_I(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_INSTANCES, smifunc, 0, 0, extra, measurefunc) + + +static void +_rcomon_smi_free_event_list(void* vlist) +{ + RocmonSmiEventList* list = (RocmonSmiEventList*)vlist; + if (list) + { + FREE_IF_NOT_NULL(list->entries); + free(list); + } +} + + +static int +_rocmon_smi_init_events(RocmonContext* context) +{ + int ret; + + // Init map + ret = init_map(&context->smiEvents, MAP_KEY_TYPE_STR, 0, &_rcomon_smi_free_event_list); + if (ret < 0) + { + ERROR_PRINT(Failed to create map for ROCm SMI events); + return ret; + } + + // Add events + ADD_SMI_EVENT_N("PCI_THROUGHPUT_SENT", "rsmi_dev_pci_throughput_get", 0, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_THROUGHPUT_RECEIVED", "rsmi_dev_pci_throughput_get", 1, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_THROUGHPUT_MAX_PKT_SZ", "rsmi_dev_pci_throughput_get", 2, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_REPLAY_COUNTER", "rsmi_dev_pci_replay_counter_get", 0, &_smi_wrapper_pci_replay_counter_get ); + ADD_SMI_EVENT_I("POWER_AVE", "rsmi_dev_power_ave_get", 0, &_smi_wrapper_power_ave_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_VIS_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_GTT", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_VIS_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_GTT", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_N("MEMORY_BUSY_PERCENT", "rsmi_dev_memory_busy_percent_get", 0, &_smi_wrapper_memory_busy_percent_get ); + ADD_SMI_EVENT_N("MEMORY_NUM_RESERVED_PAGES", "rsmi_dev_memory_reserved_pages_get", 0, &_smi_wrapper_memory_reserved_pages_get ); + ADD_SMI_EVENT_I("FAN_RPMS", "rsmi_dev_fan_rpms_get", 0, &_smi_wrapper_fan_rpms_get ); + ADD_SMI_EVENT_I("FAN_SPEED", "rsmi_dev_fan_speed_get", 0, &_smi_wrapper_fan_speed_get ); + ADD_SMI_EVENT_I("FAN_SPEED_MAX", "rsmi_dev_fan_speed_max_get", 0, &_smi_wrapper_fan_speed_max_get ); + ADD_SMI_EVENT_S("TEMP_EDGE", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_EDGE, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("TEMP_JUNCTION", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_JUNCTION, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("TEMP_MEMORY", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_MEMORY, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("VOLT_VDDGFX", "rsmi_dev_volt_metric_get", RSMI_VOLT_CURRENT, RSMI_VOLT_TYPE_VDDGFX, 0, &_smi_wrapper_volt_metric_get ); + ADD_SMI_EVENT_N("OVERDRIVE_LEVEL", "rsmi_dev_overdrive_level_get", 0, &_smi_wrapper_overdrive_level_get ); + ADD_SMI_EVENT_V("ECC_COUNT_UMC_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_UMC_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SDMA_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SDMA_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_GFX_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_GFX_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_HDP_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_HDP_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_DF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_DF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SMN_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SMN_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SEM_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SEM_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP0_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP0_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP1_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP1_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_FUSE_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_FUSE_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_LAST_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_LAST_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_N("PROCS_USING_GPU", "rsmi_compute_process_info_get", 0, &_smi_wrapper_compute_process_info_get ); + + return 0; +} + +static int +_rocmon_setupCounters_smi(RocmonDevice* device, const char** events, int numEvents) +{ + int ret; + const int instanceNumLen = 5; + + // Delete previous events + if (device->activeSmiEvents) + { + free(device->activeSmiEvents); + device->activeSmiEvents = NULL; + device->numActiveSmiEvents = 0; + } + + // Look if the are any events + if (numEvents <= 0) + { + return 0; + } + + // Create event array + RocmonSmiEvent* activeEvents = (RocmonSmiEvent*) malloc(numEvents * sizeof(RocmonSmiEvent)); + if (activeEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate active event list); + return -ENOMEM; + } + + for (int i = 0; i < numEvents; i++) + { + char eventName[MAX_ROCMON_SMI_EVENT_NAME]; + int instance = -1; + + // Parse event name -> normal event vs one with multiple instances (EVENT[0]) + const char* event = events[i]; + char* instancePart = strrchr(event, '['); + if (instancePart != NULL) + { + char withoutBrackets[instanceNumLen+1]; // +1 is '\0' + int partlen = strlen(instancePart); + + // Check if number fit in 'withoutBrackets' + if (partlen - 2 > instanceNumLen) + { + ERROR_PRINT(Instance number in '%s' is too large, event); + free(activeEvents); + return -EINVAL; + } + + // Copy instance number without brackets + strncpy(withoutBrackets, instancePart+1, partlen-2); + withoutBrackets[instanceNumLen] = '\0'; + + // Parse instance as number + char* endParsed; + instance = strtol(withoutBrackets, &endParsed, 10); + + // Check if parsing was successful + char* endOfString = &withoutBrackets[partlen-2]; + if (endParsed != endOfString) + { + ERROR_PRINT(Failed to parse instance number in '%s', event); + free(activeEvents); + return -EINVAL; + } + + // Copy event name without instance + int eventNameLen = instancePart - event; + strncpy(eventName, event, eventNameLen); + eventName[eventNameLen] = '\0'; + } + else + { + // Copy entire event name + strncpy(eventName, event, MAX_ROCMON_SMI_EVENT_NAME); + } + + // Lookup event in available events + RocmonSmiEvent* metric = NULL; + ret = get_smap_by_key(device->smiMetrics, eventName, (void**)&metric); + if (ret < 0) + { + ERROR_PRINT(RSMI event '%s' not found for device %d, eventName, device->deviceId); + free(activeEvents); + return -EINVAL; + } + + // Copy event + RocmonSmiEvent* tmpEvent = &activeEvents[i]; + memcpy(tmpEvent, metric, sizeof(RocmonSmiEvent)); + + // Check if event supports instances + if (instance >= 0 && tmpEvent->type != ROCMON_SMI_EVENT_TYPE_INSTANCES) + { + ERROR_PRINT(Instance number given but event '%s' does not support one, eventName); + free(activeEvents); + return -EINVAL; + } + + // Check if event requires instances + if (instance < 0 && tmpEvent->type == ROCMON_SMI_EVENT_TYPE_INSTANCES) + { + ERROR_PRINT(No instance number given but event '%s' requires one, eventName); + free(activeEvents); + return -EINVAL; + } + + // Check if event has enough instances + if (instance >= 0 && instance >= metric->instances) + { + ERROR_PRINT(Instance %d seleced but event '%s' has only %d, instance, eventName, metric->instances); + free(activeEvents); + return -EINVAL; + } + + // Set instance number + if (instance >= 0) + { + tmpEvent->subvariant = instance; + } + } + + device->activeSmiEvents = activeEvents; + device->numActiveSmiEvents = numEvents; + + return 0; +} + + +int +rocmon_smi_setupCounters(RocmonContext* context, int gid) +{ + int ret = 0; + int numSmiEvents = 0; + const char **smiEvents = NULL; + // Check arguments + if (gid < 0 || gid >= context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + + // Get group info + GroupInfo* group = &context->groups[gid]; + + // Allocate memory for string arrays + smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (smiEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate smiEvents name array); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "RSMI_", 5) == 0) + { + // Rocprofiler event + smiEvents[numSmiEvents] = name + 5; // +5 removes 'ROCP_' prefix + numSmiEvents++; + } + } + if (numSmiEvents == 0) + { + free(smiEvents); + return 0; + } + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + ret = _rocmon_setupCounters_smi(device, smiEvents, numSmiEvents); + if (ret < 0) + { + ERROR_PRINT(Failed to setup ROCMON SMI events for device %d, i); + } + } + free(smiEvents); + return 0; +} + +int +rocmon_smi_readCounters(RocmonContext* context) +{ + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + if (context->activeGroup < 0) + { + return -EFAULT; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + // Check if there are any counters to start + if (device->numActiveSmiEvents <= 0) + { + return 0; + } + + // Save baseline values + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveSmiEvents; i++) + { + double value = 0; + RocmonSmiEvent* event = &device->activeSmiEvents[i]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + // Measure counter + if (event->measureFunc) + { + event->measureFunc(device->deviceId, event, result); + } + } + } + return 0; +} + +int +rocmon_smi_startCounters(RocmonContext* context) +{ + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + if (context->activeGroup < 0) + { + return -EFAULT; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + // Check if there are any counters to start + if (device->numActiveSmiEvents <= 0) + { + return 0; + } + + // Save baseline values + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveSmiEvents; i++) + { + double value = 0; + RocmonSmiEvent* event = &device->activeSmiEvents[i]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + + // Measure counter + if (event->measureFunc) + { + event->measureFunc(device->deviceId, event, result); + } + + // Save value + result->fullValue = 0; + } + } + return 0; +} + +int +rocmon_smi_stopCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + return 0; +} + + +static int +rocmon_smi_getEventsOfGpu(RocmonContext* context, int gpuIdx, EventList_rocm_t* list) +{ + EventList_rocm_t tmpList = NULL; + Event_rocm_t* tmpEventList = NULL; + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + // Validate args + if ((gpuIdx < 0) || (gpuIdx > rocmon_context->numDevices) || (!list)) + { + return -EINVAL; + } + + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + + if (*list) + { + tmpList = *list; + } + else + { + // Allocate list structure + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + memset(tmpList, 0, sizeof(EventList_rocm)); + } + + // Get number of events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Add %d ROCm SMI events, get_map_size(device->smiMetrics)); + if (get_map_size(device->smiMetrics) == 0) + { + // No events -> return list + *list = tmpList; + return 0; + } + // (Re-)Allocate event array + tmpEventList = realloc(tmpList->events, (tmpList->numEvents + get_map_size(device->smiMetrics)) * sizeof(Event_rocm_t)); + if (!tmpEventList) + { + if (!*list) free(tmpList); + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + return -ENOMEM; + } + tmpList->events = tmpEventList; + int startindex = tmpList->numEvents; + + // Copy ROCm SMI metric information + for (int i = 0; i < get_map_size(device->smiMetrics); i++) + { + RocmonSmiEvent* event = NULL; + Event_rocm_t* out = &tmpList->events[startindex + i]; + int len; + + // Get event + if (get_smap_by_idx(device->smiMetrics, i, (void**)&event) < 0) + { + continue; + } + + // Copy name + len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "RSMI_%s", event->name); + } + + // Copy description + char* description = "SMI Event"; // TODO: use real descriptions + len = strlen(description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", description); + } + + // Copy instances + out->instances = event->instances; + tmpList->numEvents++; + } + + *list = tmpList; + return 0; +} + + +int rocmon_smi_init(RocmonContext* context, int numGpus, const int* gpuIds) +{ + int ret = 0; + if ((!context) || (numGpus <= 0) || (!gpuIds)) + { + return -EINVAL; + } + + ret = _rocmon_smi_link_libraries(); + if (ret < 0) + { + return -EFAULT; + } + + // init rocm smi library + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RSMI); + RSMI_CALL(rsmi_init, (0), + { + ERROR_PLAIN_PRINT(Failed to init rocm_smi); + goto rocmon_init_rsmi_failed; + }); + + // Get available SMI events for devices + _rocmon_smi_init_events(context); + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice *device = &context->devices[i]; + // Initialize SMI events map + if (init_map(&device->smiMetrics, MAP_KEY_TYPE_STR, 0, &free) < 0) + { + ERROR_PLAIN_PRINT(Cannot init smiMetrics map); + goto rocmon_init_rsmi_failed; + } + if (_rocmon_smi_get_functions(device) < 0) + { + ERROR_PRINT(Failed to get SMI functions for device %d, device->deviceId); + goto rocmon_init_rsmi_failed; + } + } + rocmon_smi_initialized = TRUE; + return 0; +rocmon_init_rsmi_failed: + RSMI_CALL(rsmi_shut_down, (), { + // fall through + }); + return 0; +} + + +void rocmon_smi_finalize(RocmonContext* context) +{ + if (!rocmon_smi_initialized) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON SMI); + if (context) + { + if (context->devices) + { + // Free each devices fields + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + if (device->activeSmiEvents) + { + free(device->activeSmiEvents); + device->activeSmiEvents = NULL; + device->numActiveSmiEvents = 0; + } + if (device->smiMetrics) + { + destroy_smap(device->smiMetrics); + device->smiMetrics = NULL; + } + } + } + } + + RSMI_CALL(rsmi_shut_down, (), { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); + // fall through + }); + rocmon_smi_initialized = FALSE; +} + +int +rocmon_smi_switchActiveGroup(RocmonContext* context, int newGroupId) +{ + int ret; + + ret = rocmon_smi_stopCounters(context); + if (ret < 0) + { + return ret; + } + + ret = rocmon_smi_setupCounters(context, newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_smi_startCounters(context); + if (ret < 0) + { + return ret; + } + + return 0; +} + +#endif /* LIKWID_ROCMON_SMI_H */ diff --git a/src/includes/rocmon_smi_types.h b/src/includes/rocmon_smi_types.h new file mode 100644 index 000000000..cb6a5efae --- /dev/null +++ b/src/includes/rocmon_smi_types.h @@ -0,0 +1,81 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_smi_types.h + * + * Description: Header File of rocmon for smi backend. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SMI_TYPES_H +#define LIKWID_ROCMON_SMI_TYPES_H + +#include +#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 +typedef struct metrics_table_header_t metrics_table_header_t; +#endif +#include +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT +#endif +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT +#endif +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include + +struct RocmonSmiEvent_struct; +typedef int (*RocmonSmiMeasureFunc)(int deviceId, struct RocmonSmiEvent_struct* event, RocmonEventResult* result); + +typedef enum { + ROCMON_SMI_EVENT_TYPE_NORMAL = 0, + ROCMON_SMI_EVENT_TYPE_VARIANT, + ROCMON_SMI_EVENT_TYPE_SUBVARIANT, + ROCMON_SMI_EVENT_TYPE_INSTANCES +} RocmonSmiEventType; + +#define MAX_ROCMON_SMI_EVENT_NAME 40 +typedef struct RocmonSmiEvent_struct { + char name[MAX_ROCMON_SMI_EVENT_NAME]; + uint64_t variant; + uint64_t subvariant; + uint64_t extra; + int instances; + RocmonSmiEventType type; + RocmonSmiMeasureFunc measureFunc; +} RocmonSmiEvent; + +typedef struct { + RocmonSmiEvent* entries; + int numEntries; +} RocmonSmiEventList; + +#endif /* LIKWID_ROCMON_SMI_TYPES_H */ diff --git a/src/includes/rocmon_v1.h b/src/includes/rocmon_v1.h index 0ea8b70e0..2ee73d335 100644 --- a/src/includes/rocmon_v1.h +++ b/src/includes/rocmon_v1.h @@ -30,34 +30,963 @@ #ifndef LIKWID_ROCMON_V1_H #define LIKWID_ROCMON_V1_H -int rocmon_v1_init(int numGpus, const int* gpuIds); -void rocmon_v1_finalize(void); -int rocmon_v1_addEventSet(const char* eventString, int* gid); -int rocmon_v1_setupCounters(int gid); -int rocmon_v1_startCounters(void); -int rocmon_v1_stopCounters(void); -int rocmon_v1_readCounters(void); -double rocmon_v1_getResult(int gpuIdx, int groupId, int eventId); -double rocmon_v1_getLastResult(int gpuIdx, int groupId, int eventId); -int rocmon_v1_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list); -void rocmon_v1_freeEventsOfGpu(EventList_rocm_t list); -int rocmon_v1_switchActiveGroup(int newGroupId); -int rocmon_v1_getNumberOfGroups(void); -int rocmon_v1_getIdOfActiveGroup(void); -int rocmon_v1_getNumberOfGPUs(void); -int rocmon_v1_getNumberOfEvents(int groupId); -int rocmon_v1_getNumberOfMetrics(int groupId); -double rocmon_v1_getTimeOfGroup(int groupId); -double rocmon_v1_getLastTimeOfGroup(int groupId); -double rocmon_v1_getTimeToLastReadOfGroup(int groupId); -char* rocmon_v1_getEventName(int groupId, int eventId); -char* rocmon_v1_getCounterName(int groupId, int eventId); -char* rocmon_v1_getMetricName(int groupId, int metricId); -char* rocmon_v1_getGroupName(int groupId); -char* rocmon_v1_getGroupInfoShort(int groupId); -char* rocmon_v1_getGroupInfoLong(int groupId); -int rocmon_v1_getGroups(char*** groups, char*** shortinfos, char*** longinfos); -int rocmon_v1_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos); +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + + + +// #include +// #include +// #include + +// Variables +static void *rocmon_v1_dl_hsa_lib = NULL; +static void *rocmon_v1_dl_profiler_lib = NULL; + + +static bool rocmon_v1_initialized = FALSE; + +// Macros +#ifndef FREE_IF_NOT_NULL +#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } +#endif + +#ifndef ROCM_CALL +#define ROCM_CALL( call, args, handleerror ) \ + do { \ + hsa_status_t _status = (*call##_ptr)args; \ + if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + const char* err = NULL; \ + rocprofiler_error_string(&err); \ + if (err) fprintf(stderr, "Error: %s\n", err); \ + handleerror; \ + } \ + } while (0) +#endif + + +// ROCm function declarations +#ifndef DECLAREFUNC_HSA +#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; +#endif + +DECLAREFUNC_HSA(hsa_init, ()); +DECLAREFUNC_HSA(hsa_shut_down, ()); +DECLAREFUNC_HSA(hsa_iterate_agents, (hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data)); +DECLAREFUNC_HSA(hsa_agent_get_info, (hsa_agent_t agent, hsa_agent_info_t attribute, void* value)); +DECLAREFUNC_HSA(hsa_system_get_info, (hsa_system_info_t attribute, void *value)); + +DECLAREFUNC_HSA(rocprofiler_iterate_info, (const hsa_agent_t* agent, rocprofiler_info_kind_t kind, hsa_status_t (*callback)(const rocprofiler_info_data_t, void* data), void* data)); +DECLAREFUNC_HSA(rocprofiler_close, (rocprofiler_t* context)); +DECLAREFUNC_HSA(rocprofiler_open, (hsa_agent_t agent, rocprofiler_feature_t* features, uint32_t feature_count, rocprofiler_t** context, uint32_t mode, rocprofiler_properties_t* properties)); +DECLAREFUNC_HSA(rocprofiler_error_string, ()); +DECLAREFUNC_HSA(rocprofiler_start, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_stop, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_read, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_get_data, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_get_metrics, (const rocprofiler_t* context)); + + + +// ---------------------------------------------------- +// Rocmon helper functions +// ---------------------------------------------------- + +static int +_rocmon_v1_link_libraries() +{ + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm V1 libraries); + + // Need to link in the ROCm HSA libraries + rocmon_v1_dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_v1_dl_hsa_lib) + { + ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); + return -1; + } + + // Need to link in the Rocprofiler libraries + rocmon_v1_dl_profiler_lib = dlopen("librocprofiler64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_v1_dl_profiler_lib) + { + rocmon_v1_dl_profiler_lib = dlopen("librocprofiler64.so.1", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_v1_dl_profiler_lib) + { + ERROR_PRINT(Rocprofiler library librocprofiler64.so not found: %s, dlerror()); + return -1; + } + } + + // Link HSA functions + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_init); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_shut_down); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_iterate_agents); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_agent_get_info); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_system_get_info); + + // Link Rocprofiler functions + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_iterate_info); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_close); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_open); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_error_string); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_start); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_stop); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_read); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_get_data); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_get_metrics); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm V1 libraries done); + return 0; +} + +typedef struct { + RocmonContext* context; + int numGpus; + const int* gpuIds; +} iterate_agents_cb_arg; + +typedef struct { + RocmonDevice* device; + int currIndex; +} iterate_info_cb_arg; + + +static hsa_status_t +_rocmon_v1_iterate_info_callback_count(const rocprofiler_info_data_t info, void* data) +{ + RocmonDevice* device = (RocmonDevice*) data; + if (device) { + device->numRocMetrics++; + } + return HSA_STATUS_SUCCESS; +} + +static void +_rocmon_v1_print_rocprofiler_info_data(const rocprofiler_info_data_t info) +{ + if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + { + return; + } + printf("Name '%s':\n", info.metric.name); + printf("\tKind: '%s'\n", (info.kind == ROCPROFILER_INFO_KIND_METRIC ? "Metric" : "Trace")); + printf("\tInstances: %d\n", info.metric.instances); + printf("\tDescription: '%s'\n", info.metric.description); + printf("\tExpression: '%s'\n", info.metric.expr); + printf("\tBlockName: '%s'\n", info.metric.block_name); + printf("\tBlockCounters: %d\n", info.metric.block_counters); +} + +static hsa_status_t +_rocmon_v1_iterate_info_callback_add(const rocprofiler_info_data_t info, void* data) +{ + iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; + + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); + if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP) + { + _rocmon_v1_print_rocprofiler_info_data(info); + } + // Check info kind + if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + { + ERROR_PRINT(Wrong info kind %u, info.kind); + return HSA_STATUS_ERROR; + } + + // Check index + if (arg->currIndex >= arg->device->numRocMetrics) + { + ERROR_PRINT(Metric index out of bounds: %d, arg->currIndex); + return HSA_STATUS_ERROR; + } + + // Copy info data + rocprofiler_info_data_t* target_info = &arg->device->v1_rocMetrics[arg->currIndex]; + memcpy(target_info, &info, sizeof(rocprofiler_info_data_t)); + arg->currIndex++; + + return HSA_STATUS_SUCCESS; +} + + +static hsa_status_t +_rocmon_v1_iterate_agents_callback(hsa_agent_t agent, void* argv) +{ + // Count number of callback invocations as the devices id + static int nextDeviceId = 0; + int deviceId = nextDeviceId; + bool noAgent = false; + + iterate_agents_cb_arg *arg = (iterate_agents_cb_arg*) argv; + + // Check if device is a GPU + hsa_device_type_t type; + ROCM_CALL(hsa_agent_get_info, (agent, HSA_AGENT_INFO_DEVICE, &type), return -1); + if (type != HSA_DEVICE_TYPE_GPU) + { + return HSA_STATUS_SUCCESS; + } + nextDeviceId++; + + // Check if device is includes in arg->gpuIds + int gpuIndex = -1; + for (int i = 0; i < arg->numGpus; i++) + { + if (deviceId == arg->gpuIds[i]) + { + gpuIndex = i; + break; + } + } + if (gpuIndex < 0) + { + return HSA_STATUS_SUCCESS; + } + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing agent %d, gpuIndex); + + // Add agent to context + RocmonDevice *device = &arg->context->devices[gpuIndex]; + device->deviceId = deviceId; + device->hsa_agent = agent; + device->v1_context = NULL; + device->numActiveRocEvents = 0; + device->v1_activeRocEvents = NULL; + device->numGroupResults = 0; + device->groupResults = NULL; + + // Get number of available metrics + device->numRocMetrics = 0; + ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_count, device), return HSA_STATUS_ERROR); + //ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, RocProfiler provides %d events, device->numRocMetrics); + + // workaround for bug in ROCm 5.4.0 + if(device->numRocMetrics == 0) { + ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_count, device), return HSA_STATUS_ERROR); + noAgent = true; + } + + // Allocate memory for metrics + device->v1_rocMetrics = (rocprofiler_info_data_t*) malloc(device->numRocMetrics * sizeof(rocprofiler_info_data_t)); + if (device->v1_rocMetrics == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate set of v1_rocMetrics); + return HSA_STATUS_ERROR; + } + + // Fetch metric informatino + iterate_info_cb_arg info_arg = { + .device = device, + .currIndex = 0, + }; + //ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, Read %d RocProfiler events for device %d, device->numRocMetrics, device->deviceId); + + // If the call fails with agent, call rocprofiler_iterate_info without agent + if(noAgent) + { + ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); + } else { + ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); + } + + return HSA_STATUS_SUCCESS; +} + + + + + +static int +_rocmon_v1_get_timestamp(uint64_t* timestamp_ns) +{ + uint64_t timestamp; + + // Get timestamp from system + ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP, ×tamp), return -1); + // Convert to nanoseconds + *timestamp_ns = (uint64_t)((long double)timestamp * rocmon_context->hsa_timestamp_factor); + + return 0; +} + + +static int +_rocmon_v1_getLastResult(RocmonDevice* device, int eventId, double* value) +{ + rocprofiler_data_t* data = &device->v1_activeRocEvents[eventId].data; + + switch (data->kind) + { + case ROCPROFILER_DATA_KIND_INT32: + *value = (double) data->result_int32; + break; + case ROCPROFILER_DATA_KIND_INT64: + *value = (double) data->result_int64; + break; + case ROCPROFILER_DATA_KIND_FLOAT: + *value = (double) data->result_float; + break; + case ROCPROFILER_DATA_KIND_DOUBLE: + *value = data->result_double; + break; + + case ROCPROFILER_DATA_KIND_BYTES: + case ROCPROFILER_DATA_KIND_UNINIT: + default: + return -1; + } + + return 0; +} + + +static int +_rocmon_readCounters_rocprofiler_v1(RocmonDevice* device) +{ + int ret; + + // Check if there are any counters to start + if (device->numActiveRocEvents <= 0) + { + return 0; + } + + if (!device->v1_context) + { + return 0; + } + + ROCM_CALL(rocprofiler_read, (device->v1_context, 0), return -1); + ROCM_CALL(rocprofiler_get_data, (device->v1_context, 0), return -1); + ROCM_CALL(rocprofiler_get_metrics, (device->v1_context), return -1); + + // Update results + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveRocEvents; i++) + { + RocmonEventResult* result = &groupResult->results[i]; + + // Read value + ret = _rocmon_v1_getLastResult(device, i, &result->fullValue); + if (ret < 0) + { + return -1; + } + + // Calculate delta since last read + result->lastValue = result->fullValue - result->lastValue; + } + + return 0; +} + + + +int +_rocmon_v1_readCounters(RocmonContext* context, uint64_t* (*getDestTimestampFunc)(RocmonDevice* device)) +{ + int ret; + + // Get timestamp + uint64_t timestamp; + if (ret = _rocmon_v1_get_timestamp(×tamp)) + { + return ret; + } + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + if (!device->rocprof_v1) continue; + + // Save timestamp + if (getDestTimestampFunc) + { + uint64_t* timestampDest = getDestTimestampFunc(device); + if (timestampDest) + { + *timestampDest = timestamp; + } + } + + // Read rocprofiler counters + ret = _rocmon_readCounters_rocprofiler_v1(device); + if (ret < 0) return ret; + } + + return 0; +} + + +static uint64_t* +_rocmon_v1_get_read_time(RocmonDevice* device) +{ + return &device->time.read; +} + + +static uint64_t* +_rocmon_v1_get_stop_time(RocmonDevice* device) +{ + return &device->time.stop; +} + + +int +rocmon_v1_init(RocmonContext* context, int numGpus, const int* gpuIds) +{ + hsa_status_t status = 0; + RocmonDevice* devices = NULL; + int num_devices = 0; + + // check if already initialized + if (rocmon_v1_initialized) + { + return 0; + } + if (context == NULL) + { + return -EEXIST; + } + + // Validate arguments + if (numGpus <= 0) + { + ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); + return -EINVAL; + } + + // Initialize other parts + init_configuration(); + + // initialize libraries + int ret = _rocmon_v1_link_libraries(); + if (ret < 0) + { + ERROR_PLAIN_PRINT(Failed to initialize libraries); + return ret; + } + + // init hsa library + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing HSA); + ROCM_CALL(hsa_init, (), + { + ERROR_PLAIN_PRINT(Failed to init hsa library); + goto rocmon_init_hsa_failed; + }); + + if (!context->devices) + { + context->devices = (RocmonDevice*) malloc(numGpus * sizeof(RocmonDevice)); + if (!context->devices) + { + ERROR_PLAIN_PRINT(Cannot allocate set of GPUs); + free(devices); + return -ENOMEM; + } + context->numDevices = numGpus; + } + // Get hsa timestamp factor + uint64_t frequency_hz; + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting HSA timestamp factor); + ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency_hz), + { + ERROR_PLAIN_PRINT(Failed to get HSA timestamp factor); + goto rocmon_init_info_agents_failed; + }); + context->hsa_timestamp_factor = (long double)1000000000 / (long double)frequency_hz; + + // initialize structures for specified devices (fetch ROCm specific info) + iterate_agents_cb_arg arg = { + .context = context, + .numGpus = numGpus, + .gpuIds = gpuIds, + }; + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Iterating through %d available agents, numGpus); + ROCM_CALL(hsa_iterate_agents, (_rocmon_v1_iterate_agents_callback, &arg), + { + ERROR_PRINT(Error while iterating through available agents); + goto rocmon_init_info_agents_failed; + }); + + rocmon_v1_initialized = TRUE; + return 0; +rocmon_init_info_agents_failed: + ROCM_CALL(hsa_shut_down, (), { + // fall through + }); +rocmon_init_hsa_failed: + free(context->devices); + context->devices = NULL; + context->numDevices = 0; + return -1; +} + + +void +rocmon_v1_finalize(RocmonContext* context) +{ + + if (!rocmon_v1_initialized) + { + return; + } + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON); + + if (context) + { + if (context->devices) + { + // Free each devices fields + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + if (device->rocprof_v1) + { + FREE_IF_NOT_NULL(device->v1_rocMetrics); + FREE_IF_NOT_NULL(device->v1_activeRocEvents); + } + if (device->groupResults) + { + // Free events of event result lists + for (int j = 0; j < device->numGroupResults; j++) + { + FREE_IF_NOT_NULL(device->groupResults[i].results); + } + // Free list + free(device->groupResults); + } + if (device->v1_context) + { + ROCM_CALL(rocprofiler_close, (device->v1_context),); + } + } + } + } + + ROCM_CALL(hsa_shut_down, (), { + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); + // fall through + }); +} + + +/*int*/ +/*rocmon_v1_addEventSet(const char* eventString, int* gid)*/ +/*{*/ +/* // Check arguments*/ +/* if (!eventString)*/ +/* {*/ +/* return -EINVAL;*/ +/* }*/ +/* */ +/* // Ensure rocmon is initialized*/ +/* if (!rocmon_v1_initialized)*/ +/* {*/ +/* return -EFAULT;*/ +/* }*/ + +/* // Allocate memory for event group if necessary*/ +/* if (rocmon_context->numActiveGroups == rocmon_context->numGroups)*/ +/* {*/ +/* GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo));*/ +/* if (tmpInfo == NULL)*/ +/* {*/ +/* ERROR_PLAIN_PRINT(Cannot allocate additional group);*/ +/* return -ENOMEM;*/ +/* }*/ +/* rocmon_context->groups = tmpInfo;*/ +/* rocmon_context->numGroups++;*/ +/* }*/ + +/* // Parse event string*/ +/* int err = _rocmon_v1_parse_eventstring(eventString, &rocmon_context->groups[rocmon_context->numActiveGroups]);*/ +/* if (err < 0)*/ +/* {*/ +/* return err;*/ +/* }*/ + +/* */ + +/* *gid = rocmon_context->numActiveGroups;*/ +/* rocmon_context->numActiveGroups++;*/ +/* return 0;*/ +/*}*/ + + +int +_rocmon_setupCounters_rocprofiler_v1(RocmonDevice* device, const char** events, int numEvents) +{ + // Close previous rocprofiler context + if (device->v1_context) + { + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Closing previous rocprofiler context); + ROCM_CALL(rocprofiler_close, (device->v1_context), return -1); + } + + // Look if the are any events + if (numEvents <= 0) + { + return 0; + } + + // Create feature array to monitor + rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(numEvents * sizeof(rocprofiler_feature_t)); + if (features == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate feature list); + return -ENOMEM; + } + for (int i = 0; i < numEvents; i++) + { + features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[i].name = events[i]; + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEBUG, Setup ROCMON rocprofiler_v1 counter %d %s, i, events[i]); + } + + // Free previous feature array if present + FREE_IF_NOT_NULL(device->v1_activeRocEvents); + + device->numActiveRocEvents = numEvents; + device->v1_activeRocEvents = features; + + // Open context + rocprofiler_properties_t properties = {}; + properties.queue_depth = 128; + uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | ROCPROFILER_MODE_SINGLEGROUP; + + // Important: only a single profiling group is supported at this time which limits the number of events that can be monitored at a time. + ROCM_CALL(rocprofiler_open, (device->hsa_agent, device->v1_activeRocEvents, device->numActiveRocEvents, &device->v1_context, mode, &properties), return -1); + + return 0; +} + + +int +rocmon_v1_setupCounters(RocmonContext* context, int gid) +{ + int ret; + + // Check arguments + if (gid < 0 || gid >= context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Get group info + GroupInfo* group = &context->groups[gid]; + + // + // Separate rocprofiler and SMI events + // + const char **rocEvents = NULL; + int numRocEvents = 0; + + // Allocate memory for string arrays + rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (rocEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "ROCP_", 5) == 0) + { + // Rocprofiler event + rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix + numRocEvents++; + } + } + + // Add events to each device + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Add rocprofiler events + //ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); + ret = _rocmon_setupCounters_rocprofiler_v1(device, rocEvents, numRocEvents); + if (ret < 0) + { + free(rocEvents); + return ret; + } + } + // Cleanup + free(rocEvents); + + return 0; +} + + +static int +_rocmon_startCounters_rocprofiler_v1(RocmonDevice* device) +{ + // Check if there are any counters to start + if (device->numActiveRocEvents <= 0) + { + return 0; + } + + // Reset results + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveRocEvents; i++) + { + RocmonEventResult* result = &groupResult->results[i]; + result->lastValue = 0; + result->fullValue = 0; + } + + if (device->v1_context) + { + ROCM_CALL(rocprofiler_start, (device->v1_context, 0), return -1); + } + + return 0; +} + + + +int +rocmon_v1_startCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Get timestamp + uint64_t timestamp; + if (ret = _rocmon_v1_get_timestamp(×tamp)) + { + return ret; + } + + // Start counters on each device + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + device->time.start = timestamp; + device->time.read = timestamp; + + // Start rocprofiler events + ret = _rocmon_startCounters_rocprofiler_v1(device); + if (ret < 0) return ret; + + // Start SMI events +/* _rocmon_startCounters_smi(device);*/ +/* if (ret < 0) return ret;*/ + } + + return 0; +} + + +static int +_rocmon_stopCounters_rocprofiler_v1(RocmonDevice* device) +{ + if (device->v1_context) + { + // Close context + ROCM_CALL(rocprofiler_stop, (device->v1_context, 0), return -1); + } + + return 0; +} + + +int +rocmon_v1_stopCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Read counters + ret = _rocmon_v1_readCounters(context, &_rocmon_v1_get_stop_time); + if (ret < 0) return ret; + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Stop rocprofiler events + ret = _rocmon_stopCounters_rocprofiler_v1(device); + if (ret < 0) return ret; + + // Nothing to stop for SMI events + } + + return 0; +} + + +int +rocmon_v1_readCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Read counters + ret = _rocmon_v1_readCounters(context, &_rocmon_v1_get_read_time); + if (ret < 0) return ret; + + return 0; +} + + +int +rocmon_v1_getEventsOfGpu(RocmonContext* context, int gpuIdx, EventList_rocm_t* list) +{ + EventList_rocm_t tmpList = NULL; + Event_rocm_t* tmpEventList = NULL; + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + // Validate args + if ((gpuIdx < 0) || (gpuIdx > context->numDevices) || (!list)) + { + return -EINVAL; + } + + RocmonDevice* device = &context->devices[gpuIdx]; + + if (*list) + { + tmpList = *list; + } + else + { + // Allocate list structure + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + memset(tmpList, 0, sizeof(EventList_rocm)); + } + + // Get number of events + printf("Number of events %d\n", device->numRocMetrics); + + if (device->numRocMetrics == 0) + { + // No events -> return list + *list = tmpList; + return 0; + } + // (Re-)Allocate event array + tmpEventList = realloc(tmpList->events, (tmpList->numEvents + device->numRocMetrics) * sizeof(Event_rocm_t)); + if (!tmpEventList) + { + if (!*list) free(tmpList); + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + return -ENOMEM; + } + tmpList->events = tmpEventList; + int startindex = tmpList->numEvents; + + // Copy rocprofiler event information + for (int i = 0; i < device->numRocMetrics; i++) + { + rocprofiler_info_data_t* event = &device->v1_rocMetrics[i]; + Event_rocm_t* out = &tmpList->events[startindex + i]; + int len; + + // Copy name + printf("Name %s\n", event->metric.name); + len = strlen(event->metric.name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "ROCP_%s", event->metric.name); + } + + // Copy description + len = strlen(event->metric.description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", event->metric.description); + } + tmpList->numEvents++; + } + *list = tmpList; + return 0; +} + + +int +rocmon_v1_switchActiveGroup(RocmonContext* context, int newGroupId) +{ + int ret; + + ret = rocmon_v1_stopCounters(context); + if (ret < 0) + { + return ret; + } + + ret = rocmon_v1_setupCounters(context, newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_v1_startCounters(context); + if (ret < 0) + { + return ret; + } + + return 0; +} + #endif /* LIKWID_ROCMON_V1_H */ diff --git a/src/includes/rocmon_v1_types.h b/src/includes/rocmon_v1_types.h index 5d06f85d3..22d588a90 100644 --- a/src/includes/rocmon_v1_types.h +++ b/src/includes/rocmon_v1_types.h @@ -32,118 +32,28 @@ #include // #include -#ifndef ROCPROFILER_VERSION_MAJOR #ifdef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #undef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #endif -#include +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT #endif -#include -#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 -typedef struct metrics_table_header_t metrics_table_header_t; +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT #endif -#include -#include - -typedef struct { - double lastValue; - double fullValue; -} RocmonEventResult; - -typedef struct { - RocmonEventResult* results; // First rocprofiler results, then SMI results - int numResults; -} RocmonEventResultList; - - - -struct RocmonSmiEvent_struct; -typedef int (*RocmonSmiMeasureFunc)(int deviceId, struct RocmonSmiEvent_struct* event, RocmonEventResult* result); - -typedef enum { - ROCMON_SMI_EVENT_TYPE_NORMAL = 0, - ROCMON_SMI_EVENT_TYPE_VARIANT, - ROCMON_SMI_EVENT_TYPE_SUBVARIANT, - ROCMON_SMI_EVENT_TYPE_INSTANCES -} RocmonSmiEventType; - -typedef struct RocmonSmiEvent_struct { - char name[40]; - uint64_t variant; - uint64_t subvariant; - uint64_t extra; - int instances; - RocmonSmiEventType type; - RocmonSmiMeasureFunc measureFunc; -} RocmonSmiEvent; - -typedef struct { - RocmonSmiEvent* entries; - int numEntries; -} RocmonSmiEventList; - -typedef struct { - int deviceId; // LIKWID device id - - hsa_agent_t hsa_agent; // HSA agent handle for this device - rocprofiler_t* context; // Rocprofiler context (has activeEvents configured) - - // Available rocprofiler metrics - rocprofiler_info_data_t* rocMetrics; - int numRocMetrics; - - // Available ROCm SMI events - Map_t smiMetrics; - - // Currently configured rocprofiler events (bound to context) - rocprofiler_feature_t* activeRocEvents; - int numActiveRocEvents; - - // Currently configured ROCm SMI events - RocmonSmiEvent* activeSmiEvents; - int numActiveSmiEvents; - - // Results for all events in all event sets - RocmonEventResultList* groupResults; - int numGroupResults; - - // Timestamps in ns - struct { - uint64_t start; - uint64_t read; - uint64_t stop; - } time; -} RocmonDevice; - -typedef struct { - // Event Groups - GroupInfo *groups; - int numGroups; // Number of allocated groups - int numActiveGroups; // Number of used groups - int activeGroup; // Currently active group - - // Devices (HSA agents) - RocmonDevice *devices; - int numDevices; - - // System information - long double hsa_timestamp_factor; // hsa_timestamp * hsa_timestamp_factor = timestamp_in_ns +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include - // ROCm SMI events - Map_t smiEvents; -} RocmonContext; -extern RocmonContext *rocmon_context; +#include -typedef struct { - bstring tag; - int groupID; - int gpuCount; - int eventCount; - double* time; - uint32_t* count; - int* gpulist; - double** counters; -} LikwidRocmResults; #endif /* LIKWID_ROCMON_V1_TYPES_H */ diff --git a/src/rocmon.c b/src/rocmon.c index 7e552f968..743b3b33c 100644 --- a/src/rocmon.c +++ b/src/rocmon.c @@ -44,51 +44,296 @@ #include #include -#include +#include #ifdef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #undef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #endif -#ifndef LIKWID_ROCPROF_SDK -#include -#include -#else + +#include +int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; +static int rocmon_initialized = 0; +static RocmonContext* rocmon_context = NULL; + +// Include backends +#include +#include +#ifdef LIKWID_ROCPROF_SDK #include #include #endif +#include +#include -#include - +//#include +const char* rocprofiler_group_arch = "amd_gpu"; void rocmon_finalize(void) { -#ifndef LIKWID_ROCPROF_SDK - rocmon_v1_finalize(); -#else - rocmon_sdk_finalize(); + if ((!rocmon_initialized) || (rocmon_context == NULL)) + { + rocmon_context = NULL; + rocmon_initialized = 0; + return; + } + if (rocmon_context->use_rocprofiler_v1) + { + rocmon_v1_finalize(rocmon_context); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + rocmon_sdk_finalize(rocmon_context); + } #endif + + rocmon_smi_finalize(rocmon_context); + + if (rocmon_context->devices) + { + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* dev = &rocmon_context->devices[i]; + if (dev->groupResults) + { + if (dev->groupResults->results) + { + free(dev->groupResults->results); + dev->groupResults->results = NULL; + dev->groupResults->numResults = 0; + } + free(dev->groupResults); + dev->groupResults = NULL; + } + } + free(rocmon_context->devices); + rocmon_context->devices = NULL; + rocmon_context->numDevices = 0; + } + if (rocmon_context->groups) + { + free(rocmon_context->groups); + rocmon_context->groups = NULL; + rocmon_context->numGroups = 0; + rocmon_context->numActiveGroups = 0; + rocmon_context->activeGroup = -1; + } + + free(rocmon_context); + rocmon_context = NULL; + + rocmon_initialized = FALSE; return; } int rocmon_init(int numGpus, const int* gpuIds) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_init(numGpus, gpuIds); + int err = 0; + + // check if already initialized + if (rocmon_initialized) + { + return 0; + } + if (rocmon_context != NULL) + { + return -EEXIST; + } + // Validate arguments + if (numGpus <= 0) + { + ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); + return -EINVAL; + } + if (!gpuIds) + { + ERROR_PRINT(Invalid GPU list); + return -EINVAL; + } + + // Initialize other parts + init_configuration(); + + // Allocate memory for context + rocmon_context = (RocmonContext*) malloc(sizeof(RocmonContext)); + if (rocmon_context == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate Rocmon context); + return -ENOMEM; + } + memset(rocmon_context, 0, sizeof(RocmonContext)); + rocmon_context->groups = NULL; + rocmon_context->devices = NULL; + +#ifdef LIKWID_ROCPROF_SDK + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RocProfiler SDK); + err = rocmon_sdk_init(rocmon_context, numGpus, gpuIds); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RocProfiler SDK returned %d, err); #else - return rocmon_sdk_init(numGpus, gpuIds); + err = -1; #endif + if (err != 0) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RocProfiler V1); + err = rocmon_v1_init(rocmon_context, numGpus, gpuIds); + if (err == 0) + { + rocmon_context->use_rocprofiler_v1 = 1; + } + else + { + ERROR_PRINT(Failed to initialize Rocprofiler v1 and SDK); + free(rocmon_context); + rocmon_context = NULL; + return err; + } + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing ROCm SMI); + err = rocmon_smi_init(rocmon_context, numGpus, gpuIds); + if (err != 0) + { + // Only fail if there are no devices -> neither v1 nor sdk added them + if (rocmon_context->devices == NULL) + { + ERROR_PRINT(Failed to initialize Rocprofiler SMI); + free(rocmon_context); + rocmon_context = NULL; + return err; + } + } + rocmon_context->state = ROCMON_STATE_INITIALIZED; + rocmon_initialized = TRUE; + return err; +} + +int find_colon(const char* str) +{ + for (int i = 0; i < strlen(str); i++) + { + if (str[i] == ':') + { + return 1; + } + } + return 0; +} + +static int +_rocmon_parse_eventstring(const char* eventString, const char* arch, GroupInfo* group) +{ + int err = 0; + const char colon = ':'; + Configuration_t config = get_configuration(); + + if ((strstr(eventString, &colon) != NULL) || (find_colon(eventString))) + { + // If custom group -> perfgroup_customGroup + err = perfgroup_customGroup(eventString, group); + if (err < 0) + { + ERROR_PRINT(Cannot transform %s to performance group, eventString); + return err; + } + } + else + { + // If performance group -> perfgroup_readGroup + err = perfgroup_readGroup(config->groupPath, arch, eventString, group); + if (err == -EACCES) + { + ERROR_PRINT(Access to performance group %s not allowed, eventString); + return err; + } + else if (err == -ENODEV) + { + ERROR_PRINT(Performance group %s only available with deactivated HyperThreading, eventString); + return err; + } + if (err < 0) + { + ERROR_PRINT(Cannot read performance group %s for %s, eventString, arch); + return err; + } + } + + return 0; } int rocmon_addEventSet(const char* eventString, int* gid) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_addEventSet(eventString, gid); -#else - return rocmon_sdk_addEventSet(eventString, gid); -#endif + int ret = 0; + GroupInfo group = {}; + // Check arguments + if ((!gid) || (!eventString)) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + ERROR_PRINT(ROCMON not initialized); + return -EFAULT; + } + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding Eventstring %s, eventString); + ret = _rocmon_parse_eventstring(eventString, rocprofiler_group_arch, &group); + if (ret < 0) + { + return ret; + } + + // Allocate memory for event group if necessary + if (rocmon_context->numActiveGroups == rocmon_context->numGroups) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Increasing group space to %d, rocmon_context->numGroups+1); + GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo)); + if (tmpInfo == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate additional group); + return -ENOMEM; + } + rocmon_context->groups = tmpInfo; + rocmon_context->numGroups++; + } + + // Allocate memory for event results + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Allocate result space); + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Allocate memory for event results + int numEvents = group.nevents; + RocmonEventResult* tmpResults = (RocmonEventResult*) malloc(numEvents * sizeof(RocmonEventResult)); + if (tmpResults == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event results); + return -ENOMEM; + } + + // Allocate memory for new event result list entry + RocmonEventResultList* tmpGroupResults = (RocmonEventResultList*) realloc(device->groupResults, (device->numGroupResults+1) * sizeof(RocmonEventResultList)); + if (tmpGroupResults == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate new event group result list); + return -ENOMEM; + } + + device->groupResults = tmpGroupResults; + device->groupResults[device->numGroupResults].results = tmpResults; + device->groupResults[device->numGroupResults].numResults = numEvents; + device->numGroupResults++; + } + + rocmon_context->groups[rocmon_context->numActiveGroups] = group; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Eventstring %s got GID %d, eventString, rocmon_context->numActiveGroups); + *gid = rocmon_context->numActiveGroups; + rocmon_context->numActiveGroups++; + return 0; } @@ -96,11 +341,119 @@ rocmon_addEventSet(const char* eventString, int* gid) int rocmon_setupCounters(int gid) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_setupCounters(gid); -#else - return rocmon_sdk_setupCounters(gid); + int ret; + + // Check arguments + if (gid < 0 || gid >= rocmon_context->numActiveGroups) + { + ERROR_PRINT(Invalid eventset ID %d, gid); + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + ERROR_PRINT(Rocmon not initialized); + return -EFAULT; + } + if ((rocmon_context->state != ROCMON_STATE_STOPPED) && (rocmon_context->state != ROCMON_STATE_INITIALIZED)) + { + ERROR_PRINT(Rocmon not in a valid state to setup -> %d, rocmon_context->state); + return -EFAULT; + } + + // Get group info + GroupInfo* group = &rocmon_context->groups[gid]; + + // + // Separate rocprofiler and SMI events + // + const char **smiEvents = NULL, **rocEvents = NULL; + int numSmiEvents = 0, numRocEvents = 0; + + // Allocate memory for string arrays + smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (smiEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); + return -ENOMEM; + } + rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (rocEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); + free(smiEvents); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "RSMI_", 5) == 0) + { + // RSMI event + smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix + numSmiEvents++; + } + else if (strncmp(name, "ROCP_", 5) == 0) + { + // Rocprofiler event + rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix + numRocEvents++; + } + else + { + // Unknown event + ERROR_PRINT(Event '%s' has no prefix ('ROCP_' or 'RSMI_'), name); + return -EINVAL; + } + } + + // Add events to each device + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Add rocprofiler events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCPROFILER WITH %d events, numRocEvents); + if (rocmon_context->use_rocprofiler_v1) + { + ret = rocmon_v1_setupCounters(rocmon_context, gid); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ret = rocmon_sdk_setupCounters(rocmon_context, gid); + } #endif + if (ret < 0) + { + ERROR_PRINT(Setting up rocprofiler counters failed); + free(smiEvents); + free(rocEvents); + return ret; + } + + // Add SMI events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCM SMI WITH %d events, numSmiEvents); + ret = rocmon_smi_setupCounters(rocmon_context, gid); + if (ret < 0) + { + ERROR_PRINT(Setting up SMI counters failed); + free(smiEvents); + free(rocEvents); + return ret; + } + device->activeGroup = gid; + } + rocmon_context->activeGroup = gid; + rocmon_context->state = ROCMON_STATE_SETUP; + // Cleanup + free(smiEvents); + free(rocEvents); + + return 0; } @@ -108,273 +461,501 @@ rocmon_setupCounters(int gid) int rocmon_startCounters(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_startCounters(); -#else - return rocmon_sdk_startCounters(); + int ret = 0; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + ERROR_PRINT(ROCMON not initialized); + return -EFAULT; + } + if ((rocmon_context->activeGroup < 0) || (rocmon_context->state != ROCMON_STATE_SETUP)) + { + ERROR_PRINT(No eventset configured for ROCMON); + return -EFAULT; + } + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting ROCMON rocprofiler_v1 counters); + ret = rocmon_v1_startCounters(rocmon_context); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting ROCMON rocprofiler_sdk counters); + ret = rocmon_sdk_startCounters(rocmon_context); + } #endif + if (ret < 0) + { + return ret; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting ROCMON SMI counters); + ret = rocmon_smi_startCounters(rocmon_context); + if (ret < 0) + { + return ret; + } + rocmon_context->state = ROCMON_STATE_RUNNING; + return 0; } - int rocmon_stopCounters(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_stopCounters(); -#else - return rocmon_sdk_stopCounters(); + int ret = 0; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + if ((rocmon_context->activeGroup < 0) || (rocmon_context->state != ROCMON_STATE_RUNNING)) + { + return -EFAULT; + } + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping ROCMON rocprofiler_v1 counters); + ret = rocmon_v1_stopCounters(rocmon_context); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping ROCMON rocprofiler_sdk counters); + ret = rocmon_sdk_stopCounters(rocmon_context); + } #endif + if (ret < 0) + { + return ret; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping ROCMON SMI counters); + ret = rocmon_smi_stopCounters(rocmon_context); + if (ret < 0) + { + return ret; + } + rocmon_context->state = ROCMON_STATE_STOPPED; + return 0; } - int rocmon_readCounters(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_readCounters(); -#else - return rocmon_sdk_readCounters(); + int ret = 0; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + if ((rocmon_context->activeGroup < 0) || (rocmon_context->state != ROCMON_STATE_RUNNING)) + { + return -EFAULT; + } + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reading ROCMON rocprofiler_v1 counters); + ret = rocmon_v1_readCounters(rocmon_context); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reading ROCMON rocprofiler_sdk counters); + ret = rocmon_sdk_readCounters(rocmon_context); + } #endif + if (ret < 0) + { + ERROR_PRINT(Failed to read ROCMON rocprofiler counters); + return ret; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reading ROCMON SMI counters); + ret = rocmon_smi_readCounters(rocmon_context); + if (ret < 0) + { + ERROR_PRINT(Failed to read ROCMON SMI counters); + return ret; + } + return 0; } -double -rocmon_getResult(int gpuIdx, int groupId, int eventId) +int +rocmon_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getResult(gpuIdx, groupId, eventId); -#else - return rocmon_sdk_getResult(gpuIdx, groupId, eventId); + int ret = 0; + EventList_rocm_t l = malloc(sizeof(EventList_rocm)); + if (!l) + { + return -ENOMEM; + } + memset(l, 0, sizeof(EventList_rocm)); + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding RocProfiler V1 events); + ret = rocmon_v1_getEventsOfGpu(rocmon_context, gpuIdx, &l); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding RocProfiler SDK events); + ret = rocmon_sdk_getEventsOfGpu(rocmon_context, gpuIdx, &l); + } #endif + if (ret < 0) + { + rocmon_freeEventsOfGpu(l); + return ret; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding ROCm SMI events); + ret = rocmon_smi_getEventsOfGpu(rocmon_context, gpuIdx, &l); + if (ret < 0) + { + rocmon_freeEventsOfGpu(l); + return ret; + } + *list = l; + return 0; } - -// TODO: multiple groups -double -rocmon_getLastResult(int gpuIdx, int groupId, int eventId) +void +rocmon_freeEventsOfGpu(EventList_rocm_t list) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getLastResult(gpuIdx, groupId, eventId); -#else - return rocmon_sdk_getLastResult(gpuIdx, groupId, eventId); -#endif + if (!list) + { + return; + } + if (list->events != NULL) + { + for (int i = 0; i < list->numEvents; i++) + { + Event_rocm_t* event = &list->events[i]; + if (event->name) { + free(event->name); + event->name = NULL; + } + if (event->description) { + free(event->description); + event->description = NULL; + } + } + free(list->events); + list->events = NULL; + } + free(list); + return; } int -rocmon_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) +rocmon_switchActiveGroup(int newGroupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getEventsOfGpu(gpuIdx, list); -#else - return rocmon_sdk_getEventsOfGpu(gpuIdx, list); + int ret = 0; + if (rocmon_context->use_rocprofiler_v1) + { + ret = rocmon_v1_switchActiveGroup(rocmon_context, newGroupId); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ret = rocmon_sdk_switchActiveGroup(rocmon_context, newGroupId); + } #endif + if (ret < 0) + { + return ret; + } + ret = rocmon_smi_switchActiveGroup(rocmon_context, newGroupId); + if (ret < 0) + { + return ret; + } + return 0; } -void -rocmon_freeEventsOfGpu(EventList_rocm_t list) + + +void rocmon_setVerbosity(int level) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_freeEventsOfGpu(list); -#else - return rocmon_sdk_freeEventsOfGpu(list); -#endif + if (level >= DEBUGLEV_ONLY_ERROR && level <= DEBUGLEV_DEVELOP) + { + likwid_rocmon_verbosity = level; + } } -int -rocmon_switchActiveGroup(int newGroupId) + +double +rocmon_getResult(int gpuIdx, int groupId, int eventId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_switchActiveGroup(newGroupId); -#else - return rocmon_sdk_switchActiveGroup(newGroupId); -#endif + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Validate gpuIdx + if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) + { + return -EFAULT; + } + + // Validate groupId + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + if (groupId < 0 || groupId >= device->numGroupResults) + { + return -EFAULT; + } + + // Validate eventId + RocmonEventResultList* groupResult = &device->groupResults[groupId]; + if (eventId < 0 || eventId >= groupResult->numResults) + { + return -EFAULT; + } + + // Return result + return groupResult->results[eventId].fullValue; +} + + +// TODO: multiple groups +double +rocmon_getLastResult(int gpuIdx, int groupId, int eventId) +{ + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Validate gpuIdx + if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) + { + return -EFAULT; + } + + // Validate groupId + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + if (groupId < 0 || groupId >= device->numGroupResults) + { + return -EFAULT; + } + + // Validate eventId + RocmonEventResultList* groupResult = &device->groupResults[groupId]; + if (eventId < 0 || eventId >= groupResult->numResults) + { + return -EFAULT; + } + + // Return result + return groupResult->results[eventId].lastValue; } int rocmon_getNumberOfGroups(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getNumberOfGroups(); -#else - return rocmon_sdk_getNumberOfGroups(); -#endif + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->numActiveGroups; } int rocmon_getIdOfActiveGroup(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getIdOfActiveGroup(); -#else - return rocmon_sdk_getIdOfActiveGroup(); -#endif + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->activeGroup; } int rocmon_getNumberOfGPUs(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getNumberOfGPUs(); -#else - return rocmon_sdk_getNumberOfGPUs(); -#endif + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->numDevices; } int rocmon_getNumberOfEvents(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getNumberOfEvents(groupId); -#else - return rocmon_sdk_getNumberOfEvents(groupId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return -EFAULT; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->nevents; } int rocmon_getNumberOfMetrics(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getNumberOfMetrics(groupId); -#else - return rocmon_sdk_getNumberOfMetrics(groupId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + { + return -EFAULT; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->nmetrics; } double rocmon_getTimeOfGroup(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getTimeOfGroup(groupId); -#else - return rocmon_sdk_getTimeOfGroup(groupId); -#endif + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + return 0; } double rocmon_getLastTimeOfGroup(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getLastTimeOfGroup(groupId); -#else - return rocmon_sdk_getLastTimeOfGroup(groupId); -#endif + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + return 0; } double rocmon_getTimeToLastReadOfGroup(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getTimeToLastReadOfGroup(groupId); -#else - return rocmon_sdk_getTimeToLastReadOfGroup(groupId); -#endif + return 0; } char* rocmon_getEventName(int groupId, int eventId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getEventName(groupId, eventId); -#else - return rocmon_sdk_getEventName(groupId, eventId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((eventId < 0) || (eventId >= ginfo->nevents)) + { + return NULL; + } + return ginfo->events[eventId]; } char* rocmon_getCounterName(int groupId, int eventId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getCounterName(groupId, eventId); -#else - return rocmon_sdk_getCounterName(groupId, eventId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((eventId < 0) || (eventId >= ginfo->nevents)) + { + return NULL; + } + return ginfo->counters[eventId]; } char* rocmon_getMetricName(int groupId, int metricId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getMetricName(groupId, metricId); -#else - return rocmon_sdk_getMetricName(groupId, metricId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((metricId < 0) || (metricId >= ginfo->nmetrics)) + { + return NULL; + } + return ginfo->metricnames[metricId]; } char* rocmon_getGroupName(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getGroupName(groupId); -#else - return rocmon_sdk_getGroupName(groupId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->groupname; } char* rocmon_getGroupInfoShort(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getGroupInfoShort(groupId); -#else - return rocmon_sdk_getGroupInfoShort(groupId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->shortinfo; } char* rocmon_getGroupInfoLong(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getGroupInfoLong(groupId); -#else - return rocmon_sdk_getGroupInfoLong(groupId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->longinfo; } - int rocmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getGroups(groups, shortinfos, longinfos); -#else - return rocmon_sdk_getGroups(groups, shortinfos, longinfos); -#endif + init_configuration(); + Configuration_t config = get_configuration(); + + + return perfgroup_getGroups(config->groupPath, rocprofiler_group_arch, groups, shortinfos, longinfos); } int rocmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_returnGroups(nrgroups, groups, shortinfos, longinfos); -#else - return rocmon_sdk_returnGroups(nrgroups, groups, shortinfos, longinfos); -#endif + perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); } -void rocmon_setVerbosity(int level) + + +// only used internally by the ROCMON MarkerAPI +GroupInfo* rocmon_get_group(int gid) { - if (level >= DEBUGLEV_ONLY_ERROR && level <= DEBUGLEV_DEVELOP) + if ((gid >= 0) && (gid < rocmon_context->numActiveGroups)) { - likwid_rocmon_verbosity = level; + return &rocmon_context->groups[gid]; } + return NULL; } + #endif /* LIKWID_WITH_ROCMON */ diff --git a/src/rocmon_marker.c b/src/rocmon_marker.c index 01e43ffac..dc7707022 100644 --- a/src/rocmon_marker.c +++ b/src/rocmon_marker.c @@ -39,11 +39,21 @@ #include #include -#ifndef LIKWID_ROCPROF_SDK +#include +#include #include +#ifdef LIKWID_ROCPROF_SDK +#include #endif +#include +#ifndef FREE_IF_NOT_NULL +#define FREE_IF_NOT_NULL(x) if (x != NULL) { free(x); x = NULL; } +#endif + +#ifndef gettid #define gettid() syscall(SYS_gettid) +#endif #ifndef NAN #define NAN (0.0/0.0) @@ -202,8 +212,6 @@ _rocmon_saveToFile(const char* markerfile) static void _rocmon_finalize(void) { -#define FREE_IF_NOT_NULL(x) if (x != NULL) { free(x); x = NULL; } - // Ensure markers were initialized if (!rocmon_marker_initialized) { @@ -316,7 +324,7 @@ rocmon_markerInit(void) ret = rocmon_addEventSet(bdata(gEventStrings->entry[i]), &gpu_groups[i]); if (ret < 0) { - fprintf(stderr,"Error setting up Rocmon Marker API.\n"); + fprintf(stderr,"Error setting up Rocmon Marker API: %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -337,7 +345,7 @@ rocmon_markerInit(void) ret = rocmon_setupCounters(gpu_groups[active_group]); if (ret) { - fprintf(stderr,"Error setting up Rocmon Marker API.\n"); + fprintf(stderr,"Error setting up Rocmon Marker API: %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -349,7 +357,7 @@ rocmon_markerInit(void) ret = rocmon_startCounters(); if (ret) { - fprintf(stderr,"Error starting up Rocmon Marker API.\n"); + fprintf(stderr,"Error starting up Rocmon Marker API: %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -1066,8 +1074,8 @@ rocmon_getMetricOfRegionGpu(int region, int metricId, int gpuId) { return NAN; } - GroupInfo* ginfo = &rocmon_context->groups[rocmMarkerResults[region].groupID]; - if (metricId < 0 || metricId >= ginfo->nmetrics) + GroupInfo* ginfo = rocmon_get_group(rocmMarkerResults[region].groupID); + if ((!ginfo) || (metricId < 0) || (metricId >= ginfo->nmetrics)) { return NAN; } diff --git a/src/rocmon_sdk.c b/src/rocmon_sdk.c deleted file mode 100644 index 7e66a1402..000000000 --- a/src/rocmon_sdk.c +++ /dev/null @@ -1,251 +0,0 @@ - /* ======================================================================================= - * - * Filename: rocmon_sdk.c - * - * Description: Main implementation of the performance monitoring module - * for AMD GPUs with ROCm >= 6.2 - * - * Version: - * Released: - * - * Author: Thomas Gruber (tg), thomas.roehl@googlemail.com - * Project: likwid - * - * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg - * - * This program is free software: you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation, either version 3 of the License, or (at your option) any later - * version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A - * PARTICULAR PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - * - * ======================================================================================= - */ -#ifdef LIKWID_WITH_ROCMON - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include - -static bool rocmon_initialized = FALSE; -int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; - -int -rocmon_sdk_init(int numGpus, const int* gpuIds) -{ - return 0; -} - - -void -rocmon_sdk_finalize(void) -{ - return; -} - - -int -rocmon_sdk_addEventSet(const char* eventString, int* gid) -{ - return 0; -} - -int -rocmon_sdk_setupCounters(int gid) -{ - return 0; -} - - -int -rocmon_sdk_startCounters(void) -{ - return 0; -} - -int -rocmon_sdk_stopCounters(void) -{ - return 0; -} - - -int -rocmon_sdk_readCounters(void) -{ - return 0; -} - - -double -rocmon_sdk_getResult(int gpuIdx, int groupId, int eventId) -{ - return 0.0; -} - - -// TODO: multiple groups -double -rocmon_sdk_getLastResult(int gpuIdx, int groupId, int eventId) -{ - return 0.0; -} - - -int -rocmon_sdk_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) -{ - return -EINVAL; -} - -void -rocmon_sdk_freeEventsOfGpu(EventList_rocm_t list) -{ - return; -} - - -int -rocmon_sdk_switchActiveGroup(int newGroupId) -{ - return 0; -} - - -int -rocmon_sdk_getNumberOfGroups(void) -{ - return 0; -} - - -int -rocmon_sdk_getIdOfActiveGroup(void) -{ - return 0; -} - - -int -rocmon_sdk_getNumberOfGPUs(void) -{ - return 0; -} - - -int -rocmon_sdk_getNumberOfEvents(int groupId) -{ - return 0; -} - - -int -rocmon_sdk_getNumberOfMetrics(int groupId) -{ - return 0; -} - - -double -rocmon_sdk_getTimeOfGroup(int groupId) -{ - return 0; -} - - -double -rocmon_sdk_getLastTimeOfGroup(int groupId) -{ - return 0; -} - - -double -rocmon_sdk_getTimeToLastReadOfGroup(int groupId) -{ - return 0; -} - - -char* -rocmon_sdk_getEventName(int groupId, int eventId) -{ - return NULL; -} - - -char* -rocmon_sdk_getCounterName(int groupId, int eventId) -{ - return NULL; -} - - -char* -rocmon_sdk_getMetricName(int groupId, int metricId) -{ - return NULL; -} - - -char* -rocmon_sdk_getGroupName(int groupId) -{ - return NULL; -} - - -char* -rocmon_sdk_getGroupInfoShort(int groupId) -{ - return NULL; -} - - -char* -rocmon_sdk_getGroupInfoLong(int groupId) -{ - return NULL; -} - - -int -rocmon_sdk_getGroups(char*** groups, char*** shortinfos, char*** longinfos) -{ - init_configuration(); - Configuration_t config = get_configuration(); - - return perfgroup_getGroups(config->groupPath, "amd_gpu_sdk", groups, shortinfos, longinfos); -} - - -int -rocmon_sdk_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) -{ - perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); -} - - -#endif /* LIKWID_WITH_ROCMON */ diff --git a/src/rocmon_v1.c b/src/rocmon_v1.c deleted file mode 100644 index 31ff459e8..000000000 --- a/src/rocmon_v1.c +++ /dev/null @@ -1,2275 +0,0 @@ - /* ======================================================================================= - * - * Filename: rocmon_v1.c - * - * Description: Main implementation of the performance monitoring module - * for AMD GPUs with ROCm < 6.2 - * - * Version: - * Released: - * - * Author: Thomas Gruber (tg), thomas.roehl@googlemail.com - * Project: likwid - * - * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg - * - * This program is free software: you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation, either version 3 of the License, or (at your option) any later - * version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A - * PARTICULAR PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - * - * ======================================================================================= - */ -#ifdef LIKWID_WITH_ROCMON - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include - - - - -// #include -// #include -// #include - -// Variables -static void *dl_hsa_lib = NULL; -static void *dl_profiler_lib = NULL; -static void *dl_rsmi_lib = NULL; - -RocmonContext *rocmon_context = NULL; -static bool rocmon_initialized = FALSE; -int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; - -// Macros -#define membersize(type, member) sizeof(((type *) NULL)->member) -#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } -#define ROCM_CALL( call, args, handleerror ) \ - do { \ - hsa_status_t _status = (*call##_ptr)args; \ - if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ - const char* err = NULL; \ - fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ - rocprofiler_error_string(&err); \ - fprintf(stderr, "Error: %s\n", err); \ - handleerror; \ - } \ - } while (0) - -#define RSMI_CALL( call, args, handleerror ) \ - do { \ - rsmi_status_t _status = (*call##_ptr)args; \ - if (_status != RSMI_STATUS_SUCCESS) { \ - fprintf(stderr, "Error: function %s failed with error %d.\n", #call, _status); \ - handleerror; \ - } \ - } while (0) - -// ROCm function declarations -#define ROCMWEAK __attribute__(( weak )) -#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; -#define DECLAREFUNC_SMI(funcname, funcsig) rsmi_status_t ROCMWEAK funcname funcsig; rsmi_status_t ( *funcname##_ptr ) funcsig; - -DECLAREFUNC_HSA(hsa_init, ()); -DECLAREFUNC_HSA(hsa_shut_down, ()); -DECLAREFUNC_HSA(hsa_iterate_agents, (hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data)); -DECLAREFUNC_HSA(hsa_agent_get_info, (hsa_agent_t agent, hsa_agent_info_t attribute, void* value)); -DECLAREFUNC_HSA(hsa_system_get_info, (hsa_system_info_t attribute, void *value)); - -DECLAREFUNC_HSA(rocprofiler_iterate_info, (const hsa_agent_t* agent, rocprofiler_info_kind_t kind, hsa_status_t (*callback)(const rocprofiler_info_data_t, void* data), void* data)); -DECLAREFUNC_HSA(rocprofiler_close, (rocprofiler_t* context)); -DECLAREFUNC_HSA(rocprofiler_open, (hsa_agent_t agent, rocprofiler_feature_t* features, uint32_t feature_count, rocprofiler_t** context, uint32_t mode, rocprofiler_properties_t* properties)); -DECLAREFUNC_HSA(rocprofiler_error_string, ()); -DECLAREFUNC_HSA(rocprofiler_start, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_stop, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_read, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_get_data, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_get_metrics, (const rocprofiler_t* context)); - -DECLAREFUNC_SMI(rsmi_init, (uint64_t flags)); -DECLAREFUNC_SMI(rsmi_shut_down, ()); -DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_open, (uint32_t dv_ind, rsmi_func_id_iter_handle_t* handle)); -DECLAREFUNC_SMI(rsmi_dev_supported_variant_iterator_open, (rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t* var_iter)); -DECLAREFUNC_SMI(rsmi_func_iter_value_get, (rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t* value )); -DECLAREFUNC_SMI(rsmi_func_iter_next, (rsmi_func_id_iter_handle_t handle)); -DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_close, (rsmi_func_id_iter_handle_t* handle)); -DECLAREFUNC_SMI(rsmi_dev_power_ave_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* power)); -DECLAREFUNC_SMI(rsmi_dev_pci_throughput_get, (uint32_t dv_ind, uint64_t* sent, uint64_t* received, uint64_t* max_pkt_sz)); -DECLAREFUNC_SMI(rsmi_dev_pci_replay_counter_get, (uint32_t dv_ind, uint64_t* counter)); -DECLAREFUNC_SMI(rsmi_dev_memory_total_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* total)); -DECLAREFUNC_SMI(rsmi_dev_memory_usage_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* used )); -DECLAREFUNC_SMI(rsmi_dev_memory_busy_percent_get, (uint32_t dv_ind, uint32_t* busy_percent)); -DECLAREFUNC_SMI(rsmi_dev_memory_reserved_pages_get, (uint32_t dv_ind, uint32_t* num_pages, rsmi_retired_page_record_t* records)); -DECLAREFUNC_SMI(rsmi_dev_fan_rpms_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); -DECLAREFUNC_SMI(rsmi_dev_fan_speed_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); -DECLAREFUNC_SMI(rsmi_dev_fan_speed_max_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* max_speed)); -DECLAREFUNC_SMI(rsmi_dev_temp_metric_get, (uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t* temperature)); -DECLAREFUNC_SMI(rsmi_dev_volt_metric_get, (uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t* voltage)); -DECLAREFUNC_SMI(rsmi_dev_overdrive_level_get, (uint32_t dv_ind, uint32_t* od)); -DECLAREFUNC_SMI(rsmi_dev_ecc_count_get, (uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t* ec)); -DECLAREFUNC_SMI(rsmi_compute_process_info_get, (rsmi_process_info_t* procs, uint32_t* num_items)); - - -// ---------------------------------------------------- -// SMI event wrapper -// ---------------------------------------------------- - -static int -_smi_wrapper_pci_throughput_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t value; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _smi_wrapper_pci_throughput_get(%d, %d), deviceId, event->extra); - // Internal variant: 0 for sent, 1 for received bytes and 2 for max packet size - if (event->extra == 0) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, &value, NULL, NULL), return -1); - else if (event->extra == 1) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, &value, NULL), return -1); - else if (event->extra == 2) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, NULL, &value), return -1); - else return -1; - - result->fullValue += value; - result->lastValue = value; - - return 0; -} - - -static int -_smi_wrapper_pci_replay_counter_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t counter; - RSMI_CALL(rsmi_dev_pci_replay_counter_get, (deviceId, &counter), return -1); - result->fullValue += counter; - result->lastValue = counter; - - return 0; -} - - -static int -_smi_wrapper_power_ave_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t power; - RSMI_CALL(rsmi_dev_power_ave_get, (deviceId, event->subvariant, &power), return -1); - result->fullValue += power; - result->lastValue = power; - - return 0; -} - - -static int -_smi_wrapper_memory_total_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t total; - RSMI_CALL(rsmi_dev_memory_total_get, (deviceId, event->variant, &total), return -1); - result->fullValue += total; - result->lastValue = total; - - return 0; -} - - -static int -_smi_wrapper_memory_usage_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t used; - RSMI_CALL(rsmi_dev_memory_usage_get, (deviceId, event->variant, &used), return -1); - result->fullValue += used; - result->lastValue = used; - - return 0; -} - - -static int -_smi_wrapper_memory_busy_percent_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t percent; - RSMI_CALL(rsmi_dev_memory_busy_percent_get, (deviceId, &percent), return -1); - result->fullValue += percent; - result->lastValue = percent; - - return 0; -} - - -static int -_smi_wrapper_memory_reserved_pages_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t num_pages; - RSMI_CALL(rsmi_dev_memory_reserved_pages_get, (deviceId, &num_pages, NULL), return -1); - result->fullValue += num_pages; - result->lastValue = num_pages; - - return 0; -} - - -static int -_smi_wrapper_fan_rpms_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t speed; - RSMI_CALL(rsmi_dev_fan_rpms_get, (deviceId, event->subvariant, &speed), return -1); - result->fullValue += speed; - result->lastValue = speed; - - return 0; -} - - -static int -_smi_wrapper_fan_speed_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t speed; - RSMI_CALL(rsmi_dev_fan_speed_get, (deviceId, event->subvariant, &speed), return -1); - result->fullValue += speed; - result->lastValue = speed; - - return 0; -} - - -static int -_smi_wrapper_fan_speed_max_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t max_speed; - RSMI_CALL(rsmi_dev_fan_speed_max_get, (deviceId, event->subvariant, &max_speed), return -1); - result->fullValue += max_speed; - result->lastValue = max_speed; - - return 0; -} - - -static int -_smi_wrapper_temp_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t temperature; - RSMI_CALL(rsmi_dev_temp_metric_get, (deviceId, event->subvariant, event->variant, &temperature), return -1); - result->fullValue += temperature; - result->lastValue = temperature; - - return 0; -} - - -static int -_smi_wrapper_volt_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t voltage; - RSMI_CALL(rsmi_dev_volt_metric_get, (deviceId, event->subvariant, event->variant, &voltage), return -1); - result->fullValue += voltage; - result->lastValue = voltage; - - return 0; -} - - -static int -_smi_wrapper_overdrive_level_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t overdrive; - RSMI_CALL(rsmi_dev_overdrive_level_get, (deviceId, &overdrive), return -1); - result->fullValue += overdrive; - result->lastValue = overdrive; - - return 0; -} - - -static int -_smi_wrapper_ecc_count_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - rsmi_error_count_t error_count; - RSMI_CALL(rsmi_dev_ecc_count_get, (deviceId, event->variant, &error_count), return -1); - - if (event->extra == 0) - { - result->lastValue = error_count.correctable_err - result->fullValue; - result->fullValue = error_count.correctable_err; - } - else if (event->extra == 1) - { - result->lastValue = error_count.uncorrectable_err - result->fullValue; - result->fullValue = error_count.uncorrectable_err; - } - else - { - return -1; - } - - return 0; -} - - -static int -_smi_wrapper_compute_process_info_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t num_items; - RSMI_CALL(rsmi_compute_process_info_get, (NULL, &num_items), return -1); - result->fullValue += num_items; - result->lastValue = num_items; - - return 0; -} - - -// ---------------------------------------------------- -// Rocmon helper functions -// ---------------------------------------------------- - -static int -_rocmon_link_libraries() -{ - #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries); - - // Need to link in the ROCm HSA libraries - dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_hsa_lib) - { - ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); - return -1; - } - - // Need to link in the Rocprofiler libraries - dl_profiler_lib = dlopen("librocprofiler64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_profiler_lib) - { - dl_profiler_lib = dlopen("librocprofiler64.so.1", RTLD_NOW | RTLD_GLOBAL); - if (!dl_profiler_lib) - { - ERROR_PRINT(Rocprofiler library librocprofiler64.so not found: %s, dlerror()); - return -1; - } - } - - // Need to link in the Rocprofiler libraries - dl_rsmi_lib = dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_rsmi_lib) - { - ERROR_PRINT(ROCm SMI library librocm_smi64.so not found: %s, dlerror()); - return -1; - } - - // Link HSA functions - DLSYM_AND_CHECK(dl_hsa_lib, hsa_init); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_shut_down); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_iterate_agents); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_agent_get_info); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_system_get_info); - - // Link Rocprofiler functions - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_iterate_info); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_close); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_open); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_error_string); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_start); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_stop); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_read); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_data); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_metrics); - - // Link SMI functions - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_init); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_shut_down); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_open); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_variant_iterator_open); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_value_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_next); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_close); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_power_ave_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_throughput_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_replay_counter_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_total_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_usage_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_busy_percent_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_reserved_pages_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_rpms_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_max_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_temp_metric_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_volt_metric_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_overdrive_level_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_ecc_count_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_compute_process_info_get); - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries done); - return 0; -} - -typedef struct { - RocmonContext* context; - int numGpus; - const int* gpuIds; -} iterate_agents_cb_arg; - -typedef struct { - RocmonDevice* device; - int currIndex; -} iterate_info_cb_arg; - - -static hsa_status_t -_rocmon_iterate_info_callback_count(const rocprofiler_info_data_t info, void* data) -{ - RocmonDevice* device = (RocmonDevice*) data; - if (device) { - device->numRocMetrics++; - } - return HSA_STATUS_SUCCESS; -} - -static void -_rocmon_print_rocprofiler_info_data(const rocprofiler_info_data_t info) -{ - if (info.kind != ROCPROFILER_INFO_KIND_METRIC) - { - return; - } - printf("Name '%s':\n", info.metric.name); - printf("\tKind: '%s'\n", (info.kind == ROCPROFILER_INFO_KIND_METRIC ? "Metric" : "Trace")); - printf("\tInstances: %d\n", info.metric.instances); - printf("\tDescription: '%s'\n", info.metric.description); - printf("\tExpression: '%s'\n", info.metric.expr); - printf("\tBlockName: '%s'\n", info.metric.block_name); - printf("\tBlockCounters: %d\n", info.metric.block_counters); -} - -static hsa_status_t -_rocmon_iterate_info_callback_add(const rocprofiler_info_data_t info, void* data) -{ - iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; - - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); - if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP) - { - _rocmon_print_rocprofiler_info_data(info); - } - // Check info kind - if (info.kind != ROCPROFILER_INFO_KIND_METRIC) - { - ERROR_PRINT(Wrong info kind %u, info.kind); - return HSA_STATUS_ERROR; - } - - // Check index - if (arg->currIndex >= arg->device->numRocMetrics) - { - ERROR_PRINT(Metric index out of bounds: %d, arg->currIndex); - return HSA_STATUS_ERROR; - } - - // Copy info data - rocprofiler_info_data_t* target_info = &arg->device->rocMetrics[arg->currIndex]; - memcpy(target_info, &info, sizeof(rocprofiler_info_data_t)); - arg->currIndex++; - - return HSA_STATUS_SUCCESS; -} - - -static hsa_status_t -_rocmon_iterate_agents_callback(hsa_agent_t agent, void* argv) -{ - // Count number of callback invocations as the devices id - static int nextDeviceId = 0; - int deviceId = nextDeviceId; - bool noAgent = false; - - iterate_agents_cb_arg *arg = (iterate_agents_cb_arg*) argv; - - // Check if device is a GPU - hsa_device_type_t type; - ROCM_CALL(hsa_agent_get_info, (agent, HSA_AGENT_INFO_DEVICE, &type), return -1); - if (type != HSA_DEVICE_TYPE_GPU) - { - return HSA_STATUS_SUCCESS; - } - nextDeviceId++; - - // Check if device is includes in arg->gpuIds - int gpuIndex = -1; - for (int i = 0; i < arg->numGpus; i++) - { - if (deviceId == arg->gpuIds[i]) - { - gpuIndex = i; - break; - } - } - if (gpuIndex < 0) - { - return HSA_STATUS_SUCCESS; - } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing agent %d, gpuIndex); - - // Add agent to context - RocmonDevice *device = &arg->context->devices[gpuIndex]; - device->deviceId = deviceId; - device->hsa_agent = agent; - device->context = NULL; - device->numActiveRocEvents = 0; - device->activeRocEvents = NULL; - device->numGroupResults = 0; - device->groupResults = NULL; - - // Get number of available metrics - device->numRocMetrics = 0; - ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, RocProfiler provides %d events, device->numRocMetrics); - - // workaround for bug in ROCm 5.4.0 - if(device->numRocMetrics == 0) { - ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); - noAgent = true; - } - - // Allocate memory for metrics - device->rocMetrics = (rocprofiler_info_data_t*) malloc(device->numRocMetrics * sizeof(rocprofiler_info_data_t)); - if (device->rocMetrics == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate set of rocMetrics); - return HSA_STATUS_ERROR; - } - - // Initialize SMI events map - if (init_map(&device->smiMetrics, MAP_KEY_TYPE_STR, 0, &free) < 0) - { - ERROR_PLAIN_PRINT(Cannot init smiMetrics map); - return HSA_STATUS_ERROR; - } - - // Fetch metric informatino - iterate_info_cb_arg info_arg = { - .device = device, - .currIndex = 0, - }; - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, Read %d RocProfiler events for device %d, device->numRocMetrics, device->deviceId); - - // If the call fails with agent, call rocprofiler_iterate_info without agent - if(noAgent) - { - ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); - } else { - ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); - } - - return HSA_STATUS_SUCCESS; -} - - -static int -_rocmon_parse_eventstring(const char* eventString, GroupInfo* group) -{ - int err = 0; - Configuration_t config = get_configuration(); - bstring eventBString = bfromcstr(eventString); - - if (bstrchrp(eventBString, ':', 0) != BSTR_ERR) - { - // If custom group -> perfgroup_customGroup - err = perfgroup_customGroup(eventString, group); - if (err < 0) - { - ERROR_PRINT(Cannot transform %s to performance group, eventString); - return err; - } - } - else - { - // If performance group -> perfgroup_readGroup - err = perfgroup_readGroup(config->groupPath, "amd_gpu", eventString, group); - if (err == -EACCES) - { - ERROR_PRINT(Access to performance group %s not allowed, eventString); - return err; - } - else if (err == -ENODEV) - { - ERROR_PRINT(Performance group %s only available with deactivated HyperThreading, eventString); - return err; - } - if (err < 0) - { - ERROR_PRINT(Cannot read performance group %s, eventString); - return err; - } - } - - return 0; -} - - -static int -_rocmon_get_timestamp(uint64_t* timestamp_ns) -{ - uint64_t timestamp; - - // Get timestamp from system - ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP, ×tamp), return -1); - // Convert to nanoseconds - *timestamp_ns = (uint64_t)((long double)timestamp * rocmon_context->hsa_timestamp_factor); - - return 0; -} - - -static int -_rocmon_getLastResult(RocmonDevice* device, int eventId, double* value) -{ - rocprofiler_data_t* data = &device->activeRocEvents[eventId].data; - - switch (data->kind) - { - case ROCPROFILER_DATA_KIND_INT32: - *value = (double) data->result_int32; - break; - case ROCPROFILER_DATA_KIND_INT64: - *value = (double) data->result_int64; - break; - case ROCPROFILER_DATA_KIND_FLOAT: - *value = (double) data->result_float; - break; - case ROCPROFILER_DATA_KIND_DOUBLE: - *value = data->result_double; - break; - - case ROCPROFILER_DATA_KIND_BYTES: - case ROCPROFILER_DATA_KIND_UNINIT: - default: - return -1; - } - - return 0; -} - - -static int -_rocmon_readCounters_rocprofiler(RocmonDevice* device) -{ - int ret; - - // Check if there are any counters to start - if (device->numActiveRocEvents <= 0) - { - return 0; - } - - if (!device->context) - { - return 0; - } - - ROCM_CALL(rocprofiler_read, (device->context, 0), return -1); - ROCM_CALL(rocprofiler_get_data, (device->context, 0), return -1); - ROCM_CALL(rocprofiler_get_metrics, (device->context), return -1); - - // Update results - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveRocEvents; i++) - { - RocmonEventResult* result = &groupResult->results[i]; - - // Read value - ret = _rocmon_getLastResult(device, i, &result->fullValue); - if (ret < 0) - { - return -1; - } - - // Calculate delta since last read - result->lastValue = result->fullValue - result->lastValue; - } - - return 0; -} - - -static int -_rocmon_readCounters_smi(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveSmiEvents <= 0) - { - return 0; - } - - // Save baseline values - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) - { - double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; - - // Measure counter - if (event->measureFunc) - { - event->measureFunc(device->deviceId, event, result); - } - } - - return 0; -} - - -static int -_rocmon_readCounters(uint64_t* (*getDestTimestampFunc)(RocmonDevice* device)) -{ - int ret; - - // Get timestamp - uint64_t timestamp; - if (ret = _rocmon_get_timestamp(×tamp)) - { - return ret; - } - - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Save timestamp - if (getDestTimestampFunc) - { - uint64_t* timestampDest = getDestTimestampFunc(device); - if (timestampDest) - { - *timestampDest = timestamp; - } - } - - // Read rocprofiler counters - ret = _rocmon_readCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Read SMI counters - ret = _rocmon_readCounters_smi(device); - if (ret < 0) return ret; - } - - return 0; -} - - -static uint64_t* -_rocmon_get_read_time(RocmonDevice* device) -{ - return &device->time.read; -} - - -static uint64_t* -_rocmon_get_stop_time(RocmonDevice* device) -{ - return &device->time.stop; -} - - -// ---------------------------------------------------- -// Rocmon SMI helper functions -// ---------------------------------------------------- - -static bstring -_rocmon_smi_build_label(RocmonSmiEventType type, const char* funcname, uint64_t variant, uint64_t subvariant) -{ - switch (type) - { - case ROCMON_SMI_EVENT_TYPE_NORMAL: - return bfromcstr(funcname); - case ROCMON_SMI_EVENT_TYPE_VARIANT: - return bformat("%s|%" PRIu64, funcname, variant); - case ROCMON_SMI_EVENT_TYPE_SUBVARIANT: - return bformat("%s|%" PRIu64 "|%" PRIu64, funcname, variant, subvariant); - case ROCMON_SMI_EVENT_TYPE_INSTANCES: - return bfromcstr(funcname); - } -} - - -static int -_rocmon_smi_add_event_to_device(RocmonDevice* device, const char* funcname, RocmonSmiEventType type, int64_t variant, uint64_t subvariant) -{ - int ret; - - // Get event by label - RocmonSmiEventList* list = NULL; - bstring label = _rocmon_smi_build_label(type, funcname, variant, subvariant); - ret = get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list); - bdestroy(label); - if (ret < 0) - { - // Event not registered -> ignore - return 0; - } - - // For events with multiple sensor, only make one entry -> find if one exists - if (type == ROCMON_SMI_EVENT_TYPE_INSTANCES && subvariant > 0) - { - // Get list from map - for (int i = 0; i < list->numEntries; i++) - { - RocmonSmiEvent* event = &list->entries[i]; - RocmonSmiEvent* existingEvent = NULL; - ret = get_smap_by_key(device->smiMetrics, event->name, (void**)&existingEvent); - if (ret < 0) - { - ERROR_PRINT(Failed to find previous instance for event %s, event->name); - return -1; - } - - // Update instance information - existingEvent->instances++; - } - return 0; - } - - for (int i = 0; i < list->numEntries; i++) - { - RocmonSmiEvent* event = &list->entries[i]; - - // Allocate memory for device event description - RocmonSmiEvent* tmpEvent = (RocmonSmiEvent*) malloc(sizeof(RocmonSmiEvent)); - if (tmpEvent == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event in device list %s, event->name); - return -ENOMEM; - } - - // Copy information from global description - memcpy(tmpEvent, event, sizeof(RocmonSmiEvent)); - tmpEvent->variant = variant; - tmpEvent->subvariant = subvariant; - tmpEvent->instances = 1; - - // Save event info to device event map - add_smap(device->smiMetrics, tmpEvent->name, tmpEvent); - } - - return 0; -} - - -static int -_rocmon_smi_get_function_subvariants(RocmonDevice* device, const char* funcname, uint64_t variant, rsmi_func_id_iter_handle_t var_iter) -{ - rsmi_func_id_iter_handle_t sub_var_iter; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Get open subvariants iterator - status = (*rsmi_dev_supported_variant_iterator_open_ptr)(var_iter, &sub_var_iter); - if (status == RSMI_STATUS_NO_DATA) - { - // No subvariants - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_VARIANT, variant, 0); - if (ret < 0) return -1; - return 0; - } - - // Subvariants available -> iterate them - do { - // Get subvariant information - (*rsmi_func_iter_value_get_ptr)(sub_var_iter, &value); - - // Process info - if (variant == RSMI_DEFAULT_VARIANT) - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_INSTANCES, variant, value.id); - else - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, variant, value.id); - if (ret < 0) return ret; - - // Advance iterator - status = (*rsmi_func_iter_next_ptr)(sub_var_iter); - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - (*rsmi_dev_supported_func_iterator_close_ptr)(&sub_var_iter); - - return 0; -} - - -static int -_rocmon_smi_get_function_variants(RocmonDevice* device, const char* funcname, rsmi_func_id_iter_handle_t iter_handle) -{ - rsmi_func_id_iter_handle_t var_iter; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Get open variants iterator - status = (*rsmi_dev_supported_variant_iterator_open_ptr)(iter_handle, &var_iter); - if (status == RSMI_STATUS_NO_DATA) - { - // No variants - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); - if (ret < 0) return -1; - return 0; - } - - // Variants available -> iterate them - do { - // Get variant information - (*rsmi_func_iter_value_get_ptr)(var_iter, &value); - - // Get function subvariants - ret = _rocmon_smi_get_function_subvariants(device, funcname, value.id, var_iter); - if (ret < 0) return -1; - - // Advance iterator - status = (*rsmi_func_iter_next_ptr)(var_iter); - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - (*rsmi_dev_supported_func_iterator_close_ptr)(&var_iter); - - return 0; -} - - -static int -_rocmon_smi_get_functions(RocmonDevice* device) -{ - rsmi_func_id_iter_handle_t iter_handle; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Open iterator - //(*rsmi_dev_supported_func_iterator_open_ptr)(device->deviceId, &iter_handle); - RSMI_CALL(rsmi_dev_supported_func_iterator_open, (device->deviceId, &iter_handle), { - return -1; - }); - - do - { - // Get function information - //(*rsmi_func_iter_value_get_ptr)(iter_handle, &value); - RSMI_CALL(rsmi_func_iter_value_get, (iter_handle, &value), { - ERROR_PRINT(Failed to get smi function value for device %d, device->deviceId); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - return -1; - }); - - // Get function variants - ret = _rocmon_smi_get_function_variants(device, value.name, iter_handle); - if (ret < 0) - { - ERROR_PRINT(Failed to get smi function variants for device %d, device->deviceId); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - return -1; - } - - // Advance iterator (cannot use RSMI_CALL macro here because we have an assignment, - // so we check that the function pointer exists to avoid segfaults.) - if (rsmi_func_iter_next_ptr) { - status = (*rsmi_func_iter_next_ptr)(iter_handle); - } - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - //(*rsmi_dev_supported_func_iterator_close_ptr)(&iter_handle); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - - // Add device independent functions - ret = _rocmon_smi_add_event_to_device(device, "rsmi_compute_process_info_get", ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); - if (ret < 0) return -1; - - return 0; -} - -#define ADD_SMI_EVENT(name, type, smifunc, variant, subvariant, extra, measurefunc) if (_rocmon_smi_add_event_to_map(name, type, smifunc, variant, subvariant, extra, measurefunc) < 0) { return -1; } -#define ADD_SMI_EVENT_N(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_NORMAL, smifunc, 0, 0, extra, measurefunc) -#define ADD_SMI_EVENT_V(name, smifunc, variant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_VARIANT, smifunc, variant, 0, extra, measurefunc) -#define ADD_SMI_EVENT_S(name, smifunc, variant, subvariant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, smifunc, variant, subvariant, extra, measurefunc) -#define ADD_SMI_EVENT_I(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_INSTANCES, smifunc, 0, 0, extra, measurefunc) - -static int -_rocmon_smi_add_event_to_map(char* name, RocmonSmiEventType type, char* smifunc, uint64_t variant, uint64_t subvariant, uint64_t extra, RocmonSmiMeasureFunc measureFunc) -{ - // Add new event list to map (if not already present) - bstring label = _rocmon_smi_build_label(type, smifunc, variant, subvariant); - RocmonSmiEventList* list; - if (get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list) < 0) - { - // Allocate memory for event list - list = (RocmonSmiEventList*) malloc(sizeof(RocmonSmiEventList)); - if (list == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event list %s, name); - return -ENOMEM; - } - list->entries = NULL; - list->numEntries = 0; - - add_smap(rocmon_context->smiEvents, bdata(label), list); - } - bdestroy(label); - - // Allocate memory for another event in list - list->numEntries++; - list->entries = (RocmonSmiEvent*) realloc(list->entries, list->numEntries * sizeof(RocmonSmiEvent)); - if (list->entries == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event %s, name); - return -ENOMEM; - } - - // Set event properties - RocmonSmiEvent* event = &list->entries[list->numEntries-1]; - strncpy(event->name, name, sizeof(event->name)); - event->name[sizeof(event->name)] = '\0'; - event->type = type; - event->variant = variant; - event->subvariant = subvariant; - event->extra = extra; - event->instances = 0; // gets set when scanning supported device functions - event->measureFunc = measureFunc; - - return 0; -} - - -static void -_rcomon_smi_free_event_list(void* vlist) -{ - RocmonSmiEventList* list = (RocmonSmiEventList*)vlist; - if (list) - { - FREE_IF_NOT_NULL(list->entries); - free(list); - } -} - - -static int -_rocmon_smi_init_events() -{ - int ret; - - // Init map - ret = init_map(&rocmon_context->smiEvents, MAP_KEY_TYPE_STR, 0, &_rcomon_smi_free_event_list); - if (ret < 0) - { - ERROR_PRINT(Failed to create map for ROCm SMI events); - return -1; - } - - // Add events - ADD_SMI_EVENT_N("PCI_THROUGHPUT_SENT", "rsmi_dev_pci_throughput_get", 0, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_THROUGHPUT_RECEIVED", "rsmi_dev_pci_throughput_get", 1, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_THROUGHPUT_MAX_PKT_SZ", "rsmi_dev_pci_throughput_get", 2, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_REPLAY_COUNTER", "rsmi_dev_pci_replay_counter_get", 0, &_smi_wrapper_pci_replay_counter_get ); - ADD_SMI_EVENT_I("POWER_AVE", "rsmi_dev_power_ave_get", 0, &_smi_wrapper_power_ave_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_VIS_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_GTT", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_VIS_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_GTT", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_N("MEMORY_BUSY_PERCENT", "rsmi_dev_memory_busy_percent_get", 0, &_smi_wrapper_memory_busy_percent_get ); - ADD_SMI_EVENT_N("MEMORY_NUM_RESERVED_PAGES", "rsmi_dev_memory_reserved_pages_get", 0, &_smi_wrapper_memory_reserved_pages_get ); - ADD_SMI_EVENT_I("FAN_RPMS", "rsmi_dev_fan_rpms_get", 0, &_smi_wrapper_fan_rpms_get ); - ADD_SMI_EVENT_I("FAN_SPEED", "rsmi_dev_fan_speed_get", 0, &_smi_wrapper_fan_speed_get ); - ADD_SMI_EVENT_I("FAN_SPEED_MAX", "rsmi_dev_fan_speed_max_get", 0, &_smi_wrapper_fan_speed_max_get ); - ADD_SMI_EVENT_S("TEMP_EDGE", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_EDGE, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("TEMP_JUNCTION", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_JUNCTION, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("TEMP_MEMORY", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_MEMORY, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("VOLT_VDDGFX", "rsmi_dev_volt_metric_get", RSMI_VOLT_CURRENT, RSMI_VOLT_TYPE_VDDGFX, 0, &_smi_wrapper_volt_metric_get ); - ADD_SMI_EVENT_N("OVERDRIVE_LEVEL", "rsmi_dev_overdrive_level_get", 0, &_smi_wrapper_overdrive_level_get ); - ADD_SMI_EVENT_V("ECC_COUNT_UMC_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_UMC_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SDMA_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SDMA_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_GFX_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_GFX_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_HDP_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_HDP_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_DF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_DF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SMN_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SMN_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SEM_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SEM_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP0_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP0_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP1_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP1_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_FUSE_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_FUSE_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_LAST_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_LAST_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_N("PROCS_USING_GPU", "rsmi_compute_process_info_get", 0, &_smi_wrapper_compute_process_info_get ); - - return 0; -} - - -int -rocmon_v1_init(int numGpus, const int* gpuIds) -{ - hsa_status_t status; - - // check if already initialized - if (rocmon_initialized) - { - return 0; - } - if (rocmon_context != NULL) - { - return -EEXIST; - } - - // Validate arguments - if (numGpus <= 0) - { - ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); - return -EINVAL; - } - - // Initialize other parts - init_configuration(); - - // initialize libraries - int ret = _rocmon_link_libraries(); - if (ret < 0) - { - ERROR_PLAIN_PRINT(Failed to initialize libraries); - return ret; - } - - // Allocate memory for context - rocmon_context = (RocmonContext*) malloc(sizeof(RocmonContext)); - if (rocmon_context == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate Rocmon context); - return -ENOMEM; - } - rocmon_context->groups = NULL; - rocmon_context->numGroups = 0; - rocmon_context->numActiveGroups = 0; - - rocmon_context->devices = (RocmonDevice*) malloc(numGpus * sizeof(RocmonDevice)); - rocmon_context->numDevices = numGpus; - if (rocmon_context->devices == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate set of GPUs); - free(rocmon_context); - rocmon_context = NULL; - return -ENOMEM; - } - - // init hsa library - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing HSA); - ROCM_CALL(hsa_init, (), - { - ERROR_PLAIN_PRINT(Failed to init hsa library); - goto rocmon_init_hsa_failed; - }); - - // init rocm smi library - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RSMI); - RSMI_CALL(rsmi_init, (0), - { - ERROR_PLAIN_PRINT(Failed to init rocm_smi); - goto rocmon_init_rsmi_failed; - }); - - // Get hsa timestamp factor - uint64_t frequency_hz; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting HSA timestamp factor); - ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency_hz), - { - ERROR_PLAIN_PRINT(Failed to get HSA timestamp factor); - goto rocmon_init_info_agents_failed; - }); - rocmon_context->hsa_timestamp_factor = (long double)1000000000 / (long double)frequency_hz; - - // initialize structures for specified devices (fetch ROCm specific info) - iterate_agents_cb_arg arg = { - .context = rocmon_context, - .numGpus = numGpus, - .gpuIds = gpuIds, - }; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Iterating through %d available agents, numGpus); - ROCM_CALL(hsa_iterate_agents, (_rocmon_iterate_agents_callback, &arg), - { - ERROR_PRINT(Error while iterating through available agents); - goto rocmon_init_info_agents_failed; - }); - - // Get available SMI events for devices - _rocmon_smi_init_events(); - for (int i = 0; i < rocmon_context->numDevices; i++) - { - if (_rocmon_smi_get_functions(&rocmon_context->devices[i]) < 0) - { - ERROR_PRINT(Failed to get SMI functions for device %d, rocmon_context->devices[i].deviceId); - goto rocmon_init_info_agents_failed; - } - } - - rocmon_initialized = TRUE; - return 0; -rocmon_init_info_agents_failed: - RSMI_CALL(rsmi_shut_down, (), { - // fall through - }); -rocmon_init_rsmi_failed: - ROCM_CALL(hsa_shut_down, (), { - // fall through - }); -rocmon_init_hsa_failed: - free(rocmon_context->devices); - free(rocmon_context); - rocmon_context = NULL; - return -1; -} - - -void -rocmon_v1_finalize(void) -{ - RocmonContext* context = rocmon_context; - - if (!rocmon_initialized) - { - return; - } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON); - - if (context) - { - if (context->devices) - { - // Free each devices fields - for (int i = 0; i < context->numDevices; i++) - { - RocmonDevice* device = &context->devices[i]; - FREE_IF_NOT_NULL(device->rocMetrics); - FREE_IF_NOT_NULL(device->activeRocEvents); - FREE_IF_NOT_NULL(device->activeSmiEvents); - if (device->groupResults) - { - // Free events of event result lists - for (int j = 0; j < device->numGroupResults; j++) - { - FREE_IF_NOT_NULL(device->groupResults[i].results); - } - // Free list - free(device->groupResults); - } - if (device->context) - { - ROCM_CALL(rocprofiler_close, (device->context),); - } - destroy_smap(device->smiMetrics); - } - - free(context->devices); - context->devices = NULL; - } - - FREE_IF_NOT_NULL(context->groups); - destroy_smap(context->smiEvents); - - free(context); - context = NULL; - } - - RSMI_CALL(rsmi_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); - // fall through - }); - ROCM_CALL(hsa_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); - // fall through - }); -} - - -int -rocmon_v1_addEventSet(const char* eventString, int* gid) -{ - // Check arguments - if (!eventString) - { - return -EINVAL; - } - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Allocate memory for event group if necessary - if (rocmon_context->numActiveGroups == rocmon_context->numGroups) - { - GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo)); - if (tmpInfo == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate additional group); - return -ENOMEM; - } - rocmon_context->groups = tmpInfo; - rocmon_context->numGroups++; - } - - // Parse event string - int err = _rocmon_parse_eventstring(eventString, &rocmon_context->groups[rocmon_context->numActiveGroups]); - if (err < 0) - { - return err; - } - - // Allocate memory for event results - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Allocate memory for event results - int numEvents = rocmon_context->groups[rocmon_context->numActiveGroups].nevents; - RocmonEventResult* tmpResults = (RocmonEventResult*) malloc(numEvents * sizeof(RocmonEventResult)); - if (tmpResults == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate event results); - return -ENOMEM; - } - - // Allocate memory for new event result list entry - RocmonEventResultList* tmpGroupResults = (RocmonEventResultList*) realloc(device->groupResults, (device->numGroupResults+1) * sizeof(RocmonEventResultList)); - if (tmpGroupResults == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate new event group result list); - return -ENOMEM; - } - - device->groupResults = tmpGroupResults; - device->groupResults[device->numGroupResults].results = tmpResults; - device->groupResults[device->numGroupResults].numResults = numEvents; - device->numGroupResults++; - } - - *gid = rocmon_context->numActiveGroups; - rocmon_context->numActiveGroups++; - return 0; -} - - -static int -_rocmon_setupCounters_rocprofiler(RocmonDevice* device, const char** events, int numEvents) -{ - // Close previous rocprofiler context - if (device->context) - { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Closing previous rocprofiler context); - ROCM_CALL(rocprofiler_close, (device->context), return -1); - } - - // Look if the are any events - if (numEvents <= 0) - { - return 0; - } - - // Create feature array to monitor - rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(numEvents * sizeof(rocprofiler_feature_t)); - if (features == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate feature list); - return -ENOMEM; - } - for (int i = 0; i < numEvents; i++) - { - features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[i].name = events[i]; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP EVENT %d %s, i, events[i]); - } - - // Free previous feature array if present - FREE_IF_NOT_NULL(device->activeRocEvents); - - device->numActiveRocEvents = numEvents; - device->activeRocEvents = features; - - // Open context - rocprofiler_properties_t properties = {}; - properties.queue_depth = 128; - uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | ROCPROFILER_MODE_SINGLEGROUP; - - // Important: only a single profiling group is supported at this time which limits the number of events that can be monitored at a time. - ROCM_CALL(rocprofiler_open, (device->hsa_agent, device->activeRocEvents, device->numActiveRocEvents, &device->context, mode, &properties), return -1); - - return 0; -} - - -static int -_rocmon_setupCounters_smi(RocmonDevice* device, const char** events, int numEvents) -{ - int ret; - const int instanceNumLen = 5; - - // Delete previous events - if (device->activeSmiEvents) - { - device->activeSmiEvents = NULL; - device->numActiveSmiEvents = 0; - } - - // Look if the are any events - if (numEvents <= 0) - { - return 0; - } - - // Create event array - RocmonSmiEvent* activeEvents = (RocmonSmiEvent*) malloc(numEvents * sizeof(RocmonSmiEvent)); - if (activeEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate active event list); - return -ENOMEM; - } - - for (int i = 0; i < numEvents; i++) - { - char eventName[membersize(RocmonSmiEvent, name)]; - int instance = -1; - - // Parse event name -> normal event vs one with multiple instances (EVENT[0]) - const char* event = events[i]; - char* instancePart = strrchr(event, '['); - if (instancePart != NULL) - { - char withoutBrackets[instanceNumLen+1]; // +1 is '\0' - int partlen = strlen(instancePart); - - // Check if number fit in 'withoutBrackets' - if (partlen - 2 > instanceNumLen) - { - ERROR_PRINT(Instance number in '%s' is too large, event); - free(activeEvents); - return -EINVAL; - } - - // Copy instance number without brackets - strncpy(withoutBrackets, instancePart+1, partlen-2); - withoutBrackets[instanceNumLen] = '\0'; - - // Parse instance as number - char* endParsed; - instance = strtol(withoutBrackets, &endParsed, 10); - - // Check if parsing was successful - char* endOfString = &withoutBrackets[partlen-2]; - if (endParsed != endOfString) - { - ERROR_PRINT(Failed to parse instance number in '%s', event); - free(activeEvents); - return -EINVAL; - } - - // Copy event name without instance - int eventNameLen = instancePart - event; - strncpy(eventName, event, eventNameLen); - eventName[eventNameLen] = '\0'; - } - else - { - // Copy entire event name - strncpy(eventName, event, membersize(RocmonSmiEvent, name)); - } - - // Lookup event in available events - RocmonSmiEvent* metric = NULL; - ret = get_smap_by_key(device->smiMetrics, eventName, (void**)&metric); - if (ret < 0) - { - ERROR_PRINT(RSMI event '%s' not found for device %d, eventName, device->deviceId); - free(activeEvents); - return -EINVAL; - } - - // Copy event - RocmonSmiEvent* tmpEvent = &activeEvents[i]; - memcpy(tmpEvent, metric, sizeof(RocmonSmiEvent)); - - // Check if event supports instances - if (instance >= 0 && tmpEvent->type != ROCMON_SMI_EVENT_TYPE_INSTANCES) - { - ERROR_PRINT(Instance number given but event '%s' does not support one, eventName); - free(activeEvents); - return -EINVAL; - } - - // Check if event requires instances - if (instance < 0 && tmpEvent->type == ROCMON_SMI_EVENT_TYPE_INSTANCES) - { - ERROR_PRINT(No instance number given but event '%s' requires one, eventName); - free(activeEvents); - return -EINVAL; - } - - // Check if event has enough instances - if (instance >= 0 && instance >= metric->instances) - { - ERROR_PRINT(Instance %d seleced but event '%s' has only %d, instance, eventName, metric->instances); - free(activeEvents); - return -EINVAL; - } - - // Set instance number - if (instance >= 0) - { - tmpEvent->subvariant = instance; - } - } - - device->activeSmiEvents = activeEvents; - device->numActiveSmiEvents = numEvents; - - return 0; -} - - -int -rocmon_v1_setupCounters(int gid) -{ - int ret; - - // Check arguments - if (gid < 0 || gid >= rocmon_context->numActiveGroups) - { - return -EINVAL; - } - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Get group info - GroupInfo* group = &rocmon_context->groups[gid]; - - // - // Separate rocprofiler and SMI events - // - const char **smiEvents = NULL, **rocEvents = NULL; - int numSmiEvents = 0, numRocEvents = 0; - - // Allocate memory for string arrays - smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (smiEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); - return -ENOMEM; - } - rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (rocEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); - free(smiEvents); - return -ENOMEM; - } - - // Go through each event and sort it - for (int i = 0; i < group->nevents; i++) - { - const char* name = group->events[i]; - if (strncmp(name, "RSMI_", 5) == 0) - { - // RSMI event - smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix - numSmiEvents++; - } - else if (strncmp(name, "ROCP_", 5) == 0) - { - // Rocprofiler event - rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix - numRocEvents++; - } - else - { - // Unknown event - ERROR_PRINT(Event '%s' has no prefix ('ROCP_' or 'RSMI_'), name); - return -EINVAL; - } - } - - // Add events to each device - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Add rocprofiler events - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); - ret = _rocmon_setupCounters_rocprofiler(device, rocEvents, numRocEvents); - if (ret < 0) - { - free(smiEvents); - free(rocEvents); - return ret; - } - - // Add SMI events - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCM SMI WITH %d events, numSmiEvents); - ret = _rocmon_setupCounters_smi(device, smiEvents, numSmiEvents); - if (ret < 0) - { - free(smiEvents); - free(rocEvents); - return ret; - } - } - rocmon_context->activeGroup = gid; - - // Cleanup - free(smiEvents); - free(rocEvents); - - return 0; -} - - -static int -_rocmon_startCounters_rocprofiler(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveRocEvents <= 0) - { - return 0; - } - - // Reset results - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveRocEvents; i++) - { - RocmonEventResult* result = &groupResult->results[i]; - result->lastValue = 0; - result->fullValue = 0; - } - - if (device->context) - { - ROCM_CALL(rocprofiler_start, (device->context, 0), return -1); - } - - return 0; -} - - -static int -_rocmon_startCounters_smi(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveSmiEvents <= 0) - { - return 0; - } - - // Save baseline values - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) - { - double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; - - // Measure counter - if (event->measureFunc) - { - event->measureFunc(device->deviceId, event, result); - } - - // Save value - result->fullValue = 0; - } - - return 0; -} - - -int -rocmon_v1_startCounters(void) -{ - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Get timestamp - uint64_t timestamp; - if (ret = _rocmon_get_timestamp(×tamp)) - { - return ret; - } - - // Start counters on each device - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - device->time.start = timestamp; - device->time.read = timestamp; - - // Start rocprofiler events - ret = _rocmon_startCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Start SMI events - _rocmon_startCounters_smi(device); - if (ret < 0) return ret; - } - - return 0; -} - - -static int -_rocmon_stopCounters_rocprofiler(RocmonDevice* device) -{ - if (device->context) - { - // Close context - ROCM_CALL(rocprofiler_stop, (device->context, 0), return -1); - } - - return 0; -} - - -int -rocmon_v1_stopCounters(void) -{ - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Read counters - ret = _rocmon_readCounters(&_rocmon_get_stop_time); - if (ret < 0) return ret; - - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Stop rocprofiler events - ret = _rocmon_stopCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Nothing to stop for SMI events - } - - return 0; -} - - -int -rocmon_v1_readCounters(void) -{ - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Read counters - ret = _rocmon_readCounters(&_rocmon_get_read_time); - if (ret < 0) return ret; - - return 0; -} - - -double -rocmon_v1_getResult(int gpuIdx, int groupId, int eventId) -{ - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate gpuIdx - if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) - { - return -EFAULT; - } - - // Validate groupId - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - if (groupId < 0 || groupId >= device->numGroupResults) - { - return -EFAULT; - } - - // Validate eventId - RocmonEventResultList* groupResult = &device->groupResults[groupId]; - if (eventId < 0 || eventId >= groupResult->numResults) - { - return -EFAULT; - } - - // Return result - return groupResult->results[eventId].fullValue; -} - - -// TODO: multiple groups -double -rocmon_v1_getLastResult(int gpuIdx, int groupId, int eventId) -{ - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate gpuIdx - if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) - { - return -EFAULT; - } - - // Validate groupId - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - if (groupId < 0 || groupId >= device->numGroupResults) - { - return -EFAULT; - } - - // Validate eventId - RocmonEventResultList* groupResult = &device->groupResults[groupId]; - if (eventId < 0 || eventId >= groupResult->numResults) - { - return -EFAULT; - } - - // Return result - return groupResult->results[eventId].lastValue; -} - - -int -rocmon_v1_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) -{ - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate args - if (gpuIdx < 0 || gpuIdx > rocmon_context->numDevices) - { - return -EINVAL; - } - if (list == NULL) - { - return -EINVAL; - } - - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - - // Allocate list structure - EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); - if (tmpList == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate event list); - return -ENOMEM; - } - - // Get number of events - printf("NUmber of events %d + %d\n", device->numRocMetrics , get_map_size(device->smiMetrics)); - tmpList->numEvents = device->numRocMetrics + get_map_size(device->smiMetrics); - if (tmpList->numEvents == 0) - { - // No events -> return empty list - tmpList->events = NULL; - *list = tmpList; - return 0; - } - - // Allocate event array - tmpList->events = (Event_rocm_t*) malloc(tmpList->numEvents * sizeof(Event_rocm_t)); - if (tmpList->events == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate events for event list); - free(tmpList); - return -ENOMEM; - } - - // Copy rocprofiler event information - for (int i = 0; i < device->numRocMetrics; i++) - { - rocprofiler_info_data_t* event = &device->rocMetrics[i]; - Event_rocm_t* out = &tmpList->events[i]; - int len; - - // Copy name - printf("Name %s\n", event->metric.name); - len = strlen(event->metric.name) + 5 /* Prefix */ + 1 /* NULL byte */; - out->name = (char*) malloc(len); - if (out->name) - { - snprintf(out->name, len, "ROCP_%s", event->metric.name); - } - - // Copy description - len = strlen(event->metric.description) + 1 /* NULL byte */; - out->description = (char*) malloc(len); - if (out->description) - { - snprintf(out->description, len, "%s", event->metric.description); - } - - // Copy instances - out->instances = event->metric.instances; - } - - // Copy ROCm SMI metric information - for (int i = 0; i < get_map_size(device->smiMetrics); i++) - { - RocmonSmiEvent* event = NULL; - Event_rocm_t* out = &tmpList->events[device->numRocMetrics + i]; - int len; - - // Get event - if (get_smap_by_idx(device->smiMetrics, i, (void**)&event) < 0) - { - continue; - } - - // Copy name - len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; - out->name = (char*) malloc(len); - if (out->name) - { - snprintf(out->name, len, "RSMI_%s", event->name); - } - - // Copy description - char* description = "SMI Event"; // TODO: use real descriptions - len = strlen(description) + 1 /* NULL byte */; - out->description = (char*) malloc(len); - if (out->description) - { - snprintf(out->description, len, "%s", description); - } - - // Copy instances - out->instances = event->instances; - } - - *list = tmpList; - return 0; -} - -void -rocmon_v1_freeEventsOfGpu(EventList_rocm_t list) -{ -#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } - - // Check pointer - if (list == NULL) - { - return; - } - - if (list->events != NULL) - { - for (int i = 0; i < list->numEvents; i++) - { - Event_rocm_t* event = &list->events[i]; - FREE_IF_NOT_NULL(event->name); - FREE_IF_NOT_NULL(event->description); - } - free(list->events); - } - free(list); -} - - -int -rocmon_v1_switchActiveGroup(int newGroupId) -{ - int ret; - - ret = rocmon_stopCounters(); - if (ret < 0) - { - return ret; - } - - ret = rocmon_setupCounters(newGroupId); - if (ret < 0) - { - return ret; - } - - ret = rocmon_startCounters(); - if (ret < 0) - { - return ret; - } - - return 0; -} - - -int -rocmon_v1_getNumberOfGroups(void) -{ - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->numActiveGroups; -} - - -int -rocmon_v1_getIdOfActiveGroup(void) -{ - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->activeGroup; -} - - -int -rocmon_v1_getNumberOfGPUs(void) -{ - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->numDevices; -} - - -int -rocmon_v1_getNumberOfEvents(int groupId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return -EFAULT; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->nevents; -} - - -int -rocmon_v1_getNumberOfMetrics(int groupId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->nmetrics; -} - - -double -rocmon_v1_getTimeOfGroup(int groupId) -{ - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.stop - device->time.start)); - } - return t*1E-9; -} - - -double -rocmon_v1_getLastTimeOfGroup(int groupId) -{ - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.stop - device->time.read)); - } - return t*1E-9; -} - - -double -rocmon_v1_getTimeToLastReadOfGroup(int groupId) -{ - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.read - device->time.start)); - } - return t*1E-9; -} - - -char* -rocmon_v1_getEventName(int groupId, int eventId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((eventId < 0) || (eventId >= ginfo->nevents)) - { - return NULL; - } - return ginfo->events[eventId]; -} - - -char* -rocmon_v1_getCounterName(int groupId, int eventId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((eventId < 0) || (eventId >= ginfo->nevents)) - { - return NULL; - } - return ginfo->counters[eventId]; -} - - -char* -rocmon_v1_getMetricName(int groupId, int metricId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((metricId < 0) || (metricId >= ginfo->nmetrics)) - { - return NULL; - } - return ginfo->metricnames[metricId]; -} - - -char* -rocmon_v1_getGroupName(int groupId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->groupname; -} - - -char* -rocmon_v1_getGroupInfoShort(int groupId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->shortinfo; -} - - -char* -rocmon_v1_getGroupInfoLong(int groupId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->longinfo; -} - - -int -rocmon_v1_getGroups(char*** groups, char*** shortinfos, char*** longinfos) -{ - init_configuration(); - Configuration_t config = get_configuration(); - - return perfgroup_getGroups(config->groupPath, "amd_gpu_v1", groups, shortinfos, longinfos); -} - - -int -rocmon_v1_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) -{ - perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); -} - - - -#endif /* LIKWID_WITH_ROCMON */ diff --git a/test/test_rocmon.c b/test/test_rocmon.c new file mode 100644 index 000000000..89df579b0 --- /dev/null +++ b/test/test_rocmon.c @@ -0,0 +1,72 @@ +#include +#include + + +#include + + + + + +int main(int argc, char* argv[]) +{ + int gpuId = 0; + int ret = 0; + int gid = -1; + rocmon_setVerbosity(DEBUGLEV_DEVELOP); + ret = rocmon_init(1, &gpuId); + if (ret < 0) + { + printf("rocmon_init failed with %d\n", ret); + return ret; + } + ret = rocmon_addEventSet("ROCP_SQ_WAVES:ROCM0", &gid); + if (ret < 0) + { + printf("rocmon_addEventSet failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Event set ID %d\n", gid); + ret = rocmon_setupCounters(gid); + if (ret < 0) + { + printf("rocmon_setupCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + ret = rocmon_startCounters(); + if (ret < 0) + { + printf("rocmon_startCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters running\n"); + ret = rocmon_readCounters(); + if (ret < 0) + { + printf("rocmon_startCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters running\n"); + ret = rocmon_readCounters(); + if (ret < 0) + { + printf("rocmon_startCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters running\n"); + ret = rocmon_stopCounters(); + if (ret < 0) + { + printf("rocmon_stopCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters stopped\n"); + rocmon_finalize(); + return 0; +} From 47b230e7e11481b6ee8ccae4834e424edc35bfc8 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:17:52 +0200 Subject: [PATCH 08/29] Add check for ROCM >= 6.2 --- make/config_checks.mk | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/make/config_checks.mk b/make/config_checks.mk index c84edd4a2..949e38330 100644 --- a/make/config_checks.mk +++ b/make/config_checks.mk @@ -84,7 +84,14 @@ INCLUDES += -I$(CUDAINCLUDE) -I$(CUPTIINCLUDE) endif ifeq ($(strip $(ROCM_INTERFACE)), true) +ROCM_SDK_CHECK := $(shell which rocprofv3 2>/dev/null | wc -l) +ifeq ($(strip $(ROCM_SDK_CHECK)),0) # HSA includes 'hsa/xxx.h' and rocprofiler 'xxx.h' DEFINES += -D__HIP_PLATFORM_AMD__ INCLUDES += -I$(HIPINCLUDE) -I$(HSAINCLUDE) -I$(HSAINCLUDE)/hsa -I$(ROCPROFILERINCLUDE) -I$(RSMIINCLUDE) +else +$(info Compile for ROCm >= 6.2) +DEFINES += -DLIKWID_ROCPROF_SDK +INCLUDES += -I$(ROCPROFILERINCLUDE) -I$(RSMIINCLUDE) +endif endif From 4c877e2e596e02b0915ad80e62bf1d92e1ae15a5 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:18:35 +0200 Subject: [PATCH 09/29] Split ROCM backends in 'v1' and 'sdk' --- src/includes/rocmon_v1.h | 64 + .../{rocmon_types.h => rocmon_v1_types.h} | 10 +- src/rocmon.c | 2225 ++-------------- src/rocmon_marker.c | 4 +- src/rocmon_v1.c | 2275 +++++++++++++++++ 5 files changed, 2512 insertions(+), 2066 deletions(-) create mode 100644 src/includes/rocmon_v1.h rename src/includes/{rocmon_types.h => rocmon_v1_types.h} (90%) create mode 100644 src/rocmon_v1.c diff --git a/src/includes/rocmon_v1.h b/src/includes/rocmon_v1.h new file mode 100644 index 000000000..0ea8b70e0 --- /dev/null +++ b/src/includes/rocmon_v1.h @@ -0,0 +1,64 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_v1.h + * + * Description: Header File of rocmon module for ROCm < 6.2. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_V1_H +#define LIKWID_ROCMON_V1_H + +int rocmon_v1_init(int numGpus, const int* gpuIds); +void rocmon_v1_finalize(void); +int rocmon_v1_addEventSet(const char* eventString, int* gid); +int rocmon_v1_setupCounters(int gid); +int rocmon_v1_startCounters(void); +int rocmon_v1_stopCounters(void); +int rocmon_v1_readCounters(void); +double rocmon_v1_getResult(int gpuIdx, int groupId, int eventId); +double rocmon_v1_getLastResult(int gpuIdx, int groupId, int eventId); +int rocmon_v1_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list); +void rocmon_v1_freeEventsOfGpu(EventList_rocm_t list); +int rocmon_v1_switchActiveGroup(int newGroupId); +int rocmon_v1_getNumberOfGroups(void); +int rocmon_v1_getIdOfActiveGroup(void); +int rocmon_v1_getNumberOfGPUs(void); +int rocmon_v1_getNumberOfEvents(int groupId); +int rocmon_v1_getNumberOfMetrics(int groupId); +double rocmon_v1_getTimeOfGroup(int groupId); +double rocmon_v1_getLastTimeOfGroup(int groupId); +double rocmon_v1_getTimeToLastReadOfGroup(int groupId); +char* rocmon_v1_getEventName(int groupId, int eventId); +char* rocmon_v1_getCounterName(int groupId, int eventId); +char* rocmon_v1_getMetricName(int groupId, int metricId); +char* rocmon_v1_getGroupName(int groupId); +char* rocmon_v1_getGroupInfoShort(int groupId); +char* rocmon_v1_getGroupInfoLong(int groupId); +int rocmon_v1_getGroups(char*** groups, char*** shortinfos, char*** longinfos); +int rocmon_v1_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos); + + +#endif /* LIKWID_ROCMON_V1_H */ + diff --git a/src/includes/rocmon_types.h b/src/includes/rocmon_v1_types.h similarity index 90% rename from src/includes/rocmon_types.h rename to src/includes/rocmon_v1_types.h index 7af2e1518..a126077de 100644 --- a/src/includes/rocmon_types.h +++ b/src/includes/rocmon_v1_types.h @@ -35,8 +35,16 @@ #include // #include #ifndef ROCPROFILER_VERSION_MAJOR -#include +#ifdef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE +#undef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #endif +#include +#endif +#include +#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 +typedef struct metrics_table_header_t metrics_table_header_t; +#endif +#include #include typedef struct { diff --git a/src/rocmon.c b/src/rocmon.c index ba7bdf85b..7e552f968 100644 --- a/src/rocmon.c +++ b/src/rocmon.c @@ -45,1816 +45,108 @@ #include #include -#include -#include -#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 -typedef struct metrics_table_header_t metrics_table_header_t; -#endif -#include - -// #include -// #include -// #include - -// Variables -static void *dl_hsa_lib = NULL; -static void *dl_profiler_lib = NULL; -static void *dl_rsmi_lib = NULL; - -RocmonContext *rocmon_context = NULL; -static bool rocmon_initialized = FALSE; -int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; - -// Macros -#define membersize(type, member) sizeof(((type *) NULL)->member) -#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } -#define ROCM_CALL( call, args, handleerror ) \ - do { \ - hsa_status_t _status = (*call##_ptr)args; \ - if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ - const char* err = NULL; \ - fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ - rocprofiler_error_string(&err); \ - fprintf(stderr, "Error: %s\n", err); \ - handleerror; \ - } \ - } while (0) - -#define RSMI_CALL( call, args, handleerror ) \ - do { \ - rsmi_status_t _status = (*call##_ptr)args; \ - if (_status != RSMI_STATUS_SUCCESS) { \ - fprintf(stderr, "Error: function %s failed with error %d.\n", #call, _status); \ - handleerror; \ - } \ - } while (0) - -// ROCm function declarations -#define ROCMWEAK __attribute__(( weak )) -#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; -#define DECLAREFUNC_SMI(funcname, funcsig) rsmi_status_t ROCMWEAK funcname funcsig; rsmi_status_t ( *funcname##_ptr ) funcsig; - -DECLAREFUNC_HSA(hsa_init, ()); -DECLAREFUNC_HSA(hsa_shut_down, ()); -DECLAREFUNC_HSA(hsa_iterate_agents, (hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data)); -DECLAREFUNC_HSA(hsa_agent_get_info, (hsa_agent_t agent, hsa_agent_info_t attribute, void* value)); -DECLAREFUNC_HSA(hsa_system_get_info, (hsa_system_info_t attribute, void *value)); - -DECLAREFUNC_HSA(rocprofiler_iterate_info, (const hsa_agent_t* agent, rocprofiler_info_kind_t kind, hsa_status_t (*callback)(const rocprofiler_info_data_t, void* data), void* data)); -DECLAREFUNC_HSA(rocprofiler_close, (rocprofiler_t* context)); -DECLAREFUNC_HSA(rocprofiler_open, (hsa_agent_t agent, rocprofiler_feature_t* features, uint32_t feature_count, rocprofiler_t** context, uint32_t mode, rocprofiler_properties_t* properties)); -DECLAREFUNC_HSA(rocprofiler_error_string, ()); -DECLAREFUNC_HSA(rocprofiler_start, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_stop, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_read, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_get_data, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_get_metrics, (const rocprofiler_t* context)); - -DECLAREFUNC_SMI(rsmi_init, (uint64_t flags)); -DECLAREFUNC_SMI(rsmi_shut_down, ()); -DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_open, (uint32_t dv_ind, rsmi_func_id_iter_handle_t* handle)); -DECLAREFUNC_SMI(rsmi_dev_supported_variant_iterator_open, (rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t* var_iter)); -DECLAREFUNC_SMI(rsmi_func_iter_value_get, (rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t* value )); -DECLAREFUNC_SMI(rsmi_func_iter_next, (rsmi_func_id_iter_handle_t handle)); -DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_close, (rsmi_func_id_iter_handle_t* handle)); -DECLAREFUNC_SMI(rsmi_dev_power_ave_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* power)); -DECLAREFUNC_SMI(rsmi_dev_pci_throughput_get, (uint32_t dv_ind, uint64_t* sent, uint64_t* received, uint64_t* max_pkt_sz)); -DECLAREFUNC_SMI(rsmi_dev_pci_replay_counter_get, (uint32_t dv_ind, uint64_t* counter)); -DECLAREFUNC_SMI(rsmi_dev_memory_total_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* total)); -DECLAREFUNC_SMI(rsmi_dev_memory_usage_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* used )); -DECLAREFUNC_SMI(rsmi_dev_memory_busy_percent_get, (uint32_t dv_ind, uint32_t* busy_percent)); -DECLAREFUNC_SMI(rsmi_dev_memory_reserved_pages_get, (uint32_t dv_ind, uint32_t* num_pages, rsmi_retired_page_record_t* records)); -DECLAREFUNC_SMI(rsmi_dev_fan_rpms_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); -DECLAREFUNC_SMI(rsmi_dev_fan_speed_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); -DECLAREFUNC_SMI(rsmi_dev_fan_speed_max_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* max_speed)); -DECLAREFUNC_SMI(rsmi_dev_temp_metric_get, (uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t* temperature)); -DECLAREFUNC_SMI(rsmi_dev_volt_metric_get, (uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t* voltage)); -DECLAREFUNC_SMI(rsmi_dev_overdrive_level_get, (uint32_t dv_ind, uint32_t* od)); -DECLAREFUNC_SMI(rsmi_dev_ecc_count_get, (uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t* ec)); -DECLAREFUNC_SMI(rsmi_compute_process_info_get, (rsmi_process_info_t* procs, uint32_t* num_items)); - - -// ---------------------------------------------------- -// SMI event wrapper -// ---------------------------------------------------- - -static int -_smi_wrapper_pci_throughput_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t value; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _smi_wrapper_pci_throughput_get(%d, %d), deviceId, event->extra); - // Internal variant: 0 for sent, 1 for received bytes and 2 for max packet size - if (event->extra == 0) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, &value, NULL, NULL), return -1); - else if (event->extra == 1) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, &value, NULL), return -1); - else if (event->extra == 2) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, NULL, &value), return -1); - else return -1; - - result->fullValue += value; - result->lastValue = value; - - return 0; -} - - -static int -_smi_wrapper_pci_replay_counter_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t counter; - RSMI_CALL(rsmi_dev_pci_replay_counter_get, (deviceId, &counter), return -1); - result->fullValue += counter; - result->lastValue = counter; - - return 0; -} - - -static int -_smi_wrapper_power_ave_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t power; - RSMI_CALL(rsmi_dev_power_ave_get, (deviceId, event->subvariant, &power), return -1); - result->fullValue += power; - result->lastValue = power; - - return 0; -} - - -static int -_smi_wrapper_memory_total_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t total; - RSMI_CALL(rsmi_dev_memory_total_get, (deviceId, event->variant, &total), return -1); - result->fullValue += total; - result->lastValue = total; - - return 0; -} - - -static int -_smi_wrapper_memory_usage_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t used; - RSMI_CALL(rsmi_dev_memory_usage_get, (deviceId, event->variant, &used), return -1); - result->fullValue += used; - result->lastValue = used; - - return 0; -} - - -static int -_smi_wrapper_memory_busy_percent_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t percent; - RSMI_CALL(rsmi_dev_memory_busy_percent_get, (deviceId, &percent), return -1); - result->fullValue += percent; - result->lastValue = percent; - - return 0; -} - - -static int -_smi_wrapper_memory_reserved_pages_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t num_pages; - RSMI_CALL(rsmi_dev_memory_reserved_pages_get, (deviceId, &num_pages, NULL), return -1); - result->fullValue += num_pages; - result->lastValue = num_pages; - - return 0; -} - - -static int -_smi_wrapper_fan_rpms_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t speed; - RSMI_CALL(rsmi_dev_fan_rpms_get, (deviceId, event->subvariant, &speed), return -1); - result->fullValue += speed; - result->lastValue = speed; - - return 0; -} - - -static int -_smi_wrapper_fan_speed_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t speed; - RSMI_CALL(rsmi_dev_fan_speed_get, (deviceId, event->subvariant, &speed), return -1); - result->fullValue += speed; - result->lastValue = speed; - - return 0; -} - - -static int -_smi_wrapper_fan_speed_max_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t max_speed; - RSMI_CALL(rsmi_dev_fan_speed_max_get, (deviceId, event->subvariant, &max_speed), return -1); - result->fullValue += max_speed; - result->lastValue = max_speed; - - return 0; -} - - -static int -_smi_wrapper_temp_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t temperature; - RSMI_CALL(rsmi_dev_temp_metric_get, (deviceId, event->subvariant, event->variant, &temperature), return -1); - result->fullValue += temperature; - result->lastValue = temperature; - - return 0; -} - - -static int -_smi_wrapper_volt_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t voltage; - RSMI_CALL(rsmi_dev_volt_metric_get, (deviceId, event->subvariant, event->variant, &voltage), return -1); - result->fullValue += voltage; - result->lastValue = voltage; - - return 0; -} - - -static int -_smi_wrapper_overdrive_level_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t overdrive; - RSMI_CALL(rsmi_dev_overdrive_level_get, (deviceId, &overdrive), return -1); - result->fullValue += overdrive; - result->lastValue = overdrive; - - return 0; -} - - -static int -_smi_wrapper_ecc_count_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - rsmi_error_count_t error_count; - RSMI_CALL(rsmi_dev_ecc_count_get, (deviceId, event->variant, &error_count), return -1); - - if (event->extra == 0) - { - result->lastValue = error_count.correctable_err - result->fullValue; - result->fullValue = error_count.correctable_err; - } - else if (event->extra == 1) - { - result->lastValue = error_count.uncorrectable_err - result->fullValue; - result->fullValue = error_count.uncorrectable_err; - } - else - { - return -1; - } - - return 0; -} - - -static int -_smi_wrapper_compute_process_info_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t num_items; - RSMI_CALL(rsmi_compute_process_info_get, (NULL, &num_items), return -1); - result->fullValue += num_items; - result->lastValue = num_items; - - return 0; -} - - -// ---------------------------------------------------- -// Rocmon helper functions -// ---------------------------------------------------- - -static int -_rocmon_link_libraries() -{ - #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries); - - // Need to link in the ROCm HSA libraries - dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_hsa_lib) - { - ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); - return -1; - } - - // Need to link in the Rocprofiler libraries - dl_profiler_lib = dlopen("librocprofiler64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_profiler_lib) - { - dl_profiler_lib = dlopen("librocprofiler64.so.1", RTLD_NOW | RTLD_GLOBAL); - if (!dl_profiler_lib) - { - ERROR_PRINT(Rocprofiler library librocprofiler64.so not found: %s, dlerror()); - return -1; - } - } - - // Need to link in the Rocprofiler libraries - dl_rsmi_lib = dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_rsmi_lib) - { - ERROR_PRINT(ROCm SMI library librocm_smi64.so not found: %s, dlerror()); - return -1; - } - - // Link HSA functions - DLSYM_AND_CHECK(dl_hsa_lib, hsa_init); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_shut_down); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_iterate_agents); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_agent_get_info); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_system_get_info); - - // Link Rocprofiler functions - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_iterate_info); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_close); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_open); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_error_string); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_start); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_stop); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_read); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_data); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_metrics); - - // Link SMI functions - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_init); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_shut_down); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_open); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_variant_iterator_open); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_value_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_next); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_close); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_power_ave_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_throughput_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_replay_counter_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_total_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_usage_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_busy_percent_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_reserved_pages_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_rpms_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_max_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_temp_metric_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_volt_metric_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_overdrive_level_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_ecc_count_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_compute_process_info_get); - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries done); - return 0; -} - -typedef struct { - RocmonContext* context; - int numGpus; - const int* gpuIds; -} iterate_agents_cb_arg; - -typedef struct { - RocmonDevice* device; - int currIndex; -} iterate_info_cb_arg; - - -static hsa_status_t -_rocmon_iterate_info_callback_count(const rocprofiler_info_data_t info, void* data) -{ - RocmonDevice* device = (RocmonDevice*) data; - if (device) { - device->numRocMetrics++; - } - return HSA_STATUS_SUCCESS; -} - -static void -_rocmon_print_rocprofiler_info_data(const rocprofiler_info_data_t info) -{ - if (info.kind != ROCPROFILER_INFO_KIND_METRIC) - { - return; - } - printf("Name '%s':\n", info.metric.name); - printf("\tKind: '%s'\n", (info.kind == ROCPROFILER_INFO_KIND_METRIC ? "Metric" : "Trace")); - printf("\tInstances: %d\n", info.metric.instances); - printf("\tDescription: '%s'\n", info.metric.description); - printf("\tExpression: '%s'\n", info.metric.expr); - printf("\tBlockName: '%s'\n", info.metric.block_name); - printf("\tBlockCounters: %d\n", info.metric.block_counters); -} - -static hsa_status_t -_rocmon_iterate_info_callback_add(const rocprofiler_info_data_t info, void* data) -{ - iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; - - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); - if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP) - { - _rocmon_print_rocprofiler_info_data(info); - } - // Check info kind - if (info.kind != ROCPROFILER_INFO_KIND_METRIC) - { - ERROR_PRINT(Wrong info kind %u, info.kind); - return HSA_STATUS_ERROR; - } - - // Check index - if (arg->currIndex >= arg->device->numRocMetrics) - { - ERROR_PRINT(Metric index out of bounds: %d, arg->currIndex); - return HSA_STATUS_ERROR; - } - - // Copy info data - rocprofiler_info_data_t* target_info = &arg->device->rocMetrics[arg->currIndex]; - memcpy(target_info, &info, sizeof(rocprofiler_info_data_t)); - arg->currIndex++; - - return HSA_STATUS_SUCCESS; -} - - -static hsa_status_t -_rocmon_iterate_agents_callback(hsa_agent_t agent, void* argv) -{ - // Count number of callback invocations as the devices id - static int nextDeviceId = 0; - int deviceId = nextDeviceId; - bool noAgent = false; - - iterate_agents_cb_arg *arg = (iterate_agents_cb_arg*) argv; - - // Check if device is a GPU - hsa_device_type_t type; - ROCM_CALL(hsa_agent_get_info, (agent, HSA_AGENT_INFO_DEVICE, &type), return -1); - if (type != HSA_DEVICE_TYPE_GPU) - { - return HSA_STATUS_SUCCESS; - } - nextDeviceId++; - - // Check if device is includes in arg->gpuIds - int gpuIndex = -1; - for (int i = 0; i < arg->numGpus; i++) - { - if (deviceId == arg->gpuIds[i]) - { - gpuIndex = i; - break; - } - } - if (gpuIndex < 0) - { - return HSA_STATUS_SUCCESS; - } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing agent %d, gpuIndex); - - // Add agent to context - RocmonDevice *device = &arg->context->devices[gpuIndex]; - device->deviceId = deviceId; - device->hsa_agent = agent; - device->context = NULL; - device->numActiveRocEvents = 0; - device->activeRocEvents = NULL; - device->numGroupResults = 0; - device->groupResults = NULL; - - // Get number of available metrics - device->numRocMetrics = 0; - ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, RocProfiler provides %d events, device->numRocMetrics); - - // workaround for bug in ROCm 5.4.0 - if(device->numRocMetrics == 0) { - ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); - noAgent = true; - } - - // Allocate memory for metrics - device->rocMetrics = (rocprofiler_info_data_t*) malloc(device->numRocMetrics * sizeof(rocprofiler_info_data_t)); - if (device->rocMetrics == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate set of rocMetrics); - return HSA_STATUS_ERROR; - } - - // Initialize SMI events map - if (init_map(&device->smiMetrics, MAP_KEY_TYPE_STR, 0, &free) < 0) - { - ERROR_PLAIN_PRINT(Cannot init smiMetrics map); - return HSA_STATUS_ERROR; - } - - // Fetch metric informatino - iterate_info_cb_arg info_arg = { - .device = device, - .currIndex = 0, - }; - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, Read %d RocProfiler events for device %d, device->numRocMetrics, device->deviceId); - - // If the call fails with agent, call rocprofiler_iterate_info without agent - if(noAgent) - { - ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); - } else { - ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); - } - - return HSA_STATUS_SUCCESS; -} - - -static int -_rocmon_parse_eventstring(const char* eventString, GroupInfo* group) -{ - int err = 0; - Configuration_t config = get_configuration(); - bstring eventBString = bfromcstr(eventString); - - if (bstrchrp(eventBString, ':', 0) != BSTR_ERR) - { - // If custom group -> perfgroup_customGroup - err = perfgroup_customGroup(eventString, group); - if (err < 0) - { - ERROR_PRINT(Cannot transform %s to performance group, eventString); - return err; - } - } - else - { - // If performance group -> perfgroup_readGroup - err = perfgroup_readGroup(config->groupPath, "amd_gpu", eventString, group); - if (err == -EACCES) - { - ERROR_PRINT(Access to performance group %s not allowed, eventString); - return err; - } - else if (err == -ENODEV) - { - ERROR_PRINT(Performance group %s only available with deactivated HyperThreading, eventString); - return err; - } - if (err < 0) - { - ERROR_PRINT(Cannot read performance group %s, eventString); - return err; - } - } - - return 0; -} - - -static int -_rocmon_get_timestamp(uint64_t* timestamp_ns) -{ - uint64_t timestamp; - - // Get timestamp from system - ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP, ×tamp), return -1); - // Convert to nanoseconds - *timestamp_ns = (uint64_t)((long double)timestamp * rocmon_context->hsa_timestamp_factor); - - return 0; -} - - -static int -_rocmon_getLastResult(RocmonDevice* device, int eventId, double* value) -{ - rocprofiler_data_t* data = &device->activeRocEvents[eventId].data; - - switch (data->kind) - { - case ROCPROFILER_DATA_KIND_INT32: - *value = (double) data->result_int32; - break; - case ROCPROFILER_DATA_KIND_INT64: - *value = (double) data->result_int64; - break; - case ROCPROFILER_DATA_KIND_FLOAT: - *value = (double) data->result_float; - break; - case ROCPROFILER_DATA_KIND_DOUBLE: - *value = data->result_double; - break; - - case ROCPROFILER_DATA_KIND_BYTES: - case ROCPROFILER_DATA_KIND_UNINIT: - default: - return -1; - } - - return 0; -} - - -static int -_rocmon_readCounters_rocprofiler(RocmonDevice* device) -{ - int ret; - - // Check if there are any counters to start - if (device->numActiveRocEvents <= 0) - { - return 0; - } - - if (!device->context) - { - return 0; - } - - ROCM_CALL(rocprofiler_read, (device->context, 0), return -1); - ROCM_CALL(rocprofiler_get_data, (device->context, 0), return -1); - ROCM_CALL(rocprofiler_get_metrics, (device->context), return -1); - - // Update results - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveRocEvents; i++) - { - RocmonEventResult* result = &groupResult->results[i]; - - // Read value - ret = _rocmon_getLastResult(device, i, &result->fullValue); - if (ret < 0) - { - return -1; - } - - // Calculate delta since last read - result->lastValue = result->fullValue - result->lastValue; - } - - return 0; -} - - -static int -_rocmon_readCounters_smi(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveSmiEvents <= 0) - { - return 0; - } - - // Save baseline values - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) - { - double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; - - // Measure counter - if (event->measureFunc) - { - event->measureFunc(device->deviceId, event, result); - } - } - - return 0; -} - - -static int -_rocmon_readCounters(uint64_t* (*getDestTimestampFunc)(RocmonDevice* device)) -{ - int ret; - - // Get timestamp - uint64_t timestamp; - if (ret = _rocmon_get_timestamp(×tamp)) - { - return ret; - } - - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Save timestamp - if (getDestTimestampFunc) - { - uint64_t* timestampDest = getDestTimestampFunc(device); - if (timestampDest) - { - *timestampDest = timestamp; - } - } - - // Read rocprofiler counters - ret = _rocmon_readCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Read SMI counters - ret = _rocmon_readCounters_smi(device); - if (ret < 0) return ret; - } - - return 0; -} - - -static uint64_t* -_rocmon_get_read_time(RocmonDevice* device) -{ - return &device->time.read; -} - - -static uint64_t* -_rocmon_get_stop_time(RocmonDevice* device) -{ - return &device->time.stop; -} - - -// ---------------------------------------------------- -// Rocmon SMI helper functions -// ---------------------------------------------------- - -static bstring -_rocmon_smi_build_label(RocmonSmiEventType type, const char* funcname, uint64_t variant, uint64_t subvariant) -{ - switch (type) - { - case ROCMON_SMI_EVENT_TYPE_NORMAL: - return bfromcstr(funcname); - case ROCMON_SMI_EVENT_TYPE_VARIANT: - return bformat("%s|%" PRIu64, funcname, variant); - case ROCMON_SMI_EVENT_TYPE_SUBVARIANT: - return bformat("%s|%" PRIu64 "|%" PRIu64, funcname, variant, subvariant); - case ROCMON_SMI_EVENT_TYPE_INSTANCES: - return bfromcstr(funcname); - } -} - - -static int -_rocmon_smi_add_event_to_device(RocmonDevice* device, const char* funcname, RocmonSmiEventType type, int64_t variant, uint64_t subvariant) -{ - int ret; - - // Get event by label - RocmonSmiEventList* list = NULL; - bstring label = _rocmon_smi_build_label(type, funcname, variant, subvariant); - ret = get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list); - bdestroy(label); - if (ret < 0) - { - // Event not registered -> ignore - return 0; - } - - // For events with multiple sensor, only make one entry -> find if one exists - if (type == ROCMON_SMI_EVENT_TYPE_INSTANCES && subvariant > 0) - { - // Get list from map - for (int i = 0; i < list->numEntries; i++) - { - RocmonSmiEvent* event = &list->entries[i]; - RocmonSmiEvent* existingEvent = NULL; - ret = get_smap_by_key(device->smiMetrics, event->name, (void**)&existingEvent); - if (ret < 0) - { - ERROR_PRINT(Failed to find previous instance for event %s, event->name); - return -1; - } - - // Update instance information - existingEvent->instances++; - } - return 0; - } - - for (int i = 0; i < list->numEntries; i++) - { - RocmonSmiEvent* event = &list->entries[i]; - - // Allocate memory for device event description - RocmonSmiEvent* tmpEvent = (RocmonSmiEvent*) malloc(sizeof(RocmonSmiEvent)); - if (tmpEvent == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event in device list %s, event->name); - return -ENOMEM; - } - - // Copy information from global description - memcpy(tmpEvent, event, sizeof(RocmonSmiEvent)); - tmpEvent->variant = variant; - tmpEvent->subvariant = subvariant; - tmpEvent->instances = 1; - - // Save event info to device event map - add_smap(device->smiMetrics, tmpEvent->name, tmpEvent); - } - - return 0; -} - - -static int -_rocmon_smi_get_function_subvariants(RocmonDevice* device, const char* funcname, uint64_t variant, rsmi_func_id_iter_handle_t var_iter) -{ - rsmi_func_id_iter_handle_t sub_var_iter; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Get open subvariants iterator - status = (*rsmi_dev_supported_variant_iterator_open_ptr)(var_iter, &sub_var_iter); - if (status == RSMI_STATUS_NO_DATA) - { - // No subvariants - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_VARIANT, variant, 0); - if (ret < 0) return -1; - return 0; - } - - // Subvariants available -> iterate them - do { - // Get subvariant information - (*rsmi_func_iter_value_get_ptr)(sub_var_iter, &value); - - // Process info - if (variant == RSMI_DEFAULT_VARIANT) - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_INSTANCES, variant, value.id); - else - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, variant, value.id); - if (ret < 0) return ret; - - // Advance iterator - status = (*rsmi_func_iter_next_ptr)(sub_var_iter); - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - (*rsmi_dev_supported_func_iterator_close_ptr)(&sub_var_iter); - - return 0; -} - - -static int -_rocmon_smi_get_function_variants(RocmonDevice* device, const char* funcname, rsmi_func_id_iter_handle_t iter_handle) -{ - rsmi_func_id_iter_handle_t var_iter; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Get open variants iterator - status = (*rsmi_dev_supported_variant_iterator_open_ptr)(iter_handle, &var_iter); - if (status == RSMI_STATUS_NO_DATA) - { - // No variants - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); - if (ret < 0) return -1; - return 0; - } - - // Variants available -> iterate them - do { - // Get variant information - (*rsmi_func_iter_value_get_ptr)(var_iter, &value); - - // Get function subvariants - ret = _rocmon_smi_get_function_subvariants(device, funcname, value.id, var_iter); - if (ret < 0) return -1; - - // Advance iterator - status = (*rsmi_func_iter_next_ptr)(var_iter); - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - (*rsmi_dev_supported_func_iterator_close_ptr)(&var_iter); - - return 0; -} - - -static int -_rocmon_smi_get_functions(RocmonDevice* device) -{ - rsmi_func_id_iter_handle_t iter_handle; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Open iterator - //(*rsmi_dev_supported_func_iterator_open_ptr)(device->deviceId, &iter_handle); - RSMI_CALL(rsmi_dev_supported_func_iterator_open, (device->deviceId, &iter_handle), { - return -1; - }); - - do - { - // Get function information - //(*rsmi_func_iter_value_get_ptr)(iter_handle, &value); - RSMI_CALL(rsmi_func_iter_value_get, (iter_handle, &value), { - ERROR_PRINT(Failed to get smi function value for device %d, device->deviceId); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - return -1; - }); - - // Get function variants - ret = _rocmon_smi_get_function_variants(device, value.name, iter_handle); - if (ret < 0) - { - ERROR_PRINT(Failed to get smi function variants for device %d, device->deviceId); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - return -1; - } - - // Advance iterator (cannot use RSMI_CALL macro here because we have an assignment, - // so we check that the function pointer exists to avoid segfaults.) - if (rsmi_func_iter_next_ptr) { - status = (*rsmi_func_iter_next_ptr)(iter_handle); - } - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - //(*rsmi_dev_supported_func_iterator_close_ptr)(&iter_handle); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - - // Add device independent functions - ret = _rocmon_smi_add_event_to_device(device, "rsmi_compute_process_info_get", ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); - if (ret < 0) return -1; - - return 0; -} - -#define ADD_SMI_EVENT(name, type, smifunc, variant, subvariant, extra, measurefunc) if (_rocmon_smi_add_event_to_map(name, type, smifunc, variant, subvariant, extra, measurefunc) < 0) { return -1; } -#define ADD_SMI_EVENT_N(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_NORMAL, smifunc, 0, 0, extra, measurefunc) -#define ADD_SMI_EVENT_V(name, smifunc, variant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_VARIANT, smifunc, variant, 0, extra, measurefunc) -#define ADD_SMI_EVENT_S(name, smifunc, variant, subvariant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, smifunc, variant, subvariant, extra, measurefunc) -#define ADD_SMI_EVENT_I(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_INSTANCES, smifunc, 0, 0, extra, measurefunc) - -static int -_rocmon_smi_add_event_to_map(char* name, RocmonSmiEventType type, char* smifunc, uint64_t variant, uint64_t subvariant, uint64_t extra, RocmonSmiMeasureFunc measureFunc) -{ - // Add new event list to map (if not already present) - bstring label = _rocmon_smi_build_label(type, smifunc, variant, subvariant); - RocmonSmiEventList* list; - if (get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list) < 0) - { - // Allocate memory for event list - list = (RocmonSmiEventList*) malloc(sizeof(RocmonSmiEventList)); - if (list == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event list %s, name); - return -ENOMEM; - } - list->entries = NULL; - list->numEntries = 0; - - add_smap(rocmon_context->smiEvents, bdata(label), list); - } - bdestroy(label); - - // Allocate memory for another event in list - list->numEntries++; - list->entries = (RocmonSmiEvent*) realloc(list->entries, list->numEntries * sizeof(RocmonSmiEvent)); - if (list->entries == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event %s, name); - return -ENOMEM; - } - - // Set event properties - RocmonSmiEvent* event = &list->entries[list->numEntries-1]; - strncpy(event->name, name, sizeof(event->name)); - event->name[sizeof(event->name)] = '\0'; - event->type = type; - event->variant = variant; - event->subvariant = subvariant; - event->extra = extra; - event->instances = 0; // gets set when scanning supported device functions - event->measureFunc = measureFunc; - - return 0; -} - - -static void -_rcomon_smi_free_event_list(void* vlist) -{ - RocmonSmiEventList* list = (RocmonSmiEventList*)vlist; - if (list) - { - FREE_IF_NOT_NULL(list->entries); - free(list); - } -} - - -static int -_rocmon_smi_init_events() -{ - int ret; - - // Init map - ret = init_map(&rocmon_context->smiEvents, MAP_KEY_TYPE_STR, 0, &_rcomon_smi_free_event_list); - if (ret < 0) - { - ERROR_PRINT(Failed to create map for ROCm SMI events); - return -1; - } - - // Add events - ADD_SMI_EVENT_N("PCI_THROUGHPUT_SENT", "rsmi_dev_pci_throughput_get", 0, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_THROUGHPUT_RECEIVED", "rsmi_dev_pci_throughput_get", 1, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_THROUGHPUT_MAX_PKT_SZ", "rsmi_dev_pci_throughput_get", 2, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_REPLAY_COUNTER", "rsmi_dev_pci_replay_counter_get", 0, &_smi_wrapper_pci_replay_counter_get ); - ADD_SMI_EVENT_I("POWER_AVE", "rsmi_dev_power_ave_get", 0, &_smi_wrapper_power_ave_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_VIS_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_GTT", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_VIS_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_GTT", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_N("MEMORY_BUSY_PERCENT", "rsmi_dev_memory_busy_percent_get", 0, &_smi_wrapper_memory_busy_percent_get ); - ADD_SMI_EVENT_N("MEMORY_NUM_RESERVED_PAGES", "rsmi_dev_memory_reserved_pages_get", 0, &_smi_wrapper_memory_reserved_pages_get ); - ADD_SMI_EVENT_I("FAN_RPMS", "rsmi_dev_fan_rpms_get", 0, &_smi_wrapper_fan_rpms_get ); - ADD_SMI_EVENT_I("FAN_SPEED", "rsmi_dev_fan_speed_get", 0, &_smi_wrapper_fan_speed_get ); - ADD_SMI_EVENT_I("FAN_SPEED_MAX", "rsmi_dev_fan_speed_max_get", 0, &_smi_wrapper_fan_speed_max_get ); - ADD_SMI_EVENT_S("TEMP_EDGE", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_EDGE, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("TEMP_JUNCTION", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_JUNCTION, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("TEMP_MEMORY", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_MEMORY, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("VOLT_VDDGFX", "rsmi_dev_volt_metric_get", RSMI_VOLT_CURRENT, RSMI_VOLT_TYPE_VDDGFX, 0, &_smi_wrapper_volt_metric_get ); - ADD_SMI_EVENT_N("OVERDRIVE_LEVEL", "rsmi_dev_overdrive_level_get", 0, &_smi_wrapper_overdrive_level_get ); - ADD_SMI_EVENT_V("ECC_COUNT_UMC_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_UMC_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SDMA_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SDMA_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_GFX_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_GFX_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_HDP_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_HDP_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_DF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_DF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SMN_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SMN_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SEM_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SEM_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP0_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP0_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP1_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP1_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_FUSE_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_FUSE_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_LAST_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_LAST_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_N("PROCS_USING_GPU", "rsmi_compute_process_info_get", 0, &_smi_wrapper_compute_process_info_get ); - - return 0; -} - - -int -rocmon_init(int numGpus, const int* gpuIds) -{ - hsa_status_t status; - - // check if already initialized - if (rocmon_initialized) - { - return 0; - } - if (rocmon_context != NULL) - { - return -EEXIST; - } - - // Validate arguments - if (numGpus <= 0) - { - ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); - return -EINVAL; - } - - // Initialize other parts - init_configuration(); - - // initialize libraries - int ret = _rocmon_link_libraries(); - if (ret < 0) - { - ERROR_PLAIN_PRINT(Failed to initialize libraries); - return ret; - } - - // Allocate memory for context - rocmon_context = (RocmonContext*) malloc(sizeof(RocmonContext)); - if (rocmon_context == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate Rocmon context); - return -ENOMEM; - } - rocmon_context->groups = NULL; - rocmon_context->numGroups = 0; - rocmon_context->numActiveGroups = 0; - - rocmon_context->devices = (RocmonDevice*) malloc(numGpus * sizeof(RocmonDevice)); - rocmon_context->numDevices = numGpus; - if (rocmon_context->devices == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate set of GPUs); - free(rocmon_context); - rocmon_context = NULL; - return -ENOMEM; - } - - // init hsa library - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing HSA); - ROCM_CALL(hsa_init, (), - { - ERROR_PLAIN_PRINT(Failed to init hsa library); - goto rocmon_init_hsa_failed; - }); - - // init rocm smi library - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RSMI); - RSMI_CALL(rsmi_init, (0), - { - ERROR_PLAIN_PRINT(Failed to init rocm_smi); - goto rocmon_init_rsmi_failed; - }); - - // Get hsa timestamp factor - uint64_t frequency_hz; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting HSA timestamp factor); - ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency_hz), - { - ERROR_PLAIN_PRINT(Failed to get HSA timestamp factor); - goto rocmon_init_info_agents_failed; - }); - rocmon_context->hsa_timestamp_factor = (long double)1000000000 / (long double)frequency_hz; - - // initialize structures for specified devices (fetch ROCm specific info) - iterate_agents_cb_arg arg = { - .context = rocmon_context, - .numGpus = numGpus, - .gpuIds = gpuIds, - }; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Iterating through %d available agents, numGpus); - ROCM_CALL(hsa_iterate_agents, (_rocmon_iterate_agents_callback, &arg), - { - ERROR_PRINT(Error while iterating through available agents); - goto rocmon_init_info_agents_failed; - }); - - // Get available SMI events for devices - _rocmon_smi_init_events(); - for (int i = 0; i < rocmon_context->numDevices; i++) - { - if (_rocmon_smi_get_functions(&rocmon_context->devices[i]) < 0) - { - ERROR_PRINT(Failed to get SMI functions for device %d, rocmon_context->devices[i].deviceId); - goto rocmon_init_info_agents_failed; - } - } - - rocmon_initialized = TRUE; - return 0; -rocmon_init_info_agents_failed: - RSMI_CALL(rsmi_shut_down, (), { - // fall through - }); -rocmon_init_rsmi_failed: - ROCM_CALL(hsa_shut_down, (), { - // fall through - }); -rocmon_init_hsa_failed: - free(rocmon_context->devices); - free(rocmon_context); - rocmon_context = NULL; - return -1; -} - - -void -rocmon_finalize(void) -{ - RocmonContext* context = rocmon_context; - - if (!rocmon_initialized) - { - return; - } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON); - - if (context) - { - if (context->devices) - { - // Free each devices fields - for (int i = 0; i < context->numDevices; i++) - { - RocmonDevice* device = &context->devices[i]; - FREE_IF_NOT_NULL(device->rocMetrics); - FREE_IF_NOT_NULL(device->activeRocEvents); - FREE_IF_NOT_NULL(device->activeSmiEvents); - if (device->groupResults) - { - // Free events of event result lists - for (int j = 0; j < device->numGroupResults; j++) - { - FREE_IF_NOT_NULL(device->groupResults[i].results); - } - // Free list - free(device->groupResults); - } - if (device->context) - { - ROCM_CALL(rocprofiler_close, (device->context),); - } - destroy_smap(device->smiMetrics); - } - - free(context->devices); - context->devices = NULL; - } - - FREE_IF_NOT_NULL(context->groups); - destroy_smap(context->smiEvents); - - free(context); - context = NULL; - } - - RSMI_CALL(rsmi_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); - // fall through - }); - ROCM_CALL(hsa_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); - // fall through - }); -} - - -int -rocmon_addEventSet(const char* eventString, int* gid) -{ - // Check arguments - if (!eventString) - { - return -EINVAL; - } - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Allocate memory for event group if necessary - if (rocmon_context->numActiveGroups == rocmon_context->numGroups) - { - GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo)); - if (tmpInfo == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate additional group); - return -ENOMEM; - } - rocmon_context->groups = tmpInfo; - rocmon_context->numGroups++; - } - - // Parse event string - int err = _rocmon_parse_eventstring(eventString, &rocmon_context->groups[rocmon_context->numActiveGroups]); - if (err < 0) - { - return err; - } - - // Allocate memory for event results - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Allocate memory for event results - int numEvents = rocmon_context->groups[rocmon_context->numActiveGroups].nevents; - RocmonEventResult* tmpResults = (RocmonEventResult*) malloc(numEvents * sizeof(RocmonEventResult)); - if (tmpResults == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate event results); - return -ENOMEM; - } - - // Allocate memory for new event result list entry - RocmonEventResultList* tmpGroupResults = (RocmonEventResultList*) realloc(device->groupResults, (device->numGroupResults+1) * sizeof(RocmonEventResultList)); - if (tmpGroupResults == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate new event group result list); - return -ENOMEM; - } - - device->groupResults = tmpGroupResults; - device->groupResults[device->numGroupResults].results = tmpResults; - device->groupResults[device->numGroupResults].numResults = numEvents; - device->numGroupResults++; - } - - *gid = rocmon_context->numActiveGroups; - rocmon_context->numActiveGroups++; - return 0; -} - - -static int -_rocmon_setupCounters_rocprofiler(RocmonDevice* device, const char** events, int numEvents) -{ - // Close previous rocprofiler context - if (device->context) - { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Closing previous rocprofiler context); - ROCM_CALL(rocprofiler_close, (device->context), return -1); - } - - // Look if the are any events - if (numEvents <= 0) - { - return 0; - } - - // Create feature array to monitor - rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(numEvents * sizeof(rocprofiler_feature_t)); - if (features == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate feature list); - return -ENOMEM; - } - for (int i = 0; i < numEvents; i++) - { - features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[i].name = events[i]; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP EVENT %d %s, i, events[i]); - } - - // Free previous feature array if present - FREE_IF_NOT_NULL(device->activeRocEvents); - - device->numActiveRocEvents = numEvents; - device->activeRocEvents = features; - - // Open context - rocprofiler_properties_t properties = {}; - properties.queue_depth = 128; - uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | ROCPROFILER_MODE_SINGLEGROUP; +#ifdef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE +#undef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE +#endif +#ifndef LIKWID_ROCPROF_SDK +#include +#include +#else +#include +#include +#endif - // Important: only a single profiling group is supported at this time which limits the number of events that can be monitored at a time. - ROCM_CALL(rocprofiler_open, (device->hsa_agent, device->activeRocEvents, device->numActiveRocEvents, &device->context, mode, &properties), return -1); +#include - return 0; -} -static int -_rocmon_setupCounters_smi(RocmonDevice* device, const char** events, int numEvents) +void +rocmon_finalize(void) { - int ret; - const int instanceNumLen = 5; - - // Delete previous events - if (device->activeSmiEvents) - { - device->activeSmiEvents = NULL; - device->numActiveSmiEvents = 0; - } - - // Look if the are any events - if (numEvents <= 0) - { - return 0; - } - - // Create event array - RocmonSmiEvent* activeEvents = (RocmonSmiEvent*) malloc(numEvents * sizeof(RocmonSmiEvent)); - if (activeEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate active event list); - return -ENOMEM; - } - - for (int i = 0; i < numEvents; i++) - { - char eventName[membersize(RocmonSmiEvent, name)]; - int instance = -1; - - // Parse event name -> normal event vs one with multiple instances (EVENT[0]) - const char* event = events[i]; - char* instancePart = strrchr(event, '['); - if (instancePart != NULL) - { - char withoutBrackets[instanceNumLen+1]; // +1 is '\0' - int partlen = strlen(instancePart); - - // Check if number fit in 'withoutBrackets' - if (partlen - 2 > instanceNumLen) - { - ERROR_PRINT(Instance number in '%s' is too large, event); - free(activeEvents); - return -EINVAL; - } - - // Copy instance number without brackets - strncpy(withoutBrackets, instancePart+1, partlen-2); - withoutBrackets[instanceNumLen] = '\0'; - - // Parse instance as number - char* endParsed; - instance = strtol(withoutBrackets, &endParsed, 10); - - // Check if parsing was successful - char* endOfString = &withoutBrackets[partlen-2]; - if (endParsed != endOfString) - { - ERROR_PRINT(Failed to parse instance number in '%s', event); - free(activeEvents); - return -EINVAL; - } - - // Copy event name without instance - int eventNameLen = instancePart - event; - strncpy(eventName, event, eventNameLen); - eventName[eventNameLen] = '\0'; - } - else - { - // Copy entire event name - strncpy(eventName, event, membersize(RocmonSmiEvent, name)); - } - - // Lookup event in available events - RocmonSmiEvent* metric = NULL; - ret = get_smap_by_key(device->smiMetrics, eventName, (void**)&metric); - if (ret < 0) - { - ERROR_PRINT(RSMI event '%s' not found for device %d, eventName, device->deviceId); - free(activeEvents); - return -EINVAL; - } - - // Copy event - RocmonSmiEvent* tmpEvent = &activeEvents[i]; - memcpy(tmpEvent, metric, sizeof(RocmonSmiEvent)); - - // Check if event supports instances - if (instance >= 0 && tmpEvent->type != ROCMON_SMI_EVENT_TYPE_INSTANCES) - { - ERROR_PRINT(Instance number given but event '%s' does not support one, eventName); - free(activeEvents); - return -EINVAL; - } - - // Check if event requires instances - if (instance < 0 && tmpEvent->type == ROCMON_SMI_EVENT_TYPE_INSTANCES) - { - ERROR_PRINT(No instance number given but event '%s' requires one, eventName); - free(activeEvents); - return -EINVAL; - } - - // Check if event has enough instances - if (instance >= 0 && instance >= metric->instances) - { - ERROR_PRINT(Instance %d seleced but event '%s' has only %d, instance, eventName, metric->instances); - free(activeEvents); - return -EINVAL; - } - - // Set instance number - if (instance >= 0) - { - tmpEvent->subvariant = instance; - } - } - - device->activeSmiEvents = activeEvents; - device->numActiveSmiEvents = numEvents; - - return 0; +#ifndef LIKWID_ROCPROF_SDK + rocmon_v1_finalize(); +#else + rocmon_sdk_finalize(); +#endif + return; } - int -rocmon_setupCounters(int gid) +rocmon_init(int numGpus, const int* gpuIds) { - int ret; - - // Check arguments - if (gid < 0 || gid >= rocmon_context->numActiveGroups) - { - return -EINVAL; - } - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Get group info - GroupInfo* group = &rocmon_context->groups[gid]; - - // - // Separate rocprofiler and SMI events - // - const char **smiEvents = NULL, **rocEvents = NULL; - int numSmiEvents = 0, numRocEvents = 0; - - // Allocate memory for string arrays - smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (smiEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); - return -ENOMEM; - } - rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (rocEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); - free(smiEvents); - return -ENOMEM; - } - - // Go through each event and sort it - for (int i = 0; i < group->nevents; i++) - { - const char* name = group->events[i]; - if (strncmp(name, "RSMI_", 5) == 0) - { - // RSMI event - smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix - numSmiEvents++; - } - else if (strncmp(name, "ROCP_", 5) == 0) - { - // Rocprofiler event - rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix - numRocEvents++; - } - else - { - // Unknown event - ERROR_PRINT(Event '%s' has no prefix ('ROCP_' or 'RSMI_'), name); - return -EINVAL; - } - } - - // Add events to each device - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Add rocprofiler events - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); - ret = _rocmon_setupCounters_rocprofiler(device, rocEvents, numRocEvents); - if (ret < 0) - { - free(smiEvents); - free(rocEvents); - return ret; - } - - // Add SMI events - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCM SMI WITH %d events, numSmiEvents); - ret = _rocmon_setupCounters_smi(device, smiEvents, numSmiEvents); - if (ret < 0) - { - free(smiEvents); - free(rocEvents); - return ret; - } - } - rocmon_context->activeGroup = gid; - - // Cleanup - free(smiEvents); - free(rocEvents); - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_init(numGpus, gpuIds); +#else + return rocmon_sdk_init(numGpus, gpuIds); +#endif } - -static int -_rocmon_startCounters_rocprofiler(RocmonDevice* device) +int +rocmon_addEventSet(const char* eventString, int* gid) { - // Check if there are any counters to start - if (device->numActiveRocEvents <= 0) - { - return 0; - } - - // Reset results - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveRocEvents; i++) - { - RocmonEventResult* result = &groupResult->results[i]; - result->lastValue = 0; - result->fullValue = 0; - } - - if (device->context) - { - ROCM_CALL(rocprofiler_start, (device->context, 0), return -1); - } - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_addEventSet(eventString, gid); +#else + return rocmon_sdk_addEventSet(eventString, gid); +#endif } -static int -_rocmon_startCounters_smi(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveSmiEvents <= 0) - { - return 0; - } - - // Save baseline values - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) - { - double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; - - // Measure counter - if (event->measureFunc) - { - event->measureFunc(device->deviceId, event, result); - } - - // Save value - result->fullValue = 0; - } - return 0; +int +rocmon_setupCounters(int gid) +{ +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_setupCounters(gid); +#else + return rocmon_sdk_setupCounters(gid); +#endif } + int rocmon_startCounters(void) { - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Get timestamp - uint64_t timestamp; - if (ret = _rocmon_get_timestamp(×tamp)) - { - return ret; - } - - // Start counters on each device - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - device->time.start = timestamp; - device->time.read = timestamp; - - // Start rocprofiler events - ret = _rocmon_startCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Start SMI events - _rocmon_startCounters_smi(device); - if (ret < 0) return ret; - } - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_startCounters(); +#else + return rocmon_sdk_startCounters(); +#endif } -static int -_rocmon_stopCounters_rocprofiler(RocmonDevice* device) -{ - if (device->context) - { - // Close context - ROCM_CALL(rocprofiler_stop, (device->context, 0), return -1); - } - - return 0; -} - int rocmon_stopCounters(void) { - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Read counters - ret = _rocmon_readCounters(&_rocmon_get_stop_time); - if (ret < 0) return ret; - - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Stop rocprofiler events - ret = _rocmon_stopCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Nothing to stop for SMI events - } - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_stopCounters(); +#else + return rocmon_sdk_stopCounters(); +#endif } int rocmon_readCounters(void) { - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Read counters - ret = _rocmon_readCounters(&_rocmon_get_read_time); - if (ret < 0) return ret; - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_readCounters(); +#else + return rocmon_sdk_readCounters(); +#endif } double rocmon_getResult(int gpuIdx, int groupId, int eventId) { - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate gpuIdx - if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) - { - return -EFAULT; - } - - // Validate groupId - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - if (groupId < 0 || groupId >= device->numGroupResults) - { - return -EFAULT; - } - - // Validate eventId - RocmonEventResultList* groupResult = &device->groupResults[groupId]; - if (eventId < 0 || eventId >= groupResult->numResults) - { - return -EFAULT; - } - - // Return result - return groupResult->results[eventId].fullValue; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getResult(gpuIdx, groupId, eventId); +#else + return rocmon_sdk_getResult(gpuIdx, groupId, eventId); +#endif } @@ -1862,413 +154,219 @@ rocmon_getResult(int gpuIdx, int groupId, int eventId) double rocmon_getLastResult(int gpuIdx, int groupId, int eventId) { - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate gpuIdx - if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) - { - return -EFAULT; - } - - // Validate groupId - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - if (groupId < 0 || groupId >= device->numGroupResults) - { - return -EFAULT; - } - - // Validate eventId - RocmonEventResultList* groupResult = &device->groupResults[groupId]; - if (eventId < 0 || eventId >= groupResult->numResults) - { - return -EFAULT; - } - - // Return result - return groupResult->results[eventId].lastValue; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getLastResult(gpuIdx, groupId, eventId); +#else + return rocmon_sdk_getLastResult(gpuIdx, groupId, eventId); +#endif } int rocmon_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) { - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate args - if (gpuIdx < 0 || gpuIdx > rocmon_context->numDevices) - { - return -EINVAL; - } - if (list == NULL) - { - return -EINVAL; - } - - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - - // Allocate list structure - EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); - if (tmpList == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate event list); - return -ENOMEM; - } - - // Get number of events - printf("NUmber of events %d + %d\n", device->numRocMetrics , get_map_size(device->smiMetrics)); - tmpList->numEvents = device->numRocMetrics + get_map_size(device->smiMetrics); - if (tmpList->numEvents == 0) - { - // No events -> return empty list - tmpList->events = NULL; - *list = tmpList; - return 0; - } - - // Allocate event array - tmpList->events = (Event_rocm_t*) malloc(tmpList->numEvents * sizeof(Event_rocm_t)); - if (tmpList->events == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate events for event list); - free(tmpList); - return -ENOMEM; - } - - // Copy rocprofiler event information - for (int i = 0; i < device->numRocMetrics; i++) - { - rocprofiler_info_data_t* event = &device->rocMetrics[i]; - Event_rocm_t* out = &tmpList->events[i]; - int len; - - // Copy name - printf("Name %s\n", event->metric.name); - len = strlen(event->metric.name) + 5 /* Prefix */ + 1 /* NULL byte */; - out->name = (char*) malloc(len); - if (out->name) - { - snprintf(out->name, len, "ROCP_%s", event->metric.name); - } - - // Copy description - len = strlen(event->metric.description) + 1 /* NULL byte */; - out->description = (char*) malloc(len); - if (out->description) - { - snprintf(out->description, len, "%s", event->metric.description); - } - - // Copy instances - out->instances = event->metric.instances; - } - - // Copy ROCm SMI metric information - for (int i = 0; i < get_map_size(device->smiMetrics); i++) - { - RocmonSmiEvent* event = NULL; - Event_rocm_t* out = &tmpList->events[device->numRocMetrics + i]; - int len; - - // Get event - if (get_smap_by_idx(device->smiMetrics, i, (void**)&event) < 0) - { - continue; - } - - // Copy name - len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; - out->name = (char*) malloc(len); - if (out->name) - { - snprintf(out->name, len, "RSMI_%s", event->name); - } - - // Copy description - char* description = "SMI Event"; // TODO: use real descriptions - len = strlen(description) + 1 /* NULL byte */; - out->description = (char*) malloc(len); - if (out->description) - { - snprintf(out->description, len, "%s", description); - } - - // Copy instances - out->instances = event->instances; - } - - *list = tmpList; - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getEventsOfGpu(gpuIdx, list); +#else + return rocmon_sdk_getEventsOfGpu(gpuIdx, list); +#endif } void rocmon_freeEventsOfGpu(EventList_rocm_t list) { -#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } - - // Check pointer - if (list == NULL) - { - return; - } - - if (list->events != NULL) - { - for (int i = 0; i < list->numEvents; i++) - { - Event_rocm_t* event = &list->events[i]; - FREE_IF_NOT_NULL(event->name); - FREE_IF_NOT_NULL(event->description); - } - free(list->events); - } - free(list); +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_freeEventsOfGpu(list); +#else + return rocmon_sdk_freeEventsOfGpu(list); +#endif } int rocmon_switchActiveGroup(int newGroupId) { - int ret; - - ret = rocmon_stopCounters(); - if (ret < 0) - { - return ret; - } - - ret = rocmon_setupCounters(newGroupId); - if (ret < 0) - { - return ret; - } - - ret = rocmon_startCounters(); - if (ret < 0) - { - return ret; - } - - return 0; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_switchActiveGroup(newGroupId); +#else + return rocmon_sdk_switchActiveGroup(newGroupId); +#endif } int rocmon_getNumberOfGroups(void) { - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->numActiveGroups; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getNumberOfGroups(); +#else + return rocmon_sdk_getNumberOfGroups(); +#endif } int rocmon_getIdOfActiveGroup(void) { - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->activeGroup; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getIdOfActiveGroup(); +#else + return rocmon_sdk_getIdOfActiveGroup(); +#endif } int rocmon_getNumberOfGPUs(void) { - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->numDevices; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getNumberOfGPUs(); +#else + return rocmon_sdk_getNumberOfGPUs(); +#endif } int rocmon_getNumberOfEvents(int groupId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return -EFAULT; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->nevents; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getNumberOfEvents(groupId); +#else + return rocmon_sdk_getNumberOfEvents(groupId); +#endif } int rocmon_getNumberOfMetrics(int groupId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->nmetrics; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getNumberOfMetrics(groupId); +#else + return rocmon_sdk_getNumberOfMetrics(groupId); +#endif } double rocmon_getTimeOfGroup(int groupId) { - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.stop - device->time.start)); - } - return t*1E-9; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getTimeOfGroup(groupId); +#else + return rocmon_sdk_getTimeOfGroup(groupId); +#endif } double rocmon_getLastTimeOfGroup(int groupId) { - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.stop - device->time.read)); - } - return t*1E-9; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getLastTimeOfGroup(groupId); +#else + return rocmon_sdk_getLastTimeOfGroup(groupId); +#endif } double rocmon_getTimeToLastReadOfGroup(int groupId) { - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.read - device->time.start)); - } - return t*1E-9; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getTimeToLastReadOfGroup(groupId); +#else + return rocmon_sdk_getTimeToLastReadOfGroup(groupId); +#endif } char* rocmon_getEventName(int groupId, int eventId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((eventId < 0) || (eventId >= ginfo->nevents)) - { - return NULL; - } - return ginfo->events[eventId]; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getEventName(groupId, eventId); +#else + return rocmon_sdk_getEventName(groupId, eventId); +#endif } char* rocmon_getCounterName(int groupId, int eventId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((eventId < 0) || (eventId >= ginfo->nevents)) - { - return NULL; - } - return ginfo->counters[eventId]; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getCounterName(groupId, eventId); +#else + return rocmon_sdk_getCounterName(groupId, eventId); +#endif } char* rocmon_getMetricName(int groupId, int metricId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((metricId < 0) || (metricId >= ginfo->nmetrics)) - { - return NULL; - } - return ginfo->metricnames[metricId]; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getMetricName(groupId, metricId); +#else + return rocmon_sdk_getMetricName(groupId, metricId); +#endif } char* rocmon_getGroupName(int groupId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->groupname; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getGroupName(groupId); +#else + return rocmon_sdk_getGroupName(groupId); +#endif } char* rocmon_getGroupInfoShort(int groupId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->shortinfo; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getGroupInfoShort(groupId); +#else + return rocmon_sdk_getGroupInfoShort(groupId); +#endif } char* rocmon_getGroupInfoLong(int groupId) { - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->longinfo; +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getGroupInfoLong(groupId); +#else + return rocmon_sdk_getGroupInfoLong(groupId); +#endif } int rocmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos) { - init_configuration(); - Configuration_t config = get_configuration(); - - return perfgroup_getGroups(config->groupPath, "amd_gpu", groups, shortinfos, longinfos); +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_getGroups(groups, shortinfos, longinfos); +#else + return rocmon_sdk_getGroups(groups, shortinfos, longinfos); +#endif } int rocmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) { - perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); +#ifndef LIKWID_ROCPROF_SDK + return rocmon_v1_returnGroups(nrgroups, groups, shortinfos, longinfos); +#else + return rocmon_sdk_returnGroups(nrgroups, groups, shortinfos, longinfos); +#endif } void rocmon_setVerbosity(int level) @@ -2279,5 +377,4 @@ void rocmon_setVerbosity(int level) } } - #endif /* LIKWID_WITH_ROCMON */ diff --git a/src/rocmon_marker.c b/src/rocmon_marker.c index 68337239d..01e43ffac 100644 --- a/src/rocmon_marker.c +++ b/src/rocmon_marker.c @@ -39,7 +39,9 @@ #include #include -#include +#ifndef LIKWID_ROCPROF_SDK +#include +#endif #define gettid() syscall(SYS_gettid) diff --git a/src/rocmon_v1.c b/src/rocmon_v1.c new file mode 100644 index 000000000..31ff459e8 --- /dev/null +++ b/src/rocmon_v1.c @@ -0,0 +1,2275 @@ + /* ======================================================================================= + * + * Filename: rocmon_v1.c + * + * Description: Main implementation of the performance monitoring module + * for AMD GPUs with ROCm < 6.2 + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.roehl@googlemail.com + * Project: likwid + * + * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifdef LIKWID_WITH_ROCMON + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + + + + +// #include +// #include +// #include + +// Variables +static void *dl_hsa_lib = NULL; +static void *dl_profiler_lib = NULL; +static void *dl_rsmi_lib = NULL; + +RocmonContext *rocmon_context = NULL; +static bool rocmon_initialized = FALSE; +int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; + +// Macros +#define membersize(type, member) sizeof(((type *) NULL)->member) +#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } +#define ROCM_CALL( call, args, handleerror ) \ + do { \ + hsa_status_t _status = (*call##_ptr)args; \ + if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ + const char* err = NULL; \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + rocprofiler_error_string(&err); \ + fprintf(stderr, "Error: %s\n", err); \ + handleerror; \ + } \ + } while (0) + +#define RSMI_CALL( call, args, handleerror ) \ + do { \ + rsmi_status_t _status = (*call##_ptr)args; \ + if (_status != RSMI_STATUS_SUCCESS) { \ + fprintf(stderr, "Error: function %s failed with error %d.\n", #call, _status); \ + handleerror; \ + } \ + } while (0) + +// ROCm function declarations +#define ROCMWEAK __attribute__(( weak )) +#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; +#define DECLAREFUNC_SMI(funcname, funcsig) rsmi_status_t ROCMWEAK funcname funcsig; rsmi_status_t ( *funcname##_ptr ) funcsig; + +DECLAREFUNC_HSA(hsa_init, ()); +DECLAREFUNC_HSA(hsa_shut_down, ()); +DECLAREFUNC_HSA(hsa_iterate_agents, (hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data)); +DECLAREFUNC_HSA(hsa_agent_get_info, (hsa_agent_t agent, hsa_agent_info_t attribute, void* value)); +DECLAREFUNC_HSA(hsa_system_get_info, (hsa_system_info_t attribute, void *value)); + +DECLAREFUNC_HSA(rocprofiler_iterate_info, (const hsa_agent_t* agent, rocprofiler_info_kind_t kind, hsa_status_t (*callback)(const rocprofiler_info_data_t, void* data), void* data)); +DECLAREFUNC_HSA(rocprofiler_close, (rocprofiler_t* context)); +DECLAREFUNC_HSA(rocprofiler_open, (hsa_agent_t agent, rocprofiler_feature_t* features, uint32_t feature_count, rocprofiler_t** context, uint32_t mode, rocprofiler_properties_t* properties)); +DECLAREFUNC_HSA(rocprofiler_error_string, ()); +DECLAREFUNC_HSA(rocprofiler_start, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_stop, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_read, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_get_data, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_get_metrics, (const rocprofiler_t* context)); + +DECLAREFUNC_SMI(rsmi_init, (uint64_t flags)); +DECLAREFUNC_SMI(rsmi_shut_down, ()); +DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_open, (uint32_t dv_ind, rsmi_func_id_iter_handle_t* handle)); +DECLAREFUNC_SMI(rsmi_dev_supported_variant_iterator_open, (rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t* var_iter)); +DECLAREFUNC_SMI(rsmi_func_iter_value_get, (rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t* value )); +DECLAREFUNC_SMI(rsmi_func_iter_next, (rsmi_func_id_iter_handle_t handle)); +DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_close, (rsmi_func_id_iter_handle_t* handle)); +DECLAREFUNC_SMI(rsmi_dev_power_ave_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* power)); +DECLAREFUNC_SMI(rsmi_dev_pci_throughput_get, (uint32_t dv_ind, uint64_t* sent, uint64_t* received, uint64_t* max_pkt_sz)); +DECLAREFUNC_SMI(rsmi_dev_pci_replay_counter_get, (uint32_t dv_ind, uint64_t* counter)); +DECLAREFUNC_SMI(rsmi_dev_memory_total_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* total)); +DECLAREFUNC_SMI(rsmi_dev_memory_usage_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* used )); +DECLAREFUNC_SMI(rsmi_dev_memory_busy_percent_get, (uint32_t dv_ind, uint32_t* busy_percent)); +DECLAREFUNC_SMI(rsmi_dev_memory_reserved_pages_get, (uint32_t dv_ind, uint32_t* num_pages, rsmi_retired_page_record_t* records)); +DECLAREFUNC_SMI(rsmi_dev_fan_rpms_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); +DECLAREFUNC_SMI(rsmi_dev_fan_speed_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); +DECLAREFUNC_SMI(rsmi_dev_fan_speed_max_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* max_speed)); +DECLAREFUNC_SMI(rsmi_dev_temp_metric_get, (uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t* temperature)); +DECLAREFUNC_SMI(rsmi_dev_volt_metric_get, (uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t* voltage)); +DECLAREFUNC_SMI(rsmi_dev_overdrive_level_get, (uint32_t dv_ind, uint32_t* od)); +DECLAREFUNC_SMI(rsmi_dev_ecc_count_get, (uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t* ec)); +DECLAREFUNC_SMI(rsmi_compute_process_info_get, (rsmi_process_info_t* procs, uint32_t* num_items)); + + +// ---------------------------------------------------- +// SMI event wrapper +// ---------------------------------------------------- + +static int +_smi_wrapper_pci_throughput_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t value; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _smi_wrapper_pci_throughput_get(%d, %d), deviceId, event->extra); + // Internal variant: 0 for sent, 1 for received bytes and 2 for max packet size + if (event->extra == 0) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, &value, NULL, NULL), return -1); + else if (event->extra == 1) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, &value, NULL), return -1); + else if (event->extra == 2) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, NULL, &value), return -1); + else return -1; + + result->fullValue += value; + result->lastValue = value; + + return 0; +} + + +static int +_smi_wrapper_pci_replay_counter_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t counter; + RSMI_CALL(rsmi_dev_pci_replay_counter_get, (deviceId, &counter), return -1); + result->fullValue += counter; + result->lastValue = counter; + + return 0; +} + + +static int +_smi_wrapper_power_ave_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t power; + RSMI_CALL(rsmi_dev_power_ave_get, (deviceId, event->subvariant, &power), return -1); + result->fullValue += power; + result->lastValue = power; + + return 0; +} + + +static int +_smi_wrapper_memory_total_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t total; + RSMI_CALL(rsmi_dev_memory_total_get, (deviceId, event->variant, &total), return -1); + result->fullValue += total; + result->lastValue = total; + + return 0; +} + + +static int +_smi_wrapper_memory_usage_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t used; + RSMI_CALL(rsmi_dev_memory_usage_get, (deviceId, event->variant, &used), return -1); + result->fullValue += used; + result->lastValue = used; + + return 0; +} + + +static int +_smi_wrapper_memory_busy_percent_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t percent; + RSMI_CALL(rsmi_dev_memory_busy_percent_get, (deviceId, &percent), return -1); + result->fullValue += percent; + result->lastValue = percent; + + return 0; +} + + +static int +_smi_wrapper_memory_reserved_pages_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t num_pages; + RSMI_CALL(rsmi_dev_memory_reserved_pages_get, (deviceId, &num_pages, NULL), return -1); + result->fullValue += num_pages; + result->lastValue = num_pages; + + return 0; +} + + +static int +_smi_wrapper_fan_rpms_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t speed; + RSMI_CALL(rsmi_dev_fan_rpms_get, (deviceId, event->subvariant, &speed), return -1); + result->fullValue += speed; + result->lastValue = speed; + + return 0; +} + + +static int +_smi_wrapper_fan_speed_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t speed; + RSMI_CALL(rsmi_dev_fan_speed_get, (deviceId, event->subvariant, &speed), return -1); + result->fullValue += speed; + result->lastValue = speed; + + return 0; +} + + +static int +_smi_wrapper_fan_speed_max_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t max_speed; + RSMI_CALL(rsmi_dev_fan_speed_max_get, (deviceId, event->subvariant, &max_speed), return -1); + result->fullValue += max_speed; + result->lastValue = max_speed; + + return 0; +} + + +static int +_smi_wrapper_temp_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t temperature; + RSMI_CALL(rsmi_dev_temp_metric_get, (deviceId, event->subvariant, event->variant, &temperature), return -1); + result->fullValue += temperature; + result->lastValue = temperature; + + return 0; +} + + +static int +_smi_wrapper_volt_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t voltage; + RSMI_CALL(rsmi_dev_volt_metric_get, (deviceId, event->subvariant, event->variant, &voltage), return -1); + result->fullValue += voltage; + result->lastValue = voltage; + + return 0; +} + + +static int +_smi_wrapper_overdrive_level_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t overdrive; + RSMI_CALL(rsmi_dev_overdrive_level_get, (deviceId, &overdrive), return -1); + result->fullValue += overdrive; + result->lastValue = overdrive; + + return 0; +} + + +static int +_smi_wrapper_ecc_count_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + rsmi_error_count_t error_count; + RSMI_CALL(rsmi_dev_ecc_count_get, (deviceId, event->variant, &error_count), return -1); + + if (event->extra == 0) + { + result->lastValue = error_count.correctable_err - result->fullValue; + result->fullValue = error_count.correctable_err; + } + else if (event->extra == 1) + { + result->lastValue = error_count.uncorrectable_err - result->fullValue; + result->fullValue = error_count.uncorrectable_err; + } + else + { + return -1; + } + + return 0; +} + + +static int +_smi_wrapper_compute_process_info_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t num_items; + RSMI_CALL(rsmi_compute_process_info_get, (NULL, &num_items), return -1); + result->fullValue += num_items; + result->lastValue = num_items; + + return 0; +} + + +// ---------------------------------------------------- +// Rocmon helper functions +// ---------------------------------------------------- + +static int +_rocmon_link_libraries() +{ + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries); + + // Need to link in the ROCm HSA libraries + dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (!dl_hsa_lib) + { + ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); + return -1; + } + + // Need to link in the Rocprofiler libraries + dl_profiler_lib = dlopen("librocprofiler64.so", RTLD_NOW | RTLD_GLOBAL); + if (!dl_profiler_lib) + { + dl_profiler_lib = dlopen("librocprofiler64.so.1", RTLD_NOW | RTLD_GLOBAL); + if (!dl_profiler_lib) + { + ERROR_PRINT(Rocprofiler library librocprofiler64.so not found: %s, dlerror()); + return -1; + } + } + + // Need to link in the Rocprofiler libraries + dl_rsmi_lib = dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL); + if (!dl_rsmi_lib) + { + ERROR_PRINT(ROCm SMI library librocm_smi64.so not found: %s, dlerror()); + return -1; + } + + // Link HSA functions + DLSYM_AND_CHECK(dl_hsa_lib, hsa_init); + DLSYM_AND_CHECK(dl_hsa_lib, hsa_shut_down); + DLSYM_AND_CHECK(dl_hsa_lib, hsa_iterate_agents); + DLSYM_AND_CHECK(dl_hsa_lib, hsa_agent_get_info); + DLSYM_AND_CHECK(dl_hsa_lib, hsa_system_get_info); + + // Link Rocprofiler functions + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_iterate_info); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_close); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_open); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_error_string); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_start); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_stop); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_read); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_data); + DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_metrics); + + // Link SMI functions + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_init); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_shut_down); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_open); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_variant_iterator_open); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_value_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_next); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_close); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_power_ave_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_throughput_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_replay_counter_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_total_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_usage_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_busy_percent_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_reserved_pages_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_rpms_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_max_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_temp_metric_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_volt_metric_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_overdrive_level_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_ecc_count_get); + DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_compute_process_info_get); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries done); + return 0; +} + +typedef struct { + RocmonContext* context; + int numGpus; + const int* gpuIds; +} iterate_agents_cb_arg; + +typedef struct { + RocmonDevice* device; + int currIndex; +} iterate_info_cb_arg; + + +static hsa_status_t +_rocmon_iterate_info_callback_count(const rocprofiler_info_data_t info, void* data) +{ + RocmonDevice* device = (RocmonDevice*) data; + if (device) { + device->numRocMetrics++; + } + return HSA_STATUS_SUCCESS; +} + +static void +_rocmon_print_rocprofiler_info_data(const rocprofiler_info_data_t info) +{ + if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + { + return; + } + printf("Name '%s':\n", info.metric.name); + printf("\tKind: '%s'\n", (info.kind == ROCPROFILER_INFO_KIND_METRIC ? "Metric" : "Trace")); + printf("\tInstances: %d\n", info.metric.instances); + printf("\tDescription: '%s'\n", info.metric.description); + printf("\tExpression: '%s'\n", info.metric.expr); + printf("\tBlockName: '%s'\n", info.metric.block_name); + printf("\tBlockCounters: %d\n", info.metric.block_counters); +} + +static hsa_status_t +_rocmon_iterate_info_callback_add(const rocprofiler_info_data_t info, void* data) +{ + iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); + if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP) + { + _rocmon_print_rocprofiler_info_data(info); + } + // Check info kind + if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + { + ERROR_PRINT(Wrong info kind %u, info.kind); + return HSA_STATUS_ERROR; + } + + // Check index + if (arg->currIndex >= arg->device->numRocMetrics) + { + ERROR_PRINT(Metric index out of bounds: %d, arg->currIndex); + return HSA_STATUS_ERROR; + } + + // Copy info data + rocprofiler_info_data_t* target_info = &arg->device->rocMetrics[arg->currIndex]; + memcpy(target_info, &info, sizeof(rocprofiler_info_data_t)); + arg->currIndex++; + + return HSA_STATUS_SUCCESS; +} + + +static hsa_status_t +_rocmon_iterate_agents_callback(hsa_agent_t agent, void* argv) +{ + // Count number of callback invocations as the devices id + static int nextDeviceId = 0; + int deviceId = nextDeviceId; + bool noAgent = false; + + iterate_agents_cb_arg *arg = (iterate_agents_cb_arg*) argv; + + // Check if device is a GPU + hsa_device_type_t type; + ROCM_CALL(hsa_agent_get_info, (agent, HSA_AGENT_INFO_DEVICE, &type), return -1); + if (type != HSA_DEVICE_TYPE_GPU) + { + return HSA_STATUS_SUCCESS; + } + nextDeviceId++; + + // Check if device is includes in arg->gpuIds + int gpuIndex = -1; + for (int i = 0; i < arg->numGpus; i++) + { + if (deviceId == arg->gpuIds[i]) + { + gpuIndex = i; + break; + } + } + if (gpuIndex < 0) + { + return HSA_STATUS_SUCCESS; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing agent %d, gpuIndex); + + // Add agent to context + RocmonDevice *device = &arg->context->devices[gpuIndex]; + device->deviceId = deviceId; + device->hsa_agent = agent; + device->context = NULL; + device->numActiveRocEvents = 0; + device->activeRocEvents = NULL; + device->numGroupResults = 0; + device->groupResults = NULL; + + // Get number of available metrics + device->numRocMetrics = 0; + ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, RocProfiler provides %d events, device->numRocMetrics); + + // workaround for bug in ROCm 5.4.0 + if(device->numRocMetrics == 0) { + ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); + noAgent = true; + } + + // Allocate memory for metrics + device->rocMetrics = (rocprofiler_info_data_t*) malloc(device->numRocMetrics * sizeof(rocprofiler_info_data_t)); + if (device->rocMetrics == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate set of rocMetrics); + return HSA_STATUS_ERROR; + } + + // Initialize SMI events map + if (init_map(&device->smiMetrics, MAP_KEY_TYPE_STR, 0, &free) < 0) + { + ERROR_PLAIN_PRINT(Cannot init smiMetrics map); + return HSA_STATUS_ERROR; + } + + // Fetch metric informatino + iterate_info_cb_arg info_arg = { + .device = device, + .currIndex = 0, + }; + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, Read %d RocProfiler events for device %d, device->numRocMetrics, device->deviceId); + + // If the call fails with agent, call rocprofiler_iterate_info without agent + if(noAgent) + { + ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); + } else { + ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); + } + + return HSA_STATUS_SUCCESS; +} + + +static int +_rocmon_parse_eventstring(const char* eventString, GroupInfo* group) +{ + int err = 0; + Configuration_t config = get_configuration(); + bstring eventBString = bfromcstr(eventString); + + if (bstrchrp(eventBString, ':', 0) != BSTR_ERR) + { + // If custom group -> perfgroup_customGroup + err = perfgroup_customGroup(eventString, group); + if (err < 0) + { + ERROR_PRINT(Cannot transform %s to performance group, eventString); + return err; + } + } + else + { + // If performance group -> perfgroup_readGroup + err = perfgroup_readGroup(config->groupPath, "amd_gpu", eventString, group); + if (err == -EACCES) + { + ERROR_PRINT(Access to performance group %s not allowed, eventString); + return err; + } + else if (err == -ENODEV) + { + ERROR_PRINT(Performance group %s only available with deactivated HyperThreading, eventString); + return err; + } + if (err < 0) + { + ERROR_PRINT(Cannot read performance group %s, eventString); + return err; + } + } + + return 0; +} + + +static int +_rocmon_get_timestamp(uint64_t* timestamp_ns) +{ + uint64_t timestamp; + + // Get timestamp from system + ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP, ×tamp), return -1); + // Convert to nanoseconds + *timestamp_ns = (uint64_t)((long double)timestamp * rocmon_context->hsa_timestamp_factor); + + return 0; +} + + +static int +_rocmon_getLastResult(RocmonDevice* device, int eventId, double* value) +{ + rocprofiler_data_t* data = &device->activeRocEvents[eventId].data; + + switch (data->kind) + { + case ROCPROFILER_DATA_KIND_INT32: + *value = (double) data->result_int32; + break; + case ROCPROFILER_DATA_KIND_INT64: + *value = (double) data->result_int64; + break; + case ROCPROFILER_DATA_KIND_FLOAT: + *value = (double) data->result_float; + break; + case ROCPROFILER_DATA_KIND_DOUBLE: + *value = data->result_double; + break; + + case ROCPROFILER_DATA_KIND_BYTES: + case ROCPROFILER_DATA_KIND_UNINIT: + default: + return -1; + } + + return 0; +} + + +static int +_rocmon_readCounters_rocprofiler(RocmonDevice* device) +{ + int ret; + + // Check if there are any counters to start + if (device->numActiveRocEvents <= 0) + { + return 0; + } + + if (!device->context) + { + return 0; + } + + ROCM_CALL(rocprofiler_read, (device->context, 0), return -1); + ROCM_CALL(rocprofiler_get_data, (device->context, 0), return -1); + ROCM_CALL(rocprofiler_get_metrics, (device->context), return -1); + + // Update results + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveRocEvents; i++) + { + RocmonEventResult* result = &groupResult->results[i]; + + // Read value + ret = _rocmon_getLastResult(device, i, &result->fullValue); + if (ret < 0) + { + return -1; + } + + // Calculate delta since last read + result->lastValue = result->fullValue - result->lastValue; + } + + return 0; +} + + +static int +_rocmon_readCounters_smi(RocmonDevice* device) +{ + // Check if there are any counters to start + if (device->numActiveSmiEvents <= 0) + { + return 0; + } + + // Save baseline values + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveSmiEvents; i++) + { + double value = 0; + RocmonSmiEvent* event = &device->activeSmiEvents[i]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + + // Measure counter + if (event->measureFunc) + { + event->measureFunc(device->deviceId, event, result); + } + } + + return 0; +} + + +static int +_rocmon_readCounters(uint64_t* (*getDestTimestampFunc)(RocmonDevice* device)) +{ + int ret; + + // Get timestamp + uint64_t timestamp; + if (ret = _rocmon_get_timestamp(×tamp)) + { + return ret; + } + + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Save timestamp + if (getDestTimestampFunc) + { + uint64_t* timestampDest = getDestTimestampFunc(device); + if (timestampDest) + { + *timestampDest = timestamp; + } + } + + // Read rocprofiler counters + ret = _rocmon_readCounters_rocprofiler(device); + if (ret < 0) return ret; + + // Read SMI counters + ret = _rocmon_readCounters_smi(device); + if (ret < 0) return ret; + } + + return 0; +} + + +static uint64_t* +_rocmon_get_read_time(RocmonDevice* device) +{ + return &device->time.read; +} + + +static uint64_t* +_rocmon_get_stop_time(RocmonDevice* device) +{ + return &device->time.stop; +} + + +// ---------------------------------------------------- +// Rocmon SMI helper functions +// ---------------------------------------------------- + +static bstring +_rocmon_smi_build_label(RocmonSmiEventType type, const char* funcname, uint64_t variant, uint64_t subvariant) +{ + switch (type) + { + case ROCMON_SMI_EVENT_TYPE_NORMAL: + return bfromcstr(funcname); + case ROCMON_SMI_EVENT_TYPE_VARIANT: + return bformat("%s|%" PRIu64, funcname, variant); + case ROCMON_SMI_EVENT_TYPE_SUBVARIANT: + return bformat("%s|%" PRIu64 "|%" PRIu64, funcname, variant, subvariant); + case ROCMON_SMI_EVENT_TYPE_INSTANCES: + return bfromcstr(funcname); + } +} + + +static int +_rocmon_smi_add_event_to_device(RocmonDevice* device, const char* funcname, RocmonSmiEventType type, int64_t variant, uint64_t subvariant) +{ + int ret; + + // Get event by label + RocmonSmiEventList* list = NULL; + bstring label = _rocmon_smi_build_label(type, funcname, variant, subvariant); + ret = get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list); + bdestroy(label); + if (ret < 0) + { + // Event not registered -> ignore + return 0; + } + + // For events with multiple sensor, only make one entry -> find if one exists + if (type == ROCMON_SMI_EVENT_TYPE_INSTANCES && subvariant > 0) + { + // Get list from map + for (int i = 0; i < list->numEntries; i++) + { + RocmonSmiEvent* event = &list->entries[i]; + RocmonSmiEvent* existingEvent = NULL; + ret = get_smap_by_key(device->smiMetrics, event->name, (void**)&existingEvent); + if (ret < 0) + { + ERROR_PRINT(Failed to find previous instance for event %s, event->name); + return -1; + } + + // Update instance information + existingEvent->instances++; + } + return 0; + } + + for (int i = 0; i < list->numEntries; i++) + { + RocmonSmiEvent* event = &list->entries[i]; + + // Allocate memory for device event description + RocmonSmiEvent* tmpEvent = (RocmonSmiEvent*) malloc(sizeof(RocmonSmiEvent)); + if (tmpEvent == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event in device list %s, event->name); + return -ENOMEM; + } + + // Copy information from global description + memcpy(tmpEvent, event, sizeof(RocmonSmiEvent)); + tmpEvent->variant = variant; + tmpEvent->subvariant = subvariant; + tmpEvent->instances = 1; + + // Save event info to device event map + add_smap(device->smiMetrics, tmpEvent->name, tmpEvent); + } + + return 0; +} + + +static int +_rocmon_smi_get_function_subvariants(RocmonDevice* device, const char* funcname, uint64_t variant, rsmi_func_id_iter_handle_t var_iter) +{ + rsmi_func_id_iter_handle_t sub_var_iter; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Get open subvariants iterator + status = (*rsmi_dev_supported_variant_iterator_open_ptr)(var_iter, &sub_var_iter); + if (status == RSMI_STATUS_NO_DATA) + { + // No subvariants + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_VARIANT, variant, 0); + if (ret < 0) return -1; + return 0; + } + + // Subvariants available -> iterate them + do { + // Get subvariant information + (*rsmi_func_iter_value_get_ptr)(sub_var_iter, &value); + + // Process info + if (variant == RSMI_DEFAULT_VARIANT) + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_INSTANCES, variant, value.id); + else + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, variant, value.id); + if (ret < 0) return ret; + + // Advance iterator + status = (*rsmi_func_iter_next_ptr)(sub_var_iter); + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + (*rsmi_dev_supported_func_iterator_close_ptr)(&sub_var_iter); + + return 0; +} + + +static int +_rocmon_smi_get_function_variants(RocmonDevice* device, const char* funcname, rsmi_func_id_iter_handle_t iter_handle) +{ + rsmi_func_id_iter_handle_t var_iter; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Get open variants iterator + status = (*rsmi_dev_supported_variant_iterator_open_ptr)(iter_handle, &var_iter); + if (status == RSMI_STATUS_NO_DATA) + { + // No variants + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); + if (ret < 0) return -1; + return 0; + } + + // Variants available -> iterate them + do { + // Get variant information + (*rsmi_func_iter_value_get_ptr)(var_iter, &value); + + // Get function subvariants + ret = _rocmon_smi_get_function_subvariants(device, funcname, value.id, var_iter); + if (ret < 0) return -1; + + // Advance iterator + status = (*rsmi_func_iter_next_ptr)(var_iter); + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + (*rsmi_dev_supported_func_iterator_close_ptr)(&var_iter); + + return 0; +} + + +static int +_rocmon_smi_get_functions(RocmonDevice* device) +{ + rsmi_func_id_iter_handle_t iter_handle; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Open iterator + //(*rsmi_dev_supported_func_iterator_open_ptr)(device->deviceId, &iter_handle); + RSMI_CALL(rsmi_dev_supported_func_iterator_open, (device->deviceId, &iter_handle), { + return -1; + }); + + do + { + // Get function information + //(*rsmi_func_iter_value_get_ptr)(iter_handle, &value); + RSMI_CALL(rsmi_func_iter_value_get, (iter_handle, &value), { + ERROR_PRINT(Failed to get smi function value for device %d, device->deviceId); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + return -1; + }); + + // Get function variants + ret = _rocmon_smi_get_function_variants(device, value.name, iter_handle); + if (ret < 0) + { + ERROR_PRINT(Failed to get smi function variants for device %d, device->deviceId); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + return -1; + } + + // Advance iterator (cannot use RSMI_CALL macro here because we have an assignment, + // so we check that the function pointer exists to avoid segfaults.) + if (rsmi_func_iter_next_ptr) { + status = (*rsmi_func_iter_next_ptr)(iter_handle); + } + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + //(*rsmi_dev_supported_func_iterator_close_ptr)(&iter_handle); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + + // Add device independent functions + ret = _rocmon_smi_add_event_to_device(device, "rsmi_compute_process_info_get", ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); + if (ret < 0) return -1; + + return 0; +} + +#define ADD_SMI_EVENT(name, type, smifunc, variant, subvariant, extra, measurefunc) if (_rocmon_smi_add_event_to_map(name, type, smifunc, variant, subvariant, extra, measurefunc) < 0) { return -1; } +#define ADD_SMI_EVENT_N(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_NORMAL, smifunc, 0, 0, extra, measurefunc) +#define ADD_SMI_EVENT_V(name, smifunc, variant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_VARIANT, smifunc, variant, 0, extra, measurefunc) +#define ADD_SMI_EVENT_S(name, smifunc, variant, subvariant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, smifunc, variant, subvariant, extra, measurefunc) +#define ADD_SMI_EVENT_I(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_INSTANCES, smifunc, 0, 0, extra, measurefunc) + +static int +_rocmon_smi_add_event_to_map(char* name, RocmonSmiEventType type, char* smifunc, uint64_t variant, uint64_t subvariant, uint64_t extra, RocmonSmiMeasureFunc measureFunc) +{ + // Add new event list to map (if not already present) + bstring label = _rocmon_smi_build_label(type, smifunc, variant, subvariant); + RocmonSmiEventList* list; + if (get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list) < 0) + { + // Allocate memory for event list + list = (RocmonSmiEventList*) malloc(sizeof(RocmonSmiEventList)); + if (list == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event list %s, name); + return -ENOMEM; + } + list->entries = NULL; + list->numEntries = 0; + + add_smap(rocmon_context->smiEvents, bdata(label), list); + } + bdestroy(label); + + // Allocate memory for another event in list + list->numEntries++; + list->entries = (RocmonSmiEvent*) realloc(list->entries, list->numEntries * sizeof(RocmonSmiEvent)); + if (list->entries == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event %s, name); + return -ENOMEM; + } + + // Set event properties + RocmonSmiEvent* event = &list->entries[list->numEntries-1]; + strncpy(event->name, name, sizeof(event->name)); + event->name[sizeof(event->name)] = '\0'; + event->type = type; + event->variant = variant; + event->subvariant = subvariant; + event->extra = extra; + event->instances = 0; // gets set when scanning supported device functions + event->measureFunc = measureFunc; + + return 0; +} + + +static void +_rcomon_smi_free_event_list(void* vlist) +{ + RocmonSmiEventList* list = (RocmonSmiEventList*)vlist; + if (list) + { + FREE_IF_NOT_NULL(list->entries); + free(list); + } +} + + +static int +_rocmon_smi_init_events() +{ + int ret; + + // Init map + ret = init_map(&rocmon_context->smiEvents, MAP_KEY_TYPE_STR, 0, &_rcomon_smi_free_event_list); + if (ret < 0) + { + ERROR_PRINT(Failed to create map for ROCm SMI events); + return -1; + } + + // Add events + ADD_SMI_EVENT_N("PCI_THROUGHPUT_SENT", "rsmi_dev_pci_throughput_get", 0, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_THROUGHPUT_RECEIVED", "rsmi_dev_pci_throughput_get", 1, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_THROUGHPUT_MAX_PKT_SZ", "rsmi_dev_pci_throughput_get", 2, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_REPLAY_COUNTER", "rsmi_dev_pci_replay_counter_get", 0, &_smi_wrapper_pci_replay_counter_get ); + ADD_SMI_EVENT_I("POWER_AVE", "rsmi_dev_power_ave_get", 0, &_smi_wrapper_power_ave_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_VIS_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_GTT", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_VIS_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_GTT", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_N("MEMORY_BUSY_PERCENT", "rsmi_dev_memory_busy_percent_get", 0, &_smi_wrapper_memory_busy_percent_get ); + ADD_SMI_EVENT_N("MEMORY_NUM_RESERVED_PAGES", "rsmi_dev_memory_reserved_pages_get", 0, &_smi_wrapper_memory_reserved_pages_get ); + ADD_SMI_EVENT_I("FAN_RPMS", "rsmi_dev_fan_rpms_get", 0, &_smi_wrapper_fan_rpms_get ); + ADD_SMI_EVENT_I("FAN_SPEED", "rsmi_dev_fan_speed_get", 0, &_smi_wrapper_fan_speed_get ); + ADD_SMI_EVENT_I("FAN_SPEED_MAX", "rsmi_dev_fan_speed_max_get", 0, &_smi_wrapper_fan_speed_max_get ); + ADD_SMI_EVENT_S("TEMP_EDGE", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_EDGE, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("TEMP_JUNCTION", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_JUNCTION, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("TEMP_MEMORY", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_MEMORY, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("VOLT_VDDGFX", "rsmi_dev_volt_metric_get", RSMI_VOLT_CURRENT, RSMI_VOLT_TYPE_VDDGFX, 0, &_smi_wrapper_volt_metric_get ); + ADD_SMI_EVENT_N("OVERDRIVE_LEVEL", "rsmi_dev_overdrive_level_get", 0, &_smi_wrapper_overdrive_level_get ); + ADD_SMI_EVENT_V("ECC_COUNT_UMC_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_UMC_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SDMA_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SDMA_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_GFX_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_GFX_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_HDP_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_HDP_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_DF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_DF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SMN_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SMN_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SEM_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SEM_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP0_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP0_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP1_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP1_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_FUSE_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_FUSE_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_LAST_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_LAST_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_N("PROCS_USING_GPU", "rsmi_compute_process_info_get", 0, &_smi_wrapper_compute_process_info_get ); + + return 0; +} + + +int +rocmon_v1_init(int numGpus, const int* gpuIds) +{ + hsa_status_t status; + + // check if already initialized + if (rocmon_initialized) + { + return 0; + } + if (rocmon_context != NULL) + { + return -EEXIST; + } + + // Validate arguments + if (numGpus <= 0) + { + ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); + return -EINVAL; + } + + // Initialize other parts + init_configuration(); + + // initialize libraries + int ret = _rocmon_link_libraries(); + if (ret < 0) + { + ERROR_PLAIN_PRINT(Failed to initialize libraries); + return ret; + } + + // Allocate memory for context + rocmon_context = (RocmonContext*) malloc(sizeof(RocmonContext)); + if (rocmon_context == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate Rocmon context); + return -ENOMEM; + } + rocmon_context->groups = NULL; + rocmon_context->numGroups = 0; + rocmon_context->numActiveGroups = 0; + + rocmon_context->devices = (RocmonDevice*) malloc(numGpus * sizeof(RocmonDevice)); + rocmon_context->numDevices = numGpus; + if (rocmon_context->devices == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate set of GPUs); + free(rocmon_context); + rocmon_context = NULL; + return -ENOMEM; + } + + // init hsa library + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing HSA); + ROCM_CALL(hsa_init, (), + { + ERROR_PLAIN_PRINT(Failed to init hsa library); + goto rocmon_init_hsa_failed; + }); + + // init rocm smi library + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RSMI); + RSMI_CALL(rsmi_init, (0), + { + ERROR_PLAIN_PRINT(Failed to init rocm_smi); + goto rocmon_init_rsmi_failed; + }); + + // Get hsa timestamp factor + uint64_t frequency_hz; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting HSA timestamp factor); + ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency_hz), + { + ERROR_PLAIN_PRINT(Failed to get HSA timestamp factor); + goto rocmon_init_info_agents_failed; + }); + rocmon_context->hsa_timestamp_factor = (long double)1000000000 / (long double)frequency_hz; + + // initialize structures for specified devices (fetch ROCm specific info) + iterate_agents_cb_arg arg = { + .context = rocmon_context, + .numGpus = numGpus, + .gpuIds = gpuIds, + }; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Iterating through %d available agents, numGpus); + ROCM_CALL(hsa_iterate_agents, (_rocmon_iterate_agents_callback, &arg), + { + ERROR_PRINT(Error while iterating through available agents); + goto rocmon_init_info_agents_failed; + }); + + // Get available SMI events for devices + _rocmon_smi_init_events(); + for (int i = 0; i < rocmon_context->numDevices; i++) + { + if (_rocmon_smi_get_functions(&rocmon_context->devices[i]) < 0) + { + ERROR_PRINT(Failed to get SMI functions for device %d, rocmon_context->devices[i].deviceId); + goto rocmon_init_info_agents_failed; + } + } + + rocmon_initialized = TRUE; + return 0; +rocmon_init_info_agents_failed: + RSMI_CALL(rsmi_shut_down, (), { + // fall through + }); +rocmon_init_rsmi_failed: + ROCM_CALL(hsa_shut_down, (), { + // fall through + }); +rocmon_init_hsa_failed: + free(rocmon_context->devices); + free(rocmon_context); + rocmon_context = NULL; + return -1; +} + + +void +rocmon_v1_finalize(void) +{ + RocmonContext* context = rocmon_context; + + if (!rocmon_initialized) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON); + + if (context) + { + if (context->devices) + { + // Free each devices fields + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + FREE_IF_NOT_NULL(device->rocMetrics); + FREE_IF_NOT_NULL(device->activeRocEvents); + FREE_IF_NOT_NULL(device->activeSmiEvents); + if (device->groupResults) + { + // Free events of event result lists + for (int j = 0; j < device->numGroupResults; j++) + { + FREE_IF_NOT_NULL(device->groupResults[i].results); + } + // Free list + free(device->groupResults); + } + if (device->context) + { + ROCM_CALL(rocprofiler_close, (device->context),); + } + destroy_smap(device->smiMetrics); + } + + free(context->devices); + context->devices = NULL; + } + + FREE_IF_NOT_NULL(context->groups); + destroy_smap(context->smiEvents); + + free(context); + context = NULL; + } + + RSMI_CALL(rsmi_shut_down, (), { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); + // fall through + }); + ROCM_CALL(hsa_shut_down, (), { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); + // fall through + }); +} + + +int +rocmon_v1_addEventSet(const char* eventString, int* gid) +{ + // Check arguments + if (!eventString) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Allocate memory for event group if necessary + if (rocmon_context->numActiveGroups == rocmon_context->numGroups) + { + GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo)); + if (tmpInfo == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate additional group); + return -ENOMEM; + } + rocmon_context->groups = tmpInfo; + rocmon_context->numGroups++; + } + + // Parse event string + int err = _rocmon_parse_eventstring(eventString, &rocmon_context->groups[rocmon_context->numActiveGroups]); + if (err < 0) + { + return err; + } + + // Allocate memory for event results + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Allocate memory for event results + int numEvents = rocmon_context->groups[rocmon_context->numActiveGroups].nevents; + RocmonEventResult* tmpResults = (RocmonEventResult*) malloc(numEvents * sizeof(RocmonEventResult)); + if (tmpResults == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event results); + return -ENOMEM; + } + + // Allocate memory for new event result list entry + RocmonEventResultList* tmpGroupResults = (RocmonEventResultList*) realloc(device->groupResults, (device->numGroupResults+1) * sizeof(RocmonEventResultList)); + if (tmpGroupResults == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate new event group result list); + return -ENOMEM; + } + + device->groupResults = tmpGroupResults; + device->groupResults[device->numGroupResults].results = tmpResults; + device->groupResults[device->numGroupResults].numResults = numEvents; + device->numGroupResults++; + } + + *gid = rocmon_context->numActiveGroups; + rocmon_context->numActiveGroups++; + return 0; +} + + +static int +_rocmon_setupCounters_rocprofiler(RocmonDevice* device, const char** events, int numEvents) +{ + // Close previous rocprofiler context + if (device->context) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Closing previous rocprofiler context); + ROCM_CALL(rocprofiler_close, (device->context), return -1); + } + + // Look if the are any events + if (numEvents <= 0) + { + return 0; + } + + // Create feature array to monitor + rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(numEvents * sizeof(rocprofiler_feature_t)); + if (features == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate feature list); + return -ENOMEM; + } + for (int i = 0; i < numEvents; i++) + { + features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[i].name = events[i]; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP EVENT %d %s, i, events[i]); + } + + // Free previous feature array if present + FREE_IF_NOT_NULL(device->activeRocEvents); + + device->numActiveRocEvents = numEvents; + device->activeRocEvents = features; + + // Open context + rocprofiler_properties_t properties = {}; + properties.queue_depth = 128; + uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | ROCPROFILER_MODE_SINGLEGROUP; + + // Important: only a single profiling group is supported at this time which limits the number of events that can be monitored at a time. + ROCM_CALL(rocprofiler_open, (device->hsa_agent, device->activeRocEvents, device->numActiveRocEvents, &device->context, mode, &properties), return -1); + + return 0; +} + + +static int +_rocmon_setupCounters_smi(RocmonDevice* device, const char** events, int numEvents) +{ + int ret; + const int instanceNumLen = 5; + + // Delete previous events + if (device->activeSmiEvents) + { + device->activeSmiEvents = NULL; + device->numActiveSmiEvents = 0; + } + + // Look if the are any events + if (numEvents <= 0) + { + return 0; + } + + // Create event array + RocmonSmiEvent* activeEvents = (RocmonSmiEvent*) malloc(numEvents * sizeof(RocmonSmiEvent)); + if (activeEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate active event list); + return -ENOMEM; + } + + for (int i = 0; i < numEvents; i++) + { + char eventName[membersize(RocmonSmiEvent, name)]; + int instance = -1; + + // Parse event name -> normal event vs one with multiple instances (EVENT[0]) + const char* event = events[i]; + char* instancePart = strrchr(event, '['); + if (instancePart != NULL) + { + char withoutBrackets[instanceNumLen+1]; // +1 is '\0' + int partlen = strlen(instancePart); + + // Check if number fit in 'withoutBrackets' + if (partlen - 2 > instanceNumLen) + { + ERROR_PRINT(Instance number in '%s' is too large, event); + free(activeEvents); + return -EINVAL; + } + + // Copy instance number without brackets + strncpy(withoutBrackets, instancePart+1, partlen-2); + withoutBrackets[instanceNumLen] = '\0'; + + // Parse instance as number + char* endParsed; + instance = strtol(withoutBrackets, &endParsed, 10); + + // Check if parsing was successful + char* endOfString = &withoutBrackets[partlen-2]; + if (endParsed != endOfString) + { + ERROR_PRINT(Failed to parse instance number in '%s', event); + free(activeEvents); + return -EINVAL; + } + + // Copy event name without instance + int eventNameLen = instancePart - event; + strncpy(eventName, event, eventNameLen); + eventName[eventNameLen] = '\0'; + } + else + { + // Copy entire event name + strncpy(eventName, event, membersize(RocmonSmiEvent, name)); + } + + // Lookup event in available events + RocmonSmiEvent* metric = NULL; + ret = get_smap_by_key(device->smiMetrics, eventName, (void**)&metric); + if (ret < 0) + { + ERROR_PRINT(RSMI event '%s' not found for device %d, eventName, device->deviceId); + free(activeEvents); + return -EINVAL; + } + + // Copy event + RocmonSmiEvent* tmpEvent = &activeEvents[i]; + memcpy(tmpEvent, metric, sizeof(RocmonSmiEvent)); + + // Check if event supports instances + if (instance >= 0 && tmpEvent->type != ROCMON_SMI_EVENT_TYPE_INSTANCES) + { + ERROR_PRINT(Instance number given but event '%s' does not support one, eventName); + free(activeEvents); + return -EINVAL; + } + + // Check if event requires instances + if (instance < 0 && tmpEvent->type == ROCMON_SMI_EVENT_TYPE_INSTANCES) + { + ERROR_PRINT(No instance number given but event '%s' requires one, eventName); + free(activeEvents); + return -EINVAL; + } + + // Check if event has enough instances + if (instance >= 0 && instance >= metric->instances) + { + ERROR_PRINT(Instance %d seleced but event '%s' has only %d, instance, eventName, metric->instances); + free(activeEvents); + return -EINVAL; + } + + // Set instance number + if (instance >= 0) + { + tmpEvent->subvariant = instance; + } + } + + device->activeSmiEvents = activeEvents; + device->numActiveSmiEvents = numEvents; + + return 0; +} + + +int +rocmon_v1_setupCounters(int gid) +{ + int ret; + + // Check arguments + if (gid < 0 || gid >= rocmon_context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Get group info + GroupInfo* group = &rocmon_context->groups[gid]; + + // + // Separate rocprofiler and SMI events + // + const char **smiEvents = NULL, **rocEvents = NULL; + int numSmiEvents = 0, numRocEvents = 0; + + // Allocate memory for string arrays + smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (smiEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); + return -ENOMEM; + } + rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (rocEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); + free(smiEvents); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "RSMI_", 5) == 0) + { + // RSMI event + smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix + numSmiEvents++; + } + else if (strncmp(name, "ROCP_", 5) == 0) + { + // Rocprofiler event + rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix + numRocEvents++; + } + else + { + // Unknown event + ERROR_PRINT(Event '%s' has no prefix ('ROCP_' or 'RSMI_'), name); + return -EINVAL; + } + } + + // Add events to each device + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Add rocprofiler events + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); + ret = _rocmon_setupCounters_rocprofiler(device, rocEvents, numRocEvents); + if (ret < 0) + { + free(smiEvents); + free(rocEvents); + return ret; + } + + // Add SMI events + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCM SMI WITH %d events, numSmiEvents); + ret = _rocmon_setupCounters_smi(device, smiEvents, numSmiEvents); + if (ret < 0) + { + free(smiEvents); + free(rocEvents); + return ret; + } + } + rocmon_context->activeGroup = gid; + + // Cleanup + free(smiEvents); + free(rocEvents); + + return 0; +} + + +static int +_rocmon_startCounters_rocprofiler(RocmonDevice* device) +{ + // Check if there are any counters to start + if (device->numActiveRocEvents <= 0) + { + return 0; + } + + // Reset results + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveRocEvents; i++) + { + RocmonEventResult* result = &groupResult->results[i]; + result->lastValue = 0; + result->fullValue = 0; + } + + if (device->context) + { + ROCM_CALL(rocprofiler_start, (device->context, 0), return -1); + } + + return 0; +} + + +static int +_rocmon_startCounters_smi(RocmonDevice* device) +{ + // Check if there are any counters to start + if (device->numActiveSmiEvents <= 0) + { + return 0; + } + + // Save baseline values + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveSmiEvents; i++) + { + double value = 0; + RocmonSmiEvent* event = &device->activeSmiEvents[i]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + + // Measure counter + if (event->measureFunc) + { + event->measureFunc(device->deviceId, event, result); + } + + // Save value + result->fullValue = 0; + } + + return 0; +} + + +int +rocmon_v1_startCounters(void) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Get timestamp + uint64_t timestamp; + if (ret = _rocmon_get_timestamp(×tamp)) + { + return ret; + } + + // Start counters on each device + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + device->time.start = timestamp; + device->time.read = timestamp; + + // Start rocprofiler events + ret = _rocmon_startCounters_rocprofiler(device); + if (ret < 0) return ret; + + // Start SMI events + _rocmon_startCounters_smi(device); + if (ret < 0) return ret; + } + + return 0; +} + + +static int +_rocmon_stopCounters_rocprofiler(RocmonDevice* device) +{ + if (device->context) + { + // Close context + ROCM_CALL(rocprofiler_stop, (device->context, 0), return -1); + } + + return 0; +} + + +int +rocmon_v1_stopCounters(void) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Read counters + ret = _rocmon_readCounters(&_rocmon_get_stop_time); + if (ret < 0) return ret; + + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Stop rocprofiler events + ret = _rocmon_stopCounters_rocprofiler(device); + if (ret < 0) return ret; + + // Nothing to stop for SMI events + } + + return 0; +} + + +int +rocmon_v1_readCounters(void) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Read counters + ret = _rocmon_readCounters(&_rocmon_get_read_time); + if (ret < 0) return ret; + + return 0; +} + + +double +rocmon_v1_getResult(int gpuIdx, int groupId, int eventId) +{ + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Validate gpuIdx + if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) + { + return -EFAULT; + } + + // Validate groupId + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + if (groupId < 0 || groupId >= device->numGroupResults) + { + return -EFAULT; + } + + // Validate eventId + RocmonEventResultList* groupResult = &device->groupResults[groupId]; + if (eventId < 0 || eventId >= groupResult->numResults) + { + return -EFAULT; + } + + // Return result + return groupResult->results[eventId].fullValue; +} + + +// TODO: multiple groups +double +rocmon_v1_getLastResult(int gpuIdx, int groupId, int eventId) +{ + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Validate gpuIdx + if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) + { + return -EFAULT; + } + + // Validate groupId + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + if (groupId < 0 || groupId >= device->numGroupResults) + { + return -EFAULT; + } + + // Validate eventId + RocmonEventResultList* groupResult = &device->groupResults[groupId]; + if (eventId < 0 || eventId >= groupResult->numResults) + { + return -EFAULT; + } + + // Return result + return groupResult->results[eventId].lastValue; +} + + +int +rocmon_v1_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) +{ + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Validate args + if (gpuIdx < 0 || gpuIdx > rocmon_context->numDevices) + { + return -EINVAL; + } + if (list == NULL) + { + return -EINVAL; + } + + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + + // Allocate list structure + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + + // Get number of events + printf("NUmber of events %d + %d\n", device->numRocMetrics , get_map_size(device->smiMetrics)); + tmpList->numEvents = device->numRocMetrics + get_map_size(device->smiMetrics); + if (tmpList->numEvents == 0) + { + // No events -> return empty list + tmpList->events = NULL; + *list = tmpList; + return 0; + } + + // Allocate event array + tmpList->events = (Event_rocm_t*) malloc(tmpList->numEvents * sizeof(Event_rocm_t)); + if (tmpList->events == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + free(tmpList); + return -ENOMEM; + } + + // Copy rocprofiler event information + for (int i = 0; i < device->numRocMetrics; i++) + { + rocprofiler_info_data_t* event = &device->rocMetrics[i]; + Event_rocm_t* out = &tmpList->events[i]; + int len; + + // Copy name + printf("Name %s\n", event->metric.name); + len = strlen(event->metric.name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "ROCP_%s", event->metric.name); + } + + // Copy description + len = strlen(event->metric.description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", event->metric.description); + } + + // Copy instances + out->instances = event->metric.instances; + } + + // Copy ROCm SMI metric information + for (int i = 0; i < get_map_size(device->smiMetrics); i++) + { + RocmonSmiEvent* event = NULL; + Event_rocm_t* out = &tmpList->events[device->numRocMetrics + i]; + int len; + + // Get event + if (get_smap_by_idx(device->smiMetrics, i, (void**)&event) < 0) + { + continue; + } + + // Copy name + len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "RSMI_%s", event->name); + } + + // Copy description + char* description = "SMI Event"; // TODO: use real descriptions + len = strlen(description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", description); + } + + // Copy instances + out->instances = event->instances; + } + + *list = tmpList; + return 0; +} + +void +rocmon_v1_freeEventsOfGpu(EventList_rocm_t list) +{ +#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } + + // Check pointer + if (list == NULL) + { + return; + } + + if (list->events != NULL) + { + for (int i = 0; i < list->numEvents; i++) + { + Event_rocm_t* event = &list->events[i]; + FREE_IF_NOT_NULL(event->name); + FREE_IF_NOT_NULL(event->description); + } + free(list->events); + } + free(list); +} + + +int +rocmon_v1_switchActiveGroup(int newGroupId) +{ + int ret; + + ret = rocmon_stopCounters(); + if (ret < 0) + { + return ret; + } + + ret = rocmon_setupCounters(newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_startCounters(); + if (ret < 0) + { + return ret; + } + + return 0; +} + + +int +rocmon_v1_getNumberOfGroups(void) +{ + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->numActiveGroups; +} + + +int +rocmon_v1_getIdOfActiveGroup(void) +{ + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->activeGroup; +} + + +int +rocmon_v1_getNumberOfGPUs(void) +{ + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->numDevices; +} + + +int +rocmon_v1_getNumberOfEvents(int groupId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return -EFAULT; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->nevents; +} + + +int +rocmon_v1_getNumberOfMetrics(int groupId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + { + return -EFAULT; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->nmetrics; +} + + +double +rocmon_v1_getTimeOfGroup(int groupId) +{ + int i = 0; + double t = 0; + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + { + return -EFAULT; + } + for (i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + t = MAX(t, (double)(device->time.stop - device->time.start)); + } + return t*1E-9; +} + + +double +rocmon_v1_getLastTimeOfGroup(int groupId) +{ + int i = 0; + double t = 0; + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + { + return -EFAULT; + } + for (i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + t = MAX(t, (double)(device->time.stop - device->time.read)); + } + return t*1E-9; +} + + +double +rocmon_v1_getTimeToLastReadOfGroup(int groupId) +{ + int i = 0; + double t = 0; + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + { + return -EFAULT; + } + for (i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + t = MAX(t, (double)(device->time.read - device->time.start)); + } + return t*1E-9; +} + + +char* +rocmon_v1_getEventName(int groupId, int eventId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((eventId < 0) || (eventId >= ginfo->nevents)) + { + return NULL; + } + return ginfo->events[eventId]; +} + + +char* +rocmon_v1_getCounterName(int groupId, int eventId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((eventId < 0) || (eventId >= ginfo->nevents)) + { + return NULL; + } + return ginfo->counters[eventId]; +} + + +char* +rocmon_v1_getMetricName(int groupId, int metricId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((metricId < 0) || (metricId >= ginfo->nmetrics)) + { + return NULL; + } + return ginfo->metricnames[metricId]; +} + + +char* +rocmon_v1_getGroupName(int groupId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->groupname; +} + + +char* +rocmon_v1_getGroupInfoShort(int groupId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->shortinfo; +} + + +char* +rocmon_v1_getGroupInfoLong(int groupId) +{ + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->longinfo; +} + + +int +rocmon_v1_getGroups(char*** groups, char*** shortinfos, char*** longinfos) +{ + init_configuration(); + Configuration_t config = get_configuration(); + + return perfgroup_getGroups(config->groupPath, "amd_gpu_v1", groups, shortinfos, longinfos); +} + + +int +rocmon_v1_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) +{ + perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); +} + + + +#endif /* LIKWID_WITH_ROCMON */ From 42ef30ceeabb8941f47d3a2ee92d8f9f2147410e Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:18:54 +0200 Subject: [PATCH 10/29] Filter files based on ROCM version check --- Makefile | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 3e2690421..294a34e4f 100644 --- a/Makefile +++ b/Makefile @@ -151,6 +151,12 @@ ifneq ($(ROCM_INTERFACE), true) OBJ := $(filter-out $(BUILD_DIR)/rocmon.o,$(OBJ)) OBJ := $(filter-out $(BUILD_DIR)/rocmon_marker.o,$(OBJ)) OBJ := $(filter-out $(BUILD_DIR)/topology_rocm.o,$(OBJ)) +else +ifeq ($(strip $(ROCM_SDK_CHECK)),0) +OBJ := $(filter-out $(BUILD_DIR)/rocmon_sdk.o,$(OBJ)) +else +OBJ := $(filter-out $(BUILD_DIR)/rocmon_v1.o,$(OBJ)) +endif endif ifeq ($(COMPILER),GCCPOWER) OBJ := $(filter-out $(BUILD_DIR)/topology_cpuid.o,$(OBJ)) @@ -353,10 +359,16 @@ $(BUILD_DIR)/%.o: %.c $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ $(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d -$(BUILD_DIR)/rocmon_marker.o: rocmon_marker.c - @echo "===> COMPILE $@" +$(BUILD_DIR)/rocmon_%.o: rocmon_%.c + @echo "===> COMPILE $@ with redefined symbol HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE" $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ - $(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE2 $@ + $(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE_$@ $@ + +$(BUILD_DIR)/rocmon.o: rocmon.c + @echo "===> COMPILE $@ with redefined symbol HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE" + $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ + $(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE_$@ $@ + $(BUILD_DIR)/%.o: %.cc @echo "===> COMPILE $@" From bc1b8d0de47e0c1feec7fe96310d80ea8b2ca789 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:29:26 +0200 Subject: [PATCH 11/29] Rename defines in rocmon_v1_types --- src/includes/rocmon_v1_types.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/includes/rocmon_v1_types.h b/src/includes/rocmon_v1_types.h index a126077de..5d06f85d3 100644 --- a/src/includes/rocmon_v1_types.h +++ b/src/includes/rocmon_v1_types.h @@ -1,11 +1,9 @@ /* * ======================================================================================= * - * Filename: nvmon_types.h + * Filename: rocmon_v1_types.h * - * Description: Header File of nvmon module. - * Configures and reads out performance counters - * on NVIDIA GPUs. Supports multi GPUs. + * Description: Header File of rocmon v1 module. * * Version: * Released: @@ -29,8 +27,8 @@ * * ======================================================================================= */ -#ifndef LIKWID_ROCMON_TYPES_H -#define LIKWID_ROCMON_TYPES_H +#ifndef LIKWID_ROCMON_V1_TYPES_H +#define LIKWID_ROCMON_V1_TYPES_H #include // #include @@ -148,4 +146,4 @@ typedef struct { int* gpulist; double** counters; } LikwidRocmResults; -#endif /* LIKWID_ROCMON_TYPES_H */ +#endif /* LIKWID_ROCMON_V1_TYPES_H */ From ffa9338d9c49ef8b22557382fbbb265334278c7f Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:36:41 +0200 Subject: [PATCH 12/29] Rename groups for v1 and add groups for sdk --- groups/{amd_gpu => amd_gpu_sdk}/GDS.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/MEM.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/PCI.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/POWER.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/SALU.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/SFETCH.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/STALLED.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/UTIL.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/VALU.txt | 0 groups/{amd_gpu => amd_gpu_sdk}/WAVE.txt | 0 groups/amd_gpu_v1/GDS.txt | 15 ++++++++++++++ groups/amd_gpu_v1/MEM.txt | 18 ++++++++++++++++ groups/amd_gpu_v1/PCI.txt | 23 +++++++++++++++++++++ groups/amd_gpu_v1/POWER.txt | 21 +++++++++++++++++++ groups/amd_gpu_v1/SALU.txt | 15 ++++++++++++++ groups/amd_gpu_v1/SFETCH.txt | 15 ++++++++++++++ groups/amd_gpu_v1/STALLED.txt | 19 +++++++++++++++++ groups/amd_gpu_v1/UTIL.txt | 18 ++++++++++++++++ groups/amd_gpu_v1/VALU.txt | 15 ++++++++++++++ groups/amd_gpu_v1/WAVE.txt | 15 ++++++++++++++ 20 files changed, 174 insertions(+) rename groups/{amd_gpu => amd_gpu_sdk}/GDS.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/MEM.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/PCI.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/POWER.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/SALU.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/SFETCH.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/STALLED.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/UTIL.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/VALU.txt (100%) rename groups/{amd_gpu => amd_gpu_sdk}/WAVE.txt (100%) create mode 100644 groups/amd_gpu_v1/GDS.txt create mode 100644 groups/amd_gpu_v1/MEM.txt create mode 100644 groups/amd_gpu_v1/PCI.txt create mode 100644 groups/amd_gpu_v1/POWER.txt create mode 100644 groups/amd_gpu_v1/SALU.txt create mode 100644 groups/amd_gpu_v1/SFETCH.txt create mode 100644 groups/amd_gpu_v1/STALLED.txt create mode 100644 groups/amd_gpu_v1/UTIL.txt create mode 100644 groups/amd_gpu_v1/VALU.txt create mode 100644 groups/amd_gpu_v1/WAVE.txt diff --git a/groups/amd_gpu/GDS.txt b/groups/amd_gpu_sdk/GDS.txt similarity index 100% rename from groups/amd_gpu/GDS.txt rename to groups/amd_gpu_sdk/GDS.txt diff --git a/groups/amd_gpu/MEM.txt b/groups/amd_gpu_sdk/MEM.txt similarity index 100% rename from groups/amd_gpu/MEM.txt rename to groups/amd_gpu_sdk/MEM.txt diff --git a/groups/amd_gpu/PCI.txt b/groups/amd_gpu_sdk/PCI.txt similarity index 100% rename from groups/amd_gpu/PCI.txt rename to groups/amd_gpu_sdk/PCI.txt diff --git a/groups/amd_gpu/POWER.txt b/groups/amd_gpu_sdk/POWER.txt similarity index 100% rename from groups/amd_gpu/POWER.txt rename to groups/amd_gpu_sdk/POWER.txt diff --git a/groups/amd_gpu/SALU.txt b/groups/amd_gpu_sdk/SALU.txt similarity index 100% rename from groups/amd_gpu/SALU.txt rename to groups/amd_gpu_sdk/SALU.txt diff --git a/groups/amd_gpu/SFETCH.txt b/groups/amd_gpu_sdk/SFETCH.txt similarity index 100% rename from groups/amd_gpu/SFETCH.txt rename to groups/amd_gpu_sdk/SFETCH.txt diff --git a/groups/amd_gpu/STALLED.txt b/groups/amd_gpu_sdk/STALLED.txt similarity index 100% rename from groups/amd_gpu/STALLED.txt rename to groups/amd_gpu_sdk/STALLED.txt diff --git a/groups/amd_gpu/UTIL.txt b/groups/amd_gpu_sdk/UTIL.txt similarity index 100% rename from groups/amd_gpu/UTIL.txt rename to groups/amd_gpu_sdk/UTIL.txt diff --git a/groups/amd_gpu/VALU.txt b/groups/amd_gpu_sdk/VALU.txt similarity index 100% rename from groups/amd_gpu/VALU.txt rename to groups/amd_gpu_sdk/VALU.txt diff --git a/groups/amd_gpu/WAVE.txt b/groups/amd_gpu_sdk/WAVE.txt similarity index 100% rename from groups/amd_gpu/WAVE.txt rename to groups/amd_gpu_sdk/WAVE.txt diff --git a/groups/amd_gpu_v1/GDS.txt b/groups/amd_gpu_v1/GDS.txt new file mode 100644 index 000000000..39c3446be --- /dev/null +++ b/groups/amd_gpu_v1/GDS.txt @@ -0,0 +1,15 @@ +SHORT GDS Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_GDS +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU GDS rw insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU GDS rw insts per work-item = ROCP_SQ_INSTS_GDS/ROCP_SQ_WAVES +-- +The average number of GDS read or GDS write instructions executed +per work item (affected by flow control). diff --git a/groups/amd_gpu_v1/MEM.txt b/groups/amd_gpu_v1/MEM.txt new file mode 100644 index 000000000..acc63a627 --- /dev/null +++ b/groups/amd_gpu_v1/MEM.txt @@ -0,0 +1,18 @@ +SHORT Memory utilization + +EVENTSET +ROCM0 ROCP_TA_TA_BUSY +ROCM1 ROCP_GRBM_GUI_ACTIVE +ROCM2 ROCP_SE_NUM + +METRICS +GPU memory utilization 100*max(ROCM0,16)/ROCM1/ROCM2 + +LONG +Formulas: +GPU memory utilization = 100*max(ROCP_TA_TA_BUSY,16)/ROCP_GRBM_GUI_ACTIVE/ROCP_SE_NUM +-- +The percentage of GPUTime the memory unit is active. The result includes +the stall time (MemUnitStalled). This is measured with all extra fetches +and writes and any cache or memory effects taken into account. +Value range: 0% to 100% (fetch-bound). diff --git a/groups/amd_gpu_v1/PCI.txt b/groups/amd_gpu_v1/PCI.txt new file mode 100644 index 000000000..cefaf307d --- /dev/null +++ b/groups/amd_gpu_v1/PCI.txt @@ -0,0 +1,23 @@ +SHORT PCI Transfers + +EVENTSET +ROCM0 RSMI_PCI_THROUGHPUT_SENT +ROCM1 RSMI_PCI_THROUGHPUT_RECEIVED + + +METRICS +Runtime time +PCI sent ROCM0 +PCI received ROCM1 +PCI send bandwidth 1E-6*ROCM0/time +PCI recv bandwidth 1E-6*ROCM1/time + +LONG +Formulas: +PCI sent = RSMI_PCI_THROUGHPUT_SENT +PCI received = RSMI_PCI_THROUGHPUT_RECEIVED +PCI send bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_SENT/runtime +PCI recv bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_RECEIVED/runtime +-- +Currently not usable since the RSMI_PCI_THROUGHPUT_* events require +one second per call, so 2 seconds for both of them. diff --git a/groups/amd_gpu_v1/POWER.txt b/groups/amd_gpu_v1/POWER.txt new file mode 100644 index 000000000..49830efc0 --- /dev/null +++ b/groups/amd_gpu_v1/POWER.txt @@ -0,0 +1,21 @@ +SHORT Power, temperature and voltage + +EVENTSET +ROCM0 RSMI_POWER_AVE[0] +ROCM1 RSMI_TEMP_EDGE +ROCM2 RSMI_VOLT_VDDGFX + + +METRICS +Power average 1E-6*ROCM0 +Edge temperature 1E-3*ROCM1 +Voltage 1E-3*ROCM2 + +LONG +Formulas: +Power average = RSMI_POWER_AVE[0] +Edge temperature = 1E-3*RSMI_TEMP_EDGE +Voltage = 1E-3*RSMI_VOLT_VDDGFX +-- +Gets the current average power consumption in watts, the +temperature in celsius and the voltage in volts. diff --git a/groups/amd_gpu_v1/SALU.txt b/groups/amd_gpu_v1/SALU.txt new file mode 100644 index 000000000..a693421d1 --- /dev/null +++ b/groups/amd_gpu_v1/SALU.txt @@ -0,0 +1,15 @@ +SHORT SALU Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_SALU +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU SALU insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU SALU insts per work-item = ROCP_SQ_INSTS_SALU/ROCP_SQ_WAVES +-- +The average number of scalar ALU instructions executed per work-item +(affected by flow control). diff --git a/groups/amd_gpu_v1/SFETCH.txt b/groups/amd_gpu_v1/SFETCH.txt new file mode 100644 index 000000000..bd0dfc3ff --- /dev/null +++ b/groups/amd_gpu_v1/SFETCH.txt @@ -0,0 +1,15 @@ +SHORT SFetch Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_SMEM +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU SFETCH insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU SFETCH insts per work-item = ROCP_SQ_INSTS_SMEM/ROCP_SQ_WAVES +-- +The average number of scalar fetch instructions from the video memory +executed per work-item (affected by flow control). diff --git a/groups/amd_gpu_v1/STALLED.txt b/groups/amd_gpu_v1/STALLED.txt new file mode 100644 index 000000000..9d6dc42c4 --- /dev/null +++ b/groups/amd_gpu_v1/STALLED.txt @@ -0,0 +1,19 @@ +SHORT ALU stalled by LDS + +EVENTSET +ROCM0 ROCP_SQ_WAIT_INST_LDS +ROCM1 ROCP_SQ_WAVES +ROCM2 ROCP_GRBM_GUI_ACTIVE + +METRICS +GPU ALD stalled 100*ROCM0*4/ROCM1/ROCM2 + +LONG +Formulas: +GPU ALD stalled = 100*ROCP_SQ_WAIT_INST_LDS*4/ROCP_SQ_WAVES/ROCP_GRBM_GUI_ACTIVE +-- +The percentage of GPUTime ALU units are stalled by the LDS input queue +being full or the output queue being not ready. If there are LDS bank +conflicts, reduce them. Otherwise, try reducing the number of LDS +accesses if possible. +Value range: 0% (optimal) to 100% (bad). diff --git a/groups/amd_gpu_v1/UTIL.txt b/groups/amd_gpu_v1/UTIL.txt new file mode 100644 index 000000000..7d9271e11 --- /dev/null +++ b/groups/amd_gpu_v1/UTIL.txt @@ -0,0 +1,18 @@ +SHORT GPU utilization + +EVENTSET +ROCM0 ROCP_GRBM_COUNT +ROCM1 ROCP_GRBM_GUI_ACTIVE + + +METRICS +GPU utilization 100*ROCM1/ROCM0 + + +LONG +Formulas: +GPU utilization = 100*ROCP_GRBM_GUI_ACTIVE/ROCP_GRBM_COUNT +-- +This group reassembles the 'GPUBusy' metric provided by RocProfiler. +We should add, that we can select the GPUBusy metric directly and the +calculations are done internally in case the metric formula changes. diff --git a/groups/amd_gpu_v1/VALU.txt b/groups/amd_gpu_v1/VALU.txt new file mode 100644 index 000000000..5d57b9b20 --- /dev/null +++ b/groups/amd_gpu_v1/VALU.txt @@ -0,0 +1,15 @@ +SHORT VALU Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_VALU +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU VALU insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU VALU insts per work-item = ROCP_SQ_INSTS_VALU/ROCP_SQ_WAVES +-- +The average number of vector ALU instructions executed per work-item +(affected by flow control). diff --git a/groups/amd_gpu_v1/WAVE.txt b/groups/amd_gpu_v1/WAVE.txt new file mode 100644 index 000000000..fe8914ae1 --- /dev/null +++ b/groups/amd_gpu_v1/WAVE.txt @@ -0,0 +1,15 @@ +SHORT Wavefronts + +EVENTSET +ROCM0 ROCP_SQ_WAVES + + +METRICS +GPU wavefronts ROCM0 + + +LONG +Formulas: +GPU wavefronts = ROCP_SQ_WAVES +-- +Total Wavefronts From 82c33e9de3774fece73d8d455ee6aef4badac945 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Oct 2024 21:37:26 +0200 Subject: [PATCH 13/29] Add skeleton for rocmon sdk --- src/includes/rocmon_sdk.h | 64 ++++++++ src/includes/rocmon_sdk_types.h | 35 +++++ src/rocmon_sdk.c | 251 ++++++++++++++++++++++++++++++++ 3 files changed, 350 insertions(+) create mode 100644 src/includes/rocmon_sdk.h create mode 100644 src/includes/rocmon_sdk_types.h create mode 100644 src/rocmon_sdk.c diff --git a/src/includes/rocmon_sdk.h b/src/includes/rocmon_sdk.h new file mode 100644 index 000000000..82b15b3ff --- /dev/null +++ b/src/includes/rocmon_sdk.h @@ -0,0 +1,64 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_sdk.h + * + * Description: Header File of rocmon module for ROCm >= 6.2. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SDK_H +#define LIKWID_ROCMON_SDK_H + +int rocmon_sdk_init(int numGpus, const int* gpuIds); +void rocmon_sdk_finalize(void); +int rocmon_sdk_addEventSet(const char* eventString, int* gid); +int rocmon_sdk_setupCounters(int gid); +int rocmon_sdk_startCounters(void); +int rocmon_sdk_stopCounters(void); +int rocmon_sdk_readCounters(void); +double rocmon_sdk_getResult(int gpuIdx, int groupId, int eventId); +double rocmon_sdk_getLastResult(int gpuIdx, int groupId, int eventId); +int rocmon_sdk_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list); +void rocmon_sdk_freeEventsOfGpu(EventList_rocm_t list); +int rocmon_sdk_switchActiveGroup(int newGroupId); +int rocmon_sdk_getNumberOfGroups(void); +int rocmon_sdk_getIdOfActiveGroup(void); +int rocmon_sdk_getNumberOfGPUs(void); +int rocmon_sdk_getNumberOfEvents(int groupId); +int rocmon_sdk_getNumberOfMetrics(int groupId); +double rocmon_sdk_getTimeOfGroup(int groupId); +double rocmon_sdk_getLastTimeOfGroup(int groupId); +double rocmon_sdk_getTimeToLastReadOfGroup(int groupId); +char* rocmon_sdk_getEventName(int groupId, int eventId); +char* rocmon_sdk_getCounterName(int groupId, int eventId); +char* rocmon_sdk_getMetricName(int groupId, int metricId); +char* rocmon_sdk_getGroupName(int groupId); +char* rocmon_sdk_getGroupInfoShort(int groupId); +char* rocmon_sdk_getGroupInfoLong(int groupId); +int rocmon_sdk_getGroups(char*** groups, char*** shortinfos, char*** longinfos); +int rocmon_sdk_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos); + + +#endif /* LIKWID_ROCMON_SDK_H */ + diff --git a/src/includes/rocmon_sdk_types.h b/src/includes/rocmon_sdk_types.h new file mode 100644 index 000000000..280edb6c5 --- /dev/null +++ b/src/includes/rocmon_sdk_types.h @@ -0,0 +1,35 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_sdk_types.h + * + * Description: Header File of rocmon sdk module for ROCM >= 6.2 + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SDK_TYPES_H +#define LIKWID_ROCMON_SDK_TYPES_H + + + +#endif /* LIKWID_ROCMON_SDK_TYPES_H */ diff --git a/src/rocmon_sdk.c b/src/rocmon_sdk.c new file mode 100644 index 000000000..7e66a1402 --- /dev/null +++ b/src/rocmon_sdk.c @@ -0,0 +1,251 @@ + /* ======================================================================================= + * + * Filename: rocmon_sdk.c + * + * Description: Main implementation of the performance monitoring module + * for AMD GPUs with ROCm >= 6.2 + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.roehl@googlemail.com + * Project: likwid + * + * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifdef LIKWID_WITH_ROCMON + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +static bool rocmon_initialized = FALSE; +int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; + +int +rocmon_sdk_init(int numGpus, const int* gpuIds) +{ + return 0; +} + + +void +rocmon_sdk_finalize(void) +{ + return; +} + + +int +rocmon_sdk_addEventSet(const char* eventString, int* gid) +{ + return 0; +} + +int +rocmon_sdk_setupCounters(int gid) +{ + return 0; +} + + +int +rocmon_sdk_startCounters(void) +{ + return 0; +} + +int +rocmon_sdk_stopCounters(void) +{ + return 0; +} + + +int +rocmon_sdk_readCounters(void) +{ + return 0; +} + + +double +rocmon_sdk_getResult(int gpuIdx, int groupId, int eventId) +{ + return 0.0; +} + + +// TODO: multiple groups +double +rocmon_sdk_getLastResult(int gpuIdx, int groupId, int eventId) +{ + return 0.0; +} + + +int +rocmon_sdk_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) +{ + return -EINVAL; +} + +void +rocmon_sdk_freeEventsOfGpu(EventList_rocm_t list) +{ + return; +} + + +int +rocmon_sdk_switchActiveGroup(int newGroupId) +{ + return 0; +} + + +int +rocmon_sdk_getNumberOfGroups(void) +{ + return 0; +} + + +int +rocmon_sdk_getIdOfActiveGroup(void) +{ + return 0; +} + + +int +rocmon_sdk_getNumberOfGPUs(void) +{ + return 0; +} + + +int +rocmon_sdk_getNumberOfEvents(int groupId) +{ + return 0; +} + + +int +rocmon_sdk_getNumberOfMetrics(int groupId) +{ + return 0; +} + + +double +rocmon_sdk_getTimeOfGroup(int groupId) +{ + return 0; +} + + +double +rocmon_sdk_getLastTimeOfGroup(int groupId) +{ + return 0; +} + + +double +rocmon_sdk_getTimeToLastReadOfGroup(int groupId) +{ + return 0; +} + + +char* +rocmon_sdk_getEventName(int groupId, int eventId) +{ + return NULL; +} + + +char* +rocmon_sdk_getCounterName(int groupId, int eventId) +{ + return NULL; +} + + +char* +rocmon_sdk_getMetricName(int groupId, int metricId) +{ + return NULL; +} + + +char* +rocmon_sdk_getGroupName(int groupId) +{ + return NULL; +} + + +char* +rocmon_sdk_getGroupInfoShort(int groupId) +{ + return NULL; +} + + +char* +rocmon_sdk_getGroupInfoLong(int groupId) +{ + return NULL; +} + + +int +rocmon_sdk_getGroups(char*** groups, char*** shortinfos, char*** longinfos) +{ + init_configuration(); + Configuration_t config = get_configuration(); + + return perfgroup_getGroups(config->groupPath, "amd_gpu_sdk", groups, shortinfos, longinfos); +} + + +int +rocmon_sdk_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) +{ + perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); +} + + +#endif /* LIKWID_WITH_ROCMON */ From eac1c6415426cda9319295c4dfd2c6dec7f1d718 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Oct 2024 12:32:18 +0200 Subject: [PATCH 14/29] Update Rocprofiler SDK support. Not working yet --- src/includes/rocmon.h | 8 + src/includes/rocmon_common_types.h | 228 +++ src/includes/rocmon_sdk.h | 1183 ++++++++++++++- src/includes/rocmon_sdk_types.h | 35 + src/includes/rocmon_smi.h | 1181 +++++++++++++++ src/includes/rocmon_smi_types.h | 81 + src/includes/rocmon_v1.h | 985 +++++++++++- src/includes/rocmon_v1_types.h | 120 +- src/rocmon.c | 891 +++++++++-- src/rocmon_marker.c | 24 +- src/rocmon_sdk.c | 251 --- src/rocmon_v1.c | 2275 ---------------------------- test/test_rocmon.c | 72 + 13 files changed, 4484 insertions(+), 2850 deletions(-) create mode 100644 src/includes/rocmon.h create mode 100644 src/includes/rocmon_common_types.h create mode 100644 src/includes/rocmon_smi.h create mode 100644 src/includes/rocmon_smi_types.h delete mode 100644 src/rocmon_sdk.c delete mode 100644 src/rocmon_v1.c create mode 100644 test/test_rocmon.c diff --git a/src/includes/rocmon.h b/src/includes/rocmon.h new file mode 100644 index 000000000..896138a99 --- /dev/null +++ b/src/includes/rocmon.h @@ -0,0 +1,8 @@ +#ifndef LIKWID_INTERNAL_ROCMON_H +#define LIKWID_INTERNAL_ROCMON_H + +#include + +GroupInfo* rocmon_get_group(int gid); + +#endif diff --git a/src/includes/rocmon_common_types.h b/src/includes/rocmon_common_types.h new file mode 100644 index 000000000..fe48bc866 --- /dev/null +++ b/src/includes/rocmon_common_types.h @@ -0,0 +1,228 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_common_types.h + * + * Description: Header File of rocmon for v1 and sdk backend. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_COMMON_TYPES_H +#define LIKWID_ROCMON_COMMON_TYPES_H + +#include + +#include +#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 +typedef struct metrics_table_header_t metrics_table_header_t; +#endif +#include +#include +#include +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT +#endif +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT +#endif +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include +#ifdef LIKWID_ROCPROF_SDK +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT +#endif +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT +#endif +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include +/*#ifdef ROCPROFILER_EXPORT*/ +/*#undef ROCPROFILER_EXPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_IMPORT*/ +/*#undef ROCPROFILER_IMPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MAJOR*/ +/*#undef ROCPROFILER_VERSION_MAJOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MINOR*/ +/*#undef ROCPROFILER_VERSION_MINOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_API*/ +/*#undef ROCPROFILER_API*/ +/*#endif*/ +#include +#endif + + + +#ifndef ROCMWEAK +#define ROCMWEAK __attribute__(( weak )) +#endif +#ifndef FREE_IF_NOT_NULL +#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } +#endif +/*#ifndef ARRAY_COUNT*/ +/*#define ARRAY_COUNT(arr) (sizeof(arr) / sizeof((arr)[0]))*/ +/*#endif*/ +/*#ifndef SIZEOF_STRUCT_MEMBER*/ +/*#define SIZEOF_STRUCT_MEMBER(type, member) (sizeof(((type *) NULL)->member))*/ +/*#endif*/ + +typedef struct { + double lastValue; + double fullValue; +} RocmonEventResult; + +typedef struct { + RocmonEventResult* results; // First rocprofiler results, then SMI results + int numResults; +} RocmonEventResultList; + +#include +#include + +typedef struct { + bstring tag; + int groupID; + int gpuCount; + int eventCount; + double* time; + uint32_t* count; + int* gpulist; + double** counters; +} LikwidRocmResults; + +typedef struct { + int deviceId; // LIKWID device id + int rocprof_v1; + int activeGroup; + + // Rocprofiler V1 + hsa_agent_t hsa_agent; // HSA agent handle for this device + rocprofiler_t* v1_context; // Rocprofiler context (has activeEvents configured) +#ifdef LIKWID_ROCPROF_SDK + // Rocprofiler SDK + rocprofiler_agent_t agent; + rocprofiler_context_id_t sdk_context; // Rocprofiler context (has activeEvents configured) + rocprofiler_buffer_id_t buffer; + rocprofiler_callback_thread_t thread; +#endif + + // Available rocprofiler metrics + rocprofiler_info_data_t* v1_rocMetrics; +#ifdef LIKWID_ROCPROF_SDK + rocprofiler_counter_info_v0_t* sdk_rocMetrics; +#endif + int numRocMetrics; + + // Available ROCm SMI events + Map_t smiMetrics; + + // Currently configured rocprofiler events (bound to context) + rocprofiler_feature_t* v1_activeRocEvents; +#ifdef LIKWID_ROCPROF_SDK + rocprofiler_counter_info_v0_t* sdk_activeRocEvents; +#endif + int numActiveRocEvents; + + // Currently configured ROCm SMI events + RocmonSmiEvent* activeSmiEvents; + int numActiveSmiEvents; + + // Results for all events in all event sets + RocmonEventResultList* groupResults; + int numGroupResults; + +#ifdef LIKWID_ROCPROF_SDK + rocprofiler_profile_config_id_t* profiles; + int numProfiles; +#endif + + // Timestamps in ns + struct { + uint64_t start; + uint64_t read; + uint64_t stop; + } time; + + // buffer? +} RocmonDevice; + +typedef enum { + ROCMON_STATE_FINALIZED = 0, + ROCMON_STATE_INITIALIZED, + ROCMON_STATE_SETUP, + ROCMON_STATE_RUNNING, + ROCMON_STATE_STOPPED, + MAX_ROCMON_STATE, +} RocmonContextState; +#define MIN_ROCMON_STATE ROCMON_STATE_FINALIZED + +typedef struct { + int numGroups; // Number of allocated groups + int numActiveGroups; // Number of used groups + int activeGroup; // Currently active group + GroupInfo *groups; + + // Devices (HSA agents) + RocmonDevice *devices; + int numDevices; + + // System information + long double hsa_timestamp_factor; // hsa_timestamp * hsa_timestamp_factor = timestamp_in_ns + + // Rocprofiler SDK agents with buffers +#ifdef LIKWID_ROCPROF_SDK + int num_sdk_agents; + RocprofilerSdkAgentData* agents; +#endif + + // ROCm SMI events + Map_t smiEvents; + + // Use legacy rocprofiler v1 + int use_rocprofiler_v1:1; + RocmonContextState state; +} RocmonContext; + +//extern static RocmonContext* rocmon_context; + + +#endif /* LIKWID_ROCMON_COMMON_TYPES_H */ diff --git a/src/includes/rocmon_sdk.h b/src/includes/rocmon_sdk.h index 82b15b3ff..b2d32df87 100644 --- a/src/includes/rocmon_sdk.h +++ b/src/includes/rocmon_sdk.h @@ -30,34 +30,1161 @@ #ifndef LIKWID_ROCMON_SDK_H #define LIKWID_ROCMON_SDK_H -int rocmon_sdk_init(int numGpus, const int* gpuIds); -void rocmon_sdk_finalize(void); -int rocmon_sdk_addEventSet(const char* eventString, int* gid); -int rocmon_sdk_setupCounters(int gid); -int rocmon_sdk_startCounters(void); -int rocmon_sdk_stopCounters(void); -int rocmon_sdk_readCounters(void); -double rocmon_sdk_getResult(int gpuIdx, int groupId, int eventId); -double rocmon_sdk_getLastResult(int gpuIdx, int groupId, int eventId); -int rocmon_sdk_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list); -void rocmon_sdk_freeEventsOfGpu(EventList_rocm_t list); -int rocmon_sdk_switchActiveGroup(int newGroupId); -int rocmon_sdk_getNumberOfGroups(void); -int rocmon_sdk_getIdOfActiveGroup(void); -int rocmon_sdk_getNumberOfGPUs(void); -int rocmon_sdk_getNumberOfEvents(int groupId); -int rocmon_sdk_getNumberOfMetrics(int groupId); -double rocmon_sdk_getTimeOfGroup(int groupId); -double rocmon_sdk_getLastTimeOfGroup(int groupId); -double rocmon_sdk_getTimeToLastReadOfGroup(int groupId); -char* rocmon_sdk_getEventName(int groupId, int eventId); -char* rocmon_sdk_getCounterName(int groupId, int eventId); -char* rocmon_sdk_getMetricName(int groupId, int metricId); -char* rocmon_sdk_getGroupName(int groupId); -char* rocmon_sdk_getGroupInfoShort(int groupId); -char* rocmon_sdk_getGroupInfoLong(int groupId); -int rocmon_sdk_getGroups(char*** groups, char*** shortinfos, char*** longinfos); -int rocmon_sdk_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos); +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + + +static int rocmon_sdk_initialized = FALSE; + +static void *rocmon_sdk_dl_profiler_lib = NULL; +static void *rocmon_sdk_dl_hsa_lib = NULL; +//static void *rocmon_sdk_dl_rsmi_lib = NULL; + + +// setup function for rocprofiler sdk +//rocprofiler_tool_configure_result_t* rocprofiler_configure(uint32_t, const char*, uint32_t, rocprofiler_client_id_t*); + +#ifndef ROCM_CALL +#define ROCM_CALL( call, args, handleerror ) \ + do { \ + hsa_status_t _status = (*call##_ptr)args; \ + if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ + const char* err = NULL; \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + rocprofiler_error_string(&err); \ + fprintf(stderr, "Error: %s\n", err); \ + handleerror; \ + } \ + } while (0) +#endif + +#ifndef ROCPROFILER_CALL +#define ROCPROFILER_CALL( call, args, handleerror ) \ + do { \ + rocprofiler_status_t _status = (*call##_ptr)args; \ + if(_status != ROCPROFILER_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + handleerror; \ + } \ + } while (0); +#endif +// fprintf(stderr, "Error: %s\n", (*rocprofiler_get_status_string_ptr)(_status)); \ + +#ifndef DECLARE_ROCPROFILER_SDK +#define DECLARE_ROCPROFILER_SDK(funcname, funcsig) rocprofiler_status_t ROCMWEAK funcname funcsig; rocprofiler_status_t ( *funcname##_ptr ) funcsig; +#endif + + +DECLARE_ROCPROFILER_SDK(rocprofiler_create_context, (rocprofiler_context_id_t*)) +DECLARE_ROCPROFILER_SDK(rocprofiler_create_buffer, (rocprofiler_context_id_t, size_t, size_t, rocprofiler_buffer_policy_t, rocprofiler_buffer_tracing_cb_t, void*, rocprofiler_buffer_id_t*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_query_available_agents, (rocprofiler_agent_version_t, rocprofiler_query_available_agents_cb_t, size_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_get_timestamp, (rocprofiler_timestamp_t* ts)); +DECLARE_ROCPROFILER_SDK(rocprofiler_query_counter_info, (rocprofiler_counter_id_t, rocprofiler_counter_info_version_id_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_start_context, (rocprofiler_context_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_stop_context, (rocprofiler_context_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_create_profile_config, (rocprofiler_agent_id_t, rocprofiler_counter_id_t *, size_t, rocprofiler_profile_config_id_t *)); +DECLARE_ROCPROFILER_SDK(rocprofiler_destroy_profile_config, (rocprofiler_profile_config_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_configure_agent_profile_counting_service, (rocprofiler_context_id_t, rocprofiler_buffer_id_t, rocprofiler_agent_id_t, rocprofiler_agent_profile_callback_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_sample_agent_profile_counting_service, (rocprofiler_context_id_t, rocprofiler_user_data_t, rocprofiler_counter_flag_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_iterate_agent_supported_counters, (rocprofiler_agent_id_t, rocprofiler_available_counters_cb_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_flush_buffer, (rocprofiler_buffer_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_force_configure, (rocprofiler_configure_func_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_destroy_buffer, (rocprofiler_buffer_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_context_is_active, (rocprofiler_context_id_t, int*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_create_callback_thread, (rocprofiler_callback_thread_t*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_assign_callback_thread, (rocprofiler_buffer_id_t, rocprofiler_callback_thread_t)); + +const char *rocprofiler_get_status_string(rocprofiler_status_t); +const char * (*rocprofiler_get_status_string_ptr)(rocprofiler_status_t); + +#ifndef DECLAREFUNC_HSA +#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; +#endif +DECLAREFUNC_HSA(hsa_init, ()); +DECLAREFUNC_HSA(hsa_shut_down, ()); + + +typedef struct { + rocprofiler_agent_t *agents; + int num_agents; +} _rocmon_sdk_count_agents_cb_data; + +rocprofiler_status_t _rocmon_sdk_count_agents_cb(rocprofiler_agent_version_t agents_ver, + const void** agents_arr, + size_t num_agents, + void* udata) +{ + int gpu_agents = 0; + RocmonContext **stat_context = (RocmonContext **)udata; + RocmonContext* context = *stat_context; + RocmonDevice* devices = malloc(num_agents * sizeof(RocmonDevice)); + if (!devices) + { + return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES; + } + memset(devices, 0, num_agents * sizeof(RocmonDevice)); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Found %d ROCm agents, num_agents); + for(size_t i = 0; i < num_agents; ++i) + { + const rocprofiler_agent_t* in_agent = agents_arr[i]; + if (in_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding AMD GPU at index %d, gpu_agents); + RocmonDevice* device = &devices[gpu_agents]; + device->agent = (rocprofiler_agent_t)*in_agent; + device->deviceId = in_agent->logical_node_type_id; + gpu_agents++; + } + } + context->devices = devices; + context->numDevices = gpu_agents; + return ROCPROFILER_STATUS_SUCCESS; +} + + +typedef struct { + rocprofiler_counter_info_v0_t *counters; + int num_counters; +} _rocmon_sdk_fill_agent_counters_cb_data; + +static void +_rocmon_sdk_free_agent_counters_internal(int num_counters, rocprofiler_counter_info_v0_t* counters) +{ + if ((num_counters < 0) || (!counters)) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing %d counters, num_counters); + for (int i = 0; i < num_counters; i++) + { + rocprofiler_counter_info_v0_t* info = &counters[i]; + if (info) + { + if (info->name) free((char*)info->name); + if (info->description) free((char*)info->description); + if (info->block) free((char*)info->block); + if (info->expression) free((char*)info->expression); + } + } + free(counters); +} + + +rocprofiler_status_t +_rocmon_sdk_fill_agent_counters_cb(rocprofiler_agent_id_t agent, + rocprofiler_counter_id_t* counters, + size_t num_counters, + void* udata) +{ + _rocmon_sdk_fill_agent_counters_cb_data *data = (_rocmon_sdk_fill_agent_counters_cb_data*)udata; + + rocprofiler_counter_info_v0_t* out = malloc(num_counters * sizeof(rocprofiler_counter_info_v0_t)); + if (!out) + { + return -ENOMEM; + } + for (int i = 0; i < num_counters; i++) + { + rocprofiler_counter_info_v0_t info; + rocprofiler_status_t stat = (*rocprofiler_query_counter_info_ptr)(counters[i], (rocprofiler_counter_info_version_id_t)ROCPROFILER_COUNTER_INFO_VERSION_0, &info); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to query counter info for %d, i); + for (int j = 0; j < i; j++) + { + free((char*)out[j].name); + free((char*)out[j].description); + } + free(out); + return -EFAULT; + } + //ROCPROFILER_CALL(rocprofiler_query_counter_info, (counters[i], ROCPROFILER_COUNTER_INFO_VERSION_0, &info), + /*{ + free(out); + return -EFAULT; + });*/ + int namelen = strlen(info.name)+1; + int desclen = strlen(info.description)+1; + out[i].name = malloc(namelen * sizeof(char)); + if (!out[i].name) + { + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + out[i].description = malloc(desclen * sizeof(char)); + if (!out[i].description) + { + free((char*)out[i].name); + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + out[i].block = malloc((strlen(info.block)+1) * sizeof(char)); + if (!out[i].block) + { + free((char*)out[i].name); + free((char*)out[i].description); + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + out[i].expression = malloc((strlen(info.expression)+1) * sizeof(char)); + if (!out[i].expression) + { + free((char*)out[i].name); + free((char*)out[i].description); + free((char*)out[i].block); + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + int ret = 0; + ret = snprintf((char*)out[i].name, namelen-1, "%s", info.name); + ret = snprintf((char*)out[i].description, desclen-1, "%s", info.description); + out[i].id = info.id; + out[i].is_constant = info.is_constant; + out[i].is_derived = info.is_derived; + } + data->counters = out; + data->num_counters = num_counters; + return ROCPROFILER_STATUS_SUCCESS; +} + +int _rocmon_sdk_fill_agent_counters(RocmonDevice *device) +{ + _rocmon_sdk_fill_agent_counters_cb_data fill_data = { + .counters = NULL, + .num_counters = 0, + }; + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting counters for agent %d, device->deviceId); + rocprofiler_status_t _status = (rocprofiler_iterate_agent_supported_counters_ptr)(device->agent.id, _rocmon_sdk_fill_agent_counters_cb, &fill_data); + if (_status != ROCPROFILER_STATUS_SUCCESS) + { + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Agent %d provides %d counters, device->deviceId, fill_data.num_counters); + device->sdk_rocMetrics = fill_data.counters; + device->numRocMetrics = fill_data.num_counters; + + return ROCPROFILER_STATUS_SUCCESS; +} + + +static void +_rocmon_sdk_free_agent_counters(RocmonDevice *device) +{ + if (!device->sdk_rocMetrics) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing counters for agent %d, device->deviceId); + _rocmon_sdk_free_agent_counters_internal(device->numRocMetrics, device->sdk_rocMetrics); + device->sdk_rocMetrics = NULL; + device->numRocMetrics = 0; +} + + +typedef struct { + rocprofiler_context_id_t* context; + rocprofiler_agent_t agent; + RocmonEventResultList* result; +} rocmon_sdk_read_buffers_cb; + +static void +_rocmon_sdk_read_buffers(rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* udata, + uint64_t) +{ + rocmon_sdk_read_buffers_cb* cbdata = (rocmon_sdk_read_buffers_cb*)udata; + +/* if (cbdata->result->numResults == 0)*/ +/* {*/ +/* cbdata->result->results = malloc(sizeof(RocmonEventResult))*/ +/* }*/ + printf("_rocmon_sdk_read_buffers\n"); + for (int i = 0; i < num_headers; i++) + { + rocprofiler_record_header_t* h = headers[i]; + if(h->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && h->kind == ROCPROFILER_COUNTER_RECORD_VALUE) + { + rocprofiler_record_counter_t* r = h->payload; + printf("Counter ID %d Value %f Dispatch %ld\n", r->id, r->counter_value, r->dispatch_id); + } + } + + +/* RocmonContext* mycontext = *cbdata->context;*/ +/* for (int i = 0; i < mycontext->numDevices; i++)*/ +/* {*/ +/* RocmonDevice* device = &mycontext->devices[i];*/ +/* if (device->agent.id.handle == cbdata->agent.id.handle)*/ +/* {*/ +/* RocmonEventResultList* groupResults = &device->groupResults[device->activeGroup];*/ + +/* for(int i = 0; i < num_headers; ++i)*/ +/* {*/ +/* rocprofiler_record_header_t* h = headers[i];*/ +/* if(h->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && h->kind == ROCPROFILER_COUNTER_RECORD_VALUE)*/ +/* {*/ +/* rocprofiler_record_counter_t* r = h->payload;*/ +/* if (r->id >= 0 && r->id < groupResults->numResults)*/ +/* {*/ +/* RocmonEventResult* eventResult = &cbdata->result->results[r->id];*/ +/* double diff = r->counter_value - eventResult->fullValue;*/ +/* eventResult->lastValue = eventResult->fullValue;*/ +/* eventResult->fullValue += diff;*/ +/* }*/ +/* }*/ +/* }*/ +/* }*/ +/* }*/ + + return; +} + + +int +tool_init(rocprofiler_client_finalize_t fini, void* udata) +{ + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + RocmonContext** stat_context = (RocmonContext**)udata; + RocmonContext* context = *stat_context; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_init); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA); + hsa_status_t hstat = (*hsa_init_ptr)(); + if (hstat != HSA_STATUS_SUCCESS) + { + return -EFAULT; + } + + //ROCPROFILER_CALL(rocprofiler_query_available_agents, (ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), &agent_count), return -EFAULT;); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents); + stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), udata); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + return -EFAULT; + } + if (context->numDevices == 0) + { + FREE_IF_NOT_NULL(context->devices); + return -1; + } + + for (int i = 0; i < context->numDevices; i++) + { + rocprofiler_context_id_t device_context; + rocprofiler_buffer_id_t buffer; + rocprofiler_callback_thread_t thread; + RocmonDevice* device = &context->devices[i]; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating context for device %d, device->deviceId); + stat = (*rocprofiler_create_context_ptr)(&device_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating buffer for device %d, device->deviceId); + stat = (*rocprofiler_create_buffer_ptr)(device_context, 100, 50, ROCPROFILER_BUFFER_POLICY_LOSSLESS, _rocmon_sdk_read_buffers, udata, &buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating callback thread for device %d, device->deviceId); + stat = (*rocprofiler_create_callback_thread_ptr)(&thread); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Assign callback thread to buffer for device %d, device->deviceId); + stat = (*rocprofiler_assign_callback_thread_ptr)(buffer, thread); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + + device->sdk_context = device_context; + device->buffer = buffer; + device->thread = thread; + } + return 0; +} + + +void +tool_fini(void* udata) +{ + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_fini); + RocmonContext** stat_context = (RocmonContext**)udata; + RocmonContext* context = *stat_context; + if ((!context) || (!context->devices) || (context->numDevices == 0)) + { + return; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + int active = 0; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Checking context for device %d, device->deviceId); + stat = (*rocprofiler_context_is_active_ptr)(device->sdk_context, &active); + if (active) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping context for device %d, device->deviceId); + stat = (*rocprofiler_stop_context_ptr)(device->sdk_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to stop context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Flushing buffer for device %d, device->deviceId); + stat = (*rocprofiler_flush_buffer_ptr)(device->buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroying buffer for device %d, device->deviceId); + stat = (*rocprofiler_destroy_buffer_ptr)(device->buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to destroy buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + _rocmon_sdk_free_agent_counters(device); + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); + (*hsa_shut_down_ptr)(); +} + +void +_rocmon_sdk_set_profile(rocprofiler_context_id_t context_id, + rocprofiler_agent_id_t agent, + rocprofiler_agent_set_profile_callback_t set_config, + void* udata) +{ + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_sdk_set_profile); + RocmonDevice* device = (RocmonDevice*) udata; + if (device->agent.id.handle == agent.handle) + { + if (device->activeGroup >= 0 && device->activeGroup < device->numProfiles) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Setting profile %d for device %d, device->activeGroup, device->deviceId); + set_config(context_id, device->profiles[device->activeGroup]); + } + else + { + ERROR_PRINT(Invalid active group for device %d, device->deviceId); + } + } + else + { + ERROR_PRINT(Mismatch between device %s agent and given agent, device->deviceId); + } + return; +} + + + +static int +_rocmon_sdk_link_libraries() +{ + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm SDK libraries); + dlerror(); + // Need to link in the ROCm HSA libraries + rocmon_sdk_dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_hsa_lib) + { + ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); + return -1; + } + + // Need to link in the Rocprofiler libraries + rocmon_sdk_dl_profiler_lib = dlopen("librocprofiler-sdk.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_profiler_lib) + { + // Delete last error + dlerror(); + rocmon_sdk_dl_profiler_lib = dlopen("librocprofiler-sdk.so.1", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_profiler_lib) + { + ERROR_PRINT(Rocprofiler library librocprofiler-sdk.so not found: %s, dlerror()); + return -1; + } + } + + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_status_string); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_available_agents); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_timestamp); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_start_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_stop_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_profile_config); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_destroy_profile_config); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_configure_agent_profile_counting_service); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_iterate_agent_supported_counters); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_flush_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_counter_info); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_sample_agent_profile_counting_service); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_force_configure); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_destroy_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_context_is_active); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_callback_thread); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_assign_callback_thread); + + DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_init); + DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_shut_down); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm SDK libraries done); + return 0; +} + + +rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* client_id) +{ + client_id->name = "LIKWID"; + static rocprofiler_tool_configure_result_t config_result = { + .size = sizeof(rocprofiler_tool_configure_result_t), + .initialize = tool_init, + .finalize = tool_fini, + .tool_data = &rocmon_context, + }; + return &config_result; +} + +int +rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds) +{ + int ret = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + if ((numGpus < 0) || (!gpuIds) || (!context)) + { + return -EINVAL; + } + if (rocmon_sdk_initialized) + { + return 0; + } + + // initialize libraries + ret = _rocmon_sdk_link_libraries(); + if (ret < 0) + { + //ERROR_PLAIN_PRINT(Failed to initialize libraries); + return ret; + } + + stat = (*rocprofiler_force_configure_ptr)(rocprofiler_configure); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + return -EFAULT; + } + + if (context->numDevices == 0) + { + errno = ENODEV; + ERROR_PRINT(Cannot ROCm GPUs); + return -ENODEV; + } + + RocmonDevice* devices = malloc(numGpus * sizeof(RocmonDevice)); + if (!devices) + { + return -ENOMEM; + } + memset(devices, 0, numGpus * sizeof(RocmonDevice)); + + for (int i = 0; i < numGpus; i++) + { + int idx = -1; + for (int j = 0; j < context->numDevices; j++) + { + RocmonDevice* device = &context->devices[j]; + if (gpuIds[i] == device->deviceId) + { + idx = j; + break; + } + } + if (idx >= 0) + { + memcpy(&devices[i], &context->devices[idx], sizeof(RocmonDevice)); + RocmonDevice* out = &devices[i]; +/* RocmonDevice* in = &context->devices[idx];*/ +/* out->agent = in->agent;*/ +/* printf("%d -> %d\n", in->agent.id.handle, out->agent.id.handle);*/ +/* out->thread = in->thread;*/ +/* out->buffer = in->buffer;*/ +/* printf("%d -> %d\n", in->buffer.handle, out->buffer.handle);*/ +/* out->sdk_context = in->sdk_context;*/ +/* printf("%d -> %d\n", in->sdk_context.handle, out->sdk_context.handle);*/ + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Fill agent counters for device %d, out->deviceId); + ret = _rocmon_sdk_fill_agent_counters(out); + if (ret < 0) + { + errno = -ret; + ERROR_PRINT(Failed to fill events for device %d: %s, out->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + } + else + { + errno = ENODEV; + ERROR_PRINT(Cannot find ROCm GPU %d, gpuIds[i]); + free(devices); + return -ENODEV; + } + } + free(context->devices); + context->devices = devices; + context->numDevices = numGpus; + + rocmon_sdk_initialized = TRUE; + return 0; +} + + +void +rocmon_sdk_finalize(RocmonContext* context) +{ + if (context) + { + if (context->devices) + { + for (int i = 0; i < context->numDevices; i++) + { + //free device i + RocmonDevice* dev = &context->devices[i]; + if (dev->sdk_activeRocEvents) + { + free(dev->sdk_activeRocEvents); + dev->sdk_activeRocEvents = NULL; + dev->numActiveRocEvents = 0; + } + if (dev->sdk_rocMetrics) + { + _rocmon_sdk_free_agent_counters_internal(dev->numRocMetrics, dev->sdk_rocMetrics); + dev->sdk_rocMetrics = NULL; + dev->numRocMetrics = 0; + } + if (dev->profiles) + { + for (int i = 0; i < dev->numProfiles; i++) + { + (*rocprofiler_destroy_profile_config_ptr)(dev->profiles[i]); + } + } + } + } +/* if (context->sdk_agents)*/ +/* {*/ +/* free(context->sdk_agents);*/ +/* context->sdk_agents = NULL;*/ +/* free(context->sdk_agent_buffers);*/ +/* context->sdk_agent_buffers = NULL;*/ +/* context->num_sdk_agents = 0;*/ +/* }*/ + } + rocmon_sdk_initialized = 0; + return; +} + + + +static int +_rocmon_setupCounters_rocprofiler_sdk(RocmonDevice* device, const char** events, int numEvents) +{ + rocprofiler_profile_config_id_t profile; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + if ((!device) || (!events) || (numEvents <= 0)) + { + return -EINVAL; + } + + int num_counters = 0; + rocprofiler_counter_id_t* counters = malloc(numEvents * sizeof(rocprofiler_counter_id_t)); + if (!counters) + { + return -ENOMEM; + } + + for (int i = 0; i < numEvents; i++) + { + int found = -1; + for (int j = 0; j < device->numRocMetrics; j++) + { + rocprofiler_counter_info_v0_t* m = &device->sdk_rocMetrics[j]; + if (strncmp(events[i], m->name, strlen(m->name)) == 0) + { + found = j; + break; + } + } + if (found >= 0) + { + counters[num_counters++] = device->sdk_rocMetrics[found].id; + } + else + { + ERROR_PRINT(Unknown ROCm event %s, events[i]); + } + } + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating profile for %d event(s) for device %d, num_counters, device->deviceId); + stat = (*rocprofiler_create_profile_config_ptr)(device->agent.id, counters, num_counters * sizeof(rocprofiler_counter_id_t), &profile); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to create profile: %s, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(counters); + return -ENOMEM; + } + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Increasing profile space to %d for device %d, device->numProfiles + 1, device->deviceId); + rocprofiler_profile_config_id_t* profiles = realloc(device->profiles, (device->numProfiles+1) * sizeof(rocprofiler_profile_config_id_t)); + if (!profiles) + { + (*rocprofiler_destroy_profile_config_ptr)(profile); + FREE_IF_NOT_NULL(counters); + return -ENOMEM; + } + device->profiles = profiles; + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding profile %d at idx %d for device %d, device->numProfiles, device->numProfiles, device->deviceId); + device->profiles[device->numProfiles++] = profile; + FREE_IF_NOT_NULL(counters); + return 0; +} + +int +rocmon_sdk_setupCounters(RocmonContext* context, int gid) +{ + int ret = 0; + int numRocEvents = 0; + const char **rocEvents = NULL; + // Check arguments + if (gid < 0 || gid >= context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + ERROR_PRINT(Rocmon SDK not initialized); + return -EFAULT; + } + + // Get group info + GroupInfo* group = &context->groups[gid]; + + // Allocate memory for string arrays + rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (rocEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "ROCP_", 5) == 0) + { + // Rocprofiler event + rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix + numRocEvents++; + } + } + if (numRocEvents == 0) + { + free(rocEvents); + return 0; + } + + // Add events to each device + //rocmon_context->activeGroup = gid; + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Add rocprofiler events + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); + ret = _rocmon_setupCounters_rocprofiler_sdk(device, rocEvents, numRocEvents); + if (ret < 0) + { + if (rocEvents) free(rocEvents); + return ret; + } + + } + // Cleanup + free(rocEvents); + + return 0; +} + +static int _rocmon_sdk_get_timestamp(uint64_t* timestamp) +{ + rocprofiler_timestamp_t ts; + rocprofiler_status_t stat = (*rocprofiler_get_timestamp_ptr)(&ts); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to get timestamp: %s, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + + + *timestamp = (uint64_t) ts; + return 0; +} + +static int +_rocmon_startCounters_rocprofiler_sdk(RocmonDevice* device) +{ + int active = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + //ROCPROFILER_CALL(rocprofiler_configure_agent_profile_counting_service, (device->sdk_context, device->buffer, device->agent.id, _rocmon_sdk_set_profile, NULL), \ + //ROCPROFILER_CALL(rocprofiler_destroy_profile_config, (profile), free(counters); return -EFAULT;); \ + free(counters); return -ENOMEM); + + // if not running + stat = (*rocprofiler_context_is_active)(device->sdk_context, &active); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to check ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + if (!active) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Configuring counting service for device %d, device->deviceId); + stat = (*rocprofiler_configure_agent_profile_counting_service_ptr)(device->sdk_context, device->buffer, device->agent.id, _rocmon_sdk_set_profile, device); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to configure counting service for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting context for device %d, device->deviceId); + stat = (*rocprofiler_start_context_ptr)(device->sdk_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to start ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + } + return 0; +} + +int +rocmon_sdk_startCounters(RocmonContext* context) +{ + int ret = 0; + uint64_t timestamp = 0; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + ERROR_PRINT(Rocmon SDK not initialized); + return -EFAULT; + } + + // Get timestamp + if (ret = _rocmon_sdk_get_timestamp(×tamp)) + { + return ret; + } + + // Start counters on each device + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + device->time.start = timestamp; + device->time.read = timestamp; + + // Start rocprofiler events + ret = _rocmon_startCounters_rocprofiler_sdk(device); + if (ret < 0) return ret; + + } + + return 0; +} + + +static int +_rocmon_stopCounters_rocprofiler_sdk(RocmonDevice* device) +{ + int active = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Checking context for device %d, device->deviceId); + stat = (*rocprofiler_context_is_active)(device->sdk_context, &active); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to check ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + if (active) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping context for device %d, device->deviceId); + stat = (*rocprofiler_stop_context_ptr)(device->sdk_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to stop ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } +/* stat = (*rocprofiler_flush_buffer_ptr)(device->buffer);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* }*/ + } + return 0; +} + +int +rocmon_sdk_stopCounters(RocmonContext* context) +{ + int ret = 0; + uint64_t t = 0; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + ERROR_PRINT(Rocmon SDK not initialized); + return -EFAULT; + } + // Read counters + ret = _rocmon_sdk_get_timestamp(&t); + if (ret < 0) + { + return ret; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Stop rocprofiler events + ret = _rocmon_stopCounters_rocprofiler_sdk(device); + if (ret < 0) return ret; + device->time.stop = t; + } + + return 0; +} + +static int +_rocmon_readCounters_rocprofiler_sdk(RocmonDevice* device) +{ + int active = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + // do read + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Checking context for device %d, device->deviceId); + stat = (*rocprofiler_context_is_active)(device->sdk_context, &active); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to check ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + if (active) + { + rocprofiler_user_data_t udata = { + .value = 0, + .ptr = NULL, + }; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Sampling counting service for device %d, device->deviceId); + stat = (*rocprofiler_sample_agent_profile_counting_service_ptr)(device->sdk_context, udata, ROCPROFILER_COUNTER_FLAG_NONE); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to sample counting service for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + } +/* stat = (*rocprofiler_flush_buffer_ptr)(device->buffer);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* return -EFAULT;*/ +/* }*/ + return 0; +} + + +int +rocmon_sdk_readCounters(RocmonContext* context) +{ + int ret = 0; + uint64_t t = 0; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + return -EFAULT; + } + ret = _rocmon_sdk_get_timestamp(&t); + if (ret < 0) + { + return ret; + } + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + // Read counters + ret = _rocmon_readCounters_rocprofiler_sdk(device); + if (ret < 0) return ret; + device->time.read = t; + } + + return 0; +} + + + + +int +rocmon_sdk_getEventsOfGpu(RocmonContext* context, int gpuIdx, EventList_rocm_t* list) +{ + EventList_rocm_t tmpList = NULL; + Event_rocm_t* tmpEventList = NULL; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + return -EFAULT; + } + // Validate args + if ((gpuIdx < 0) || (gpuIdx > context->numDevices) || (!list)) + { + return -EINVAL; + } + + RocmonDevice* device = &context->devices[gpuIdx]; + + if (*list) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reusing existing event list); + tmpList = *list; + } + else + { + // Allocate list structure + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Allocate new event list); + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + tmpList->numEvents = 0; + tmpList->events = NULL; + } + + // Get number of events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Add %d RocProfiler SDK events, device->numRocMetrics); + if (device->numRocMetrics == 0) + { + // No events -> return list + *list = tmpList; + return 0; + } + // (Re-)Allocate event array + tmpEventList = realloc(tmpList->events, (tmpList->numEvents + device->numRocMetrics) * sizeof(Event_rocm_t)); + if (!tmpEventList) + { + if (!*list) free(tmpList); + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + return -ENOMEM; + } + tmpList->events = tmpEventList; + int startindex = tmpList->numEvents; + + // Copy rocprofiler event information + for (int i = 0; i < device->numRocMetrics; i++) + { + rocprofiler_counter_info_v0_t* event = &device->sdk_rocMetrics[i]; + Event_rocm_t* out = &tmpList->events[startindex + i]; + int len; + + // Copy name + len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "ROCP_%s", event->name); + } + + // Copy description + len = strlen(event->description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", event->description); + } + tmpList->numEvents++; + } + *list = tmpList; + return 0; +} + + + + +int +rocmon_sdk_switchActiveGroup(RocmonContext* context, int newGroupId) +{ + int ret; + + ret = rocmon_sdk_stopCounters(context); + if (ret < 0) + { + return ret; + } + + ret = rocmon_sdk_setupCounters(context, newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_sdk_startCounters(context); + if (ret < 0) + { + return ret; + } + + return 0; +} + + + #endif /* LIKWID_ROCMON_SDK_H */ diff --git a/src/includes/rocmon_sdk_types.h b/src/includes/rocmon_sdk_types.h index 280edb6c5..7c8da13fb 100644 --- a/src/includes/rocmon_sdk_types.h +++ b/src/includes/rocmon_sdk_types.h @@ -30,6 +30,41 @@ #ifndef LIKWID_ROCMON_SDK_TYPES_H #define LIKWID_ROCMON_SDK_TYPES_H +#include +/*#ifdef ROCPROFILER_EXPORT*/ +/*#undef ROCPROFILER_EXPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_IMPORT*/ +/*#undef ROCPROFILER_IMPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MAJOR*/ +/*#undef ROCPROFILER_VERSION_MAJOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MINOR*/ +/*#undef ROCPROFILER_VERSION_MINOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_API*/ +/*#undef ROCPROFILER_API*/ +/*#endif*/ +#include +/*#ifdef ROCPROFILER_API*/ +/*#undef ROCPROFILER_API*/ +/*#endif*/ +#include + + +typedef struct { + rocprofiler_agent_t* agent; + rocprofiler_buffer_id_t buffer; + rocprofiler_context_id_t context; + RocmonEventResultList *result; +} RocprofilerSdkAgentData; + +typedef struct { + int num_agents; + RocprofilerSdkAgentData* agents; +} RocprofilerSdkData; + #endif /* LIKWID_ROCMON_SDK_TYPES_H */ diff --git a/src/includes/rocmon_smi.h b/src/includes/rocmon_smi.h new file mode 100644 index 000000000..9c959a7fe --- /dev/null +++ b/src/includes/rocmon_smi.h @@ -0,0 +1,1181 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_smi.h + * + * Description: Header File of rocmon module for ROCm SMI. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SMI_H +#define LIKWID_ROCMON_SMI_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +static void *rocmon_dl_rsmi_lib = NULL; + +static int rocmon_smi_initialized = 0; + +#ifndef RSMI_CALL +#define RSMI_CALL( call, args, handleerror ) \ + do { \ + rsmi_status_t _status = (*call##_ptr)args; \ + if (_status != RSMI_STATUS_SUCCESS) { \ + fprintf(stderr, "Error: function %s failed with error %d.\n", #call, _status); \ + handleerror; \ + } \ + } while (0) +#endif + +#ifndef DECLAREFUNC_SMI +#define DECLAREFUNC_SMI(funcname, funcsig) rsmi_status_t ROCMWEAK funcname funcsig; rsmi_status_t ( *funcname##_ptr ) funcsig; +#endif + +DECLAREFUNC_SMI(rsmi_init, (uint64_t flags)); +DECLAREFUNC_SMI(rsmi_shut_down, ()); +DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_open, (uint32_t dv_ind, rsmi_func_id_iter_handle_t* handle)); +DECLAREFUNC_SMI(rsmi_dev_supported_variant_iterator_open, (rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t* var_iter)); +DECLAREFUNC_SMI(rsmi_func_iter_value_get, (rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t* value )); +DECLAREFUNC_SMI(rsmi_func_iter_next, (rsmi_func_id_iter_handle_t handle)); +DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_close, (rsmi_func_id_iter_handle_t* handle)); +DECLAREFUNC_SMI(rsmi_dev_power_ave_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* power)); +DECLAREFUNC_SMI(rsmi_dev_pci_throughput_get, (uint32_t dv_ind, uint64_t* sent, uint64_t* received, uint64_t* max_pkt_sz)); +DECLAREFUNC_SMI(rsmi_dev_pci_replay_counter_get, (uint32_t dv_ind, uint64_t* counter)); +DECLAREFUNC_SMI(rsmi_dev_memory_total_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* total)); +DECLAREFUNC_SMI(rsmi_dev_memory_usage_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* used )); +DECLAREFUNC_SMI(rsmi_dev_memory_busy_percent_get, (uint32_t dv_ind, uint32_t* busy_percent)); +DECLAREFUNC_SMI(rsmi_dev_memory_reserved_pages_get, (uint32_t dv_ind, uint32_t* num_pages, rsmi_retired_page_record_t* records)); +DECLAREFUNC_SMI(rsmi_dev_fan_rpms_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); +DECLAREFUNC_SMI(rsmi_dev_fan_speed_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); +DECLAREFUNC_SMI(rsmi_dev_fan_speed_max_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* max_speed)); +DECLAREFUNC_SMI(rsmi_dev_temp_metric_get, (uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t* temperature)); +DECLAREFUNC_SMI(rsmi_dev_volt_metric_get, (uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t* voltage)); +DECLAREFUNC_SMI(rsmi_dev_overdrive_level_get, (uint32_t dv_ind, uint32_t* od)); +DECLAREFUNC_SMI(rsmi_dev_ecc_count_get, (uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t* ec)); +DECLAREFUNC_SMI(rsmi_compute_process_info_get, (rsmi_process_info_t* procs, uint32_t* num_items)); + + +// ---------------------------------------------------- +// SMI event wrapper +// ---------------------------------------------------- + +static int +_smi_wrapper_pci_throughput_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t value; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _smi_wrapper_pci_throughput_get(%d, %d), deviceId, event->extra); + // Internal variant: 0 for sent, 1 for received bytes and 2 for max packet size + if (event->extra == 0) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, &value, NULL, NULL), return -1); + else if (event->extra == 1) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, &value, NULL), return -1); + else if (event->extra == 2) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, NULL, &value), return -1); + else return -1; + + result->fullValue += value; + result->lastValue = value; + + return 0; +} + + +static int +_smi_wrapper_pci_replay_counter_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t counter; + RSMI_CALL(rsmi_dev_pci_replay_counter_get, (deviceId, &counter), return -1); + result->fullValue += counter; + result->lastValue = counter; + + return 0; +} + + +static int +_smi_wrapper_power_ave_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t power; + RSMI_CALL(rsmi_dev_power_ave_get, (deviceId, event->subvariant, &power), return -1); + result->fullValue += power; + result->lastValue = power; + + return 0; +} + + +static int +_smi_wrapper_memory_total_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t total; + RSMI_CALL(rsmi_dev_memory_total_get, (deviceId, event->variant, &total), return -1); + result->fullValue += total; + result->lastValue = total; + + return 0; +} + + +static int +_smi_wrapper_memory_usage_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t used; + RSMI_CALL(rsmi_dev_memory_usage_get, (deviceId, event->variant, &used), return -1); + result->fullValue += used; + result->lastValue = used; + + return 0; +} + + +static int +_smi_wrapper_memory_busy_percent_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t percent; + RSMI_CALL(rsmi_dev_memory_busy_percent_get, (deviceId, &percent), return -1); + result->fullValue += percent; + result->lastValue = percent; + + return 0; +} + + +static int +_smi_wrapper_memory_reserved_pages_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t num_pages; + RSMI_CALL(rsmi_dev_memory_reserved_pages_get, (deviceId, &num_pages, NULL), return -1); + result->fullValue += num_pages; + result->lastValue = num_pages; + + return 0; +} + + +static int +_smi_wrapper_fan_rpms_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t speed; + RSMI_CALL(rsmi_dev_fan_rpms_get, (deviceId, event->subvariant, &speed), return -1); + result->fullValue += speed; + result->lastValue = speed; + + return 0; +} + + +static int +_smi_wrapper_fan_speed_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t speed; + RSMI_CALL(rsmi_dev_fan_speed_get, (deviceId, event->subvariant, &speed), return -1); + result->fullValue += speed; + result->lastValue = speed; + + return 0; +} + + +static int +_smi_wrapper_fan_speed_max_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t max_speed; + RSMI_CALL(rsmi_dev_fan_speed_max_get, (deviceId, event->subvariant, &max_speed), return -1); + result->fullValue += max_speed; + result->lastValue = max_speed; + + return 0; +} + + +static int +_smi_wrapper_temp_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t temperature; + RSMI_CALL(rsmi_dev_temp_metric_get, (deviceId, event->subvariant, event->variant, &temperature), return -1); + result->fullValue += temperature; + result->lastValue = temperature; + + return 0; +} + + +static int +_smi_wrapper_volt_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t voltage; + RSMI_CALL(rsmi_dev_volt_metric_get, (deviceId, event->subvariant, event->variant, &voltage), return -1); + result->fullValue += voltage; + result->lastValue = voltage; + + return 0; +} + + +static int +_smi_wrapper_overdrive_level_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t overdrive; + RSMI_CALL(rsmi_dev_overdrive_level_get, (deviceId, &overdrive), return -1); + result->fullValue += overdrive; + result->lastValue = overdrive; + + return 0; +} + + +static int +_smi_wrapper_ecc_count_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + rsmi_error_count_t error_count; + RSMI_CALL(rsmi_dev_ecc_count_get, (deviceId, event->variant, &error_count), return -1); + + if (event->extra == 0) + { + result->lastValue = error_count.correctable_err - result->fullValue; + result->fullValue = error_count.correctable_err; + } + else if (event->extra == 1) + { + result->lastValue = error_count.uncorrectable_err - result->fullValue; + result->fullValue = error_count.uncorrectable_err; + } + else + { + return -1; + } + + return 0; +} + + +static int +_smi_wrapper_compute_process_info_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t num_items; + RSMI_CALL(rsmi_compute_process_info_get, (NULL, &num_items), return -1); + result->fullValue += num_items; + result->lastValue = num_items; + + return 0; +} + + +static int +_rocmon_smi_link_libraries() +{ + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD SMI libraries); + + // Need to link in the Rocprofiler libraries + rocmon_dl_rsmi_lib = dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_dl_rsmi_lib) + { + ERROR_PRINT(ROCm SMI library librocm_smi64.so not found: %s, dlerror()); + return -1; + } + + // Link SMI functions + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_init); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_shut_down); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_supported_func_iterator_open); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_supported_variant_iterator_open); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_func_iter_value_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_func_iter_next); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_supported_func_iterator_close); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_power_ave_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_pci_throughput_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_pci_replay_counter_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_total_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_usage_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_busy_percent_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_reserved_pages_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_fan_rpms_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_fan_speed_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_fan_speed_max_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_temp_metric_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_volt_metric_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_overdrive_level_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_ecc_count_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_compute_process_info_get); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries done); + return 0; +} + + + + +// ---------------------------------------------------- +// Rocmon SMI helper functions +// ---------------------------------------------------- + +static bstring +_rocmon_smi_build_label(RocmonSmiEventType type, const char* funcname, uint64_t variant, uint64_t subvariant) +{ + switch (type) + { + case ROCMON_SMI_EVENT_TYPE_NORMAL: + return bfromcstr(funcname); + case ROCMON_SMI_EVENT_TYPE_VARIANT: + return bformat("%s|%" PRIu64, funcname, variant); + case ROCMON_SMI_EVENT_TYPE_SUBVARIANT: + return bformat("%s|%" PRIu64 "|%" PRIu64, funcname, variant, subvariant); + case ROCMON_SMI_EVENT_TYPE_INSTANCES: + return bfromcstr(funcname); + } +} + + +static int +_rocmon_smi_add_event_to_device(RocmonDevice* device, const char* funcname, RocmonSmiEventType type, int64_t variant, uint64_t subvariant) +{ + int ret; + + // Get event by label + RocmonSmiEventList* list = NULL; + bstring label = _rocmon_smi_build_label(type, funcname, variant, subvariant); + ret = get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list); + bdestroy(label); + if (ret < 0) + { + // Event not registered -> ignore + return 0; + } + + // For events with multiple sensor, only make one entry -> find if one exists + if (type == ROCMON_SMI_EVENT_TYPE_INSTANCES && subvariant > 0) + { + // Get list from map + for (int i = 0; i < list->numEntries; i++) + { + RocmonSmiEvent* event = &list->entries[i]; + RocmonSmiEvent* existingEvent = NULL; + ret = get_smap_by_key(device->smiMetrics, event->name, (void**)&existingEvent); + if (ret < 0) + { + ERROR_PRINT(Failed to find previous instance for event %s, event->name); + return -1; + } + + // Update instance information + existingEvent->instances++; + } + return 0; + } + + for (int i = 0; i < list->numEntries; i++) + { + RocmonSmiEvent* event = &list->entries[i]; + + // Allocate memory for device event description + RocmonSmiEvent* tmpEvent = (RocmonSmiEvent*) malloc(sizeof(RocmonSmiEvent)); + if (tmpEvent == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event in device list %s, event->name); + return -ENOMEM; + } + + // Copy information from global description + memcpy(tmpEvent, event, sizeof(RocmonSmiEvent)); + tmpEvent->variant = variant; + tmpEvent->subvariant = subvariant; + tmpEvent->instances = 1; + + // Save event info to device event map + add_smap(device->smiMetrics, tmpEvent->name, tmpEvent); + } + + return 0; +} + + +static int +_rocmon_smi_get_function_subvariants(RocmonDevice* device, const char* funcname, uint64_t variant, rsmi_func_id_iter_handle_t var_iter) +{ + rsmi_func_id_iter_handle_t sub_var_iter; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Get open subvariants iterator + status = (*rsmi_dev_supported_variant_iterator_open_ptr)(var_iter, &sub_var_iter); + if (status == RSMI_STATUS_NO_DATA) + { + // No subvariants + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_VARIANT, variant, 0); + if (ret < 0) return -1; + return 0; + } + + // Subvariants available -> iterate them + do { + // Get subvariant information + (*rsmi_func_iter_value_get_ptr)(sub_var_iter, &value); + + // Process info + if (variant == RSMI_DEFAULT_VARIANT) + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_INSTANCES, variant, value.id); + else + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, variant, value.id); + if (ret < 0) return ret; + + // Advance iterator + status = (*rsmi_func_iter_next_ptr)(sub_var_iter); + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + (*rsmi_dev_supported_func_iterator_close_ptr)(&sub_var_iter); + + return 0; +} + + +static int +_rocmon_smi_get_function_variants(RocmonDevice* device, const char* funcname, rsmi_func_id_iter_handle_t iter_handle) +{ + rsmi_func_id_iter_handle_t var_iter; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Get open variants iterator + status = (*rsmi_dev_supported_variant_iterator_open_ptr)(iter_handle, &var_iter); + if (status == RSMI_STATUS_NO_DATA) + { + // No variants + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); + if (ret < 0) return -1; + return 0; + } + + // Variants available -> iterate them + do { + // Get variant information + (*rsmi_func_iter_value_get_ptr)(var_iter, &value); + + // Get function subvariants + ret = _rocmon_smi_get_function_subvariants(device, funcname, value.id, var_iter); + if (ret < 0) return -1; + + // Advance iterator + status = (*rsmi_func_iter_next_ptr)(var_iter); + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + (*rsmi_dev_supported_func_iterator_close_ptr)(&var_iter); + + return 0; +} + + +static int +_rocmon_smi_get_functions(RocmonDevice* device) +{ + rsmi_func_id_iter_handle_t iter_handle; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Open iterator + //(*rsmi_dev_supported_func_iterator_open_ptr)(device->deviceId, &iter_handle); + RSMI_CALL(rsmi_dev_supported_func_iterator_open, (device->deviceId, &iter_handle), { + return -1; + }); + + do + { + // Get function information + //(*rsmi_func_iter_value_get_ptr)(iter_handle, &value); + RSMI_CALL(rsmi_func_iter_value_get, (iter_handle, &value), { + ERROR_PRINT(Failed to get smi function value for device %d, device->deviceId); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + return -1; + }); + + // Get function variants + ret = _rocmon_smi_get_function_variants(device, value.name, iter_handle); + if (ret < 0) + { + ERROR_PRINT(Failed to get smi function variants for device %d, device->deviceId); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + return -1; + } + + // Advance iterator (cannot use RSMI_CALL macro here because we have an assignment, + // so we check that the function pointer exists to avoid segfaults.) + if (rsmi_func_iter_next_ptr) { + status = (*rsmi_func_iter_next_ptr)(iter_handle); + } + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + //(*rsmi_dev_supported_func_iterator_close_ptr)(&iter_handle); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + + // Add device independent functions + ret = _rocmon_smi_add_event_to_device(device, "rsmi_compute_process_info_get", ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); + if (ret < 0) return -1; + + return 0; +} + + + +static int +_rocmon_smi_add_event_to_map(char* name, RocmonSmiEventType type, char* smifunc, uint64_t variant, uint64_t subvariant, uint64_t extra, RocmonSmiMeasureFunc measureFunc) +{ + // Add new event list to map (if not already present) + bstring label = _rocmon_smi_build_label(type, smifunc, variant, subvariant); + RocmonSmiEventList* list; + if (get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list) < 0) + { + // Allocate memory for event list + list = (RocmonSmiEventList*) malloc(sizeof(RocmonSmiEventList)); + if (list == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event list %s, name); + return -ENOMEM; + } + list->entries = NULL; + list->numEntries = 0; + + add_smap(rocmon_context->smiEvents, bdata(label), list); + } + bdestroy(label); + + // Allocate memory for another event in list + list->numEntries++; + list->entries = (RocmonSmiEvent*) realloc(list->entries, list->numEntries * sizeof(RocmonSmiEvent)); + if (list->entries == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event %s, name); + return -ENOMEM; + } + + // Set event properties + RocmonSmiEvent* event = &list->entries[list->numEntries-1]; + strncpy(event->name, name, sizeof(event->name)); + event->name[sizeof(event->name)] = '\0'; + event->type = type; + event->variant = variant; + event->subvariant = subvariant; + event->extra = extra; + event->instances = 0; // gets set when scanning supported device functions + event->measureFunc = measureFunc; + + return 0; +} + +#define ADD_SMI_EVENT(name, type, smifunc, variant, subvariant, extra, measurefunc) if (_rocmon_smi_add_event_to_map(name, type, smifunc, variant, subvariant, extra, measurefunc) < 0) { return -1; } +#define ADD_SMI_EVENT_N(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_NORMAL, smifunc, 0, 0, extra, measurefunc) +#define ADD_SMI_EVENT_V(name, smifunc, variant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_VARIANT, smifunc, variant, 0, extra, measurefunc) +#define ADD_SMI_EVENT_S(name, smifunc, variant, subvariant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, smifunc, variant, subvariant, extra, measurefunc) +#define ADD_SMI_EVENT_I(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_INSTANCES, smifunc, 0, 0, extra, measurefunc) + + +static void +_rcomon_smi_free_event_list(void* vlist) +{ + RocmonSmiEventList* list = (RocmonSmiEventList*)vlist; + if (list) + { + FREE_IF_NOT_NULL(list->entries); + free(list); + } +} + + +static int +_rocmon_smi_init_events(RocmonContext* context) +{ + int ret; + + // Init map + ret = init_map(&context->smiEvents, MAP_KEY_TYPE_STR, 0, &_rcomon_smi_free_event_list); + if (ret < 0) + { + ERROR_PRINT(Failed to create map for ROCm SMI events); + return ret; + } + + // Add events + ADD_SMI_EVENT_N("PCI_THROUGHPUT_SENT", "rsmi_dev_pci_throughput_get", 0, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_THROUGHPUT_RECEIVED", "rsmi_dev_pci_throughput_get", 1, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_THROUGHPUT_MAX_PKT_SZ", "rsmi_dev_pci_throughput_get", 2, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_REPLAY_COUNTER", "rsmi_dev_pci_replay_counter_get", 0, &_smi_wrapper_pci_replay_counter_get ); + ADD_SMI_EVENT_I("POWER_AVE", "rsmi_dev_power_ave_get", 0, &_smi_wrapper_power_ave_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_VIS_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_GTT", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_VIS_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_GTT", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_N("MEMORY_BUSY_PERCENT", "rsmi_dev_memory_busy_percent_get", 0, &_smi_wrapper_memory_busy_percent_get ); + ADD_SMI_EVENT_N("MEMORY_NUM_RESERVED_PAGES", "rsmi_dev_memory_reserved_pages_get", 0, &_smi_wrapper_memory_reserved_pages_get ); + ADD_SMI_EVENT_I("FAN_RPMS", "rsmi_dev_fan_rpms_get", 0, &_smi_wrapper_fan_rpms_get ); + ADD_SMI_EVENT_I("FAN_SPEED", "rsmi_dev_fan_speed_get", 0, &_smi_wrapper_fan_speed_get ); + ADD_SMI_EVENT_I("FAN_SPEED_MAX", "rsmi_dev_fan_speed_max_get", 0, &_smi_wrapper_fan_speed_max_get ); + ADD_SMI_EVENT_S("TEMP_EDGE", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_EDGE, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("TEMP_JUNCTION", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_JUNCTION, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("TEMP_MEMORY", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_MEMORY, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("VOLT_VDDGFX", "rsmi_dev_volt_metric_get", RSMI_VOLT_CURRENT, RSMI_VOLT_TYPE_VDDGFX, 0, &_smi_wrapper_volt_metric_get ); + ADD_SMI_EVENT_N("OVERDRIVE_LEVEL", "rsmi_dev_overdrive_level_get", 0, &_smi_wrapper_overdrive_level_get ); + ADD_SMI_EVENT_V("ECC_COUNT_UMC_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_UMC_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SDMA_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SDMA_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_GFX_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_GFX_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_HDP_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_HDP_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_DF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_DF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SMN_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SMN_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SEM_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SEM_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP0_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP0_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP1_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP1_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_FUSE_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_FUSE_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_LAST_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_LAST_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_N("PROCS_USING_GPU", "rsmi_compute_process_info_get", 0, &_smi_wrapper_compute_process_info_get ); + + return 0; +} + +static int +_rocmon_setupCounters_smi(RocmonDevice* device, const char** events, int numEvents) +{ + int ret; + const int instanceNumLen = 5; + + // Delete previous events + if (device->activeSmiEvents) + { + free(device->activeSmiEvents); + device->activeSmiEvents = NULL; + device->numActiveSmiEvents = 0; + } + + // Look if the are any events + if (numEvents <= 0) + { + return 0; + } + + // Create event array + RocmonSmiEvent* activeEvents = (RocmonSmiEvent*) malloc(numEvents * sizeof(RocmonSmiEvent)); + if (activeEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate active event list); + return -ENOMEM; + } + + for (int i = 0; i < numEvents; i++) + { + char eventName[MAX_ROCMON_SMI_EVENT_NAME]; + int instance = -1; + + // Parse event name -> normal event vs one with multiple instances (EVENT[0]) + const char* event = events[i]; + char* instancePart = strrchr(event, '['); + if (instancePart != NULL) + { + char withoutBrackets[instanceNumLen+1]; // +1 is '\0' + int partlen = strlen(instancePart); + + // Check if number fit in 'withoutBrackets' + if (partlen - 2 > instanceNumLen) + { + ERROR_PRINT(Instance number in '%s' is too large, event); + free(activeEvents); + return -EINVAL; + } + + // Copy instance number without brackets + strncpy(withoutBrackets, instancePart+1, partlen-2); + withoutBrackets[instanceNumLen] = '\0'; + + // Parse instance as number + char* endParsed; + instance = strtol(withoutBrackets, &endParsed, 10); + + // Check if parsing was successful + char* endOfString = &withoutBrackets[partlen-2]; + if (endParsed != endOfString) + { + ERROR_PRINT(Failed to parse instance number in '%s', event); + free(activeEvents); + return -EINVAL; + } + + // Copy event name without instance + int eventNameLen = instancePart - event; + strncpy(eventName, event, eventNameLen); + eventName[eventNameLen] = '\0'; + } + else + { + // Copy entire event name + strncpy(eventName, event, MAX_ROCMON_SMI_EVENT_NAME); + } + + // Lookup event in available events + RocmonSmiEvent* metric = NULL; + ret = get_smap_by_key(device->smiMetrics, eventName, (void**)&metric); + if (ret < 0) + { + ERROR_PRINT(RSMI event '%s' not found for device %d, eventName, device->deviceId); + free(activeEvents); + return -EINVAL; + } + + // Copy event + RocmonSmiEvent* tmpEvent = &activeEvents[i]; + memcpy(tmpEvent, metric, sizeof(RocmonSmiEvent)); + + // Check if event supports instances + if (instance >= 0 && tmpEvent->type != ROCMON_SMI_EVENT_TYPE_INSTANCES) + { + ERROR_PRINT(Instance number given but event '%s' does not support one, eventName); + free(activeEvents); + return -EINVAL; + } + + // Check if event requires instances + if (instance < 0 && tmpEvent->type == ROCMON_SMI_EVENT_TYPE_INSTANCES) + { + ERROR_PRINT(No instance number given but event '%s' requires one, eventName); + free(activeEvents); + return -EINVAL; + } + + // Check if event has enough instances + if (instance >= 0 && instance >= metric->instances) + { + ERROR_PRINT(Instance %d seleced but event '%s' has only %d, instance, eventName, metric->instances); + free(activeEvents); + return -EINVAL; + } + + // Set instance number + if (instance >= 0) + { + tmpEvent->subvariant = instance; + } + } + + device->activeSmiEvents = activeEvents; + device->numActiveSmiEvents = numEvents; + + return 0; +} + + +int +rocmon_smi_setupCounters(RocmonContext* context, int gid) +{ + int ret = 0; + int numSmiEvents = 0; + const char **smiEvents = NULL; + // Check arguments + if (gid < 0 || gid >= context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + + // Get group info + GroupInfo* group = &context->groups[gid]; + + // Allocate memory for string arrays + smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (smiEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate smiEvents name array); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "RSMI_", 5) == 0) + { + // Rocprofiler event + smiEvents[numSmiEvents] = name + 5; // +5 removes 'ROCP_' prefix + numSmiEvents++; + } + } + if (numSmiEvents == 0) + { + free(smiEvents); + return 0; + } + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + ret = _rocmon_setupCounters_smi(device, smiEvents, numSmiEvents); + if (ret < 0) + { + ERROR_PRINT(Failed to setup ROCMON SMI events for device %d, i); + } + } + free(smiEvents); + return 0; +} + +int +rocmon_smi_readCounters(RocmonContext* context) +{ + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + if (context->activeGroup < 0) + { + return -EFAULT; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + // Check if there are any counters to start + if (device->numActiveSmiEvents <= 0) + { + return 0; + } + + // Save baseline values + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveSmiEvents; i++) + { + double value = 0; + RocmonSmiEvent* event = &device->activeSmiEvents[i]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + // Measure counter + if (event->measureFunc) + { + event->measureFunc(device->deviceId, event, result); + } + } + } + return 0; +} + +int +rocmon_smi_startCounters(RocmonContext* context) +{ + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + if (context->activeGroup < 0) + { + return -EFAULT; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + // Check if there are any counters to start + if (device->numActiveSmiEvents <= 0) + { + return 0; + } + + // Save baseline values + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveSmiEvents; i++) + { + double value = 0; + RocmonSmiEvent* event = &device->activeSmiEvents[i]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + + // Measure counter + if (event->measureFunc) + { + event->measureFunc(device->deviceId, event, result); + } + + // Save value + result->fullValue = 0; + } + } + return 0; +} + +int +rocmon_smi_stopCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + return 0; +} + + +static int +rocmon_smi_getEventsOfGpu(RocmonContext* context, int gpuIdx, EventList_rocm_t* list) +{ + EventList_rocm_t tmpList = NULL; + Event_rocm_t* tmpEventList = NULL; + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + // Validate args + if ((gpuIdx < 0) || (gpuIdx > rocmon_context->numDevices) || (!list)) + { + return -EINVAL; + } + + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + + if (*list) + { + tmpList = *list; + } + else + { + // Allocate list structure + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + memset(tmpList, 0, sizeof(EventList_rocm)); + } + + // Get number of events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Add %d ROCm SMI events, get_map_size(device->smiMetrics)); + if (get_map_size(device->smiMetrics) == 0) + { + // No events -> return list + *list = tmpList; + return 0; + } + // (Re-)Allocate event array + tmpEventList = realloc(tmpList->events, (tmpList->numEvents + get_map_size(device->smiMetrics)) * sizeof(Event_rocm_t)); + if (!tmpEventList) + { + if (!*list) free(tmpList); + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + return -ENOMEM; + } + tmpList->events = tmpEventList; + int startindex = tmpList->numEvents; + + // Copy ROCm SMI metric information + for (int i = 0; i < get_map_size(device->smiMetrics); i++) + { + RocmonSmiEvent* event = NULL; + Event_rocm_t* out = &tmpList->events[startindex + i]; + int len; + + // Get event + if (get_smap_by_idx(device->smiMetrics, i, (void**)&event) < 0) + { + continue; + } + + // Copy name + len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "RSMI_%s", event->name); + } + + // Copy description + char* description = "SMI Event"; // TODO: use real descriptions + len = strlen(description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", description); + } + + // Copy instances + out->instances = event->instances; + tmpList->numEvents++; + } + + *list = tmpList; + return 0; +} + + +int rocmon_smi_init(RocmonContext* context, int numGpus, const int* gpuIds) +{ + int ret = 0; + if ((!context) || (numGpus <= 0) || (!gpuIds)) + { + return -EINVAL; + } + + ret = _rocmon_smi_link_libraries(); + if (ret < 0) + { + return -EFAULT; + } + + // init rocm smi library + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RSMI); + RSMI_CALL(rsmi_init, (0), + { + ERROR_PLAIN_PRINT(Failed to init rocm_smi); + goto rocmon_init_rsmi_failed; + }); + + // Get available SMI events for devices + _rocmon_smi_init_events(context); + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice *device = &context->devices[i]; + // Initialize SMI events map + if (init_map(&device->smiMetrics, MAP_KEY_TYPE_STR, 0, &free) < 0) + { + ERROR_PLAIN_PRINT(Cannot init smiMetrics map); + goto rocmon_init_rsmi_failed; + } + if (_rocmon_smi_get_functions(device) < 0) + { + ERROR_PRINT(Failed to get SMI functions for device %d, device->deviceId); + goto rocmon_init_rsmi_failed; + } + } + rocmon_smi_initialized = TRUE; + return 0; +rocmon_init_rsmi_failed: + RSMI_CALL(rsmi_shut_down, (), { + // fall through + }); + return 0; +} + + +void rocmon_smi_finalize(RocmonContext* context) +{ + if (!rocmon_smi_initialized) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON SMI); + if (context) + { + if (context->devices) + { + // Free each devices fields + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + if (device->activeSmiEvents) + { + free(device->activeSmiEvents); + device->activeSmiEvents = NULL; + device->numActiveSmiEvents = 0; + } + if (device->smiMetrics) + { + destroy_smap(device->smiMetrics); + device->smiMetrics = NULL; + } + } + } + } + + RSMI_CALL(rsmi_shut_down, (), { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); + // fall through + }); + rocmon_smi_initialized = FALSE; +} + +int +rocmon_smi_switchActiveGroup(RocmonContext* context, int newGroupId) +{ + int ret; + + ret = rocmon_smi_stopCounters(context); + if (ret < 0) + { + return ret; + } + + ret = rocmon_smi_setupCounters(context, newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_smi_startCounters(context); + if (ret < 0) + { + return ret; + } + + return 0; +} + +#endif /* LIKWID_ROCMON_SMI_H */ diff --git a/src/includes/rocmon_smi_types.h b/src/includes/rocmon_smi_types.h new file mode 100644 index 000000000..cb6a5efae --- /dev/null +++ b/src/includes/rocmon_smi_types.h @@ -0,0 +1,81 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_smi_types.h + * + * Description: Header File of rocmon for smi backend. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SMI_TYPES_H +#define LIKWID_ROCMON_SMI_TYPES_H + +#include +#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 +typedef struct metrics_table_header_t metrics_table_header_t; +#endif +#include +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT +#endif +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT +#endif +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include + +struct RocmonSmiEvent_struct; +typedef int (*RocmonSmiMeasureFunc)(int deviceId, struct RocmonSmiEvent_struct* event, RocmonEventResult* result); + +typedef enum { + ROCMON_SMI_EVENT_TYPE_NORMAL = 0, + ROCMON_SMI_EVENT_TYPE_VARIANT, + ROCMON_SMI_EVENT_TYPE_SUBVARIANT, + ROCMON_SMI_EVENT_TYPE_INSTANCES +} RocmonSmiEventType; + +#define MAX_ROCMON_SMI_EVENT_NAME 40 +typedef struct RocmonSmiEvent_struct { + char name[MAX_ROCMON_SMI_EVENT_NAME]; + uint64_t variant; + uint64_t subvariant; + uint64_t extra; + int instances; + RocmonSmiEventType type; + RocmonSmiMeasureFunc measureFunc; +} RocmonSmiEvent; + +typedef struct { + RocmonSmiEvent* entries; + int numEntries; +} RocmonSmiEventList; + +#endif /* LIKWID_ROCMON_SMI_TYPES_H */ diff --git a/src/includes/rocmon_v1.h b/src/includes/rocmon_v1.h index 0ea8b70e0..2ee73d335 100644 --- a/src/includes/rocmon_v1.h +++ b/src/includes/rocmon_v1.h @@ -30,34 +30,963 @@ #ifndef LIKWID_ROCMON_V1_H #define LIKWID_ROCMON_V1_H -int rocmon_v1_init(int numGpus, const int* gpuIds); -void rocmon_v1_finalize(void); -int rocmon_v1_addEventSet(const char* eventString, int* gid); -int rocmon_v1_setupCounters(int gid); -int rocmon_v1_startCounters(void); -int rocmon_v1_stopCounters(void); -int rocmon_v1_readCounters(void); -double rocmon_v1_getResult(int gpuIdx, int groupId, int eventId); -double rocmon_v1_getLastResult(int gpuIdx, int groupId, int eventId); -int rocmon_v1_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list); -void rocmon_v1_freeEventsOfGpu(EventList_rocm_t list); -int rocmon_v1_switchActiveGroup(int newGroupId); -int rocmon_v1_getNumberOfGroups(void); -int rocmon_v1_getIdOfActiveGroup(void); -int rocmon_v1_getNumberOfGPUs(void); -int rocmon_v1_getNumberOfEvents(int groupId); -int rocmon_v1_getNumberOfMetrics(int groupId); -double rocmon_v1_getTimeOfGroup(int groupId); -double rocmon_v1_getLastTimeOfGroup(int groupId); -double rocmon_v1_getTimeToLastReadOfGroup(int groupId); -char* rocmon_v1_getEventName(int groupId, int eventId); -char* rocmon_v1_getCounterName(int groupId, int eventId); -char* rocmon_v1_getMetricName(int groupId, int metricId); -char* rocmon_v1_getGroupName(int groupId); -char* rocmon_v1_getGroupInfoShort(int groupId); -char* rocmon_v1_getGroupInfoLong(int groupId); -int rocmon_v1_getGroups(char*** groups, char*** shortinfos, char*** longinfos); -int rocmon_v1_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos); +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + + + +// #include +// #include +// #include + +// Variables +static void *rocmon_v1_dl_hsa_lib = NULL; +static void *rocmon_v1_dl_profiler_lib = NULL; + + +static bool rocmon_v1_initialized = FALSE; + +// Macros +#ifndef FREE_IF_NOT_NULL +#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } +#endif + +#ifndef ROCM_CALL +#define ROCM_CALL( call, args, handleerror ) \ + do { \ + hsa_status_t _status = (*call##_ptr)args; \ + if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + const char* err = NULL; \ + rocprofiler_error_string(&err); \ + if (err) fprintf(stderr, "Error: %s\n", err); \ + handleerror; \ + } \ + } while (0) +#endif + + +// ROCm function declarations +#ifndef DECLAREFUNC_HSA +#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; +#endif + +DECLAREFUNC_HSA(hsa_init, ()); +DECLAREFUNC_HSA(hsa_shut_down, ()); +DECLAREFUNC_HSA(hsa_iterate_agents, (hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data)); +DECLAREFUNC_HSA(hsa_agent_get_info, (hsa_agent_t agent, hsa_agent_info_t attribute, void* value)); +DECLAREFUNC_HSA(hsa_system_get_info, (hsa_system_info_t attribute, void *value)); + +DECLAREFUNC_HSA(rocprofiler_iterate_info, (const hsa_agent_t* agent, rocprofiler_info_kind_t kind, hsa_status_t (*callback)(const rocprofiler_info_data_t, void* data), void* data)); +DECLAREFUNC_HSA(rocprofiler_close, (rocprofiler_t* context)); +DECLAREFUNC_HSA(rocprofiler_open, (hsa_agent_t agent, rocprofiler_feature_t* features, uint32_t feature_count, rocprofiler_t** context, uint32_t mode, rocprofiler_properties_t* properties)); +DECLAREFUNC_HSA(rocprofiler_error_string, ()); +DECLAREFUNC_HSA(rocprofiler_start, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_stop, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_read, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_get_data, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_get_metrics, (const rocprofiler_t* context)); + + + +// ---------------------------------------------------- +// Rocmon helper functions +// ---------------------------------------------------- + +static int +_rocmon_v1_link_libraries() +{ + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm V1 libraries); + + // Need to link in the ROCm HSA libraries + rocmon_v1_dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_v1_dl_hsa_lib) + { + ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); + return -1; + } + + // Need to link in the Rocprofiler libraries + rocmon_v1_dl_profiler_lib = dlopen("librocprofiler64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_v1_dl_profiler_lib) + { + rocmon_v1_dl_profiler_lib = dlopen("librocprofiler64.so.1", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_v1_dl_profiler_lib) + { + ERROR_PRINT(Rocprofiler library librocprofiler64.so not found: %s, dlerror()); + return -1; + } + } + + // Link HSA functions + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_init); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_shut_down); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_iterate_agents); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_agent_get_info); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_system_get_info); + + // Link Rocprofiler functions + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_iterate_info); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_close); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_open); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_error_string); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_start); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_stop); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_read); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_get_data); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_get_metrics); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm V1 libraries done); + return 0; +} + +typedef struct { + RocmonContext* context; + int numGpus; + const int* gpuIds; +} iterate_agents_cb_arg; + +typedef struct { + RocmonDevice* device; + int currIndex; +} iterate_info_cb_arg; + + +static hsa_status_t +_rocmon_v1_iterate_info_callback_count(const rocprofiler_info_data_t info, void* data) +{ + RocmonDevice* device = (RocmonDevice*) data; + if (device) { + device->numRocMetrics++; + } + return HSA_STATUS_SUCCESS; +} + +static void +_rocmon_v1_print_rocprofiler_info_data(const rocprofiler_info_data_t info) +{ + if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + { + return; + } + printf("Name '%s':\n", info.metric.name); + printf("\tKind: '%s'\n", (info.kind == ROCPROFILER_INFO_KIND_METRIC ? "Metric" : "Trace")); + printf("\tInstances: %d\n", info.metric.instances); + printf("\tDescription: '%s'\n", info.metric.description); + printf("\tExpression: '%s'\n", info.metric.expr); + printf("\tBlockName: '%s'\n", info.metric.block_name); + printf("\tBlockCounters: %d\n", info.metric.block_counters); +} + +static hsa_status_t +_rocmon_v1_iterate_info_callback_add(const rocprofiler_info_data_t info, void* data) +{ + iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; + + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); + if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP) + { + _rocmon_v1_print_rocprofiler_info_data(info); + } + // Check info kind + if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + { + ERROR_PRINT(Wrong info kind %u, info.kind); + return HSA_STATUS_ERROR; + } + + // Check index + if (arg->currIndex >= arg->device->numRocMetrics) + { + ERROR_PRINT(Metric index out of bounds: %d, arg->currIndex); + return HSA_STATUS_ERROR; + } + + // Copy info data + rocprofiler_info_data_t* target_info = &arg->device->v1_rocMetrics[arg->currIndex]; + memcpy(target_info, &info, sizeof(rocprofiler_info_data_t)); + arg->currIndex++; + + return HSA_STATUS_SUCCESS; +} + + +static hsa_status_t +_rocmon_v1_iterate_agents_callback(hsa_agent_t agent, void* argv) +{ + // Count number of callback invocations as the devices id + static int nextDeviceId = 0; + int deviceId = nextDeviceId; + bool noAgent = false; + + iterate_agents_cb_arg *arg = (iterate_agents_cb_arg*) argv; + + // Check if device is a GPU + hsa_device_type_t type; + ROCM_CALL(hsa_agent_get_info, (agent, HSA_AGENT_INFO_DEVICE, &type), return -1); + if (type != HSA_DEVICE_TYPE_GPU) + { + return HSA_STATUS_SUCCESS; + } + nextDeviceId++; + + // Check if device is includes in arg->gpuIds + int gpuIndex = -1; + for (int i = 0; i < arg->numGpus; i++) + { + if (deviceId == arg->gpuIds[i]) + { + gpuIndex = i; + break; + } + } + if (gpuIndex < 0) + { + return HSA_STATUS_SUCCESS; + } + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing agent %d, gpuIndex); + + // Add agent to context + RocmonDevice *device = &arg->context->devices[gpuIndex]; + device->deviceId = deviceId; + device->hsa_agent = agent; + device->v1_context = NULL; + device->numActiveRocEvents = 0; + device->v1_activeRocEvents = NULL; + device->numGroupResults = 0; + device->groupResults = NULL; + + // Get number of available metrics + device->numRocMetrics = 0; + ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_count, device), return HSA_STATUS_ERROR); + //ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, RocProfiler provides %d events, device->numRocMetrics); + + // workaround for bug in ROCm 5.4.0 + if(device->numRocMetrics == 0) { + ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_count, device), return HSA_STATUS_ERROR); + noAgent = true; + } + + // Allocate memory for metrics + device->v1_rocMetrics = (rocprofiler_info_data_t*) malloc(device->numRocMetrics * sizeof(rocprofiler_info_data_t)); + if (device->v1_rocMetrics == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate set of v1_rocMetrics); + return HSA_STATUS_ERROR; + } + + // Fetch metric informatino + iterate_info_cb_arg info_arg = { + .device = device, + .currIndex = 0, + }; + //ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, Read %d RocProfiler events for device %d, device->numRocMetrics, device->deviceId); + + // If the call fails with agent, call rocprofiler_iterate_info without agent + if(noAgent) + { + ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); + } else { + ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); + } + + return HSA_STATUS_SUCCESS; +} + + + + + +static int +_rocmon_v1_get_timestamp(uint64_t* timestamp_ns) +{ + uint64_t timestamp; + + // Get timestamp from system + ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP, ×tamp), return -1); + // Convert to nanoseconds + *timestamp_ns = (uint64_t)((long double)timestamp * rocmon_context->hsa_timestamp_factor); + + return 0; +} + + +static int +_rocmon_v1_getLastResult(RocmonDevice* device, int eventId, double* value) +{ + rocprofiler_data_t* data = &device->v1_activeRocEvents[eventId].data; + + switch (data->kind) + { + case ROCPROFILER_DATA_KIND_INT32: + *value = (double) data->result_int32; + break; + case ROCPROFILER_DATA_KIND_INT64: + *value = (double) data->result_int64; + break; + case ROCPROFILER_DATA_KIND_FLOAT: + *value = (double) data->result_float; + break; + case ROCPROFILER_DATA_KIND_DOUBLE: + *value = data->result_double; + break; + + case ROCPROFILER_DATA_KIND_BYTES: + case ROCPROFILER_DATA_KIND_UNINIT: + default: + return -1; + } + + return 0; +} + + +static int +_rocmon_readCounters_rocprofiler_v1(RocmonDevice* device) +{ + int ret; + + // Check if there are any counters to start + if (device->numActiveRocEvents <= 0) + { + return 0; + } + + if (!device->v1_context) + { + return 0; + } + + ROCM_CALL(rocprofiler_read, (device->v1_context, 0), return -1); + ROCM_CALL(rocprofiler_get_data, (device->v1_context, 0), return -1); + ROCM_CALL(rocprofiler_get_metrics, (device->v1_context), return -1); + + // Update results + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveRocEvents; i++) + { + RocmonEventResult* result = &groupResult->results[i]; + + // Read value + ret = _rocmon_v1_getLastResult(device, i, &result->fullValue); + if (ret < 0) + { + return -1; + } + + // Calculate delta since last read + result->lastValue = result->fullValue - result->lastValue; + } + + return 0; +} + + + +int +_rocmon_v1_readCounters(RocmonContext* context, uint64_t* (*getDestTimestampFunc)(RocmonDevice* device)) +{ + int ret; + + // Get timestamp + uint64_t timestamp; + if (ret = _rocmon_v1_get_timestamp(×tamp)) + { + return ret; + } + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + if (!device->rocprof_v1) continue; + + // Save timestamp + if (getDestTimestampFunc) + { + uint64_t* timestampDest = getDestTimestampFunc(device); + if (timestampDest) + { + *timestampDest = timestamp; + } + } + + // Read rocprofiler counters + ret = _rocmon_readCounters_rocprofiler_v1(device); + if (ret < 0) return ret; + } + + return 0; +} + + +static uint64_t* +_rocmon_v1_get_read_time(RocmonDevice* device) +{ + return &device->time.read; +} + + +static uint64_t* +_rocmon_v1_get_stop_time(RocmonDevice* device) +{ + return &device->time.stop; +} + + +int +rocmon_v1_init(RocmonContext* context, int numGpus, const int* gpuIds) +{ + hsa_status_t status = 0; + RocmonDevice* devices = NULL; + int num_devices = 0; + + // check if already initialized + if (rocmon_v1_initialized) + { + return 0; + } + if (context == NULL) + { + return -EEXIST; + } + + // Validate arguments + if (numGpus <= 0) + { + ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); + return -EINVAL; + } + + // Initialize other parts + init_configuration(); + + // initialize libraries + int ret = _rocmon_v1_link_libraries(); + if (ret < 0) + { + ERROR_PLAIN_PRINT(Failed to initialize libraries); + return ret; + } + + // init hsa library + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing HSA); + ROCM_CALL(hsa_init, (), + { + ERROR_PLAIN_PRINT(Failed to init hsa library); + goto rocmon_init_hsa_failed; + }); + + if (!context->devices) + { + context->devices = (RocmonDevice*) malloc(numGpus * sizeof(RocmonDevice)); + if (!context->devices) + { + ERROR_PLAIN_PRINT(Cannot allocate set of GPUs); + free(devices); + return -ENOMEM; + } + context->numDevices = numGpus; + } + // Get hsa timestamp factor + uint64_t frequency_hz; + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting HSA timestamp factor); + ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency_hz), + { + ERROR_PLAIN_PRINT(Failed to get HSA timestamp factor); + goto rocmon_init_info_agents_failed; + }); + context->hsa_timestamp_factor = (long double)1000000000 / (long double)frequency_hz; + + // initialize structures for specified devices (fetch ROCm specific info) + iterate_agents_cb_arg arg = { + .context = context, + .numGpus = numGpus, + .gpuIds = gpuIds, + }; + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Iterating through %d available agents, numGpus); + ROCM_CALL(hsa_iterate_agents, (_rocmon_v1_iterate_agents_callback, &arg), + { + ERROR_PRINT(Error while iterating through available agents); + goto rocmon_init_info_agents_failed; + }); + + rocmon_v1_initialized = TRUE; + return 0; +rocmon_init_info_agents_failed: + ROCM_CALL(hsa_shut_down, (), { + // fall through + }); +rocmon_init_hsa_failed: + free(context->devices); + context->devices = NULL; + context->numDevices = 0; + return -1; +} + + +void +rocmon_v1_finalize(RocmonContext* context) +{ + + if (!rocmon_v1_initialized) + { + return; + } + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON); + + if (context) + { + if (context->devices) + { + // Free each devices fields + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + if (device->rocprof_v1) + { + FREE_IF_NOT_NULL(device->v1_rocMetrics); + FREE_IF_NOT_NULL(device->v1_activeRocEvents); + } + if (device->groupResults) + { + // Free events of event result lists + for (int j = 0; j < device->numGroupResults; j++) + { + FREE_IF_NOT_NULL(device->groupResults[i].results); + } + // Free list + free(device->groupResults); + } + if (device->v1_context) + { + ROCM_CALL(rocprofiler_close, (device->v1_context),); + } + } + } + } + + ROCM_CALL(hsa_shut_down, (), { + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); + // fall through + }); +} + + +/*int*/ +/*rocmon_v1_addEventSet(const char* eventString, int* gid)*/ +/*{*/ +/* // Check arguments*/ +/* if (!eventString)*/ +/* {*/ +/* return -EINVAL;*/ +/* }*/ +/* */ +/* // Ensure rocmon is initialized*/ +/* if (!rocmon_v1_initialized)*/ +/* {*/ +/* return -EFAULT;*/ +/* }*/ + +/* // Allocate memory for event group if necessary*/ +/* if (rocmon_context->numActiveGroups == rocmon_context->numGroups)*/ +/* {*/ +/* GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo));*/ +/* if (tmpInfo == NULL)*/ +/* {*/ +/* ERROR_PLAIN_PRINT(Cannot allocate additional group);*/ +/* return -ENOMEM;*/ +/* }*/ +/* rocmon_context->groups = tmpInfo;*/ +/* rocmon_context->numGroups++;*/ +/* }*/ + +/* // Parse event string*/ +/* int err = _rocmon_v1_parse_eventstring(eventString, &rocmon_context->groups[rocmon_context->numActiveGroups]);*/ +/* if (err < 0)*/ +/* {*/ +/* return err;*/ +/* }*/ + +/* */ + +/* *gid = rocmon_context->numActiveGroups;*/ +/* rocmon_context->numActiveGroups++;*/ +/* return 0;*/ +/*}*/ + + +int +_rocmon_setupCounters_rocprofiler_v1(RocmonDevice* device, const char** events, int numEvents) +{ + // Close previous rocprofiler context + if (device->v1_context) + { + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Closing previous rocprofiler context); + ROCM_CALL(rocprofiler_close, (device->v1_context), return -1); + } + + // Look if the are any events + if (numEvents <= 0) + { + return 0; + } + + // Create feature array to monitor + rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(numEvents * sizeof(rocprofiler_feature_t)); + if (features == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate feature list); + return -ENOMEM; + } + for (int i = 0; i < numEvents; i++) + { + features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[i].name = events[i]; + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEBUG, Setup ROCMON rocprofiler_v1 counter %d %s, i, events[i]); + } + + // Free previous feature array if present + FREE_IF_NOT_NULL(device->v1_activeRocEvents); + + device->numActiveRocEvents = numEvents; + device->v1_activeRocEvents = features; + + // Open context + rocprofiler_properties_t properties = {}; + properties.queue_depth = 128; + uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | ROCPROFILER_MODE_SINGLEGROUP; + + // Important: only a single profiling group is supported at this time which limits the number of events that can be monitored at a time. + ROCM_CALL(rocprofiler_open, (device->hsa_agent, device->v1_activeRocEvents, device->numActiveRocEvents, &device->v1_context, mode, &properties), return -1); + + return 0; +} + + +int +rocmon_v1_setupCounters(RocmonContext* context, int gid) +{ + int ret; + + // Check arguments + if (gid < 0 || gid >= context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Get group info + GroupInfo* group = &context->groups[gid]; + + // + // Separate rocprofiler and SMI events + // + const char **rocEvents = NULL; + int numRocEvents = 0; + + // Allocate memory for string arrays + rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (rocEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "ROCP_", 5) == 0) + { + // Rocprofiler event + rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix + numRocEvents++; + } + } + + // Add events to each device + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Add rocprofiler events + //ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); + ret = _rocmon_setupCounters_rocprofiler_v1(device, rocEvents, numRocEvents); + if (ret < 0) + { + free(rocEvents); + return ret; + } + } + // Cleanup + free(rocEvents); + + return 0; +} + + +static int +_rocmon_startCounters_rocprofiler_v1(RocmonDevice* device) +{ + // Check if there are any counters to start + if (device->numActiveRocEvents <= 0) + { + return 0; + } + + // Reset results + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveRocEvents; i++) + { + RocmonEventResult* result = &groupResult->results[i]; + result->lastValue = 0; + result->fullValue = 0; + } + + if (device->v1_context) + { + ROCM_CALL(rocprofiler_start, (device->v1_context, 0), return -1); + } + + return 0; +} + + + +int +rocmon_v1_startCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Get timestamp + uint64_t timestamp; + if (ret = _rocmon_v1_get_timestamp(×tamp)) + { + return ret; + } + + // Start counters on each device + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + device->time.start = timestamp; + device->time.read = timestamp; + + // Start rocprofiler events + ret = _rocmon_startCounters_rocprofiler_v1(device); + if (ret < 0) return ret; + + // Start SMI events +/* _rocmon_startCounters_smi(device);*/ +/* if (ret < 0) return ret;*/ + } + + return 0; +} + + +static int +_rocmon_stopCounters_rocprofiler_v1(RocmonDevice* device) +{ + if (device->v1_context) + { + // Close context + ROCM_CALL(rocprofiler_stop, (device->v1_context, 0), return -1); + } + + return 0; +} + + +int +rocmon_v1_stopCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Read counters + ret = _rocmon_v1_readCounters(context, &_rocmon_v1_get_stop_time); + if (ret < 0) return ret; + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Stop rocprofiler events + ret = _rocmon_stopCounters_rocprofiler_v1(device); + if (ret < 0) return ret; + + // Nothing to stop for SMI events + } + + return 0; +} + + +int +rocmon_v1_readCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Read counters + ret = _rocmon_v1_readCounters(context, &_rocmon_v1_get_read_time); + if (ret < 0) return ret; + + return 0; +} + + +int +rocmon_v1_getEventsOfGpu(RocmonContext* context, int gpuIdx, EventList_rocm_t* list) +{ + EventList_rocm_t tmpList = NULL; + Event_rocm_t* tmpEventList = NULL; + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + // Validate args + if ((gpuIdx < 0) || (gpuIdx > context->numDevices) || (!list)) + { + return -EINVAL; + } + + RocmonDevice* device = &context->devices[gpuIdx]; + + if (*list) + { + tmpList = *list; + } + else + { + // Allocate list structure + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + memset(tmpList, 0, sizeof(EventList_rocm)); + } + + // Get number of events + printf("Number of events %d\n", device->numRocMetrics); + + if (device->numRocMetrics == 0) + { + // No events -> return list + *list = tmpList; + return 0; + } + // (Re-)Allocate event array + tmpEventList = realloc(tmpList->events, (tmpList->numEvents + device->numRocMetrics) * sizeof(Event_rocm_t)); + if (!tmpEventList) + { + if (!*list) free(tmpList); + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + return -ENOMEM; + } + tmpList->events = tmpEventList; + int startindex = tmpList->numEvents; + + // Copy rocprofiler event information + for (int i = 0; i < device->numRocMetrics; i++) + { + rocprofiler_info_data_t* event = &device->v1_rocMetrics[i]; + Event_rocm_t* out = &tmpList->events[startindex + i]; + int len; + + // Copy name + printf("Name %s\n", event->metric.name); + len = strlen(event->metric.name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "ROCP_%s", event->metric.name); + } + + // Copy description + len = strlen(event->metric.description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", event->metric.description); + } + tmpList->numEvents++; + } + *list = tmpList; + return 0; +} + + +int +rocmon_v1_switchActiveGroup(RocmonContext* context, int newGroupId) +{ + int ret; + + ret = rocmon_v1_stopCounters(context); + if (ret < 0) + { + return ret; + } + + ret = rocmon_v1_setupCounters(context, newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_v1_startCounters(context); + if (ret < 0) + { + return ret; + } + + return 0; +} + #endif /* LIKWID_ROCMON_V1_H */ diff --git a/src/includes/rocmon_v1_types.h b/src/includes/rocmon_v1_types.h index 5d06f85d3..22d588a90 100644 --- a/src/includes/rocmon_v1_types.h +++ b/src/includes/rocmon_v1_types.h @@ -32,118 +32,28 @@ #include // #include -#ifndef ROCPROFILER_VERSION_MAJOR #ifdef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #undef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #endif -#include +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT #endif -#include -#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 -typedef struct metrics_table_header_t metrics_table_header_t; +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT #endif -#include -#include - -typedef struct { - double lastValue; - double fullValue; -} RocmonEventResult; - -typedef struct { - RocmonEventResult* results; // First rocprofiler results, then SMI results - int numResults; -} RocmonEventResultList; - - - -struct RocmonSmiEvent_struct; -typedef int (*RocmonSmiMeasureFunc)(int deviceId, struct RocmonSmiEvent_struct* event, RocmonEventResult* result); - -typedef enum { - ROCMON_SMI_EVENT_TYPE_NORMAL = 0, - ROCMON_SMI_EVENT_TYPE_VARIANT, - ROCMON_SMI_EVENT_TYPE_SUBVARIANT, - ROCMON_SMI_EVENT_TYPE_INSTANCES -} RocmonSmiEventType; - -typedef struct RocmonSmiEvent_struct { - char name[40]; - uint64_t variant; - uint64_t subvariant; - uint64_t extra; - int instances; - RocmonSmiEventType type; - RocmonSmiMeasureFunc measureFunc; -} RocmonSmiEvent; - -typedef struct { - RocmonSmiEvent* entries; - int numEntries; -} RocmonSmiEventList; - -typedef struct { - int deviceId; // LIKWID device id - - hsa_agent_t hsa_agent; // HSA agent handle for this device - rocprofiler_t* context; // Rocprofiler context (has activeEvents configured) - - // Available rocprofiler metrics - rocprofiler_info_data_t* rocMetrics; - int numRocMetrics; - - // Available ROCm SMI events - Map_t smiMetrics; - - // Currently configured rocprofiler events (bound to context) - rocprofiler_feature_t* activeRocEvents; - int numActiveRocEvents; - - // Currently configured ROCm SMI events - RocmonSmiEvent* activeSmiEvents; - int numActiveSmiEvents; - - // Results for all events in all event sets - RocmonEventResultList* groupResults; - int numGroupResults; - - // Timestamps in ns - struct { - uint64_t start; - uint64_t read; - uint64_t stop; - } time; -} RocmonDevice; - -typedef struct { - // Event Groups - GroupInfo *groups; - int numGroups; // Number of allocated groups - int numActiveGroups; // Number of used groups - int activeGroup; // Currently active group - - // Devices (HSA agents) - RocmonDevice *devices; - int numDevices; - - // System information - long double hsa_timestamp_factor; // hsa_timestamp * hsa_timestamp_factor = timestamp_in_ns +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include - // ROCm SMI events - Map_t smiEvents; -} RocmonContext; -extern RocmonContext *rocmon_context; +#include -typedef struct { - bstring tag; - int groupID; - int gpuCount; - int eventCount; - double* time; - uint32_t* count; - int* gpulist; - double** counters; -} LikwidRocmResults; #endif /* LIKWID_ROCMON_V1_TYPES_H */ diff --git a/src/rocmon.c b/src/rocmon.c index 7e552f968..743b3b33c 100644 --- a/src/rocmon.c +++ b/src/rocmon.c @@ -44,51 +44,296 @@ #include #include -#include +#include #ifdef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #undef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #endif -#ifndef LIKWID_ROCPROF_SDK -#include -#include -#else + +#include +int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; +static int rocmon_initialized = 0; +static RocmonContext* rocmon_context = NULL; + +// Include backends +#include +#include +#ifdef LIKWID_ROCPROF_SDK #include #include #endif +#include +#include -#include - +//#include +const char* rocprofiler_group_arch = "amd_gpu"; void rocmon_finalize(void) { -#ifndef LIKWID_ROCPROF_SDK - rocmon_v1_finalize(); -#else - rocmon_sdk_finalize(); + if ((!rocmon_initialized) || (rocmon_context == NULL)) + { + rocmon_context = NULL; + rocmon_initialized = 0; + return; + } + if (rocmon_context->use_rocprofiler_v1) + { + rocmon_v1_finalize(rocmon_context); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + rocmon_sdk_finalize(rocmon_context); + } #endif + + rocmon_smi_finalize(rocmon_context); + + if (rocmon_context->devices) + { + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* dev = &rocmon_context->devices[i]; + if (dev->groupResults) + { + if (dev->groupResults->results) + { + free(dev->groupResults->results); + dev->groupResults->results = NULL; + dev->groupResults->numResults = 0; + } + free(dev->groupResults); + dev->groupResults = NULL; + } + } + free(rocmon_context->devices); + rocmon_context->devices = NULL; + rocmon_context->numDevices = 0; + } + if (rocmon_context->groups) + { + free(rocmon_context->groups); + rocmon_context->groups = NULL; + rocmon_context->numGroups = 0; + rocmon_context->numActiveGroups = 0; + rocmon_context->activeGroup = -1; + } + + free(rocmon_context); + rocmon_context = NULL; + + rocmon_initialized = FALSE; return; } int rocmon_init(int numGpus, const int* gpuIds) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_init(numGpus, gpuIds); + int err = 0; + + // check if already initialized + if (rocmon_initialized) + { + return 0; + } + if (rocmon_context != NULL) + { + return -EEXIST; + } + // Validate arguments + if (numGpus <= 0) + { + ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); + return -EINVAL; + } + if (!gpuIds) + { + ERROR_PRINT(Invalid GPU list); + return -EINVAL; + } + + // Initialize other parts + init_configuration(); + + // Allocate memory for context + rocmon_context = (RocmonContext*) malloc(sizeof(RocmonContext)); + if (rocmon_context == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate Rocmon context); + return -ENOMEM; + } + memset(rocmon_context, 0, sizeof(RocmonContext)); + rocmon_context->groups = NULL; + rocmon_context->devices = NULL; + +#ifdef LIKWID_ROCPROF_SDK + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RocProfiler SDK); + err = rocmon_sdk_init(rocmon_context, numGpus, gpuIds); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RocProfiler SDK returned %d, err); #else - return rocmon_sdk_init(numGpus, gpuIds); + err = -1; #endif + if (err != 0) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RocProfiler V1); + err = rocmon_v1_init(rocmon_context, numGpus, gpuIds); + if (err == 0) + { + rocmon_context->use_rocprofiler_v1 = 1; + } + else + { + ERROR_PRINT(Failed to initialize Rocprofiler v1 and SDK); + free(rocmon_context); + rocmon_context = NULL; + return err; + } + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing ROCm SMI); + err = rocmon_smi_init(rocmon_context, numGpus, gpuIds); + if (err != 0) + { + // Only fail if there are no devices -> neither v1 nor sdk added them + if (rocmon_context->devices == NULL) + { + ERROR_PRINT(Failed to initialize Rocprofiler SMI); + free(rocmon_context); + rocmon_context = NULL; + return err; + } + } + rocmon_context->state = ROCMON_STATE_INITIALIZED; + rocmon_initialized = TRUE; + return err; +} + +int find_colon(const char* str) +{ + for (int i = 0; i < strlen(str); i++) + { + if (str[i] == ':') + { + return 1; + } + } + return 0; +} + +static int +_rocmon_parse_eventstring(const char* eventString, const char* arch, GroupInfo* group) +{ + int err = 0; + const char colon = ':'; + Configuration_t config = get_configuration(); + + if ((strstr(eventString, &colon) != NULL) || (find_colon(eventString))) + { + // If custom group -> perfgroup_customGroup + err = perfgroup_customGroup(eventString, group); + if (err < 0) + { + ERROR_PRINT(Cannot transform %s to performance group, eventString); + return err; + } + } + else + { + // If performance group -> perfgroup_readGroup + err = perfgroup_readGroup(config->groupPath, arch, eventString, group); + if (err == -EACCES) + { + ERROR_PRINT(Access to performance group %s not allowed, eventString); + return err; + } + else if (err == -ENODEV) + { + ERROR_PRINT(Performance group %s only available with deactivated HyperThreading, eventString); + return err; + } + if (err < 0) + { + ERROR_PRINT(Cannot read performance group %s for %s, eventString, arch); + return err; + } + } + + return 0; } int rocmon_addEventSet(const char* eventString, int* gid) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_addEventSet(eventString, gid); -#else - return rocmon_sdk_addEventSet(eventString, gid); -#endif + int ret = 0; + GroupInfo group = {}; + // Check arguments + if ((!gid) || (!eventString)) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + ERROR_PRINT(ROCMON not initialized); + return -EFAULT; + } + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding Eventstring %s, eventString); + ret = _rocmon_parse_eventstring(eventString, rocprofiler_group_arch, &group); + if (ret < 0) + { + return ret; + } + + // Allocate memory for event group if necessary + if (rocmon_context->numActiveGroups == rocmon_context->numGroups) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Increasing group space to %d, rocmon_context->numGroups+1); + GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo)); + if (tmpInfo == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate additional group); + return -ENOMEM; + } + rocmon_context->groups = tmpInfo; + rocmon_context->numGroups++; + } + + // Allocate memory for event results + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Allocate result space); + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Allocate memory for event results + int numEvents = group.nevents; + RocmonEventResult* tmpResults = (RocmonEventResult*) malloc(numEvents * sizeof(RocmonEventResult)); + if (tmpResults == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event results); + return -ENOMEM; + } + + // Allocate memory for new event result list entry + RocmonEventResultList* tmpGroupResults = (RocmonEventResultList*) realloc(device->groupResults, (device->numGroupResults+1) * sizeof(RocmonEventResultList)); + if (tmpGroupResults == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate new event group result list); + return -ENOMEM; + } + + device->groupResults = tmpGroupResults; + device->groupResults[device->numGroupResults].results = tmpResults; + device->groupResults[device->numGroupResults].numResults = numEvents; + device->numGroupResults++; + } + + rocmon_context->groups[rocmon_context->numActiveGroups] = group; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Eventstring %s got GID %d, eventString, rocmon_context->numActiveGroups); + *gid = rocmon_context->numActiveGroups; + rocmon_context->numActiveGroups++; + return 0; } @@ -96,11 +341,119 @@ rocmon_addEventSet(const char* eventString, int* gid) int rocmon_setupCounters(int gid) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_setupCounters(gid); -#else - return rocmon_sdk_setupCounters(gid); + int ret; + + // Check arguments + if (gid < 0 || gid >= rocmon_context->numActiveGroups) + { + ERROR_PRINT(Invalid eventset ID %d, gid); + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + ERROR_PRINT(Rocmon not initialized); + return -EFAULT; + } + if ((rocmon_context->state != ROCMON_STATE_STOPPED) && (rocmon_context->state != ROCMON_STATE_INITIALIZED)) + { + ERROR_PRINT(Rocmon not in a valid state to setup -> %d, rocmon_context->state); + return -EFAULT; + } + + // Get group info + GroupInfo* group = &rocmon_context->groups[gid]; + + // + // Separate rocprofiler and SMI events + // + const char **smiEvents = NULL, **rocEvents = NULL; + int numSmiEvents = 0, numRocEvents = 0; + + // Allocate memory for string arrays + smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (smiEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); + return -ENOMEM; + } + rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (rocEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); + free(smiEvents); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "RSMI_", 5) == 0) + { + // RSMI event + smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix + numSmiEvents++; + } + else if (strncmp(name, "ROCP_", 5) == 0) + { + // Rocprofiler event + rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix + numRocEvents++; + } + else + { + // Unknown event + ERROR_PRINT(Event '%s' has no prefix ('ROCP_' or 'RSMI_'), name); + return -EINVAL; + } + } + + // Add events to each device + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + + // Add rocprofiler events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCPROFILER WITH %d events, numRocEvents); + if (rocmon_context->use_rocprofiler_v1) + { + ret = rocmon_v1_setupCounters(rocmon_context, gid); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ret = rocmon_sdk_setupCounters(rocmon_context, gid); + } #endif + if (ret < 0) + { + ERROR_PRINT(Setting up rocprofiler counters failed); + free(smiEvents); + free(rocEvents); + return ret; + } + + // Add SMI events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCM SMI WITH %d events, numSmiEvents); + ret = rocmon_smi_setupCounters(rocmon_context, gid); + if (ret < 0) + { + ERROR_PRINT(Setting up SMI counters failed); + free(smiEvents); + free(rocEvents); + return ret; + } + device->activeGroup = gid; + } + rocmon_context->activeGroup = gid; + rocmon_context->state = ROCMON_STATE_SETUP; + // Cleanup + free(smiEvents); + free(rocEvents); + + return 0; } @@ -108,273 +461,501 @@ rocmon_setupCounters(int gid) int rocmon_startCounters(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_startCounters(); -#else - return rocmon_sdk_startCounters(); + int ret = 0; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + ERROR_PRINT(ROCMON not initialized); + return -EFAULT; + } + if ((rocmon_context->activeGroup < 0) || (rocmon_context->state != ROCMON_STATE_SETUP)) + { + ERROR_PRINT(No eventset configured for ROCMON); + return -EFAULT; + } + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting ROCMON rocprofiler_v1 counters); + ret = rocmon_v1_startCounters(rocmon_context); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting ROCMON rocprofiler_sdk counters); + ret = rocmon_sdk_startCounters(rocmon_context); + } #endif + if (ret < 0) + { + return ret; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting ROCMON SMI counters); + ret = rocmon_smi_startCounters(rocmon_context); + if (ret < 0) + { + return ret; + } + rocmon_context->state = ROCMON_STATE_RUNNING; + return 0; } - int rocmon_stopCounters(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_stopCounters(); -#else - return rocmon_sdk_stopCounters(); + int ret = 0; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + if ((rocmon_context->activeGroup < 0) || (rocmon_context->state != ROCMON_STATE_RUNNING)) + { + return -EFAULT; + } + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping ROCMON rocprofiler_v1 counters); + ret = rocmon_v1_stopCounters(rocmon_context); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping ROCMON rocprofiler_sdk counters); + ret = rocmon_sdk_stopCounters(rocmon_context); + } #endif + if (ret < 0) + { + return ret; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping ROCMON SMI counters); + ret = rocmon_smi_stopCounters(rocmon_context); + if (ret < 0) + { + return ret; + } + rocmon_context->state = ROCMON_STATE_STOPPED; + return 0; } - int rocmon_readCounters(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_readCounters(); -#else - return rocmon_sdk_readCounters(); + int ret = 0; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + if ((rocmon_context->activeGroup < 0) || (rocmon_context->state != ROCMON_STATE_RUNNING)) + { + return -EFAULT; + } + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reading ROCMON rocprofiler_v1 counters); + ret = rocmon_v1_readCounters(rocmon_context); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reading ROCMON rocprofiler_sdk counters); + ret = rocmon_sdk_readCounters(rocmon_context); + } #endif + if (ret < 0) + { + ERROR_PRINT(Failed to read ROCMON rocprofiler counters); + return ret; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reading ROCMON SMI counters); + ret = rocmon_smi_readCounters(rocmon_context); + if (ret < 0) + { + ERROR_PRINT(Failed to read ROCMON SMI counters); + return ret; + } + return 0; } -double -rocmon_getResult(int gpuIdx, int groupId, int eventId) +int +rocmon_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getResult(gpuIdx, groupId, eventId); -#else - return rocmon_sdk_getResult(gpuIdx, groupId, eventId); + int ret = 0; + EventList_rocm_t l = malloc(sizeof(EventList_rocm)); + if (!l) + { + return -ENOMEM; + } + memset(l, 0, sizeof(EventList_rocm)); + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding RocProfiler V1 events); + ret = rocmon_v1_getEventsOfGpu(rocmon_context, gpuIdx, &l); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding RocProfiler SDK events); + ret = rocmon_sdk_getEventsOfGpu(rocmon_context, gpuIdx, &l); + } #endif + if (ret < 0) + { + rocmon_freeEventsOfGpu(l); + return ret; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding ROCm SMI events); + ret = rocmon_smi_getEventsOfGpu(rocmon_context, gpuIdx, &l); + if (ret < 0) + { + rocmon_freeEventsOfGpu(l); + return ret; + } + *list = l; + return 0; } - -// TODO: multiple groups -double -rocmon_getLastResult(int gpuIdx, int groupId, int eventId) +void +rocmon_freeEventsOfGpu(EventList_rocm_t list) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getLastResult(gpuIdx, groupId, eventId); -#else - return rocmon_sdk_getLastResult(gpuIdx, groupId, eventId); -#endif + if (!list) + { + return; + } + if (list->events != NULL) + { + for (int i = 0; i < list->numEvents; i++) + { + Event_rocm_t* event = &list->events[i]; + if (event->name) { + free(event->name); + event->name = NULL; + } + if (event->description) { + free(event->description); + event->description = NULL; + } + } + free(list->events); + list->events = NULL; + } + free(list); + return; } int -rocmon_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) +rocmon_switchActiveGroup(int newGroupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getEventsOfGpu(gpuIdx, list); -#else - return rocmon_sdk_getEventsOfGpu(gpuIdx, list); + int ret = 0; + if (rocmon_context->use_rocprofiler_v1) + { + ret = rocmon_v1_switchActiveGroup(rocmon_context, newGroupId); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ret = rocmon_sdk_switchActiveGroup(rocmon_context, newGroupId); + } #endif + if (ret < 0) + { + return ret; + } + ret = rocmon_smi_switchActiveGroup(rocmon_context, newGroupId); + if (ret < 0) + { + return ret; + } + return 0; } -void -rocmon_freeEventsOfGpu(EventList_rocm_t list) + + +void rocmon_setVerbosity(int level) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_freeEventsOfGpu(list); -#else - return rocmon_sdk_freeEventsOfGpu(list); -#endif + if (level >= DEBUGLEV_ONLY_ERROR && level <= DEBUGLEV_DEVELOP) + { + likwid_rocmon_verbosity = level; + } } -int -rocmon_switchActiveGroup(int newGroupId) + +double +rocmon_getResult(int gpuIdx, int groupId, int eventId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_switchActiveGroup(newGroupId); -#else - return rocmon_sdk_switchActiveGroup(newGroupId); -#endif + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Validate gpuIdx + if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) + { + return -EFAULT; + } + + // Validate groupId + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + if (groupId < 0 || groupId >= device->numGroupResults) + { + return -EFAULT; + } + + // Validate eventId + RocmonEventResultList* groupResult = &device->groupResults[groupId]; + if (eventId < 0 || eventId >= groupResult->numResults) + { + return -EFAULT; + } + + // Return result + return groupResult->results[eventId].fullValue; +} + + +// TODO: multiple groups +double +rocmon_getLastResult(int gpuIdx, int groupId, int eventId) +{ + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + + // Validate gpuIdx + if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) + { + return -EFAULT; + } + + // Validate groupId + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + if (groupId < 0 || groupId >= device->numGroupResults) + { + return -EFAULT; + } + + // Validate eventId + RocmonEventResultList* groupResult = &device->groupResults[groupId]; + if (eventId < 0 || eventId >= groupResult->numResults) + { + return -EFAULT; + } + + // Return result + return groupResult->results[eventId].lastValue; } int rocmon_getNumberOfGroups(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getNumberOfGroups(); -#else - return rocmon_sdk_getNumberOfGroups(); -#endif + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->numActiveGroups; } int rocmon_getIdOfActiveGroup(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getIdOfActiveGroup(); -#else - return rocmon_sdk_getIdOfActiveGroup(); -#endif + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->activeGroup; } int rocmon_getNumberOfGPUs(void) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getNumberOfGPUs(); -#else - return rocmon_sdk_getNumberOfGPUs(); -#endif + if (!rocmon_context || !rocmon_initialized) + { + return -EFAULT; + } + return rocmon_context->numDevices; } int rocmon_getNumberOfEvents(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getNumberOfEvents(groupId); -#else - return rocmon_sdk_getNumberOfEvents(groupId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return -EFAULT; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->nevents; } int rocmon_getNumberOfMetrics(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getNumberOfMetrics(groupId); -#else - return rocmon_sdk_getNumberOfMetrics(groupId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + { + return -EFAULT; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->nmetrics; } double rocmon_getTimeOfGroup(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getTimeOfGroup(groupId); -#else - return rocmon_sdk_getTimeOfGroup(groupId); -#endif + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + return 0; } double rocmon_getLastTimeOfGroup(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getLastTimeOfGroup(groupId); -#else - return rocmon_sdk_getLastTimeOfGroup(groupId); -#endif + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + return -EFAULT; + } + return 0; } double rocmon_getTimeToLastReadOfGroup(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getTimeToLastReadOfGroup(groupId); -#else - return rocmon_sdk_getTimeToLastReadOfGroup(groupId); -#endif + return 0; } char* rocmon_getEventName(int groupId, int eventId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getEventName(groupId, eventId); -#else - return rocmon_sdk_getEventName(groupId, eventId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((eventId < 0) || (eventId >= ginfo->nevents)) + { + return NULL; + } + return ginfo->events[eventId]; } char* rocmon_getCounterName(int groupId, int eventId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getCounterName(groupId, eventId); -#else - return rocmon_sdk_getCounterName(groupId, eventId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((eventId < 0) || (eventId >= ginfo->nevents)) + { + return NULL; + } + return ginfo->counters[eventId]; } char* rocmon_getMetricName(int groupId, int metricId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getMetricName(groupId, metricId); -#else - return rocmon_sdk_getMetricName(groupId, metricId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + if ((metricId < 0) || (metricId >= ginfo->nmetrics)) + { + return NULL; + } + return ginfo->metricnames[metricId]; } char* rocmon_getGroupName(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getGroupName(groupId); -#else - return rocmon_sdk_getGroupName(groupId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->groupname; } char* rocmon_getGroupInfoShort(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getGroupInfoShort(groupId); -#else - return rocmon_sdk_getGroupInfoShort(groupId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->shortinfo; } char* rocmon_getGroupInfoLong(int groupId) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getGroupInfoLong(groupId); -#else - return rocmon_sdk_getGroupInfoLong(groupId); -#endif + if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) + { + return NULL; + } + GroupInfo* ginfo = &rocmon_context->groups[groupId]; + return ginfo->longinfo; } - int rocmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_getGroups(groups, shortinfos, longinfos); -#else - return rocmon_sdk_getGroups(groups, shortinfos, longinfos); -#endif + init_configuration(); + Configuration_t config = get_configuration(); + + + return perfgroup_getGroups(config->groupPath, rocprofiler_group_arch, groups, shortinfos, longinfos); } int rocmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) { -#ifndef LIKWID_ROCPROF_SDK - return rocmon_v1_returnGroups(nrgroups, groups, shortinfos, longinfos); -#else - return rocmon_sdk_returnGroups(nrgroups, groups, shortinfos, longinfos); -#endif + perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); } -void rocmon_setVerbosity(int level) + + +// only used internally by the ROCMON MarkerAPI +GroupInfo* rocmon_get_group(int gid) { - if (level >= DEBUGLEV_ONLY_ERROR && level <= DEBUGLEV_DEVELOP) + if ((gid >= 0) && (gid < rocmon_context->numActiveGroups)) { - likwid_rocmon_verbosity = level; + return &rocmon_context->groups[gid]; } + return NULL; } + #endif /* LIKWID_WITH_ROCMON */ diff --git a/src/rocmon_marker.c b/src/rocmon_marker.c index 01e43ffac..dc7707022 100644 --- a/src/rocmon_marker.c +++ b/src/rocmon_marker.c @@ -39,11 +39,21 @@ #include #include -#ifndef LIKWID_ROCPROF_SDK +#include +#include #include +#ifdef LIKWID_ROCPROF_SDK +#include #endif +#include +#ifndef FREE_IF_NOT_NULL +#define FREE_IF_NOT_NULL(x) if (x != NULL) { free(x); x = NULL; } +#endif + +#ifndef gettid #define gettid() syscall(SYS_gettid) +#endif #ifndef NAN #define NAN (0.0/0.0) @@ -202,8 +212,6 @@ _rocmon_saveToFile(const char* markerfile) static void _rocmon_finalize(void) { -#define FREE_IF_NOT_NULL(x) if (x != NULL) { free(x); x = NULL; } - // Ensure markers were initialized if (!rocmon_marker_initialized) { @@ -316,7 +324,7 @@ rocmon_markerInit(void) ret = rocmon_addEventSet(bdata(gEventStrings->entry[i]), &gpu_groups[i]); if (ret < 0) { - fprintf(stderr,"Error setting up Rocmon Marker API.\n"); + fprintf(stderr,"Error setting up Rocmon Marker API: %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -337,7 +345,7 @@ rocmon_markerInit(void) ret = rocmon_setupCounters(gpu_groups[active_group]); if (ret) { - fprintf(stderr,"Error setting up Rocmon Marker API.\n"); + fprintf(stderr,"Error setting up Rocmon Marker API: %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -349,7 +357,7 @@ rocmon_markerInit(void) ret = rocmon_startCounters(); if (ret) { - fprintf(stderr,"Error starting up Rocmon Marker API.\n"); + fprintf(stderr,"Error starting up Rocmon Marker API: %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -1066,8 +1074,8 @@ rocmon_getMetricOfRegionGpu(int region, int metricId, int gpuId) { return NAN; } - GroupInfo* ginfo = &rocmon_context->groups[rocmMarkerResults[region].groupID]; - if (metricId < 0 || metricId >= ginfo->nmetrics) + GroupInfo* ginfo = rocmon_get_group(rocmMarkerResults[region].groupID); + if ((!ginfo) || (metricId < 0) || (metricId >= ginfo->nmetrics)) { return NAN; } diff --git a/src/rocmon_sdk.c b/src/rocmon_sdk.c deleted file mode 100644 index 7e66a1402..000000000 --- a/src/rocmon_sdk.c +++ /dev/null @@ -1,251 +0,0 @@ - /* ======================================================================================= - * - * Filename: rocmon_sdk.c - * - * Description: Main implementation of the performance monitoring module - * for AMD GPUs with ROCm >= 6.2 - * - * Version: - * Released: - * - * Author: Thomas Gruber (tg), thomas.roehl@googlemail.com - * Project: likwid - * - * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg - * - * This program is free software: you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation, either version 3 of the License, or (at your option) any later - * version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A - * PARTICULAR PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - * - * ======================================================================================= - */ -#ifdef LIKWID_WITH_ROCMON - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include - -static bool rocmon_initialized = FALSE; -int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; - -int -rocmon_sdk_init(int numGpus, const int* gpuIds) -{ - return 0; -} - - -void -rocmon_sdk_finalize(void) -{ - return; -} - - -int -rocmon_sdk_addEventSet(const char* eventString, int* gid) -{ - return 0; -} - -int -rocmon_sdk_setupCounters(int gid) -{ - return 0; -} - - -int -rocmon_sdk_startCounters(void) -{ - return 0; -} - -int -rocmon_sdk_stopCounters(void) -{ - return 0; -} - - -int -rocmon_sdk_readCounters(void) -{ - return 0; -} - - -double -rocmon_sdk_getResult(int gpuIdx, int groupId, int eventId) -{ - return 0.0; -} - - -// TODO: multiple groups -double -rocmon_sdk_getLastResult(int gpuIdx, int groupId, int eventId) -{ - return 0.0; -} - - -int -rocmon_sdk_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) -{ - return -EINVAL; -} - -void -rocmon_sdk_freeEventsOfGpu(EventList_rocm_t list) -{ - return; -} - - -int -rocmon_sdk_switchActiveGroup(int newGroupId) -{ - return 0; -} - - -int -rocmon_sdk_getNumberOfGroups(void) -{ - return 0; -} - - -int -rocmon_sdk_getIdOfActiveGroup(void) -{ - return 0; -} - - -int -rocmon_sdk_getNumberOfGPUs(void) -{ - return 0; -} - - -int -rocmon_sdk_getNumberOfEvents(int groupId) -{ - return 0; -} - - -int -rocmon_sdk_getNumberOfMetrics(int groupId) -{ - return 0; -} - - -double -rocmon_sdk_getTimeOfGroup(int groupId) -{ - return 0; -} - - -double -rocmon_sdk_getLastTimeOfGroup(int groupId) -{ - return 0; -} - - -double -rocmon_sdk_getTimeToLastReadOfGroup(int groupId) -{ - return 0; -} - - -char* -rocmon_sdk_getEventName(int groupId, int eventId) -{ - return NULL; -} - - -char* -rocmon_sdk_getCounterName(int groupId, int eventId) -{ - return NULL; -} - - -char* -rocmon_sdk_getMetricName(int groupId, int metricId) -{ - return NULL; -} - - -char* -rocmon_sdk_getGroupName(int groupId) -{ - return NULL; -} - - -char* -rocmon_sdk_getGroupInfoShort(int groupId) -{ - return NULL; -} - - -char* -rocmon_sdk_getGroupInfoLong(int groupId) -{ - return NULL; -} - - -int -rocmon_sdk_getGroups(char*** groups, char*** shortinfos, char*** longinfos) -{ - init_configuration(); - Configuration_t config = get_configuration(); - - return perfgroup_getGroups(config->groupPath, "amd_gpu_sdk", groups, shortinfos, longinfos); -} - - -int -rocmon_sdk_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) -{ - perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); -} - - -#endif /* LIKWID_WITH_ROCMON */ diff --git a/src/rocmon_v1.c b/src/rocmon_v1.c deleted file mode 100644 index 31ff459e8..000000000 --- a/src/rocmon_v1.c +++ /dev/null @@ -1,2275 +0,0 @@ - /* ======================================================================================= - * - * Filename: rocmon_v1.c - * - * Description: Main implementation of the performance monitoring module - * for AMD GPUs with ROCm < 6.2 - * - * Version: - * Released: - * - * Author: Thomas Gruber (tg), thomas.roehl@googlemail.com - * Project: likwid - * - * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg - * - * This program is free software: you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation, either version 3 of the License, or (at your option) any later - * version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A - * PARTICULAR PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - * - * ======================================================================================= - */ -#ifdef LIKWID_WITH_ROCMON - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include - - - - -// #include -// #include -// #include - -// Variables -static void *dl_hsa_lib = NULL; -static void *dl_profiler_lib = NULL; -static void *dl_rsmi_lib = NULL; - -RocmonContext *rocmon_context = NULL; -static bool rocmon_initialized = FALSE; -int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; - -// Macros -#define membersize(type, member) sizeof(((type *) NULL)->member) -#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } -#define ROCM_CALL( call, args, handleerror ) \ - do { \ - hsa_status_t _status = (*call##_ptr)args; \ - if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ - const char* err = NULL; \ - fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ - rocprofiler_error_string(&err); \ - fprintf(stderr, "Error: %s\n", err); \ - handleerror; \ - } \ - } while (0) - -#define RSMI_CALL( call, args, handleerror ) \ - do { \ - rsmi_status_t _status = (*call##_ptr)args; \ - if (_status != RSMI_STATUS_SUCCESS) { \ - fprintf(stderr, "Error: function %s failed with error %d.\n", #call, _status); \ - handleerror; \ - } \ - } while (0) - -// ROCm function declarations -#define ROCMWEAK __attribute__(( weak )) -#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; -#define DECLAREFUNC_SMI(funcname, funcsig) rsmi_status_t ROCMWEAK funcname funcsig; rsmi_status_t ( *funcname##_ptr ) funcsig; - -DECLAREFUNC_HSA(hsa_init, ()); -DECLAREFUNC_HSA(hsa_shut_down, ()); -DECLAREFUNC_HSA(hsa_iterate_agents, (hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data)); -DECLAREFUNC_HSA(hsa_agent_get_info, (hsa_agent_t agent, hsa_agent_info_t attribute, void* value)); -DECLAREFUNC_HSA(hsa_system_get_info, (hsa_system_info_t attribute, void *value)); - -DECLAREFUNC_HSA(rocprofiler_iterate_info, (const hsa_agent_t* agent, rocprofiler_info_kind_t kind, hsa_status_t (*callback)(const rocprofiler_info_data_t, void* data), void* data)); -DECLAREFUNC_HSA(rocprofiler_close, (rocprofiler_t* context)); -DECLAREFUNC_HSA(rocprofiler_open, (hsa_agent_t agent, rocprofiler_feature_t* features, uint32_t feature_count, rocprofiler_t** context, uint32_t mode, rocprofiler_properties_t* properties)); -DECLAREFUNC_HSA(rocprofiler_error_string, ()); -DECLAREFUNC_HSA(rocprofiler_start, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_stop, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_read, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_get_data, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_get_metrics, (const rocprofiler_t* context)); - -DECLAREFUNC_SMI(rsmi_init, (uint64_t flags)); -DECLAREFUNC_SMI(rsmi_shut_down, ()); -DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_open, (uint32_t dv_ind, rsmi_func_id_iter_handle_t* handle)); -DECLAREFUNC_SMI(rsmi_dev_supported_variant_iterator_open, (rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t* var_iter)); -DECLAREFUNC_SMI(rsmi_func_iter_value_get, (rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t* value )); -DECLAREFUNC_SMI(rsmi_func_iter_next, (rsmi_func_id_iter_handle_t handle)); -DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_close, (rsmi_func_id_iter_handle_t* handle)); -DECLAREFUNC_SMI(rsmi_dev_power_ave_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* power)); -DECLAREFUNC_SMI(rsmi_dev_pci_throughput_get, (uint32_t dv_ind, uint64_t* sent, uint64_t* received, uint64_t* max_pkt_sz)); -DECLAREFUNC_SMI(rsmi_dev_pci_replay_counter_get, (uint32_t dv_ind, uint64_t* counter)); -DECLAREFUNC_SMI(rsmi_dev_memory_total_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* total)); -DECLAREFUNC_SMI(rsmi_dev_memory_usage_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* used )); -DECLAREFUNC_SMI(rsmi_dev_memory_busy_percent_get, (uint32_t dv_ind, uint32_t* busy_percent)); -DECLAREFUNC_SMI(rsmi_dev_memory_reserved_pages_get, (uint32_t dv_ind, uint32_t* num_pages, rsmi_retired_page_record_t* records)); -DECLAREFUNC_SMI(rsmi_dev_fan_rpms_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); -DECLAREFUNC_SMI(rsmi_dev_fan_speed_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); -DECLAREFUNC_SMI(rsmi_dev_fan_speed_max_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* max_speed)); -DECLAREFUNC_SMI(rsmi_dev_temp_metric_get, (uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t* temperature)); -DECLAREFUNC_SMI(rsmi_dev_volt_metric_get, (uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t* voltage)); -DECLAREFUNC_SMI(rsmi_dev_overdrive_level_get, (uint32_t dv_ind, uint32_t* od)); -DECLAREFUNC_SMI(rsmi_dev_ecc_count_get, (uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t* ec)); -DECLAREFUNC_SMI(rsmi_compute_process_info_get, (rsmi_process_info_t* procs, uint32_t* num_items)); - - -// ---------------------------------------------------- -// SMI event wrapper -// ---------------------------------------------------- - -static int -_smi_wrapper_pci_throughput_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t value; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _smi_wrapper_pci_throughput_get(%d, %d), deviceId, event->extra); - // Internal variant: 0 for sent, 1 for received bytes and 2 for max packet size - if (event->extra == 0) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, &value, NULL, NULL), return -1); - else if (event->extra == 1) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, &value, NULL), return -1); - else if (event->extra == 2) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, NULL, &value), return -1); - else return -1; - - result->fullValue += value; - result->lastValue = value; - - return 0; -} - - -static int -_smi_wrapper_pci_replay_counter_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t counter; - RSMI_CALL(rsmi_dev_pci_replay_counter_get, (deviceId, &counter), return -1); - result->fullValue += counter; - result->lastValue = counter; - - return 0; -} - - -static int -_smi_wrapper_power_ave_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t power; - RSMI_CALL(rsmi_dev_power_ave_get, (deviceId, event->subvariant, &power), return -1); - result->fullValue += power; - result->lastValue = power; - - return 0; -} - - -static int -_smi_wrapper_memory_total_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t total; - RSMI_CALL(rsmi_dev_memory_total_get, (deviceId, event->variant, &total), return -1); - result->fullValue += total; - result->lastValue = total; - - return 0; -} - - -static int -_smi_wrapper_memory_usage_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t used; - RSMI_CALL(rsmi_dev_memory_usage_get, (deviceId, event->variant, &used), return -1); - result->fullValue += used; - result->lastValue = used; - - return 0; -} - - -static int -_smi_wrapper_memory_busy_percent_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t percent; - RSMI_CALL(rsmi_dev_memory_busy_percent_get, (deviceId, &percent), return -1); - result->fullValue += percent; - result->lastValue = percent; - - return 0; -} - - -static int -_smi_wrapper_memory_reserved_pages_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t num_pages; - RSMI_CALL(rsmi_dev_memory_reserved_pages_get, (deviceId, &num_pages, NULL), return -1); - result->fullValue += num_pages; - result->lastValue = num_pages; - - return 0; -} - - -static int -_smi_wrapper_fan_rpms_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t speed; - RSMI_CALL(rsmi_dev_fan_rpms_get, (deviceId, event->subvariant, &speed), return -1); - result->fullValue += speed; - result->lastValue = speed; - - return 0; -} - - -static int -_smi_wrapper_fan_speed_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t speed; - RSMI_CALL(rsmi_dev_fan_speed_get, (deviceId, event->subvariant, &speed), return -1); - result->fullValue += speed; - result->lastValue = speed; - - return 0; -} - - -static int -_smi_wrapper_fan_speed_max_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t max_speed; - RSMI_CALL(rsmi_dev_fan_speed_max_get, (deviceId, event->subvariant, &max_speed), return -1); - result->fullValue += max_speed; - result->lastValue = max_speed; - - return 0; -} - - -static int -_smi_wrapper_temp_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t temperature; - RSMI_CALL(rsmi_dev_temp_metric_get, (deviceId, event->subvariant, event->variant, &temperature), return -1); - result->fullValue += temperature; - result->lastValue = temperature; - - return 0; -} - - -static int -_smi_wrapper_volt_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t voltage; - RSMI_CALL(rsmi_dev_volt_metric_get, (deviceId, event->subvariant, event->variant, &voltage), return -1); - result->fullValue += voltage; - result->lastValue = voltage; - - return 0; -} - - -static int -_smi_wrapper_overdrive_level_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t overdrive; - RSMI_CALL(rsmi_dev_overdrive_level_get, (deviceId, &overdrive), return -1); - result->fullValue += overdrive; - result->lastValue = overdrive; - - return 0; -} - - -static int -_smi_wrapper_ecc_count_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - rsmi_error_count_t error_count; - RSMI_CALL(rsmi_dev_ecc_count_get, (deviceId, event->variant, &error_count), return -1); - - if (event->extra == 0) - { - result->lastValue = error_count.correctable_err - result->fullValue; - result->fullValue = error_count.correctable_err; - } - else if (event->extra == 1) - { - result->lastValue = error_count.uncorrectable_err - result->fullValue; - result->fullValue = error_count.uncorrectable_err; - } - else - { - return -1; - } - - return 0; -} - - -static int -_smi_wrapper_compute_process_info_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t num_items; - RSMI_CALL(rsmi_compute_process_info_get, (NULL, &num_items), return -1); - result->fullValue += num_items; - result->lastValue = num_items; - - return 0; -} - - -// ---------------------------------------------------- -// Rocmon helper functions -// ---------------------------------------------------- - -static int -_rocmon_link_libraries() -{ - #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries); - - // Need to link in the ROCm HSA libraries - dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_hsa_lib) - { - ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); - return -1; - } - - // Need to link in the Rocprofiler libraries - dl_profiler_lib = dlopen("librocprofiler64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_profiler_lib) - { - dl_profiler_lib = dlopen("librocprofiler64.so.1", RTLD_NOW | RTLD_GLOBAL); - if (!dl_profiler_lib) - { - ERROR_PRINT(Rocprofiler library librocprofiler64.so not found: %s, dlerror()); - return -1; - } - } - - // Need to link in the Rocprofiler libraries - dl_rsmi_lib = dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_rsmi_lib) - { - ERROR_PRINT(ROCm SMI library librocm_smi64.so not found: %s, dlerror()); - return -1; - } - - // Link HSA functions - DLSYM_AND_CHECK(dl_hsa_lib, hsa_init); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_shut_down); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_iterate_agents); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_agent_get_info); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_system_get_info); - - // Link Rocprofiler functions - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_iterate_info); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_close); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_open); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_error_string); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_start); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_stop); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_read); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_data); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_metrics); - - // Link SMI functions - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_init); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_shut_down); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_open); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_variant_iterator_open); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_value_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_next); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_close); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_power_ave_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_throughput_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_replay_counter_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_total_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_usage_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_busy_percent_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_reserved_pages_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_rpms_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_max_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_temp_metric_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_volt_metric_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_overdrive_level_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_ecc_count_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_compute_process_info_get); - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries done); - return 0; -} - -typedef struct { - RocmonContext* context; - int numGpus; - const int* gpuIds; -} iterate_agents_cb_arg; - -typedef struct { - RocmonDevice* device; - int currIndex; -} iterate_info_cb_arg; - - -static hsa_status_t -_rocmon_iterate_info_callback_count(const rocprofiler_info_data_t info, void* data) -{ - RocmonDevice* device = (RocmonDevice*) data; - if (device) { - device->numRocMetrics++; - } - return HSA_STATUS_SUCCESS; -} - -static void -_rocmon_print_rocprofiler_info_data(const rocprofiler_info_data_t info) -{ - if (info.kind != ROCPROFILER_INFO_KIND_METRIC) - { - return; - } - printf("Name '%s':\n", info.metric.name); - printf("\tKind: '%s'\n", (info.kind == ROCPROFILER_INFO_KIND_METRIC ? "Metric" : "Trace")); - printf("\tInstances: %d\n", info.metric.instances); - printf("\tDescription: '%s'\n", info.metric.description); - printf("\tExpression: '%s'\n", info.metric.expr); - printf("\tBlockName: '%s'\n", info.metric.block_name); - printf("\tBlockCounters: %d\n", info.metric.block_counters); -} - -static hsa_status_t -_rocmon_iterate_info_callback_add(const rocprofiler_info_data_t info, void* data) -{ - iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; - - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); - if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP) - { - _rocmon_print_rocprofiler_info_data(info); - } - // Check info kind - if (info.kind != ROCPROFILER_INFO_KIND_METRIC) - { - ERROR_PRINT(Wrong info kind %u, info.kind); - return HSA_STATUS_ERROR; - } - - // Check index - if (arg->currIndex >= arg->device->numRocMetrics) - { - ERROR_PRINT(Metric index out of bounds: %d, arg->currIndex); - return HSA_STATUS_ERROR; - } - - // Copy info data - rocprofiler_info_data_t* target_info = &arg->device->rocMetrics[arg->currIndex]; - memcpy(target_info, &info, sizeof(rocprofiler_info_data_t)); - arg->currIndex++; - - return HSA_STATUS_SUCCESS; -} - - -static hsa_status_t -_rocmon_iterate_agents_callback(hsa_agent_t agent, void* argv) -{ - // Count number of callback invocations as the devices id - static int nextDeviceId = 0; - int deviceId = nextDeviceId; - bool noAgent = false; - - iterate_agents_cb_arg *arg = (iterate_agents_cb_arg*) argv; - - // Check if device is a GPU - hsa_device_type_t type; - ROCM_CALL(hsa_agent_get_info, (agent, HSA_AGENT_INFO_DEVICE, &type), return -1); - if (type != HSA_DEVICE_TYPE_GPU) - { - return HSA_STATUS_SUCCESS; - } - nextDeviceId++; - - // Check if device is includes in arg->gpuIds - int gpuIndex = -1; - for (int i = 0; i < arg->numGpus; i++) - { - if (deviceId == arg->gpuIds[i]) - { - gpuIndex = i; - break; - } - } - if (gpuIndex < 0) - { - return HSA_STATUS_SUCCESS; - } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing agent %d, gpuIndex); - - // Add agent to context - RocmonDevice *device = &arg->context->devices[gpuIndex]; - device->deviceId = deviceId; - device->hsa_agent = agent; - device->context = NULL; - device->numActiveRocEvents = 0; - device->activeRocEvents = NULL; - device->numGroupResults = 0; - device->groupResults = NULL; - - // Get number of available metrics - device->numRocMetrics = 0; - ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, RocProfiler provides %d events, device->numRocMetrics); - - // workaround for bug in ROCm 5.4.0 - if(device->numRocMetrics == 0) { - ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); - noAgent = true; - } - - // Allocate memory for metrics - device->rocMetrics = (rocprofiler_info_data_t*) malloc(device->numRocMetrics * sizeof(rocprofiler_info_data_t)); - if (device->rocMetrics == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate set of rocMetrics); - return HSA_STATUS_ERROR; - } - - // Initialize SMI events map - if (init_map(&device->smiMetrics, MAP_KEY_TYPE_STR, 0, &free) < 0) - { - ERROR_PLAIN_PRINT(Cannot init smiMetrics map); - return HSA_STATUS_ERROR; - } - - // Fetch metric informatino - iterate_info_cb_arg info_arg = { - .device = device, - .currIndex = 0, - }; - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, Read %d RocProfiler events for device %d, device->numRocMetrics, device->deviceId); - - // If the call fails with agent, call rocprofiler_iterate_info without agent - if(noAgent) - { - ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); - } else { - ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); - } - - return HSA_STATUS_SUCCESS; -} - - -static int -_rocmon_parse_eventstring(const char* eventString, GroupInfo* group) -{ - int err = 0; - Configuration_t config = get_configuration(); - bstring eventBString = bfromcstr(eventString); - - if (bstrchrp(eventBString, ':', 0) != BSTR_ERR) - { - // If custom group -> perfgroup_customGroup - err = perfgroup_customGroup(eventString, group); - if (err < 0) - { - ERROR_PRINT(Cannot transform %s to performance group, eventString); - return err; - } - } - else - { - // If performance group -> perfgroup_readGroup - err = perfgroup_readGroup(config->groupPath, "amd_gpu", eventString, group); - if (err == -EACCES) - { - ERROR_PRINT(Access to performance group %s not allowed, eventString); - return err; - } - else if (err == -ENODEV) - { - ERROR_PRINT(Performance group %s only available with deactivated HyperThreading, eventString); - return err; - } - if (err < 0) - { - ERROR_PRINT(Cannot read performance group %s, eventString); - return err; - } - } - - return 0; -} - - -static int -_rocmon_get_timestamp(uint64_t* timestamp_ns) -{ - uint64_t timestamp; - - // Get timestamp from system - ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP, ×tamp), return -1); - // Convert to nanoseconds - *timestamp_ns = (uint64_t)((long double)timestamp * rocmon_context->hsa_timestamp_factor); - - return 0; -} - - -static int -_rocmon_getLastResult(RocmonDevice* device, int eventId, double* value) -{ - rocprofiler_data_t* data = &device->activeRocEvents[eventId].data; - - switch (data->kind) - { - case ROCPROFILER_DATA_KIND_INT32: - *value = (double) data->result_int32; - break; - case ROCPROFILER_DATA_KIND_INT64: - *value = (double) data->result_int64; - break; - case ROCPROFILER_DATA_KIND_FLOAT: - *value = (double) data->result_float; - break; - case ROCPROFILER_DATA_KIND_DOUBLE: - *value = data->result_double; - break; - - case ROCPROFILER_DATA_KIND_BYTES: - case ROCPROFILER_DATA_KIND_UNINIT: - default: - return -1; - } - - return 0; -} - - -static int -_rocmon_readCounters_rocprofiler(RocmonDevice* device) -{ - int ret; - - // Check if there are any counters to start - if (device->numActiveRocEvents <= 0) - { - return 0; - } - - if (!device->context) - { - return 0; - } - - ROCM_CALL(rocprofiler_read, (device->context, 0), return -1); - ROCM_CALL(rocprofiler_get_data, (device->context, 0), return -1); - ROCM_CALL(rocprofiler_get_metrics, (device->context), return -1); - - // Update results - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveRocEvents; i++) - { - RocmonEventResult* result = &groupResult->results[i]; - - // Read value - ret = _rocmon_getLastResult(device, i, &result->fullValue); - if (ret < 0) - { - return -1; - } - - // Calculate delta since last read - result->lastValue = result->fullValue - result->lastValue; - } - - return 0; -} - - -static int -_rocmon_readCounters_smi(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveSmiEvents <= 0) - { - return 0; - } - - // Save baseline values - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) - { - double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; - - // Measure counter - if (event->measureFunc) - { - event->measureFunc(device->deviceId, event, result); - } - } - - return 0; -} - - -static int -_rocmon_readCounters(uint64_t* (*getDestTimestampFunc)(RocmonDevice* device)) -{ - int ret; - - // Get timestamp - uint64_t timestamp; - if (ret = _rocmon_get_timestamp(×tamp)) - { - return ret; - } - - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Save timestamp - if (getDestTimestampFunc) - { - uint64_t* timestampDest = getDestTimestampFunc(device); - if (timestampDest) - { - *timestampDest = timestamp; - } - } - - // Read rocprofiler counters - ret = _rocmon_readCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Read SMI counters - ret = _rocmon_readCounters_smi(device); - if (ret < 0) return ret; - } - - return 0; -} - - -static uint64_t* -_rocmon_get_read_time(RocmonDevice* device) -{ - return &device->time.read; -} - - -static uint64_t* -_rocmon_get_stop_time(RocmonDevice* device) -{ - return &device->time.stop; -} - - -// ---------------------------------------------------- -// Rocmon SMI helper functions -// ---------------------------------------------------- - -static bstring -_rocmon_smi_build_label(RocmonSmiEventType type, const char* funcname, uint64_t variant, uint64_t subvariant) -{ - switch (type) - { - case ROCMON_SMI_EVENT_TYPE_NORMAL: - return bfromcstr(funcname); - case ROCMON_SMI_EVENT_TYPE_VARIANT: - return bformat("%s|%" PRIu64, funcname, variant); - case ROCMON_SMI_EVENT_TYPE_SUBVARIANT: - return bformat("%s|%" PRIu64 "|%" PRIu64, funcname, variant, subvariant); - case ROCMON_SMI_EVENT_TYPE_INSTANCES: - return bfromcstr(funcname); - } -} - - -static int -_rocmon_smi_add_event_to_device(RocmonDevice* device, const char* funcname, RocmonSmiEventType type, int64_t variant, uint64_t subvariant) -{ - int ret; - - // Get event by label - RocmonSmiEventList* list = NULL; - bstring label = _rocmon_smi_build_label(type, funcname, variant, subvariant); - ret = get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list); - bdestroy(label); - if (ret < 0) - { - // Event not registered -> ignore - return 0; - } - - // For events with multiple sensor, only make one entry -> find if one exists - if (type == ROCMON_SMI_EVENT_TYPE_INSTANCES && subvariant > 0) - { - // Get list from map - for (int i = 0; i < list->numEntries; i++) - { - RocmonSmiEvent* event = &list->entries[i]; - RocmonSmiEvent* existingEvent = NULL; - ret = get_smap_by_key(device->smiMetrics, event->name, (void**)&existingEvent); - if (ret < 0) - { - ERROR_PRINT(Failed to find previous instance for event %s, event->name); - return -1; - } - - // Update instance information - existingEvent->instances++; - } - return 0; - } - - for (int i = 0; i < list->numEntries; i++) - { - RocmonSmiEvent* event = &list->entries[i]; - - // Allocate memory for device event description - RocmonSmiEvent* tmpEvent = (RocmonSmiEvent*) malloc(sizeof(RocmonSmiEvent)); - if (tmpEvent == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event in device list %s, event->name); - return -ENOMEM; - } - - // Copy information from global description - memcpy(tmpEvent, event, sizeof(RocmonSmiEvent)); - tmpEvent->variant = variant; - tmpEvent->subvariant = subvariant; - tmpEvent->instances = 1; - - // Save event info to device event map - add_smap(device->smiMetrics, tmpEvent->name, tmpEvent); - } - - return 0; -} - - -static int -_rocmon_smi_get_function_subvariants(RocmonDevice* device, const char* funcname, uint64_t variant, rsmi_func_id_iter_handle_t var_iter) -{ - rsmi_func_id_iter_handle_t sub_var_iter; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Get open subvariants iterator - status = (*rsmi_dev_supported_variant_iterator_open_ptr)(var_iter, &sub_var_iter); - if (status == RSMI_STATUS_NO_DATA) - { - // No subvariants - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_VARIANT, variant, 0); - if (ret < 0) return -1; - return 0; - } - - // Subvariants available -> iterate them - do { - // Get subvariant information - (*rsmi_func_iter_value_get_ptr)(sub_var_iter, &value); - - // Process info - if (variant == RSMI_DEFAULT_VARIANT) - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_INSTANCES, variant, value.id); - else - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, variant, value.id); - if (ret < 0) return ret; - - // Advance iterator - status = (*rsmi_func_iter_next_ptr)(sub_var_iter); - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - (*rsmi_dev_supported_func_iterator_close_ptr)(&sub_var_iter); - - return 0; -} - - -static int -_rocmon_smi_get_function_variants(RocmonDevice* device, const char* funcname, rsmi_func_id_iter_handle_t iter_handle) -{ - rsmi_func_id_iter_handle_t var_iter; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Get open variants iterator - status = (*rsmi_dev_supported_variant_iterator_open_ptr)(iter_handle, &var_iter); - if (status == RSMI_STATUS_NO_DATA) - { - // No variants - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); - if (ret < 0) return -1; - return 0; - } - - // Variants available -> iterate them - do { - // Get variant information - (*rsmi_func_iter_value_get_ptr)(var_iter, &value); - - // Get function subvariants - ret = _rocmon_smi_get_function_subvariants(device, funcname, value.id, var_iter); - if (ret < 0) return -1; - - // Advance iterator - status = (*rsmi_func_iter_next_ptr)(var_iter); - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - (*rsmi_dev_supported_func_iterator_close_ptr)(&var_iter); - - return 0; -} - - -static int -_rocmon_smi_get_functions(RocmonDevice* device) -{ - rsmi_func_id_iter_handle_t iter_handle; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Open iterator - //(*rsmi_dev_supported_func_iterator_open_ptr)(device->deviceId, &iter_handle); - RSMI_CALL(rsmi_dev_supported_func_iterator_open, (device->deviceId, &iter_handle), { - return -1; - }); - - do - { - // Get function information - //(*rsmi_func_iter_value_get_ptr)(iter_handle, &value); - RSMI_CALL(rsmi_func_iter_value_get, (iter_handle, &value), { - ERROR_PRINT(Failed to get smi function value for device %d, device->deviceId); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - return -1; - }); - - // Get function variants - ret = _rocmon_smi_get_function_variants(device, value.name, iter_handle); - if (ret < 0) - { - ERROR_PRINT(Failed to get smi function variants for device %d, device->deviceId); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - return -1; - } - - // Advance iterator (cannot use RSMI_CALL macro here because we have an assignment, - // so we check that the function pointer exists to avoid segfaults.) - if (rsmi_func_iter_next_ptr) { - status = (*rsmi_func_iter_next_ptr)(iter_handle); - } - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - //(*rsmi_dev_supported_func_iterator_close_ptr)(&iter_handle); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - - // Add device independent functions - ret = _rocmon_smi_add_event_to_device(device, "rsmi_compute_process_info_get", ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); - if (ret < 0) return -1; - - return 0; -} - -#define ADD_SMI_EVENT(name, type, smifunc, variant, subvariant, extra, measurefunc) if (_rocmon_smi_add_event_to_map(name, type, smifunc, variant, subvariant, extra, measurefunc) < 0) { return -1; } -#define ADD_SMI_EVENT_N(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_NORMAL, smifunc, 0, 0, extra, measurefunc) -#define ADD_SMI_EVENT_V(name, smifunc, variant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_VARIANT, smifunc, variant, 0, extra, measurefunc) -#define ADD_SMI_EVENT_S(name, smifunc, variant, subvariant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, smifunc, variant, subvariant, extra, measurefunc) -#define ADD_SMI_EVENT_I(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_INSTANCES, smifunc, 0, 0, extra, measurefunc) - -static int -_rocmon_smi_add_event_to_map(char* name, RocmonSmiEventType type, char* smifunc, uint64_t variant, uint64_t subvariant, uint64_t extra, RocmonSmiMeasureFunc measureFunc) -{ - // Add new event list to map (if not already present) - bstring label = _rocmon_smi_build_label(type, smifunc, variant, subvariant); - RocmonSmiEventList* list; - if (get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list) < 0) - { - // Allocate memory for event list - list = (RocmonSmiEventList*) malloc(sizeof(RocmonSmiEventList)); - if (list == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event list %s, name); - return -ENOMEM; - } - list->entries = NULL; - list->numEntries = 0; - - add_smap(rocmon_context->smiEvents, bdata(label), list); - } - bdestroy(label); - - // Allocate memory for another event in list - list->numEntries++; - list->entries = (RocmonSmiEvent*) realloc(list->entries, list->numEntries * sizeof(RocmonSmiEvent)); - if (list->entries == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event %s, name); - return -ENOMEM; - } - - // Set event properties - RocmonSmiEvent* event = &list->entries[list->numEntries-1]; - strncpy(event->name, name, sizeof(event->name)); - event->name[sizeof(event->name)] = '\0'; - event->type = type; - event->variant = variant; - event->subvariant = subvariant; - event->extra = extra; - event->instances = 0; // gets set when scanning supported device functions - event->measureFunc = measureFunc; - - return 0; -} - - -static void -_rcomon_smi_free_event_list(void* vlist) -{ - RocmonSmiEventList* list = (RocmonSmiEventList*)vlist; - if (list) - { - FREE_IF_NOT_NULL(list->entries); - free(list); - } -} - - -static int -_rocmon_smi_init_events() -{ - int ret; - - // Init map - ret = init_map(&rocmon_context->smiEvents, MAP_KEY_TYPE_STR, 0, &_rcomon_smi_free_event_list); - if (ret < 0) - { - ERROR_PRINT(Failed to create map for ROCm SMI events); - return -1; - } - - // Add events - ADD_SMI_EVENT_N("PCI_THROUGHPUT_SENT", "rsmi_dev_pci_throughput_get", 0, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_THROUGHPUT_RECEIVED", "rsmi_dev_pci_throughput_get", 1, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_THROUGHPUT_MAX_PKT_SZ", "rsmi_dev_pci_throughput_get", 2, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_REPLAY_COUNTER", "rsmi_dev_pci_replay_counter_get", 0, &_smi_wrapper_pci_replay_counter_get ); - ADD_SMI_EVENT_I("POWER_AVE", "rsmi_dev_power_ave_get", 0, &_smi_wrapper_power_ave_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_VIS_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_GTT", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_VIS_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_GTT", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_N("MEMORY_BUSY_PERCENT", "rsmi_dev_memory_busy_percent_get", 0, &_smi_wrapper_memory_busy_percent_get ); - ADD_SMI_EVENT_N("MEMORY_NUM_RESERVED_PAGES", "rsmi_dev_memory_reserved_pages_get", 0, &_smi_wrapper_memory_reserved_pages_get ); - ADD_SMI_EVENT_I("FAN_RPMS", "rsmi_dev_fan_rpms_get", 0, &_smi_wrapper_fan_rpms_get ); - ADD_SMI_EVENT_I("FAN_SPEED", "rsmi_dev_fan_speed_get", 0, &_smi_wrapper_fan_speed_get ); - ADD_SMI_EVENT_I("FAN_SPEED_MAX", "rsmi_dev_fan_speed_max_get", 0, &_smi_wrapper_fan_speed_max_get ); - ADD_SMI_EVENT_S("TEMP_EDGE", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_EDGE, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("TEMP_JUNCTION", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_JUNCTION, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("TEMP_MEMORY", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_MEMORY, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("VOLT_VDDGFX", "rsmi_dev_volt_metric_get", RSMI_VOLT_CURRENT, RSMI_VOLT_TYPE_VDDGFX, 0, &_smi_wrapper_volt_metric_get ); - ADD_SMI_EVENT_N("OVERDRIVE_LEVEL", "rsmi_dev_overdrive_level_get", 0, &_smi_wrapper_overdrive_level_get ); - ADD_SMI_EVENT_V("ECC_COUNT_UMC_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_UMC_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SDMA_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SDMA_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_GFX_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_GFX_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_HDP_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_HDP_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_DF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_DF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SMN_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SMN_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SEM_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SEM_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP0_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP0_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP1_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP1_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_FUSE_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_FUSE_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_LAST_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_LAST_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_N("PROCS_USING_GPU", "rsmi_compute_process_info_get", 0, &_smi_wrapper_compute_process_info_get ); - - return 0; -} - - -int -rocmon_v1_init(int numGpus, const int* gpuIds) -{ - hsa_status_t status; - - // check if already initialized - if (rocmon_initialized) - { - return 0; - } - if (rocmon_context != NULL) - { - return -EEXIST; - } - - // Validate arguments - if (numGpus <= 0) - { - ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); - return -EINVAL; - } - - // Initialize other parts - init_configuration(); - - // initialize libraries - int ret = _rocmon_link_libraries(); - if (ret < 0) - { - ERROR_PLAIN_PRINT(Failed to initialize libraries); - return ret; - } - - // Allocate memory for context - rocmon_context = (RocmonContext*) malloc(sizeof(RocmonContext)); - if (rocmon_context == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate Rocmon context); - return -ENOMEM; - } - rocmon_context->groups = NULL; - rocmon_context->numGroups = 0; - rocmon_context->numActiveGroups = 0; - - rocmon_context->devices = (RocmonDevice*) malloc(numGpus * sizeof(RocmonDevice)); - rocmon_context->numDevices = numGpus; - if (rocmon_context->devices == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate set of GPUs); - free(rocmon_context); - rocmon_context = NULL; - return -ENOMEM; - } - - // init hsa library - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing HSA); - ROCM_CALL(hsa_init, (), - { - ERROR_PLAIN_PRINT(Failed to init hsa library); - goto rocmon_init_hsa_failed; - }); - - // init rocm smi library - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RSMI); - RSMI_CALL(rsmi_init, (0), - { - ERROR_PLAIN_PRINT(Failed to init rocm_smi); - goto rocmon_init_rsmi_failed; - }); - - // Get hsa timestamp factor - uint64_t frequency_hz; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting HSA timestamp factor); - ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency_hz), - { - ERROR_PLAIN_PRINT(Failed to get HSA timestamp factor); - goto rocmon_init_info_agents_failed; - }); - rocmon_context->hsa_timestamp_factor = (long double)1000000000 / (long double)frequency_hz; - - // initialize structures for specified devices (fetch ROCm specific info) - iterate_agents_cb_arg arg = { - .context = rocmon_context, - .numGpus = numGpus, - .gpuIds = gpuIds, - }; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Iterating through %d available agents, numGpus); - ROCM_CALL(hsa_iterate_agents, (_rocmon_iterate_agents_callback, &arg), - { - ERROR_PRINT(Error while iterating through available agents); - goto rocmon_init_info_agents_failed; - }); - - // Get available SMI events for devices - _rocmon_smi_init_events(); - for (int i = 0; i < rocmon_context->numDevices; i++) - { - if (_rocmon_smi_get_functions(&rocmon_context->devices[i]) < 0) - { - ERROR_PRINT(Failed to get SMI functions for device %d, rocmon_context->devices[i].deviceId); - goto rocmon_init_info_agents_failed; - } - } - - rocmon_initialized = TRUE; - return 0; -rocmon_init_info_agents_failed: - RSMI_CALL(rsmi_shut_down, (), { - // fall through - }); -rocmon_init_rsmi_failed: - ROCM_CALL(hsa_shut_down, (), { - // fall through - }); -rocmon_init_hsa_failed: - free(rocmon_context->devices); - free(rocmon_context); - rocmon_context = NULL; - return -1; -} - - -void -rocmon_v1_finalize(void) -{ - RocmonContext* context = rocmon_context; - - if (!rocmon_initialized) - { - return; - } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON); - - if (context) - { - if (context->devices) - { - // Free each devices fields - for (int i = 0; i < context->numDevices; i++) - { - RocmonDevice* device = &context->devices[i]; - FREE_IF_NOT_NULL(device->rocMetrics); - FREE_IF_NOT_NULL(device->activeRocEvents); - FREE_IF_NOT_NULL(device->activeSmiEvents); - if (device->groupResults) - { - // Free events of event result lists - for (int j = 0; j < device->numGroupResults; j++) - { - FREE_IF_NOT_NULL(device->groupResults[i].results); - } - // Free list - free(device->groupResults); - } - if (device->context) - { - ROCM_CALL(rocprofiler_close, (device->context),); - } - destroy_smap(device->smiMetrics); - } - - free(context->devices); - context->devices = NULL; - } - - FREE_IF_NOT_NULL(context->groups); - destroy_smap(context->smiEvents); - - free(context); - context = NULL; - } - - RSMI_CALL(rsmi_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); - // fall through - }); - ROCM_CALL(hsa_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); - // fall through - }); -} - - -int -rocmon_v1_addEventSet(const char* eventString, int* gid) -{ - // Check arguments - if (!eventString) - { - return -EINVAL; - } - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Allocate memory for event group if necessary - if (rocmon_context->numActiveGroups == rocmon_context->numGroups) - { - GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo)); - if (tmpInfo == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate additional group); - return -ENOMEM; - } - rocmon_context->groups = tmpInfo; - rocmon_context->numGroups++; - } - - // Parse event string - int err = _rocmon_parse_eventstring(eventString, &rocmon_context->groups[rocmon_context->numActiveGroups]); - if (err < 0) - { - return err; - } - - // Allocate memory for event results - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Allocate memory for event results - int numEvents = rocmon_context->groups[rocmon_context->numActiveGroups].nevents; - RocmonEventResult* tmpResults = (RocmonEventResult*) malloc(numEvents * sizeof(RocmonEventResult)); - if (tmpResults == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate event results); - return -ENOMEM; - } - - // Allocate memory for new event result list entry - RocmonEventResultList* tmpGroupResults = (RocmonEventResultList*) realloc(device->groupResults, (device->numGroupResults+1) * sizeof(RocmonEventResultList)); - if (tmpGroupResults == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate new event group result list); - return -ENOMEM; - } - - device->groupResults = tmpGroupResults; - device->groupResults[device->numGroupResults].results = tmpResults; - device->groupResults[device->numGroupResults].numResults = numEvents; - device->numGroupResults++; - } - - *gid = rocmon_context->numActiveGroups; - rocmon_context->numActiveGroups++; - return 0; -} - - -static int -_rocmon_setupCounters_rocprofiler(RocmonDevice* device, const char** events, int numEvents) -{ - // Close previous rocprofiler context - if (device->context) - { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Closing previous rocprofiler context); - ROCM_CALL(rocprofiler_close, (device->context), return -1); - } - - // Look if the are any events - if (numEvents <= 0) - { - return 0; - } - - // Create feature array to monitor - rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(numEvents * sizeof(rocprofiler_feature_t)); - if (features == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate feature list); - return -ENOMEM; - } - for (int i = 0; i < numEvents; i++) - { - features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[i].name = events[i]; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP EVENT %d %s, i, events[i]); - } - - // Free previous feature array if present - FREE_IF_NOT_NULL(device->activeRocEvents); - - device->numActiveRocEvents = numEvents; - device->activeRocEvents = features; - - // Open context - rocprofiler_properties_t properties = {}; - properties.queue_depth = 128; - uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | ROCPROFILER_MODE_SINGLEGROUP; - - // Important: only a single profiling group is supported at this time which limits the number of events that can be monitored at a time. - ROCM_CALL(rocprofiler_open, (device->hsa_agent, device->activeRocEvents, device->numActiveRocEvents, &device->context, mode, &properties), return -1); - - return 0; -} - - -static int -_rocmon_setupCounters_smi(RocmonDevice* device, const char** events, int numEvents) -{ - int ret; - const int instanceNumLen = 5; - - // Delete previous events - if (device->activeSmiEvents) - { - device->activeSmiEvents = NULL; - device->numActiveSmiEvents = 0; - } - - // Look if the are any events - if (numEvents <= 0) - { - return 0; - } - - // Create event array - RocmonSmiEvent* activeEvents = (RocmonSmiEvent*) malloc(numEvents * sizeof(RocmonSmiEvent)); - if (activeEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate active event list); - return -ENOMEM; - } - - for (int i = 0; i < numEvents; i++) - { - char eventName[membersize(RocmonSmiEvent, name)]; - int instance = -1; - - // Parse event name -> normal event vs one with multiple instances (EVENT[0]) - const char* event = events[i]; - char* instancePart = strrchr(event, '['); - if (instancePart != NULL) - { - char withoutBrackets[instanceNumLen+1]; // +1 is '\0' - int partlen = strlen(instancePart); - - // Check if number fit in 'withoutBrackets' - if (partlen - 2 > instanceNumLen) - { - ERROR_PRINT(Instance number in '%s' is too large, event); - free(activeEvents); - return -EINVAL; - } - - // Copy instance number without brackets - strncpy(withoutBrackets, instancePart+1, partlen-2); - withoutBrackets[instanceNumLen] = '\0'; - - // Parse instance as number - char* endParsed; - instance = strtol(withoutBrackets, &endParsed, 10); - - // Check if parsing was successful - char* endOfString = &withoutBrackets[partlen-2]; - if (endParsed != endOfString) - { - ERROR_PRINT(Failed to parse instance number in '%s', event); - free(activeEvents); - return -EINVAL; - } - - // Copy event name without instance - int eventNameLen = instancePart - event; - strncpy(eventName, event, eventNameLen); - eventName[eventNameLen] = '\0'; - } - else - { - // Copy entire event name - strncpy(eventName, event, membersize(RocmonSmiEvent, name)); - } - - // Lookup event in available events - RocmonSmiEvent* metric = NULL; - ret = get_smap_by_key(device->smiMetrics, eventName, (void**)&metric); - if (ret < 0) - { - ERROR_PRINT(RSMI event '%s' not found for device %d, eventName, device->deviceId); - free(activeEvents); - return -EINVAL; - } - - // Copy event - RocmonSmiEvent* tmpEvent = &activeEvents[i]; - memcpy(tmpEvent, metric, sizeof(RocmonSmiEvent)); - - // Check if event supports instances - if (instance >= 0 && tmpEvent->type != ROCMON_SMI_EVENT_TYPE_INSTANCES) - { - ERROR_PRINT(Instance number given but event '%s' does not support one, eventName); - free(activeEvents); - return -EINVAL; - } - - // Check if event requires instances - if (instance < 0 && tmpEvent->type == ROCMON_SMI_EVENT_TYPE_INSTANCES) - { - ERROR_PRINT(No instance number given but event '%s' requires one, eventName); - free(activeEvents); - return -EINVAL; - } - - // Check if event has enough instances - if (instance >= 0 && instance >= metric->instances) - { - ERROR_PRINT(Instance %d seleced but event '%s' has only %d, instance, eventName, metric->instances); - free(activeEvents); - return -EINVAL; - } - - // Set instance number - if (instance >= 0) - { - tmpEvent->subvariant = instance; - } - } - - device->activeSmiEvents = activeEvents; - device->numActiveSmiEvents = numEvents; - - return 0; -} - - -int -rocmon_v1_setupCounters(int gid) -{ - int ret; - - // Check arguments - if (gid < 0 || gid >= rocmon_context->numActiveGroups) - { - return -EINVAL; - } - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Get group info - GroupInfo* group = &rocmon_context->groups[gid]; - - // - // Separate rocprofiler and SMI events - // - const char **smiEvents = NULL, **rocEvents = NULL; - int numSmiEvents = 0, numRocEvents = 0; - - // Allocate memory for string arrays - smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (smiEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); - return -ENOMEM; - } - rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (rocEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); - free(smiEvents); - return -ENOMEM; - } - - // Go through each event and sort it - for (int i = 0; i < group->nevents; i++) - { - const char* name = group->events[i]; - if (strncmp(name, "RSMI_", 5) == 0) - { - // RSMI event - smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix - numSmiEvents++; - } - else if (strncmp(name, "ROCP_", 5) == 0) - { - // Rocprofiler event - rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix - numRocEvents++; - } - else - { - // Unknown event - ERROR_PRINT(Event '%s' has no prefix ('ROCP_' or 'RSMI_'), name); - return -EINVAL; - } - } - - // Add events to each device - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Add rocprofiler events - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); - ret = _rocmon_setupCounters_rocprofiler(device, rocEvents, numRocEvents); - if (ret < 0) - { - free(smiEvents); - free(rocEvents); - return ret; - } - - // Add SMI events - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCM SMI WITH %d events, numSmiEvents); - ret = _rocmon_setupCounters_smi(device, smiEvents, numSmiEvents); - if (ret < 0) - { - free(smiEvents); - free(rocEvents); - return ret; - } - } - rocmon_context->activeGroup = gid; - - // Cleanup - free(smiEvents); - free(rocEvents); - - return 0; -} - - -static int -_rocmon_startCounters_rocprofiler(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveRocEvents <= 0) - { - return 0; - } - - // Reset results - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveRocEvents; i++) - { - RocmonEventResult* result = &groupResult->results[i]; - result->lastValue = 0; - result->fullValue = 0; - } - - if (device->context) - { - ROCM_CALL(rocprofiler_start, (device->context, 0), return -1); - } - - return 0; -} - - -static int -_rocmon_startCounters_smi(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveSmiEvents <= 0) - { - return 0; - } - - // Save baseline values - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) - { - double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; - - // Measure counter - if (event->measureFunc) - { - event->measureFunc(device->deviceId, event, result); - } - - // Save value - result->fullValue = 0; - } - - return 0; -} - - -int -rocmon_v1_startCounters(void) -{ - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Get timestamp - uint64_t timestamp; - if (ret = _rocmon_get_timestamp(×tamp)) - { - return ret; - } - - // Start counters on each device - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - device->time.start = timestamp; - device->time.read = timestamp; - - // Start rocprofiler events - ret = _rocmon_startCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Start SMI events - _rocmon_startCounters_smi(device); - if (ret < 0) return ret; - } - - return 0; -} - - -static int -_rocmon_stopCounters_rocprofiler(RocmonDevice* device) -{ - if (device->context) - { - // Close context - ROCM_CALL(rocprofiler_stop, (device->context, 0), return -1); - } - - return 0; -} - - -int -rocmon_v1_stopCounters(void) -{ - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Read counters - ret = _rocmon_readCounters(&_rocmon_get_stop_time); - if (ret < 0) return ret; - - for (int i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Stop rocprofiler events - ret = _rocmon_stopCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Nothing to stop for SMI events - } - - return 0; -} - - -int -rocmon_v1_readCounters(void) -{ - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Read counters - ret = _rocmon_readCounters(&_rocmon_get_read_time); - if (ret < 0) return ret; - - return 0; -} - - -double -rocmon_v1_getResult(int gpuIdx, int groupId, int eventId) -{ - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate gpuIdx - if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) - { - return -EFAULT; - } - - // Validate groupId - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - if (groupId < 0 || groupId >= device->numGroupResults) - { - return -EFAULT; - } - - // Validate eventId - RocmonEventResultList* groupResult = &device->groupResults[groupId]; - if (eventId < 0 || eventId >= groupResult->numResults) - { - return -EFAULT; - } - - // Return result - return groupResult->results[eventId].fullValue; -} - - -// TODO: multiple groups -double -rocmon_v1_getLastResult(int gpuIdx, int groupId, int eventId) -{ - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate gpuIdx - if (gpuIdx < 0 || gpuIdx >= rocmon_context->numDevices) - { - return -EFAULT; - } - - // Validate groupId - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - if (groupId < 0 || groupId >= device->numGroupResults) - { - return -EFAULT; - } - - // Validate eventId - RocmonEventResultList* groupResult = &device->groupResults[groupId]; - if (eventId < 0 || eventId >= groupResult->numResults) - { - return -EFAULT; - } - - // Return result - return groupResult->results[eventId].lastValue; -} - - -int -rocmon_v1_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) -{ - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate args - if (gpuIdx < 0 || gpuIdx > rocmon_context->numDevices) - { - return -EINVAL; - } - if (list == NULL) - { - return -EINVAL; - } - - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - - // Allocate list structure - EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); - if (tmpList == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate event list); - return -ENOMEM; - } - - // Get number of events - printf("NUmber of events %d + %d\n", device->numRocMetrics , get_map_size(device->smiMetrics)); - tmpList->numEvents = device->numRocMetrics + get_map_size(device->smiMetrics); - if (tmpList->numEvents == 0) - { - // No events -> return empty list - tmpList->events = NULL; - *list = tmpList; - return 0; - } - - // Allocate event array - tmpList->events = (Event_rocm_t*) malloc(tmpList->numEvents * sizeof(Event_rocm_t)); - if (tmpList->events == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate events for event list); - free(tmpList); - return -ENOMEM; - } - - // Copy rocprofiler event information - for (int i = 0; i < device->numRocMetrics; i++) - { - rocprofiler_info_data_t* event = &device->rocMetrics[i]; - Event_rocm_t* out = &tmpList->events[i]; - int len; - - // Copy name - printf("Name %s\n", event->metric.name); - len = strlen(event->metric.name) + 5 /* Prefix */ + 1 /* NULL byte */; - out->name = (char*) malloc(len); - if (out->name) - { - snprintf(out->name, len, "ROCP_%s", event->metric.name); - } - - // Copy description - len = strlen(event->metric.description) + 1 /* NULL byte */; - out->description = (char*) malloc(len); - if (out->description) - { - snprintf(out->description, len, "%s", event->metric.description); - } - - // Copy instances - out->instances = event->metric.instances; - } - - // Copy ROCm SMI metric information - for (int i = 0; i < get_map_size(device->smiMetrics); i++) - { - RocmonSmiEvent* event = NULL; - Event_rocm_t* out = &tmpList->events[device->numRocMetrics + i]; - int len; - - // Get event - if (get_smap_by_idx(device->smiMetrics, i, (void**)&event) < 0) - { - continue; - } - - // Copy name - len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; - out->name = (char*) malloc(len); - if (out->name) - { - snprintf(out->name, len, "RSMI_%s", event->name); - } - - // Copy description - char* description = "SMI Event"; // TODO: use real descriptions - len = strlen(description) + 1 /* NULL byte */; - out->description = (char*) malloc(len); - if (out->description) - { - snprintf(out->description, len, "%s", description); - } - - // Copy instances - out->instances = event->instances; - } - - *list = tmpList; - return 0; -} - -void -rocmon_v1_freeEventsOfGpu(EventList_rocm_t list) -{ -#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } - - // Check pointer - if (list == NULL) - { - return; - } - - if (list->events != NULL) - { - for (int i = 0; i < list->numEvents; i++) - { - Event_rocm_t* event = &list->events[i]; - FREE_IF_NOT_NULL(event->name); - FREE_IF_NOT_NULL(event->description); - } - free(list->events); - } - free(list); -} - - -int -rocmon_v1_switchActiveGroup(int newGroupId) -{ - int ret; - - ret = rocmon_stopCounters(); - if (ret < 0) - { - return ret; - } - - ret = rocmon_setupCounters(newGroupId); - if (ret < 0) - { - return ret; - } - - ret = rocmon_startCounters(); - if (ret < 0) - { - return ret; - } - - return 0; -} - - -int -rocmon_v1_getNumberOfGroups(void) -{ - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->numActiveGroups; -} - - -int -rocmon_v1_getIdOfActiveGroup(void) -{ - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->activeGroup; -} - - -int -rocmon_v1_getNumberOfGPUs(void) -{ - if (!rocmon_context || !rocmon_initialized) - { - return -EFAULT; - } - return rocmon_context->numDevices; -} - - -int -rocmon_v1_getNumberOfEvents(int groupId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return -EFAULT; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->nevents; -} - - -int -rocmon_v1_getNumberOfMetrics(int groupId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->nmetrics; -} - - -double -rocmon_v1_getTimeOfGroup(int groupId) -{ - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.stop - device->time.start)); - } - return t*1E-9; -} - - -double -rocmon_v1_getLastTimeOfGroup(int groupId) -{ - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.stop - device->time.read)); - } - return t*1E-9; -} - - -double -rocmon_v1_getTimeToLastReadOfGroup(int groupId) -{ - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.read - device->time.start)); - } - return t*1E-9; -} - - -char* -rocmon_v1_getEventName(int groupId, int eventId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((eventId < 0) || (eventId >= ginfo->nevents)) - { - return NULL; - } - return ginfo->events[eventId]; -} - - -char* -rocmon_v1_getCounterName(int groupId, int eventId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((eventId < 0) || (eventId >= ginfo->nevents)) - { - return NULL; - } - return ginfo->counters[eventId]; -} - - -char* -rocmon_v1_getMetricName(int groupId, int metricId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - if ((metricId < 0) || (metricId >= ginfo->nmetrics)) - { - return NULL; - } - return ginfo->metricnames[metricId]; -} - - -char* -rocmon_v1_getGroupName(int groupId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->groupname; -} - - -char* -rocmon_v1_getGroupInfoShort(int groupId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->shortinfo; -} - - -char* -rocmon_v1_getGroupInfoLong(int groupId) -{ - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId >= rocmon_context->numActiveGroups) - { - return NULL; - } - GroupInfo* ginfo = &rocmon_context->groups[groupId]; - return ginfo->longinfo; -} - - -int -rocmon_v1_getGroups(char*** groups, char*** shortinfos, char*** longinfos) -{ - init_configuration(); - Configuration_t config = get_configuration(); - - return perfgroup_getGroups(config->groupPath, "amd_gpu_v1", groups, shortinfos, longinfos); -} - - -int -rocmon_v1_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) -{ - perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); -} - - - -#endif /* LIKWID_WITH_ROCMON */ diff --git a/test/test_rocmon.c b/test/test_rocmon.c new file mode 100644 index 000000000..89df579b0 --- /dev/null +++ b/test/test_rocmon.c @@ -0,0 +1,72 @@ +#include +#include + + +#include + + + + + +int main(int argc, char* argv[]) +{ + int gpuId = 0; + int ret = 0; + int gid = -1; + rocmon_setVerbosity(DEBUGLEV_DEVELOP); + ret = rocmon_init(1, &gpuId); + if (ret < 0) + { + printf("rocmon_init failed with %d\n", ret); + return ret; + } + ret = rocmon_addEventSet("ROCP_SQ_WAVES:ROCM0", &gid); + if (ret < 0) + { + printf("rocmon_addEventSet failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Event set ID %d\n", gid); + ret = rocmon_setupCounters(gid); + if (ret < 0) + { + printf("rocmon_setupCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + ret = rocmon_startCounters(); + if (ret < 0) + { + printf("rocmon_startCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters running\n"); + ret = rocmon_readCounters(); + if (ret < 0) + { + printf("rocmon_startCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters running\n"); + ret = rocmon_readCounters(); + if (ret < 0) + { + printf("rocmon_startCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters running\n"); + ret = rocmon_stopCounters(); + if (ret < 0) + { + printf("rocmon_stopCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters stopped\n"); + rocmon_finalize(); + return 0; +} From 09a94f23a815960ba9d92b57648587413b7db31d Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 17:54:45 +0100 Subject: [PATCH 15/29] Delete group directory for amd_gpu_sdk, no differentiation required --- groups/amd_gpu_sdk/GDS.txt | 15 --------------- groups/amd_gpu_sdk/MEM.txt | 18 ------------------ groups/amd_gpu_sdk/PCI.txt | 23 ----------------------- groups/amd_gpu_sdk/POWER.txt | 21 --------------------- groups/amd_gpu_sdk/SALU.txt | 15 --------------- groups/amd_gpu_sdk/SFETCH.txt | 15 --------------- groups/amd_gpu_sdk/STALLED.txt | 19 ------------------- groups/amd_gpu_sdk/UTIL.txt | 18 ------------------ groups/amd_gpu_sdk/VALU.txt | 15 --------------- groups/amd_gpu_sdk/WAVE.txt | 15 --------------- 10 files changed, 174 deletions(-) delete mode 100644 groups/amd_gpu_sdk/GDS.txt delete mode 100644 groups/amd_gpu_sdk/MEM.txt delete mode 100644 groups/amd_gpu_sdk/PCI.txt delete mode 100644 groups/amd_gpu_sdk/POWER.txt delete mode 100644 groups/amd_gpu_sdk/SALU.txt delete mode 100644 groups/amd_gpu_sdk/SFETCH.txt delete mode 100644 groups/amd_gpu_sdk/STALLED.txt delete mode 100644 groups/amd_gpu_sdk/UTIL.txt delete mode 100644 groups/amd_gpu_sdk/VALU.txt delete mode 100644 groups/amd_gpu_sdk/WAVE.txt diff --git a/groups/amd_gpu_sdk/GDS.txt b/groups/amd_gpu_sdk/GDS.txt deleted file mode 100644 index 39c3446be..000000000 --- a/groups/amd_gpu_sdk/GDS.txt +++ /dev/null @@ -1,15 +0,0 @@ -SHORT GDS Instructions - -EVENTSET -ROCM0 ROCP_SQ_INSTS_GDS -ROCM1 ROCP_SQ_WAVES - -METRICS -GPU GDS rw insts per work-item ROCM0/ROCM1 - -LONG -Formulas: -GPU GDS rw insts per work-item = ROCP_SQ_INSTS_GDS/ROCP_SQ_WAVES --- -The average number of GDS read or GDS write instructions executed -per work item (affected by flow control). diff --git a/groups/amd_gpu_sdk/MEM.txt b/groups/amd_gpu_sdk/MEM.txt deleted file mode 100644 index acc63a627..000000000 --- a/groups/amd_gpu_sdk/MEM.txt +++ /dev/null @@ -1,18 +0,0 @@ -SHORT Memory utilization - -EVENTSET -ROCM0 ROCP_TA_TA_BUSY -ROCM1 ROCP_GRBM_GUI_ACTIVE -ROCM2 ROCP_SE_NUM - -METRICS -GPU memory utilization 100*max(ROCM0,16)/ROCM1/ROCM2 - -LONG -Formulas: -GPU memory utilization = 100*max(ROCP_TA_TA_BUSY,16)/ROCP_GRBM_GUI_ACTIVE/ROCP_SE_NUM --- -The percentage of GPUTime the memory unit is active. The result includes -the stall time (MemUnitStalled). This is measured with all extra fetches -and writes and any cache or memory effects taken into account. -Value range: 0% to 100% (fetch-bound). diff --git a/groups/amd_gpu_sdk/PCI.txt b/groups/amd_gpu_sdk/PCI.txt deleted file mode 100644 index cefaf307d..000000000 --- a/groups/amd_gpu_sdk/PCI.txt +++ /dev/null @@ -1,23 +0,0 @@ -SHORT PCI Transfers - -EVENTSET -ROCM0 RSMI_PCI_THROUGHPUT_SENT -ROCM1 RSMI_PCI_THROUGHPUT_RECEIVED - - -METRICS -Runtime time -PCI sent ROCM0 -PCI received ROCM1 -PCI send bandwidth 1E-6*ROCM0/time -PCI recv bandwidth 1E-6*ROCM1/time - -LONG -Formulas: -PCI sent = RSMI_PCI_THROUGHPUT_SENT -PCI received = RSMI_PCI_THROUGHPUT_RECEIVED -PCI send bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_SENT/runtime -PCI recv bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_RECEIVED/runtime --- -Currently not usable since the RSMI_PCI_THROUGHPUT_* events require -one second per call, so 2 seconds for both of them. diff --git a/groups/amd_gpu_sdk/POWER.txt b/groups/amd_gpu_sdk/POWER.txt deleted file mode 100644 index 49830efc0..000000000 --- a/groups/amd_gpu_sdk/POWER.txt +++ /dev/null @@ -1,21 +0,0 @@ -SHORT Power, temperature and voltage - -EVENTSET -ROCM0 RSMI_POWER_AVE[0] -ROCM1 RSMI_TEMP_EDGE -ROCM2 RSMI_VOLT_VDDGFX - - -METRICS -Power average 1E-6*ROCM0 -Edge temperature 1E-3*ROCM1 -Voltage 1E-3*ROCM2 - -LONG -Formulas: -Power average = RSMI_POWER_AVE[0] -Edge temperature = 1E-3*RSMI_TEMP_EDGE -Voltage = 1E-3*RSMI_VOLT_VDDGFX --- -Gets the current average power consumption in watts, the -temperature in celsius and the voltage in volts. diff --git a/groups/amd_gpu_sdk/SALU.txt b/groups/amd_gpu_sdk/SALU.txt deleted file mode 100644 index a693421d1..000000000 --- a/groups/amd_gpu_sdk/SALU.txt +++ /dev/null @@ -1,15 +0,0 @@ -SHORT SALU Instructions - -EVENTSET -ROCM0 ROCP_SQ_INSTS_SALU -ROCM1 ROCP_SQ_WAVES - -METRICS -GPU SALU insts per work-item ROCM0/ROCM1 - -LONG -Formulas: -GPU SALU insts per work-item = ROCP_SQ_INSTS_SALU/ROCP_SQ_WAVES --- -The average number of scalar ALU instructions executed per work-item -(affected by flow control). diff --git a/groups/amd_gpu_sdk/SFETCH.txt b/groups/amd_gpu_sdk/SFETCH.txt deleted file mode 100644 index bd0dfc3ff..000000000 --- a/groups/amd_gpu_sdk/SFETCH.txt +++ /dev/null @@ -1,15 +0,0 @@ -SHORT SFetch Instructions - -EVENTSET -ROCM0 ROCP_SQ_INSTS_SMEM -ROCM1 ROCP_SQ_WAVES - -METRICS -GPU SFETCH insts per work-item ROCM0/ROCM1 - -LONG -Formulas: -GPU SFETCH insts per work-item = ROCP_SQ_INSTS_SMEM/ROCP_SQ_WAVES --- -The average number of scalar fetch instructions from the video memory -executed per work-item (affected by flow control). diff --git a/groups/amd_gpu_sdk/STALLED.txt b/groups/amd_gpu_sdk/STALLED.txt deleted file mode 100644 index 9d6dc42c4..000000000 --- a/groups/amd_gpu_sdk/STALLED.txt +++ /dev/null @@ -1,19 +0,0 @@ -SHORT ALU stalled by LDS - -EVENTSET -ROCM0 ROCP_SQ_WAIT_INST_LDS -ROCM1 ROCP_SQ_WAVES -ROCM2 ROCP_GRBM_GUI_ACTIVE - -METRICS -GPU ALD stalled 100*ROCM0*4/ROCM1/ROCM2 - -LONG -Formulas: -GPU ALD stalled = 100*ROCP_SQ_WAIT_INST_LDS*4/ROCP_SQ_WAVES/ROCP_GRBM_GUI_ACTIVE --- -The percentage of GPUTime ALU units are stalled by the LDS input queue -being full or the output queue being not ready. If there are LDS bank -conflicts, reduce them. Otherwise, try reducing the number of LDS -accesses if possible. -Value range: 0% (optimal) to 100% (bad). diff --git a/groups/amd_gpu_sdk/UTIL.txt b/groups/amd_gpu_sdk/UTIL.txt deleted file mode 100644 index 7d9271e11..000000000 --- a/groups/amd_gpu_sdk/UTIL.txt +++ /dev/null @@ -1,18 +0,0 @@ -SHORT GPU utilization - -EVENTSET -ROCM0 ROCP_GRBM_COUNT -ROCM1 ROCP_GRBM_GUI_ACTIVE - - -METRICS -GPU utilization 100*ROCM1/ROCM0 - - -LONG -Formulas: -GPU utilization = 100*ROCP_GRBM_GUI_ACTIVE/ROCP_GRBM_COUNT --- -This group reassembles the 'GPUBusy' metric provided by RocProfiler. -We should add, that we can select the GPUBusy metric directly and the -calculations are done internally in case the metric formula changes. diff --git a/groups/amd_gpu_sdk/VALU.txt b/groups/amd_gpu_sdk/VALU.txt deleted file mode 100644 index 5d57b9b20..000000000 --- a/groups/amd_gpu_sdk/VALU.txt +++ /dev/null @@ -1,15 +0,0 @@ -SHORT VALU Instructions - -EVENTSET -ROCM0 ROCP_SQ_INSTS_VALU -ROCM1 ROCP_SQ_WAVES - -METRICS -GPU VALU insts per work-item ROCM0/ROCM1 - -LONG -Formulas: -GPU VALU insts per work-item = ROCP_SQ_INSTS_VALU/ROCP_SQ_WAVES --- -The average number of vector ALU instructions executed per work-item -(affected by flow control). diff --git a/groups/amd_gpu_sdk/WAVE.txt b/groups/amd_gpu_sdk/WAVE.txt deleted file mode 100644 index fe8914ae1..000000000 --- a/groups/amd_gpu_sdk/WAVE.txt +++ /dev/null @@ -1,15 +0,0 @@ -SHORT Wavefronts - -EVENTSET -ROCM0 ROCP_SQ_WAVES - - -METRICS -GPU wavefronts ROCM0 - - -LONG -Formulas: -GPU wavefronts = ROCP_SQ_WAVES --- -Total Wavefronts From 100fb54d4a75f2d0c474d2908f9c8df5bb719b0a Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 17:56:12 +0100 Subject: [PATCH 16/29] Check error code when initializing ROCm topology --- src/cpustring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cpustring.c b/src/cpustring.c index 63ca736e8..19167aa7e 100644 --- a/src/cpustring.c +++ b/src/cpustring.c @@ -1036,10 +1036,15 @@ int gpustr_to_gpulist_rocm(const char* gpustr, int* gpulist, int length) { int insert = 0; - topology_rocm_init(); + int ret = topology_rocm_init(); + if (ret < 0) + { + return ret; + } RocmTopology_t gpu_topology = get_rocmTopology(); bstring bgpustr = bfromcstr(gpustr); struct bstrList* commalist = bsplit(bgpustr, ','); + bdestroy(bgpustr); for (int i = 0; i < commalist->qty; i++) { if (bstrchrp(commalist->entry[i], '-', 0) != BSTR_ERR) From a4eb7ee2e06ad8d892faaefb10a0664c653c6664 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 17:56:55 +0100 Subject: [PATCH 17/29] Fix ERROR_PRINTS --- src/frequency_cpu.c | 24 ++++++++++++------------ src/frequency_uncore.c | 10 +++++----- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/frequency_cpu.c b/src/frequency_cpu.c index 30c28bd56..3dd0fe693 100644 --- a/src/frequency_cpu.c +++ b/src/frequency_cpu.c @@ -632,7 +632,7 @@ static int getAMDTurbo(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -641,7 +641,7 @@ static int getAMDTurbo(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -680,7 +680,7 @@ static int setAMDTurbo(const int cpu_id, const int turbo) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -689,7 +689,7 @@ static int setAMDTurbo(const int cpu_id, const int turbo) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -741,7 +741,7 @@ static int getIntelTurbo(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -750,7 +750,7 @@ static int getIntelTurbo(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -789,7 +789,7 @@ static int setIntelTurbo(const int cpu_id, const int turbo) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -798,7 +798,7 @@ static int setIntelTurbo(const int cpu_id, const int turbo) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -858,7 +858,7 @@ static int getIntelHWP(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -867,7 +867,7 @@ static int getIntelHWP(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -904,7 +904,7 @@ static int getBaseFreq(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -913,7 +913,7 @@ static int getBaseFreq(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } diff --git a/src/frequency_uncore.c b/src/frequency_uncore.c index d4667fa74..5e5de03f6 100644 --- a/src/frequency_uncore.c +++ b/src/frequency_uncore.c @@ -198,7 +198,7 @@ int freq_setUncoreFreqMin(const int socket_id, const uint64_t freq) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return 0; } @@ -267,7 +267,7 @@ uint64_t freq_getUncoreFreqMin(const int socket_id) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return 0; } @@ -329,7 +329,7 @@ int freq_setUncoreFreqMax(const int socket_id, const uint64_t freq) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return 0; } @@ -396,7 +396,7 @@ uint64_t freq_getUncoreFreqMax(const int socket_id) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return 0; } @@ -454,7 +454,7 @@ uint64_t freq_getUncoreFreqCur(const int socket_id) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return 0; } } From 3a782a49d8d606abda7582a734db7766983bdb6d Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 18:00:32 +0100 Subject: [PATCH 18/29] Fix ERROR_PRINTS --- src/access_client.c | 4 ++-- src/access_x86_msr.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/access_client.c b/src/access_client.c index 9cb1b4715..5f558319d 100644 --- a/src/access_client.c +++ b/src/access_client.c @@ -255,7 +255,7 @@ access_client_startDaemon_bridge(int cpu_id, const char *bridge_path, struct soc io_count = send(socket_fd, (char*) &io_buf, sizeof(io_buf), 0); if (io_count != sizeof(io_buf)) { - ERROR_PRINT(Failed to send msg to the bridge socket) + ERROR_PRINT(Failed to send msg to the bridge socket); close(socket_fd); return -1; } @@ -263,7 +263,7 @@ access_client_startDaemon_bridge(int cpu_id, const char *bridge_path, struct soc io_count = recv(socket_fd, (char*) &io_buf, sizeof(io_buf), 0); if (io_count != sizeof(io_buf)) { - ERROR_PRINT(Failed to recv msg from the bridge socket) + ERROR_PRINT(Failed to recv msg from the bridge socket); close(socket_fd); return -1; } diff --git a/src/access_x86_msr.c b/src/access_x86_msr.c index d023b1082..aff42b940 100644 --- a/src/access_x86_msr.c +++ b/src/access_x86_msr.c @@ -122,7 +122,7 @@ access_x86_msr_init(const int cpu_id) fd = open(msr_file_name, O_RDWR); if (fd < 0) { - ERROR_PRINT(Cannot access MSR device file %s: %s.,msr_file_name , strerror(errno)) + ERROR_PRINT(Cannot access MSR device file %s: %s.,msr_file_name , strerror(errno)); ERROR_PLAIN_PRINT(Please check if 'msr' module is loaded and device files have correct permissions); ERROR_PLAIN_PRINT(Alternatively you might want to look into (sys)daemonmode); free(msr_file_name); From 0fff85fb67a114e226a610ad8648b9a54e8ba2a1 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 18:01:04 +0100 Subject: [PATCH 19/29] Fix ERROR_PRINTS --- src/perfmon.c | 2 +- src/power.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/perfmon.c b/src/perfmon.c index 2322b50a5..129fcd71b 100644 --- a/src/perfmon.c +++ b/src/perfmon.c @@ -771,7 +771,7 @@ perfmon_check_counter_map(int cpu_id) HPMinit(); if (HPMaddThread(cpu_id) != 0) { - ERROR_PLAIN_PRINT(Cannot check counters without access to performance counters) + ERROR_PLAIN_PRINT(Cannot check counters without access to performance counters); return; } own_hpm = 1; diff --git a/src/power.c b/src/power.c index b223925fa..c8e37eda6 100644 --- a/src/power.c +++ b/src/power.c @@ -249,7 +249,7 @@ power_init(int cpuId) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PRINT(Cannot get access to RAPL counters) + ERROR_PRINT(Cannot get access to RAPL counters); return err; } } From df147a1d2cdec4d221ce0d97161fe8ecc51952a3 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 18:01:20 +0100 Subject: [PATCH 20/29] Fix ERROR_PRINTS --- src/includes/error.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/includes/error.h b/src/includes/error.h index 739ab8cef..8982ff003 100644 --- a/src/includes/error.h +++ b/src/includes/error.h @@ -43,10 +43,10 @@ exit(EXIT_FAILURE) #define ERROR_PLAIN_PRINT(msg) \ - fprintf(stderr, "ERROR - [%s:%s:%d] " str(msg) "\n", __FILE__, __func__,__LINE__); + fprintf(stderr, "ERROR - [%s:%s:%d] " str(msg) "\n", __FILE__, __func__,__LINE__) #define ERROR_PRINT(fmt, ...) \ - fprintf(stderr, "ERROR - [%s:%s:%d] %s.\n" str(fmt) "\n", __FILE__, __func__,__LINE__, strerror(errno), ##__VA_ARGS__); + fprintf(stderr, "ERROR - [%s:%s:%d] %s.\n" str(fmt) "\n", __FILE__, __func__,__LINE__, strerror(errno), ##__VA_ARGS__) #define CHECK_ERROR(func, msg) \ if ((func) < 0) { \ @@ -65,6 +65,19 @@ exit(EXIT_FAILURE); \ } +#ifndef DEBUGLEV_ONLY_ERROR +#define DEBUGLEV_ONLY_ERROR 0 +#endif +#ifndef DEBUGLEV_INFO +#define DEBUGLEV_INFO 1 +#endif +#ifndef DEBUGLEV_DETAIL +#define DEBUGLEV_DETAIL 2 +#endif +#ifndef DEBUGLEV_DEVELOP +#define DEBUGLEV_DEVELOP 3 +#endif + #define VERBOSEPRINTREG(cpuid,reg,flags,msg) \ if (perfmon_verbosity >= DEBUGLEV_DETAIL) \ { \ From 3a54fe6b91782ba7eb58f332e344b4e9a5d7510d Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 18:02:37 +0100 Subject: [PATCH 21/29] Guard debug levels with ifdefs --- src/includes/likwid.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/includes/likwid.h b/src/includes/likwid.h index 146a226ae..3a6a1608a 100644 --- a/src/includes/likwid.h +++ b/src/includes/likwid.h @@ -38,10 +38,18 @@ #include +#ifndef DEBUGLEV_ONLY_ERROR #define DEBUGLEV_ONLY_ERROR 0 +#endif +#ifndef DEBUGLEV_INFO #define DEBUGLEV_INFO 1 +#endif +#ifndef DEBUGLEV_DETAIL #define DEBUGLEV_DETAIL 2 +#endif +#ifndef DEBUGLEV_DEVELOP #define DEBUGLEV_DEVELOP 3 +#endif #define LIKWID_VERSION "VERSION.RELEASE.MINORVERSION" #define LIKWID_COMMIT GITCOMMIT From 0af128ed87bde5da7a00a0472dd3d39a3a258b08 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 18:03:14 +0100 Subject: [PATCH 22/29] Rename groups again to amd_gpu --- groups/{amd_gpu_v1 => amd_gpu}/GDS.txt | 0 groups/{amd_gpu_v1 => amd_gpu}/MEM.txt | 0 groups/{amd_gpu_v1 => amd_gpu}/PCI.txt | 0 groups/{amd_gpu_v1 => amd_gpu}/POWER.txt | 0 groups/{amd_gpu_v1 => amd_gpu}/SALU.txt | 0 groups/{amd_gpu_v1 => amd_gpu}/SFETCH.txt | 0 groups/{amd_gpu_v1 => amd_gpu}/STALLED.txt | 0 groups/{amd_gpu_v1 => amd_gpu}/UTIL.txt | 0 groups/{amd_gpu_v1 => amd_gpu}/VALU.txt | 0 groups/{amd_gpu_v1 => amd_gpu}/WAVE.txt | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename groups/{amd_gpu_v1 => amd_gpu}/GDS.txt (100%) rename groups/{amd_gpu_v1 => amd_gpu}/MEM.txt (100%) rename groups/{amd_gpu_v1 => amd_gpu}/PCI.txt (100%) rename groups/{amd_gpu_v1 => amd_gpu}/POWER.txt (100%) rename groups/{amd_gpu_v1 => amd_gpu}/SALU.txt (100%) rename groups/{amd_gpu_v1 => amd_gpu}/SFETCH.txt (100%) rename groups/{amd_gpu_v1 => amd_gpu}/STALLED.txt (100%) rename groups/{amd_gpu_v1 => amd_gpu}/UTIL.txt (100%) rename groups/{amd_gpu_v1 => amd_gpu}/VALU.txt (100%) rename groups/{amd_gpu_v1 => amd_gpu}/WAVE.txt (100%) diff --git a/groups/amd_gpu_v1/GDS.txt b/groups/amd_gpu/GDS.txt similarity index 100% rename from groups/amd_gpu_v1/GDS.txt rename to groups/amd_gpu/GDS.txt diff --git a/groups/amd_gpu_v1/MEM.txt b/groups/amd_gpu/MEM.txt similarity index 100% rename from groups/amd_gpu_v1/MEM.txt rename to groups/amd_gpu/MEM.txt diff --git a/groups/amd_gpu_v1/PCI.txt b/groups/amd_gpu/PCI.txt similarity index 100% rename from groups/amd_gpu_v1/PCI.txt rename to groups/amd_gpu/PCI.txt diff --git a/groups/amd_gpu_v1/POWER.txt b/groups/amd_gpu/POWER.txt similarity index 100% rename from groups/amd_gpu_v1/POWER.txt rename to groups/amd_gpu/POWER.txt diff --git a/groups/amd_gpu_v1/SALU.txt b/groups/amd_gpu/SALU.txt similarity index 100% rename from groups/amd_gpu_v1/SALU.txt rename to groups/amd_gpu/SALU.txt diff --git a/groups/amd_gpu_v1/SFETCH.txt b/groups/amd_gpu/SFETCH.txt similarity index 100% rename from groups/amd_gpu_v1/SFETCH.txt rename to groups/amd_gpu/SFETCH.txt diff --git a/groups/amd_gpu_v1/STALLED.txt b/groups/amd_gpu/STALLED.txt similarity index 100% rename from groups/amd_gpu_v1/STALLED.txt rename to groups/amd_gpu/STALLED.txt diff --git a/groups/amd_gpu_v1/UTIL.txt b/groups/amd_gpu/UTIL.txt similarity index 100% rename from groups/amd_gpu_v1/UTIL.txt rename to groups/amd_gpu/UTIL.txt diff --git a/groups/amd_gpu_v1/VALU.txt b/groups/amd_gpu/VALU.txt similarity index 100% rename from groups/amd_gpu_v1/VALU.txt rename to groups/amd_gpu/VALU.txt diff --git a/groups/amd_gpu_v1/WAVE.txt b/groups/amd_gpu/WAVE.txt similarity index 100% rename from groups/amd_gpu_v1/WAVE.txt rename to groups/amd_gpu/WAVE.txt From edb835d499c046577272278f6d88b310eec0beb8 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 18:24:55 +0100 Subject: [PATCH 23/29] Update Rocmon code --- make/config_checks.mk | 7 +- src/applications/likwid.lua | 4 +- src/includes/rocmon_sdk.h | 188 ++++++++++++++++++++++++------------ src/includes/rocmon_smi.h | 12 ++- src/includes/rocmon_v1.h | 18 +--- src/includes/types.h | 2 + src/rocmon.c | 24 +++-- src/rocmon_marker.c | 8 +- 8 files changed, 165 insertions(+), 98 deletions(-) diff --git a/make/config_checks.mk b/make/config_checks.mk index 949e38330..3d6825ef1 100644 --- a/make/config_checks.mk +++ b/make/config_checks.mk @@ -85,13 +85,10 @@ endif ifeq ($(strip $(ROCM_INTERFACE)), true) ROCM_SDK_CHECK := $(shell which rocprofv3 2>/dev/null | wc -l) -ifeq ($(strip $(ROCM_SDK_CHECK)),0) # HSA includes 'hsa/xxx.h' and rocprofiler 'xxx.h' DEFINES += -D__HIP_PLATFORM_AMD__ -INCLUDES += -I$(HIPINCLUDE) -I$(HSAINCLUDE) -I$(HSAINCLUDE)/hsa -I$(ROCPROFILERINCLUDE) -I$(RSMIINCLUDE) -else -$(info Compile for ROCm >= 6.2) +INCLUDES += -I$(HIPINCLUDE) -I$(HSAINCLUDE) -I$(HSAINCLUDE)/hsa -I$(RSMIINCLUDE) +ifeq ($(strip $(ROCM_SDK_CHECK)),1) DEFINES += -DLIKWID_ROCPROF_SDK -INCLUDES += -I$(ROCPROFILERINCLUDE) -I$(RSMIINCLUDE) endif endif diff --git a/src/applications/likwid.lua b/src/applications/likwid.lua index 638d32d40..0654932e2 100644 --- a/src/applications/likwid.lua +++ b/src/applications/likwid.lua @@ -1576,7 +1576,7 @@ end likwid.getMarkerResultsCuda = getMarkerResultsCuda local function getMarkerResultsRocm(filename, gpulist, nan2value) - local gputopo = likwid.getGpuTopology_rocm() + local gputopo = likwid.getRocmTopology() local ret = likwid.readMarkerFileRocm(filename) if ret < 0 then return nil, nil @@ -1627,7 +1627,7 @@ likwid.getMarkerResultsRocm = getMarkerResultsRocm local function printOutputRocm(results, metrics, gpulist, region, stats) local maxLineFields = 0 - local gputopo = likwid.getGpuTopology_rocm() + local gputopo = likwid.getRocmTopology() local regionName = likwid.markerRegionTagRocm(region) local regionGPUs = likwid.markerRegionGpusRocm(region) local cur_gpulist = gpulist diff --git a/src/includes/rocmon_sdk.h b/src/includes/rocmon_sdk.h index b2d32df87..187a04f63 100644 --- a/src/includes/rocmon_sdk.h +++ b/src/includes/rocmon_sdk.h @@ -113,6 +113,8 @@ DECLARE_ROCPROFILER_SDK(rocprofiler_destroy_buffer, (rocprofiler_buffer_id_t)); DECLARE_ROCPROFILER_SDK(rocprofiler_context_is_active, (rocprofiler_context_id_t, int*)); DECLARE_ROCPROFILER_SDK(rocprofiler_create_callback_thread, (rocprofiler_callback_thread_t*)); DECLARE_ROCPROFILER_SDK(rocprofiler_assign_callback_thread, (rocprofiler_buffer_id_t, rocprofiler_callback_thread_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_query_record_counter_id, (rocprofiler_counter_instance_id_t id, rocprofiler_counter_id_t* counter_id)); +DECLARE_ROCPROFILER_SDK(rocprofiler_is_initialized, (int*)) const char *rocprofiler_get_status_string(rocprofiler_status_t); const char * (*rocprofiler_get_status_string_ptr)(rocprofiler_status_t); @@ -124,6 +126,69 @@ DECLAREFUNC_HSA(hsa_init, ()); DECLAREFUNC_HSA(hsa_shut_down, ()); +static int +_rocmon_sdk_link_libraries() +{ + if (rocmon_sdk_dl_hsa_lib && rocmon_sdk_dl_profiler_lib) + { + return 0; + } + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm SDK libraries); + dlerror(); + // Need to link in the ROCm HSA libraries + rocmon_sdk_dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_hsa_lib) + { + ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); + return -1; + } + + // Need to link in the Rocprofiler libraries + rocmon_sdk_dl_profiler_lib = dlopen("librocprofiler-sdk.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_profiler_lib) + { + // Delete last error + dlerror(); + rocmon_sdk_dl_profiler_lib = dlopen("librocprofiler-sdk.so.1", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_profiler_lib) + { + ERROR_PRINT(Rocprofiler library librocprofiler-sdk.so not found: %s, dlerror()); + return -1; + } + } + + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_status_string); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_available_agents); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_timestamp); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_start_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_stop_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_profile_config); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_destroy_profile_config); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_configure_agent_profile_counting_service); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_iterate_agent_supported_counters); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_flush_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_counter_info); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_sample_agent_profile_counting_service); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_force_configure); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_destroy_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_context_is_active); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_callback_thread); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_assign_callback_thread); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_record_counter_id); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_is_initialized); + + DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_init); + DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_shut_down); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm SDK libraries done); + return 0; +} + + + typedef struct { rocprofiler_agent_t *agents; int num_agents; @@ -304,13 +369,13 @@ _rocmon_sdk_free_agent_counters(RocmonDevice *device) typedef struct { - rocprofiler_context_id_t* context; - rocprofiler_agent_t agent; - RocmonEventResultList* result; + RocmonContext** context; + rocprofiler_context_id_t devcontext; + int devid; } rocmon_sdk_read_buffers_cb; static void -_rocmon_sdk_read_buffers(rocprofiler_context_id_t context, +_rocmon_sdk_read_buffers(rocprofiler_context_id_t device_context, rocprofiler_buffer_id_t buffer, rocprofiler_record_header_t** headers, size_t num_headers, @@ -318,6 +383,8 @@ _rocmon_sdk_read_buffers(rocprofiler_context_id_t context, uint64_t) { rocmon_sdk_read_buffers_cb* cbdata = (rocmon_sdk_read_buffers_cb*)udata; + RocmonContext** stat_context = (RocmonContext**)udata; + RocmonContext* context = *stat_context; /* if (cbdata->result->numResults == 0)*/ /* {*/ @@ -331,6 +398,27 @@ _rocmon_sdk_read_buffers(rocprofiler_context_id_t context, { rocprofiler_record_counter_t* r = h->payload; printf("Counter ID %d Value %f Dispatch %ld\n", r->id, r->counter_value, r->dispatch_id); + rocprofiler_counter_id_t cid = {.handle = 0}; + (*rocprofiler_query_record_counter_id_ptr)(r->id, &cid); + for (int j = 0; j < context->numDevices; j++) + { + RocmonDevice *dev = &context->devices[j]; + if (dev->deviceId == cbdata->devid) + { + for (int k = 0; k < dev->numActiveRocEvents; k++) + { + rocprofiler_counter_info_v0_t* cinfo = &dev->sdk_activeRocEvents[k]; + if (cinfo->id.handle == cid.handle) + { + RocmonEventResultList* resultlist = &dev->groupResults[dev->activeGroup]; + resultlist->results[k].fullValue += r->counter_value; + resultlist->results[k].lastValue = resultlist->results[k].fullValue - resultlist->results[k].lastValue; + break; + } + } + break; + } + } } } @@ -373,18 +461,27 @@ tool_init(rocprofiler_client_finalize_t fini, void* udata) RocmonContext* context = *stat_context; ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_init); - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA); - hsa_status_t hstat = (*hsa_init_ptr)(); - if (hstat != HSA_STATUS_SUCCESS) + // initialize libraries + if (_rocmon_sdk_link_libraries() < 0) { + ERROR_PLAIN_PRINT(Failed to initialize libraries); return -EFAULT; } +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA);*/ +/* hsa_status_t hstat = (*hsa_init_ptr)();*/ +/* if (hstat != HSA_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to initialize HSA);*/ +/* return -EFAULT;*/ +/* }*/ + //ROCPROFILER_CALL(rocprofiler_query_available_agents, (ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), &agent_count), return -EFAULT;); ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents); stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), udata); if (stat != ROCPROFILER_STATUS_SUCCESS) { + ERROR_PRINT(Failed to query available agents); return -EFAULT; } if (context->numDevices == 0) @@ -409,7 +506,12 @@ tool_init(rocprofiler_client_finalize_t fini, void* udata) return -EFAULT; } ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating buffer for device %d, device->deviceId); - stat = (*rocprofiler_create_buffer_ptr)(device_context, 100, 50, ROCPROFILER_BUFFER_POLICY_LOSSLESS, _rocmon_sdk_read_buffers, udata, &buffer); + rocmon_sdk_read_buffers_cb devdata = { + .context = stat_context, + .devid = device->deviceId, + .devcontext = device_context + }; + stat = (*rocprofiler_create_buffer_ptr)(device_context, 100, 50, ROCPROFILER_BUFFER_POLICY_LOSSLESS, _rocmon_sdk_read_buffers, &devdata, &buffer); if (stat != ROCPROFILER_STATUS_SUCCESS) { errno = EFAULT; @@ -517,60 +619,6 @@ _rocmon_sdk_set_profile(rocprofiler_context_id_t context_id, -static int -_rocmon_sdk_link_libraries() -{ - #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm SDK libraries); - dlerror(); - // Need to link in the ROCm HSA libraries - rocmon_sdk_dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); - if (!rocmon_sdk_dl_hsa_lib) - { - ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); - return -1; - } - - // Need to link in the Rocprofiler libraries - rocmon_sdk_dl_profiler_lib = dlopen("librocprofiler-sdk.so", RTLD_NOW | RTLD_GLOBAL); - if (!rocmon_sdk_dl_profiler_lib) - { - // Delete last error - dlerror(); - rocmon_sdk_dl_profiler_lib = dlopen("librocprofiler-sdk.so.1", RTLD_NOW | RTLD_GLOBAL); - if (!rocmon_sdk_dl_profiler_lib) - { - ERROR_PRINT(Rocprofiler library librocprofiler-sdk.so not found: %s, dlerror()); - return -1; - } - } - - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_context); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_status_string); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_buffer); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_available_agents); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_timestamp); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_start_context); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_stop_context); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_profile_config); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_destroy_profile_config); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_configure_agent_profile_counting_service); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_iterate_agent_supported_counters); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_flush_buffer); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_counter_info); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_sample_agent_profile_counting_service); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_force_configure); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_destroy_buffer); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_context_is_active); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_callback_thread); - DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_assign_callback_thread); - - DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_init); - DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_shut_down); - - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm SDK libraries done); - return 0; -} rocprofiler_tool_configure_result_t* @@ -580,12 +628,22 @@ rocprofiler_configure(uint32_t version, rocprofiler_client_id_t* client_id) { client_id->name = "LIKWID"; + if (!rocmon_context) + { + rocmon_context = malloc(sizeof(RocmonContext)); + if (!rocmon_context) + { + return NULL; + } + memset(rocmon_context, 0, sizeof(RocmonContext)); + } static rocprofiler_tool_configure_result_t config_result = { .size = sizeof(rocprofiler_tool_configure_result_t), .initialize = tool_init, .finalize = tool_fini, .tool_data = &rocmon_context, }; + DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing Rocprofiler SDK); return &config_result; } @@ -600,6 +658,7 @@ rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds) } if (rocmon_sdk_initialized) { + return 0; } @@ -607,20 +666,21 @@ rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds) ret = _rocmon_sdk_link_libraries(); if (ret < 0) { - //ERROR_PLAIN_PRINT(Failed to initialize libraries); + ERROR_PLAIN_PRINT(Failed to initialize libraries); return ret; } stat = (*rocprofiler_force_configure_ptr)(rocprofiler_configure); if (stat != ROCPROFILER_STATUS_SUCCESS) { + ERROR_PLAIN_PRINT(Failed to configure rocprofiler); return -EFAULT; } if (context->numDevices == 0) { errno = ENODEV; - ERROR_PRINT(Cannot ROCm GPUs); + ERROR_PRINT(Cannot find any ROCm GPUs); return -ENODEV; } diff --git a/src/includes/rocmon_smi.h b/src/includes/rocmon_smi.h index 9c959a7fe..686d8e92a 100644 --- a/src/includes/rocmon_smi.h +++ b/src/includes/rocmon_smi.h @@ -1104,6 +1104,8 @@ int rocmon_smi_init(RocmonContext* context, int numGpus, const int* gpuIds) ERROR_PRINT(Failed to get SMI functions for device %d, device->deviceId); goto rocmon_init_rsmi_failed; } + device->activeSmiEvents = NULL; + device->smiMetrics = NULL; } rocmon_smi_initialized = TRUE; return 0; @@ -1132,19 +1134,27 @@ void rocmon_smi_finalize(RocmonContext* context) RocmonDevice* device = &context->devices[i]; if (device->activeSmiEvents) { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing active SMI events for device %d, device->deviceId); free(device->activeSmiEvents); device->activeSmiEvents = NULL; device->numActiveSmiEvents = 0; } if (device->smiMetrics) { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing SMI event list for device %d, device->deviceId); destroy_smap(device->smiMetrics); device->smiMetrics = NULL; } } } + if (context->smiEvents) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing SMI event list); + destroy_smap(context->smiEvents); + context->smiEvents = NULL; + } } - + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown RSMI); RSMI_CALL(rsmi_shut_down, (), { ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); // fall through diff --git a/src/includes/rocmon_v1.h b/src/includes/rocmon_v1.h index 2ee73d335..bf0fe7e03 100644 --- a/src/includes/rocmon_v1.h +++ b/src/includes/rocmon_v1.h @@ -203,10 +203,10 @@ _rocmon_v1_iterate_info_callback_add(const rocprofiler_info_data_t info, void* d iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); - if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP) - { - _rocmon_v1_print_rocprofiler_info_data(info); - } +/* if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP)*/ +/* {*/ +/* _rocmon_v1_print_rocprofiler_info_data(info);*/ +/* }*/ // Check info kind if (info.kind != ROCPROFILER_INFO_KIND_METRIC) { @@ -566,16 +566,6 @@ rocmon_v1_finalize(RocmonContext* context) FREE_IF_NOT_NULL(device->v1_rocMetrics); FREE_IF_NOT_NULL(device->v1_activeRocEvents); } - if (device->groupResults) - { - // Free events of event result lists - for (int j = 0; j < device->numGroupResults; j++) - { - FREE_IF_NOT_NULL(device->groupResults[i].results); - } - // Free list - free(device->groupResults); - } if (device->v1_context) { ROCM_CALL(rocprofiler_close, (device->v1_context),); diff --git a/src/includes/types.h b/src/includes/types.h index 1c45306e8..4d43ce887 100644 --- a/src/includes/types.h +++ b/src/includes/types.h @@ -87,4 +87,6 @@ typedef struct { #define ARRAY_COUNT(arr) (sizeof(arr) / sizeof((arr)[0])) +typedef int bool; + #endif /*TYPES_H*/ diff --git a/src/rocmon.c b/src/rocmon.c index 743b3b33c..cde753b78 100644 --- a/src/rocmon.c +++ b/src/rocmon.c @@ -97,29 +97,37 @@ rocmon_finalize(void) RocmonDevice* dev = &rocmon_context->devices[i]; if (dev->groupResults) { - if (dev->groupResults->results) + for (int j = 0; j < dev->numGroupResults; j++) { - free(dev->groupResults->results); - dev->groupResults->results = NULL; - dev->groupResults->numResults = 0; + RocmonEventResultList* l = &dev->groupResults[j]; + if (l->results) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroy group result %d for device %d, j, dev->deviceId); + free(l->results); + l->results = NULL; + l->numResults = 0; + } } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroy group results for device %d, dev->deviceId); free(dev->groupResults); dev->groupResults = NULL; } } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroy devices); free(rocmon_context->devices); rocmon_context->devices = NULL; rocmon_context->numDevices = 0; } if (rocmon_context->groups) { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroy groups); free(rocmon_context->groups); rocmon_context->groups = NULL; rocmon_context->numGroups = 0; rocmon_context->numActiveGroups = 0; rocmon_context->activeGroup = -1; } - + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroy context); free(rocmon_context); rocmon_context = NULL; @@ -137,10 +145,6 @@ rocmon_init(int numGpus, const int* gpuIds) { return 0; } - if (rocmon_context != NULL) - { - return -EEXIST; - } // Validate arguments if (numGpus <= 0) { @@ -314,6 +318,7 @@ rocmon_addEventSet(const char* eventString, int* gid) ERROR_PLAIN_PRINT(Cannot allocate event results); return -ENOMEM; } + memset(tmpResults, 0, numEvents * sizeof(RocmonEventResult)); // Allocate memory for new event result list entry RocmonEventResultList* tmpGroupResults = (RocmonEventResultList*) realloc(device->groupResults, (device->numGroupResults+1) * sizeof(RocmonEventResultList)); @@ -322,7 +327,6 @@ rocmon_addEventSet(const char* eventString, int* gid) ERROR_PLAIN_PRINT(Cannot allocate new event group result list); return -ENOMEM; } - device->groupResults = tmpGroupResults; device->groupResults[device->numGroupResults].results = tmpResults; device->groupResults[device->numGroupResults].numResults = numEvents; diff --git a/src/rocmon_marker.c b/src/rocmon_marker.c index dc7707022..976e3ce10 100644 --- a/src/rocmon_marker.c +++ b/src/rocmon_marker.c @@ -241,6 +241,7 @@ rocmon_markerInit(void) { return; } + printf("rocmon_markerInit\n"); // Get environment variables char* eventStr = getenv("LIKWID_ROCMON_EVENTS"); @@ -252,7 +253,7 @@ rocmon_markerInit(void) // Validate environment variables are set if ((eventStr == NULL) || (gpuStr == NULL) || (gpuFileStr == NULL)) { - fprintf(stderr, "Running without GPU Marker API. Activate GPU Marker API with -m, -G and -W on commandline.\n"); + fprintf(stderr, "Running without Rocmon Marker API. Activate Rocmon Marker API with -m, -I and -R on commandline.\n"); return; } if (verbosityStr != NULL) { @@ -309,7 +310,7 @@ rocmon_markerInit(void) ret = rocmon_init(num_gpus, gpu_ids); if (ret < 0) { - fprintf(stderr,"Error init Rocmon Marker API.\n"); + fprintf(stderr,"Error initializing Rocmon Marker API with %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -396,6 +397,7 @@ rocmon_markerClose(void) } else { + printf("Saving ROCMON MarkerAPI results to %s\n", markerfile); _rocmon_saveToFile(markerfile); } @@ -728,6 +730,7 @@ rocmon_readMarkerFile(const char* filename) fprintf(stderr, "Error opening file %s\n", filename); } ptr = fgets(buf, sizeof(buf), fp); + printf("# %s\n", buf); ret = sscanf(buf, "%d %d %d", &gpus, ®ions, &groups); if (ret != 3) { @@ -778,6 +781,7 @@ rocmon_readMarkerFile(const char* filename) } while (fgets(buf, sizeof(buf), fp)) { + printf("# %s\n", buf); if (strchr(buf,':')) { int regionid = 0, groupid = -1; From 647b60773a3bba50c7a803e51d18c9dffa77c4d1 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 18:25:31 +0100 Subject: [PATCH 24/29] Always compile rocprofiler v1 support --- Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile b/Makefile index 294a34e4f..06f92b1c9 100644 --- a/Makefile +++ b/Makefile @@ -154,8 +154,6 @@ OBJ := $(filter-out $(BUILD_DIR)/topology_rocm.o,$(OBJ)) else ifeq ($(strip $(ROCM_SDK_CHECK)),0) OBJ := $(filter-out $(BUILD_DIR)/rocmon_sdk.o,$(OBJ)) -else -OBJ := $(filter-out $(BUILD_DIR)/rocmon_v1.o,$(OBJ)) endif endif ifeq ($(COMPILER),GCCPOWER) From b60e2f6ed14e5f46ecc9d60c92213b195be77c8d Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 18:25:48 +0100 Subject: [PATCH 25/29] Fix uninitialized variable warnings --- ext/GOTCHA/src/libc_wrappers.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ext/GOTCHA/src/libc_wrappers.c b/ext/GOTCHA/src/libc_wrappers.c index 30608e1bb..cb5532d85 100644 --- a/ext/GOTCHA/src/libc_wrappers.c +++ b/ext/GOTCHA/src/libc_wrappers.c @@ -426,8 +426,8 @@ int gotcha_int_printf(int fd, const char *format, ...) { } if (*str == 'd' || *str == 'i') { - signed long val; - char numstr[64]; + signed long val = 0; + char numstr[64] = {'\0'}; if (char_width) val = (signed long)(signed char)va_arg(args, signed int); else if (short_width) @@ -444,8 +444,8 @@ int gotcha_int_printf(int fd, const char *format, ...) { add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1); } else if (*str == 'u') { - unsigned long val; - char numstr[64]; + unsigned long val = 0; + char numstr[64] = {'\0'}; if (char_width) val = (unsigned long)(unsigned char)va_arg(args, unsigned int); else if (short_width) @@ -462,8 +462,8 @@ int gotcha_int_printf(int fd, const char *format, ...) { add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1); } else if (*str == 'x' || *str == 'X' || *str == 'p') { - unsigned long val; - char numstr[64]; + unsigned long val = 0; + char numstr[64] = {'\0'}; if (*str != 'p') { if (char_width) val = (unsigned long)(unsigned char)va_arg(args, unsigned int); @@ -486,7 +486,7 @@ int gotcha_int_printf(int fd, const char *format, ...) { add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1); } else if (*str == 'c') { - char cbuf[2]; + char cbuf[2] = {'\0'}; cbuf[0] = (unsigned char)va_arg(args, unsigned int); cbuf[1] = '\0'; add_to_buffer(cbuf, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, @@ -499,7 +499,7 @@ int gotcha_int_printf(int fd, const char *format, ...) { add_to_buffer("%", fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1); } else { - char s[3]; + char s[3] = {'\0'}; s[0] = '%'; s[1] = *str; s[2] = '\0'; @@ -517,7 +517,7 @@ int gotcha_int_printf(int fd, const char *format, ...) { } void *gotcha_memset(void *s, int c, size_t n) { - size_t i; + size_t i = 0; unsigned char byte = (unsigned char)c; for (i = 0; i < n; i++) { ((unsigned char *)s)[i] = byte; From 5a1de25b717a9b0bae4c725b4c6c06df74a1295d Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 31 Oct 2024 18:27:18 +0100 Subject: [PATCH 26/29] Update build config file --- config.mk | 1 - 1 file changed, 1 deletion(-) diff --git a/config.mk b/config.mk index 76b473c4a..69c1a88ae 100644 --- a/config.mk +++ b/config.mk @@ -197,6 +197,5 @@ BUILDAPPDAEMON=false # to be in the LD_LIBRARY_PATH to dynamically load the libraries. # Include directory for ROCm headers HSAINCLUDE = $(ROCM_HOME)/include -ROCPROFILERINCLUDE = $(ROCM_HOME)/include/rocprofiler HIPINCLUDE = $(ROCM_HOME)/include RSMIINCLUDE = $(ROCM_HOME)/include From d3e1eb829e1dbeb1af8da7602cde0fc7ed2a94f6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 10 Nov 2024 17:24:40 +0100 Subject: [PATCH 27/29] Update code to work again but only v1 and smi, sdk still fails to init --- src/includes/rocmon_sdk.h | 156 ++++++++++++++++++++++++++++++-------- src/includes/rocmon_smi.h | 9 ++- src/includes/rocmon_v1.h | 2 +- src/rocmon.c | 83 ++++++++++---------- 4 files changed, 171 insertions(+), 79 deletions(-) diff --git a/src/includes/rocmon_sdk.h b/src/includes/rocmon_sdk.h index 76e3b42f0..9aa6820f6 100644 --- a/src/includes/rocmon_sdk.h +++ b/src/includes/rocmon_sdk.h @@ -179,6 +179,7 @@ _rocmon_sdk_link_libraries() DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_assign_callback_thread); DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_record_counter_id); DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_is_initialized); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_status_string); DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_init); DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_shut_down); @@ -452,32 +453,12 @@ _rocmon_sdk_read_buffers(rocprofiler_context_id_t device_context, } -int -tool_init(rocprofiler_client_finalize_t fini, void* udata) +static int _rocmon_sdk_create_devices(RocmonContext** stat_context) { rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; - RocmonContext** stat_context = (RocmonContext**)udata; RocmonContext* context = *stat_context; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_init); - - // initialize libraries - if (_rocmon_sdk_link_libraries() < 0) - { - ERROR_PLAIN_PRINT(Failed to initialize libraries); - return -EFAULT; - } - -/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA);*/ -/* hsa_status_t hstat = (*hsa_init_ptr)();*/ -/* if (hstat != HSA_STATUS_SUCCESS)*/ -/* {*/ -/* ERROR_PRINT(Failed to initialize HSA);*/ -/* return -EFAULT;*/ -/* }*/ - - //ROCPROFILER_CALL(rocprofiler_query_available_agents, (ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), &agent_count), return -EFAULT;); ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents); - stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), udata); + stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), stat_context); if (stat != ROCPROFILER_STATUS_SUCCESS) { ERROR_PRINT(Failed to query available agents); @@ -488,7 +469,6 @@ tool_init(rocprofiler_client_finalize_t fini, void* udata) FREE_IF_NOT_NULL(context->devices); return -1; } - for (int i = 0; i < context->numDevices; i++) { rocprofiler_context_id_t device_context; @@ -544,6 +524,99 @@ tool_init(rocprofiler_client_finalize_t fini, void* udata) return 0; } +int +tool_init(rocprofiler_client_finalize_t fini, void* udata) +{ + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + RocmonContext** stat_context = (RocmonContext**)udata; + RocmonContext* context = *stat_context; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_init); + + // initialize libraries + if (_rocmon_sdk_link_libraries() < 0) + { + ERROR_PLAIN_PRINT(Failed to initialize libraries); + return -EFAULT; + } + return _rocmon_sdk_create_devices(stat_context); + +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA);*/ +/* hsa_status_t hstat = (*hsa_init_ptr)();*/ +/* if (hstat != HSA_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to initialize HSA);*/ +/* return -EFAULT;*/ +/* }*/ + + //ROCPROFILER_CALL(rocprofiler_query_available_agents, (ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), &agent_count), return -EFAULT;); +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents);*/ +/* stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), udata);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to query available agents);*/ +/* return -EFAULT;*/ +/* }*/ +/* if (context->numDevices == 0)*/ +/* {*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -1;*/ +/* }*/ + +/* for (int i = 0; i < context->numDevices; i++)*/ +/* {*/ +/* rocprofiler_context_id_t device_context;*/ +/* rocprofiler_buffer_id_t buffer;*/ +/* rocprofiler_callback_thread_t thread;*/ +/* RocmonDevice* device = &context->devices[i];*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating context for device %d, device->deviceId);*/ +/* stat = (*rocprofiler_create_context_ptr)(&device_context);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating buffer for device %d, device->deviceId);*/ +/* rocmon_sdk_read_buffers_cb devdata = {*/ +/* .context = stat_context,*/ +/* .devid = device->deviceId,*/ +/* .devcontext = device_context*/ +/* };*/ +/* stat = (*rocprofiler_create_buffer_ptr)(device_context, 100, 50, ROCPROFILER_BUFFER_POLICY_LOSSLESS, _rocmon_sdk_read_buffers, &devdata, &buffer);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating callback thread for device %d, device->deviceId);*/ +/* stat = (*rocprofiler_create_callback_thread_ptr)(&thread);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Assign callback thread to buffer for device %d, device->deviceId);*/ +/* stat = (*rocprofiler_assign_callback_thread_ptr)(buffer, thread);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* */ +/* device->sdk_context = device_context;*/ +/* device->buffer = buffer;*/ +/* device->thread = thread;*/ +/* }*/ +/* return 0;*/ +} + void tool_fini(void* udata) @@ -648,6 +721,7 @@ int rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds) { int ret = 0; + rocprofiler_context_id_t text_context; rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; if ((numGpus < 0) || (!gpuIds) || (!context)) { @@ -666,11 +740,23 @@ rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds) return ret; } - stat = (*rocprofiler_force_configure_ptr)(rocprofiler_configure); - if (stat != ROCPROFILER_STATUS_SUCCESS) +/* stat = (*rocprofiler_force_configure_ptr)(rocprofiler_configure);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to configure rocprofiler: %s, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* return -EFAULT;*/ +/* }*/ +/* stat = (*rocprofiler_create_context_ptr)(&text_context);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to create test context: %s, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* return -EFAULT;*/ +/* }*/ + ret = _rocmon_sdk_create_devices(&rocmon_context); + if (ret < 0) { - ERROR_PLAIN_PRINT(Failed to configure rocprofiler); - return -EFAULT; + ERROR_PRINT(Failed to create SDK devices); + return ret; } if (context->numDevices == 0) @@ -1085,12 +1171,16 @@ _rocmon_readCounters_rocprofiler_sdk(RocmonDevice* device) return -EFAULT; } } -/* stat = (*rocprofiler_flush_buffer_ptr)(device->buffer);*/ -/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ -/* {*/ -/* ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ -/* return -EFAULT;*/ -/* }*/ + else + { + ERROR_PRINT(Device context for device %d not active, device->deviceId); + } + stat = (*rocprofiler_flush_buffer_ptr)(device->buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } return 0; } diff --git a/src/includes/rocmon_smi.h b/src/includes/rocmon_smi.h index bfc9ce156..d40990a64 100644 --- a/src/includes/rocmon_smi.h +++ b/src/includes/rocmon_smi.h @@ -932,6 +932,7 @@ rocmon_smi_startCounters(RocmonContext* context) for (int i = 0; i < context->numDevices; i++) { RocmonDevice* device = &context->devices[i]; + fprintf(stderr, "Device %d with %d SMI events\n", device->deviceId, device->numActiveSmiEvents); // Check if there are any counters to start if (device->numActiveSmiEvents <= 0) { @@ -940,11 +941,11 @@ rocmon_smi_startCounters(RocmonContext* context) // Save baseline values RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) + for (int j = 0; j < device->numActiveSmiEvents; j++) { double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + RocmonSmiEvent* event = &device->activeSmiEvents[j]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+j]; // Measure counter if (event->measureFunc) @@ -1156,7 +1157,7 @@ void rocmon_smi_finalize(RocmonContext* context) } ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown RSMI); RSMI_CALL(rsmi_shut_down, (), { - ERROR_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI failed); + ERROR_PRINT(Shutdown SMI failed); // fall through }); rocmon_smi_initialized = FALSE; diff --git a/src/includes/rocmon_v1.h b/src/includes/rocmon_v1.h index 2cae677dd..3fe05b0c7 100644 --- a/src/includes/rocmon_v1.h +++ b/src/includes/rocmon_v1.h @@ -575,7 +575,7 @@ rocmon_v1_finalize(RocmonContext* context) } ROCM_CALL(hsa_shut_down, (), { - ERROR_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA failed); + ERROR_PRINT(Shutdown HSA failed); // fall through }); } diff --git a/src/rocmon.c b/src/rocmon.c index cde753b78..f767b29dc 100644 --- a/src/rocmon.c +++ b/src/rocmon.c @@ -372,23 +372,8 @@ rocmon_setupCounters(int gid) // // Separate rocprofiler and SMI events // - const char **smiEvents = NULL, **rocEvents = NULL; int numSmiEvents = 0, numRocEvents = 0; - // Allocate memory for string arrays - smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (smiEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); - return -ENOMEM; - } - rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (rocEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); - free(smiEvents); - return -ENOMEM; - } // Go through each event and sort it for (int i = 0; i < group->nevents; i++) @@ -397,13 +382,11 @@ rocmon_setupCounters(int gid) if (strncmp(name, "RSMI_", 5) == 0) { // RSMI event - smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix numSmiEvents++; } else if (strncmp(name, "ROCP_", 5) == 0) { // Rocprofiler event - rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix numRocEvents++; } else @@ -414,48 +397,66 @@ rocmon_setupCounters(int gid) } } - // Add events to each device for (int i = 0; i < rocmon_context->numDevices; i++) { RocmonDevice* device = &rocmon_context->devices[i]; + device->numActiveSmiEvents = 0; + device->numActiveRocEvents = 0; + } - // Add rocprofiler events - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCPROFILER WITH %d events, numRocEvents); - if (rocmon_context->use_rocprofiler_v1) - { - ret = rocmon_v1_setupCounters(rocmon_context, gid); - } + // Add rocprofiler events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCPROFILER WITH %d events, numRocEvents); + if (rocmon_context->use_rocprofiler_v1) + { + ret = rocmon_v1_setupCounters(rocmon_context, gid); + } #ifdef LIKWID_ROCPROF_SDK - else - { - ret = rocmon_sdk_setupCounters(rocmon_context, gid); - } + else + { + ret = rocmon_sdk_setupCounters(rocmon_context, gid); + } #endif - if (ret < 0) - { - ERROR_PRINT(Setting up rocprofiler counters failed); - free(smiEvents); - free(rocEvents); - return ret; - } + if (ret < 0) + { + ERROR_PRINT(Setting up rocprofiler counters failed); +/* free(smiEvents);*/ +/* free(rocEvents);*/ + return ret; + } - // Add SMI events + // Add SMI events + if (numSmiEvents > 0) + { ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCM SMI WITH %d events, numSmiEvents); ret = rocmon_smi_setupCounters(rocmon_context, gid); if (ret < 0) { ERROR_PRINT(Setting up SMI counters failed); - free(smiEvents); - free(rocEvents); +/* free(smiEvents);*/ +/* free(rocEvents);*/ return ret; } + } + else + { + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + device->numActiveSmiEvents = 0; + } + } + + // Add events to each device + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; device->activeGroup = gid; } rocmon_context->activeGroup = gid; rocmon_context->state = ROCMON_STATE_SETUP; - // Cleanup - free(smiEvents); - free(rocEvents); +/* // Cleanup*/ +/* free(smiEvents);*/ +/* free(rocEvents);*/ return 0; } From 1f4f9e184905ecdaef75ea0675d3ed7d4429d88d Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 10 Nov 2024 23:08:11 +0100 Subject: [PATCH 28/29] Use typedef for bool only if not C99+ --- src/includes/types.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/includes/types.h b/src/includes/types.h index 4d43ce887..802caad44 100644 --- a/src/includes/types.h +++ b/src/includes/types.h @@ -87,6 +87,11 @@ typedef struct { #define ARRAY_COUNT(arr) (sizeof(arr) / sizeof((arr)[0])) + +#if __STDC_VERSION__ <= 199901L typedef int bool; +#else +#include +#endif #endif /*TYPES_H*/ From 6277c48a500ef3c4bef830d25cd14c5cc3a8451d Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 10 Nov 2024 23:12:28 +0100 Subject: [PATCH 29/29] Remove typedef for bool --- src/includes/types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/includes/types.h b/src/includes/types.h index 802caad44..1461fcb14 100644 --- a/src/includes/types.h +++ b/src/includes/types.h @@ -88,10 +88,10 @@ typedef struct { #define ARRAY_COUNT(arr) (sizeof(arr) / sizeof((arr)[0])) -#if __STDC_VERSION__ <= 199901L +/*#if __STDC_VERSION__ <= 199901L typedef int bool; #else #include -#endif +#endif*/ #endif /*TYPES_H*/