From 86f21f364fcef1a0c52a8b6dbb4910d59800c222 Mon Sep 17 00:00:00 2001 From: Jonathan Bernhard Date: Tue, 3 Dec 2024 17:41:02 +0100 Subject: [PATCH 1/8] Fix LIKWID_VERSION and LIKWID_COMMIT macros in likwid.h not being set correctly during installation --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a34d37234..bb108cb2b 100644 --- a/Makefile +++ b/Makefile @@ -617,7 +617,7 @@ install: install_daemon install_freq install_appdaemon install_container_helper @mkdir -p $(PREFIX)/include @chmod 755 $(PREFIX)/include @install -m 644 $(SRC_DIR)/includes/likwid.h $(PREFIX)/include/ - @sed -i -e "s##$(VERSION)#g" -e "s##$(DATE)#g" -e "s##$(GITCOMMIT)#g" -e "s##$(MINOR)#g" $(PREFIX)/include/likwid.h + @sed -i -e "s##$(VERSION)#g" -e "s##$(DATE)#g" -e "s##$(GITCOMMIT)#g" -e "s##$(MINOR)#g" -e "s#VERSION.RELEASE.MINORVERSION#$(VERSION).$(RELEASE).$(MINOR)#g" -e "s#LIKWID_COMMIT GITCOMMIT#LIKWID_COMMIT \"$(GITCOMMIT)\"#g" $(PREFIX)/include/likwid.h @install -m 644 $(SRC_DIR)/includes/likwid-marker.h $(PREFIX)/include/ $(FORTRAN_INSTALL) @echo "===> INSTALL groups to $(PREFIX)/share/likwid/perfgroups" From 1a76091bfb6e2e882af642646575df21ef468eef Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Wed, 4 Dec 2024 16:26:49 +0100 Subject: [PATCH 2/8] Add temperature reading for AMD to sysfeatures --- src/includes/sysFeatures_amd_thermal.h | 8 + src/sysFeatures_amd.c | 4 + src/sysFeatures_amd_thermal.c | 418 +++++++++++++++++++++++++ 3 files changed, 430 insertions(+) create mode 100644 src/includes/sysFeatures_amd_thermal.h create mode 100644 src/sysFeatures_amd_thermal.c diff --git a/src/includes/sysFeatures_amd_thermal.h b/src/includes/sysFeatures_amd_thermal.h new file mode 100644 index 000000000..783b8521b --- /dev/null +++ b/src/includes/sysFeatures_amd_thermal.h @@ -0,0 +1,8 @@ +#ifndef SYSFEATURES_AMD_THERMAL_H +#define SYSFEATURES_AMD_THERMAL_H + +#include + +extern const _SysFeatureList likwid_sysft_amd_k10_cpu_thermal_feature_list; + +#endif //SYSFEATURES_AMD_THERMAL_H diff --git a/src/sysFeatures_amd.c b/src/sysFeatures_amd.c index bf9176b45..ea13b678f 100644 --- a/src/sysFeatures_amd.c +++ b/src/sysFeatures_amd.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -282,6 +283,7 @@ static const _SysFeatureList* amd_k17_cpu_feature_inputs[] = { &amd_k17_cpu_speculation_feature_list, //&amd_k19_cpu_l1dflush_feature_list, &amd_k17_cpu_hwconfig_feature_list, + &likwid_sysft_amd_k10_cpu_thermal_feature_list, NULL, }; @@ -289,6 +291,7 @@ static const _SysFeatureList* amd_k19_zen3_cpu_feature_inputs[] = { &amd_k19_cpu_prefetch_feature_list, &amd_k19_cpu_speculation_feature_list, &amd_k17_cpu_hwconfig_feature_list, + &likwid_sysft_amd_k10_cpu_thermal_feature_list, NULL, }; @@ -297,6 +300,7 @@ static const _SysFeatureList* amd_k19_zen4_cpu_feature_inputs[] = { &amd_k19_cpu_speculation_feature_list, &amd_k19_cpu_l1dflush_feature_list, &amd_k17_cpu_hwconfig_feature_list, + &likwid_sysft_amd_k10_cpu_thermal_feature_list, NULL, }; diff --git a/src/sysFeatures_amd_thermal.c b/src/sysFeatures_amd_thermal.c new file mode 100644 index 000000000..1739eb8d2 --- /dev/null +++ b/src/sysFeatures_amd_thermal.c @@ -0,0 +1,418 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct sysfs_ccd { + /* e.g. /sys/bus/pci/drivers/k10temp/0000:12:3.4/hwmon/hwmon5/temp3_input */ + bstring temp_path; + /* e.g. Tccd8 */ + bstring label; +}; + +struct sysfs_socket { + /* 'socket' may be slightly misleading, as we actually refer to a PCI device. + * However, we will later associate each socket with one PCI device. */ + struct sysfs_ccd *ccds; + size_t count; + /* e.g. /sys/bus/pci/drivers/k10temp/0000:12:3.4 */ + bstring pci_path; + /* e.g. /sys/bus/pci/drivers/k10temp/0000:12:3.4/hwmon/hwmon5 */ + bstring hwmon_path; + /* e.g. /sys/bus/pci/drivers/k10temp/0000:12:3.4/hwmon/hwmon5/temp1_label */ + bstring temp_path; + /* e.g. Tctl */ + bstring label; +}; + +struct sysfs_info { + struct sysfs_socket *sockets; + size_t count; + bool init; +} info; + +__attribute__((destructor)) static void free_paths(void) +{ + for (size_t i = 0; i < info.count; i++) + { + struct sysfs_socket *socket = &info.sockets[i]; + + for (size_t j = 0; j < socket->count; j++) + { + struct sysfs_ccd *ccd = &socket->ccds[j]; + + bdestroy(ccd->temp_path); + bdestroy(ccd->label); + } + + free(socket->ccds); + bdestroy(socket->pci_path); + bdestroy(socket->hwmon_path); + bdestroy(socket->temp_path); + bdestroy(socket->label); + } + + free(info.sockets); + memset(&info, 0, sizeof(info)); +} + +static int read_sysfs_file(const char *path, char *buf, size_t size) +{ + if (size == 0 || !buf || !path) + return -EINVAL; + + FILE *file = fopen(path, "r"); + if (!file) + return -errno; + const size_t len = fread(buf, sizeof(buf[0]), size - 1, file); + buf[len] = '\0'; + fclose(file); + return 0; +} + +static int ccd_sort(const void *a, const void *b) +{ + const struct sysfs_ccd *ca = a; + const struct sysfs_ccd *cb = b; + + const char *sa = bdata(ca->label); + const char *sb = bdata(cb->label); + + static const size_t digit_pos = strlen("Tccd"); + assert(strncmp(sa, "Tccd", digit_pos) == 0); + assert(strncmp(sb, "Tccd", digit_pos) == 0); + + const int ia = atoi(&sa[digit_pos]); + const int ib = atoi(&sb[digit_pos]); + if (ia < ib) + return -1; + if (ia > ib) + return 1; + return 0; +} + +static int socket_sort(const void *a, const void *b) +{ + const struct sysfs_socket *sa = a; + const struct sysfs_socket *sb = b; + return bstrcmp(sa->pci_path, sb->pci_path); +} + +static int create_paths(void) +{ + if (info.init) + return 0; + + /* Enumerate PCI devices associated with k10temp driver. */ + const char *k10temp_base = "/sys/bus/pci/drivers/k10temp"; + DIR *k10temp_dir = opendir(k10temp_base); + if (!k10temp_dir) + { + DEBUG_PRINT(DEBUGLEV_DEVELOP, %s not found. Not initializing k10temp); + return -errno; + } + + struct dirent *pcidevice_file; + while (errno = 0, (pcidevice_file = readdir(k10temp_dir))) + { + char d_name_tokenized[sizeof(pcidevice_file->d_name)]; + snprintf(d_name_tokenized, sizeof(d_name_tokenized), "%s", pcidevice_file->d_name); // <-- manual strlcpy + // + /* Read all entries in the k10temp directory. + * Find all entries, which look like a PCI address in order to determine + * which devices are associated with the k10temp driver. + * A PCI address looks like 0000:00:00:0 */ + char *saveptr = NULL; + const char *domain = strtok_r(d_name_tokenized, ":", &saveptr); + const char *bus = strtok_r(NULL, ":", &saveptr); + const char *dev = strtok_r(NULL, ".", &saveptr); + const char *func = strtok_r(NULL, "", &saveptr); + + /* Incase not all tokens are in the file name, just continue to next file. */ + if (!domain || !bus || !dev || !func) + continue; + + void *new_sockets = realloc(info.sockets, (info.count + 1) * sizeof(info.sockets[0])); + if (!new_sockets) + break; + + info.sockets = new_sockets; + struct sysfs_socket *s = &info.sockets[info.count]; + info.count += 1; + + memset(s, 0, sizeof(*s)); + s->pci_path = bformat("%s/%s", k10temp_base, pcidevice_file->d_name); + if (!s->pci_path) + { + errno = ENOMEM; + break; + } + } + + if (errno != 0) + { + const int errno_save = errno; + closedir(k10temp_dir); + free_paths(); + return -errno_save; + } + + closedir(k10temp_dir); + + /* Sort enumerated PCI devices in ascending order. + * + * IMPORTANT: This is an attempt to hopefully match the device with + * the lowest device ID to socket 0, and so on. + * There is no guarantee this is actually correct, but we otherwise do not + * have the ability to know which socket the temperature PCI device belongs to. */ + qsort(info.sockets, info.count, sizeof(info.sockets[0]), socket_sort); + + /* Populate hwmon_path for each socket. */ + for (size_t socket_id = 0; socket_id < info.count; socket_id++) + { + /* Determine hwmon path */ + struct sysfs_socket *s = &info.sockets[socket_id]; + + char hwmon_base[PATH_MAX]; + snprintf(hwmon_base, sizeof(hwmon_base), "%s/hwmon", bdata(s->pci_path)); + DIR *hwmon_base_dir = opendir(hwmon_base); + if (!hwmon_base_dir) + { + const int errno_save = errno; + DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, k10temp: Unable to read dir %s, hwmon_base); + free_paths(); + return -errno_save; + } + + struct dirent *hwmon_candidate_dir; + while (errno = 0, (hwmon_candidate_dir = readdir(hwmon_base_dir))) + { + /* only allow hwmon subdirectories */ + if (strncmp(hwmon_candidate_dir->d_name, "hwmon", 5) != 0) + continue; + + /* Check if the current dirent is actually a directory */ + char hwmon_candidate_path[PATH_MAX]; + snprintf( + hwmon_candidate_path, + sizeof(hwmon_candidate_path), + "%s/%s", + bdata(s->hwmon_path), + hwmon_candidate_dir->d_name + ); + + s->hwmon_path = bformat("%s/%s", hwmon_base, hwmon_candidate_dir->d_name); + if (!s->hwmon_path) + errno = ENOMEM; + break; + } + + if (errno != 0) + { + const int errno_save = errno; + closedir(hwmon_base_dir); + free_paths(); + return -errno_save; + } + + closedir(hwmon_base_dir); + + /* Crawl hwmon dir for tempX_input and tempX_label and populate arrays accordingly. */ + DIR *hwmon_dir = opendir(bdata(s->hwmon_path)); + if (!hwmon_dir) + { + const int errno_save = errno; + DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, k10temp: Unable to read dir %s, bdata(s->hwmon_path)); + free_paths(); + return -errno_save; + } + + struct dirent *temp_dirent; + while (errno = 0, (temp_dirent = readdir(hwmon_dir))) + { + /* check if file name is of form 'temp\d+_label'. */ + if (strncmp(temp_dirent->d_name, "temp", strlen("temp")) != 0) + continue; + + const char *numstart = &temp_dirent->d_name[strlen("temp")]; + char *numend; + errno = 0; + unsigned long no = strtoul(numstart, &numend, 10); + if (numstart == numend || errno != 0) + continue; + + if (strcmp(numend, "_label") != 0) + continue; + + /* Okay, now we have made sure out file name is of the right format. + * Next, we read the label name from the file to decide if it's a CCD + * temperature or a Tctl temperature. */ + + /* temporarily store path to e.g. /sys/...../temp3_label */ + char label_path[PATH_MAX]; + snprintf(label_path, sizeof(label_path), "%s/%s", bdata(s->hwmon_path), temp_dirent->d_name); + + /* read e.g. temp3_label to string and store it. */ + char label_string[64]; + int err = read_sysfs_file(label_path, label_string, sizeof(label_string)); + if (err < 0) + { + errno = -err; + break; + } + + /* temporarilty store path to e.g. /sys/...../temp3_input */ + char temp_path[PATH_MAX]; + snprintf(temp_path, sizeof(temp_path), "%s/temp%lu_input", bdata(s->hwmon_path), no); + + /* We now have to differentiate between CCD temeprature sensors and CTL temperature + * sensors. The CCD temperatures are stored in the 'ccds' array, while we should + * hopefully only find a single Tctl temperature. The latter one will be stored + * only once per socket. */ + if (strncmp(label_string, "Tccd", strlen("Tccd")) == 0) + { + void *new_ccds = realloc(s->ccds, (s->count + 1) * sizeof(s->ccds[0])); + if (!new_ccds) + break; + + s->ccds = new_ccds; + struct sysfs_ccd *ccd = &s->ccds[s->count]; + s->count += 1; + + memset(ccd, 0, sizeof(*ccd)); + ccd->temp_path = bfromcstr(temp_path); + ccd->label = bfromcstr(label_string); + if (!ccd->temp_path || !ccd->label) + { + errno = ENOMEM; + break; + } + } else { + if (s->label) + { + /* If s->label is alreay set, we have encountered more then one non-CCD temperature. + * We only support one sensors per socket, so issue a warning but continue regardless. */ + DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Found more than one non-Tccd. current=%s new=%s, bdata(s->label), label_string); + bdestroy(s->label); + bdestroy(s->temp_path); + } + + s->label = bfromcstr(label_string); + s->temp_path = bfromcstr(temp_path); + if (!s->label || !s->temp_path) + { + errno = ENOMEM; + break; + } + } + } + + if (errno != 0) + { + const int errno_save = errno; + closedir(hwmon_dir); + free_paths(); + return -errno_save; + } + + closedir(hwmon_dir); + + /* Fail if no socket sensor has been found. CCD sensors are not mandatory, since + * some CPUs do not have any. */ + if (!s->label) + return -ENODEV; + + /* Last, we have to sort each CCD entry according to their label. */ + qsort(s->ccds, s->count, sizeof(s->ccds[0]), ccd_sort); + } + + info.init = true; + return 0; +} + +static int temp_getter(const char *file, char **value) +{ + /* read temperature */ + char temp_data[64]; + int err = read_sysfs_file(file, temp_data, sizeof(temp_data)); + if (err < 0) + return err; + + /* parse to value */ + char *endptr; + errno = 0; + long temp = strtol(temp_data, &endptr, 10); + if (temp_data == endptr) + return -EIO; + if (errno != 0) + return -errno; + + return likwid_sysft_double_to_string((double)temp / 1000.0, value); +} + +static int amd_thermal_temperature_ccd_getter(LikwidDevice_t device, char **value) +{ + int err = create_paths(); + if (err < 0) + return err; + + err = topology_init(); + if (err < 0) + return err; + + /* determine CCD to read from */ + CpuTopology_t topo = get_cpuTopology(); + + const uint32_t local_die_id = device->id.simple.id % (topo->numDies / topo->numSockets); + const uint32_t socket_id = device->id.simple.id / (topo->numDies / topo->numSockets); + + if (socket_id >= info.count) + return -EINVAL; + + if (local_die_id >= info.sockets[socket_id].count) + return -EINVAL; + + return temp_getter(bdata(info.sockets[socket_id].ccds[local_die_id].temp_path), value); +} + +static int amd_thermal_temperature_ctl_getter(LikwidDevice_t device, char **value) +{ + int err = create_paths(); + if (err < 0) + return err; + + if (device->id.simple.id >= info.count) + return -EINVAL; + + return temp_getter(bdata(info.sockets[device->id.simple.id].temp_path), value); +} + +static int amd_thermal_tester(void) +{ + int err = create_paths(); + if (err < 0) + return 0; + + /* We need at least one socket in order to detect the thermal sensor as valid. */ + return info.count > 0; +} + +static _SysFeature amd_thermal_features[] = { + {"ccd_temp", "thermal", "Current CPU CCD temperature", amd_thermal_temperature_ccd_getter, NULL, DEVICE_TYPE_DIE, NULL, "degrees C"}, + {"ctl_temp", "thermal", "Current CPU CTL temperature (cooling temperature)", amd_thermal_temperature_ctl_getter, NULL, DEVICE_TYPE_SOCKET, NULL, "degrees C"}, +}; + +const _SysFeatureList likwid_sysft_amd_k10_cpu_thermal_feature_list = { + .num_features = ARRAY_COUNT(amd_thermal_features), + .tester = amd_thermal_tester, + .features = amd_thermal_features, +}; From c98e60c36bee23aa7a0f26c5613ff114c426a896 Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Wed, 4 Dec 2024 15:28:23 +0000 Subject: [PATCH 3/8] Rename AMD thermal sysfeatures to be more in line with Intel --- src/sysFeatures_amd_thermal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sysFeatures_amd_thermal.c b/src/sysFeatures_amd_thermal.c index 1739eb8d2..9459e3c86 100644 --- a/src/sysFeatures_amd_thermal.c +++ b/src/sysFeatures_amd_thermal.c @@ -407,8 +407,8 @@ static int amd_thermal_tester(void) } static _SysFeature amd_thermal_features[] = { - {"ccd_temp", "thermal", "Current CPU CCD temperature", amd_thermal_temperature_ccd_getter, NULL, DEVICE_TYPE_DIE, NULL, "degrees C"}, - {"ctl_temp", "thermal", "Current CPU CTL temperature (cooling temperature)", amd_thermal_temperature_ctl_getter, NULL, DEVICE_TYPE_SOCKET, NULL, "degrees C"}, + {"ccd_temp", "thermal", "Current CPU CCD temperature (Tccd)", amd_thermal_temperature_ccd_getter, NULL, DEVICE_TYPE_DIE, NULL, "degrees C"}, + {"pkg_temp", "thermal", "Current CPU socket temperature (Tctl)", amd_thermal_temperature_ctl_getter, NULL, DEVICE_TYPE_SOCKET, NULL, "degrees C"}, }; const _SysFeatureList likwid_sysft_amd_k10_cpu_thermal_feature_list = { From cf782c8bb2779a2d5479a4d04ce39a857f0adcd5 Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Wed, 4 Dec 2024 16:15:30 +0000 Subject: [PATCH 4/8] Fix nvml init error when libnvidia-ml.so ist not found This fixes sysfeatures, where likwid was compiled with Nvidia support, but is run on a system without Nvidia runtime libraries or GPU. --- src/sysFeatures.c | 5 +---- src/sysFeatures_nvml.c | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/sysFeatures.c b/src/sysFeatures.c index a237eb909..b4c5dd196 100644 --- a/src/sysFeatures.c +++ b/src/sysFeatures.c @@ -186,10 +186,7 @@ int likwid_sysft_init(void) #ifdef LIKWID_WITH_NVMON err = likwid_sysft_init_nvml(&_feature_list); if (err < 0) - { - ERROR_PRINT(Failed to initialize SysFeatures nvml module); - return err; - } + DEBUG_PRINT(DEBUGLEV_INFO, Failed to initialize SysFeatures nvml module); #endif DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialized %d features, _feature_list.num_features); diff --git a/src/sysFeatures_nvml.c b/src/sysFeatures_nvml.c index f1745c895..c5d88ba42 100644 --- a/src/sysFeatures_nvml.c +++ b/src/sysFeatures_nvml.c @@ -147,7 +147,7 @@ int likwid_sysft_init_nvml(_SysFeatureList *list) if (!dl_nvml) { - ERROR_PRINT(dlopen(libnvidia-ml.so) failed: %s, dlerror()); + DEBUG_PRINT(DEBUGLEV_INFO, dlopen(libnvidia-ml.so) failed: %s, dlerror()); return -ELIBACC; } From 598d5293130ea5f02cb6b0a39e63e10e1037e31c Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Wed, 4 Dec 2024 17:24:01 +0100 Subject: [PATCH 5/8] Remove surplus line in sysFeatures_amd_thermal --- src/sysFeatures_amd_thermal.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sysFeatures_amd_thermal.c b/src/sysFeatures_amd_thermal.c index 9459e3c86..933ab7dd2 100644 --- a/src/sysFeatures_amd_thermal.c +++ b/src/sysFeatures_amd_thermal.c @@ -126,7 +126,6 @@ static int create_paths(void) { char d_name_tokenized[sizeof(pcidevice_file->d_name)]; snprintf(d_name_tokenized, sizeof(d_name_tokenized), "%s", pcidevice_file->d_name); // <-- manual strlcpy - // /* Read all entries in the k10temp directory. * Find all entries, which look like a PCI address in order to determine * which devices are associated with the k10temp driver. From 5567045e48cc1c6789b3d48611229f105c103c54 Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Mon, 9 Dec 2024 10:51:18 +0100 Subject: [PATCH 6/8] Remove old commented out code in likwid-sysfeatures --- src/applications/likwid-sysfeatures.lua | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/applications/likwid-sysfeatures.lua b/src/applications/likwid-sysfeatures.lua index 46df20d6d..87bada0f1 100644 --- a/src/applications/likwid-sysfeatures.lua +++ b/src/applications/likwid-sysfeatures.lua @@ -184,12 +184,6 @@ if #devList == 0 then end end ---[[local access_mode = likwid.getAccessClientMode() -if access_mode < 0 or access_mode > 1 then - print_stderr("Manipulation of HW features only for access mode 'direct' or 'accessdaemon'") - os.exit(1) -end]] - -- set verbosity if verbose > 0 and verbose <= 3 then likwid.setVerbosity(verbose) From ff192f73f93eb778d0f5a11d50a76e5875a8861d Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Mon, 9 Dec 2024 13:49:20 +0000 Subject: [PATCH 7/8] Add likwid-sysfeatures --saveall and --loadall --- src/applications/likwid-sysfeatures.lua | 124 ++++++++++++++++++++---- src/applications/likwid.lua | 1 + src/luawid.c | 29 +++++- 3 files changed, 133 insertions(+), 21 deletions(-) diff --git a/src/applications/likwid-sysfeatures.lua b/src/applications/likwid-sysfeatures.lua index 87bada0f1..ff426d922 100644 --- a/src/applications/likwid-sysfeatures.lua +++ b/src/applications/likwid-sysfeatures.lua @@ -61,6 +61,8 @@ local function usage() print_stdout("-s, --set \t Set feature(s) to the given value") print_stdout(" \t format: .= or just = if unique") print_stdout(" \t can be a comma-separated list of features") + print_stdout(" --saveall \t Save all available rw-features to file") + print_stdout(" --loadall \t Load all features from file") print_stdout("-O\t\t\t Output results in CSV") print_stdout("-V, --verbose \t Set verbosity\n") end @@ -121,7 +123,7 @@ local verbose = 0 local output_csv = false -- parse the command line -for opt,arg in likwid.getopt(arg, {"h","v","l","p","d:","g:","s:","a", "O","help","version","list", "print", "set:", "get:","all", "cpus:", "V:", "verbose:"}) do +for opt,arg in likwid.getopt(arg, {"h","v","l","p","d:","g:","s:","a", "O","help","version","list", "print", "set:", "get:","all", "cpus:", "V:", "verbose:", "saveall:", "loadall:"}) do if (type(arg) == "string") then local s,e = arg:find("-"); if s == 1 then @@ -152,6 +154,10 @@ for opt,arg in likwid.getopt(arg, {"h","v","l","p","d:","g:","s:","a", "O","help getList = likwid.stringsplit(arg, ",") elseif opt == "s" or opt == "set" then setList = likwid.stringsplit(arg, ",") + elseif opt == "saveall" then + saveFeatures = arg + elseif opt == "loadall" then + loadFeatures = arg elseif opt == "?" then print_stderr("Invalid commandline option -"..arg) os.exit(1) @@ -163,12 +169,12 @@ end -- validate command line input -if (not printDevices) and (not listFeatures) and (not allFeatures) and (#getList == 0) and (#setList == 0) then +if (not printDevices) and (not listFeatures) and (not allFeatures) and (not saveFeatures) and (not loadFeatures) and (#getList == 0) and (#setList == 0) then print_stderr("No operations specified, exiting...") os.exit(1) end -if (printDevices or listFeatures or allFeatures) and (#getList > 0 or #setList > 0) then - print_stderr("Cannot list features and get/set at the same time") +if (printDevices or listFeatures or allFeatures or saveFeatures or loadFeatures) and (#getList > 0 or #setList > 0) then + print_stderr("Cannot list features and get/set/load/save at the same time") os.exit(1) end if #devList == 0 then @@ -200,20 +206,21 @@ end local ft_list = likwid.sysFeatures_list() -- print available devices +device_types = {} +device_types[likwid.hwthread] = "HWThread (T)" +device_types[likwid.core] = "Core (C)" +device_types[likwid.numa] = "NUMA (M)" +device_types[likwid.die] = "Die (D)" +device_types[likwid.socket] = "Socket (S)" +device_types[likwid.node] = "Node (N)" +if likwid.nvSupported() then + device_types[likwid.nvidia_gpu] = "Nvidia GPU (GN)" +end +if likwid.rocmSupported() then + device_types[likwid.amd_gpu] = "AMD GPU (GA)" +end + if printDevices then - device_types = {} - device_types[likwid.hwthread] = "HWThread (T)" - device_types[likwid.core] = "Core (C)" - device_types[likwid.numa] = "NUMA (M)" - device_types[likwid.die] = "Die (D)" - device_types[likwid.socket] = "Socket (S)" - device_types[likwid.node] = "Node (N)" - if likwid.nvSupported() then - device_types[likwid.nvidia_gpu] = "Nvidia GPU (GN)" - end - if likwid.rocmSupported() then - device_types[likwid.amd_gpu] = "AMD GPU (GA)" - end for devtype, name in pairs(device_types) do print(string.format("%s:", name)) devices = likwid.getAvailableDevices(devtype) @@ -384,5 +391,88 @@ if #setList > 0 and #devList > 0 then os.exit(0) end +-- save all read/write features to file +if saveFeatures then + local file = io.open(saveFeatures, "w") + -- iterate over all device types + for devtype, _ in pairs(device_types) do + -- iterate over all features + for _,f in pairs(ft_list) do + local full_name = string.format("%s.%s", f.Category, f.Name) + -- only allow matching device types and if feature is readable and writable + if f.TypeID ~= devtype or f.ReadOnly or f.WriteOnly then + goto next_feat + end + + -- actually read the features + for _, dev in pairs(likwid.getAllDevices(devtype)) do + local lw_dev = likwid.createDevice(devtype, dev) + local v, err = likwid.sysFeatures_get(full_name, lw_dev) + if err then + print_stderr(string.format("Failed to get feature '%s' on device %s:%d: %s", full_name, lw_dev:typeName(), lw_dev:id(), err)) + goto next_feat + end + file:write(string.format("%s.%s@%s=%s\n", f.Category, f.Name, lw_dev:id(), v)) + end + + ::next_feat:: + end + end + + file:close() + + likwid.finalizeSysFeatures() + os.exit(0) +end + +-- load all features from file +if loadFeatures then + for line in io.lines(loadFeatures) do + -- split string like the following: cpu_freq.governor@5=schedutil + local part1 = likwid.stringsplit(line, "=") + if #part1 ~= 2 then + print_stderr("Invalid line: " .. line) + os.exit(1) + end + local part2 = likwid.stringsplit(part1[1], "@") + if #part2 ~= 2 then + print_stderr("Invalid line: " .. line) + os.exit(1) + end + part3 = likwid.stringsplit(part2[1], ".") + if #part3 ~= 2 then + print_stderr("Invalid line: " .. line) + os.exit(1) + end + local feat_cat = part3[1] + local feat_name = part3[2] + local dev_id = part2[2] + local value = part1[2] + + local full_name = string.format("%s.%s", feat_cat, feat_name) + + -- get device type of this particular feature + local devtype = nil + for _, f in pairs(ft_list) do + if f.Name == feat_name and f.Category == feat_cat then + devtype = f.TypeID + break + end + end + if not devtype then + print_stderr(string.format("Unknown feature: '%s'", full_name)) + os.exit(1) + end + local lw_dev = likwid.createDevice(devtype, dev_id) + local success, err = likwid.sysFeatures_set(full_name, lw_dev, value) + if not success then + print_stderr(string.format("Failed to set feature '%s' on device %s:%d to %s: %s", full_name, lw_dev:typeName(), lw_dev:id(), value, err)) + end + end + + likwid.finalizeSysFeatures() + os.exit(0) +end + likwid.finalizeSysFeatures() os.exit(0) diff --git a/src/applications/likwid.lua b/src/applications/likwid.lua index b777646ca..db6dd6aaf 100644 --- a/src/applications/likwid.lua +++ b/src/applications/likwid.lua @@ -189,6 +189,7 @@ likwid.finalizeSysFeatures = likwid_finalizeSysFeatures likwid.createDevice = likwid_createDevice likwid.createDevicesFromString = likwid_createDevicesFromString likwid.getAvailableDevices = likwid_getAvailableDevices +likwid.getAllDevices = likwid_getAllDevices likwid.getCudaTopology = likwid_getCudaTopology likwid.putCudaTopology = likwid_putCudaTopology diff --git a/src/luawid.c b/src/luawid.c index 58d28a1f5..73f1a3a5a 100644 --- a/src/luawid.c +++ b/src/luawid.c @@ -3828,14 +3828,24 @@ static int lua_likwid_destroyDevice(lua_State *L) return 0; } -static int lua_likwid_getAvailableDevices(lua_State *L) +static int lua_likwid_getDevices(lua_State *L, bool all) { const LikwidDeviceType type = luaL_checknumber(L, 1); char **id_list = NULL; size_t id_list_len = 0; - int err = likwid_device_get_available(type, &id_list, &id_list_len); - if (err < 0) - luaL_error(L, "likwid_device_get_available failed: %s", strerror(-err)); + int err; + if (all) + { + err = likwid_device_get_all(type, &id_list, &id_list_len); + if (err < 0) + luaL_error(L, "likwid_device_get_all failed: %s", strerror(-err)); + } + else + { + err = likwid_device_get_available(type, &id_list, &id_list_len); + if (err < 0) + luaL_error(L, "likwid_device_get_available failed: %s", strerror(-err)); + } lua_newtable(L); @@ -3850,6 +3860,16 @@ static int lua_likwid_getAvailableDevices(lua_State *L) return 1; } +static int lua_likwid_getAvailableDevices(lua_State *L) +{ + return lua_likwid_getDevices(L, false); +} + +static int lua_likwid_getAllDevices(lua_State *L) +{ + return lua_likwid_getDevices(L, true); +} + static int lua_likwiddevice_get_typeId(lua_State *L) { const LikwidDevice_t dev = *(LikwidDevice_t *)luaL_checkudata(L, 1, "LikwidDevice_t"); @@ -4249,6 +4269,7 @@ int __attribute__((visibility("default"))) luaopen_liblikwid(lua_State *L) { lua_register(L, "likwid_createDevicesFromString",lua_likwid_createDevicesFromString); lua_register(L, "likwid_createDevice",lua_likwid_createDevice); lua_register(L, "likwid_getAvailableDevices",lua_likwid_getAvailableDevices); + lua_register(L, "likwid_getAllDevices",lua_likwid_getAllDevices); #endif /* LIKWID_WITH_SYSFEATURES */ #ifdef __MIC__ setuid(0); From 786052e451f7bf098be8aa623c4b76830ec8f872 Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Mon, 9 Dec 2024 14:56:32 +0100 Subject: [PATCH 8/8] Add better explanation for --saveall --loadall --- src/applications/likwid-sysfeatures.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/applications/likwid-sysfeatures.lua b/src/applications/likwid-sysfeatures.lua index ff426d922..c51d8a575 100644 --- a/src/applications/likwid-sysfeatures.lua +++ b/src/applications/likwid-sysfeatures.lua @@ -61,8 +61,8 @@ local function usage() print_stdout("-s, --set \t Set feature(s) to the given value") print_stdout(" \t format: .= or just = if unique") print_stdout(" \t can be a comma-separated list of features") - print_stdout(" --saveall \t Save all available rw-features to file") - print_stdout(" --loadall \t Load all features from file") + print_stdout(" --saveall \t Save all available rw-features to file") + print_stdout(" --loadall \t Load all features from file") print_stdout("-O\t\t\t Output results in CSV") print_stdout("-V, --verbose \t Set verbosity\n") end