From 310234ab7c5eac11bbeae73e888ca45acb106d3a Mon Sep 17 00:00:00 2001 From: Ron Lieberman Date: Fri, 1 Mar 2024 13:12:44 -0600 Subject: [PATCH] SWDEV-445751 [MI300] gpurun : Observing Memory access fault in OpenMP Change-Id: I7021b339a682f2bedb56c38d0bc5e23aeae19a56 --- utils/bin/gpurun | 280 ++++++++++++++++++++++++++++------------------- 1 file changed, 169 insertions(+), 111 deletions(-) diff --git a/utils/bin/gpurun b/utils/bin/gpurun index e9b433f..b051209 100755 --- a/utils/bin/gpurun +++ b/utils/bin/gpurun @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright(C) 2021 Advanced Micro Devices, Inc. All rights reserved. +# Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -39,14 +39,11 @@ # mpirun -np 55 -hostfile $_host_file gpurun $_appbin $_appargs # # TODO: -# - Limit HSA queues with GPU_MAX_HW_QUEUES when multiple ranks per GPU -# - Warn if HSA_CU_MASK is preset and use that instead of calculated MASK. -# Note that rocminfo does not recognize ROCR_VISIBLE_DEVICES or HSA_CU_MASK. # - Add support for cuda. -# - Add support for other mpi launchers besides openmpi's mpirun. # - If gpurun becomes popular, convert this script to a program # +# PROGVERSION string is updated by cmake when component is installed PROGVERSION=X.Y-Z function version(){ echo $0 version $PROGVERSION @@ -55,20 +52,24 @@ function version(){ function verbosity() { export GPURUN_VERBOSE=0 } +function verbose2() { + export GPURUN_VERBOSE=2 +} function usage(){ /bin/cat 2>&1 <<"EOF" gpurun: Application process launch utility for GPUs. - This utility launches an application binary with the - Linux 'taskset' utility to limit the application process to - execute only on CPU cores in the same NUMA domain as - the selected GPU to improve efficiency of memory transfers. + This utility ensures the process will run on a single GPU. + It launches the application binary with the 'taskset' utility + so the process only runs on CPU cores in the same NUMA domain + as the selected GPU. This utility sets environment variable ROCR_VISIBLE_DEVICES - if the number of visible GPUs is greater than 1. It sets - it to limit the application binary to a single GPU. - This utility sets environment variable HSA_CU_MASK - to control which CUs are available to the process only - when more than one OpenMPI rank will utilize the same GPU + to the selected GPU ONLY if it was not already set by the + callers environment AND the number of GPUs is not 1. + This utility also sets environment variable HSA_CU_MASK + to control which CUs are available to the process. + HSA_CU_MASK is set only when more than one OpenMPI process + (rank) will utilize the same GPU. Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the number of CUs available to the process after masking. @@ -81,30 +82,31 @@ function usage(){ -s suppress output, often useful in benchmarking --version Print version of gpurun and exit - Generated (output) Environment Variables: - OMPX_TARGET_TEAM_PROCS - Number of CUs available to process - ROCR_VISIBLE_DEVICES - The logical device number for selected device - HSA_CU_MASK - The CU mask for the device. - - Optional input environment variables: - GPURUN_VERBOSE= + Optional Input environment variables: + GPURUN_VERBOSE "" (or unset) print 1 line trace to stdout, format: RANK: D: PCI: NN: 0: for silent operation, nothing is added to stdout 1: prints trace and other diagnostics to stdout 2: prints trace, other diagnostics, and taskset command ROCMINFO_BINARY Set location of rocminfo binary - AOMP location of AOMP or ROCM - - Input environment variables set by OpenMP mpirun - OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node - OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) - This script also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID + AOMP: location of AOMP or ROCM + GPURUN_DEVICE_BIAS: amount to shift device number to avoid dev 0 + ROCR_VISIBLE_DEVICES: See description above + OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node set by openmpi + OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) from openmpi + This also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK + Generated (output) Environment Variables: + OMPX_TARGET_TEAM_PROCS - Number of CUs available to process + ROCR_VISIBLE_DEVICES - The logical device number for selected device + Not changed if it was preset. + HSA_CU_MASK - The CU mask for the device. + Limitations: - This utility assigns no more than one GPU to the application process. - That is, the OpenMP API omp_get_num_devices() will always return 1. + Therefore, the OpenMP API omp_get_num_devices() will always return 1. - Currently, gpurun creates masks that are mutually exclusive of each other. That is, the MPI processes will not share CUs. If number of ranks is not perfectly divisible by number of CUs or number of GPUs, some resources @@ -133,7 +135,7 @@ function usage(){ GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to the total number of CUs on the GPU. - Copyright (c) 2022 ADVANCED MICRO DEVICES, INC. + Copyright (c) 2024 ADVANCED MICRO DEVICES, INC. EOF exit 0 @@ -144,6 +146,7 @@ EOF [ "$1" == "-version" ] && version [ "$1" == "-h" ] && usage [ "$1" == "-s" ] && verbosity && shift +[ "$1" == "-v" ] && verbose2 && shift [ "$1" == "-help" ] && usage [ "$1" == "--help" ] && usage @@ -195,35 +198,62 @@ if [ ! -f $ROCMINFO_BINARY ] ; then exit 1 fi -# Find number of GPUs and number of CUs per GPU +# Scan amdgpu devs and store info (bdfid, cpus, and numande) in 3 arrays +# indexed by _device_num. This is cleaner that parsing rocminfo bdfid. +# Eventially we want to get rid of all rocminfo parsing. +_sysdevdir="/sys/bus/pci/devices" +_scanned_num_devices=0 +_cpulist=() +_long_bdfid=() +_numanode=() +for _devid in `ls $_sysdevdir` ; do + if [ -f $_sysdevdir/$_devid/device ] ; then + _driver_name=`cat $_sysdevdir/$_devid/uevent | grep DRIVER | awk '{print $1}'` + if [ ! -z $_driver_name ] ; then + if [ $_driver_name == "DRIVER=amdgpu" ] ; then + _numa_node=`cat $_sysdevdir/$_devid/numa_node` + [ "$_numa_node" == "-1" ] && _numa_node=0 + _this_cpulist=`cat $_sysdevdir/$_devid/local_cpulist` + _long_bdfid+=( $_devid ) + _numanode+=( $_numa_node ) + _cpulist+=( $_this_cpulist ) + _scanned_num_devices=$(( $_scanned_num_devices + 1 )) + fi + fi + fi +done + +if [[ $_scanned_num_devices -lt 1 ]] ; then + >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir" + >&2 echo " num_devices=$_scanned_num_devices" + exit 1 +fi + +# Use rocminfo to find number number of CUs per GPU _available_CUs_per_device=0 -_available_devices=0 -_bdfids=() _tfile="/tmp/rinfo_out$$" -$ROCMINFO_BINARY | grep -E "Compute Unit:| Device Type:|BDFID:" >$_tfile +$ROCMINFO_BINARY | grep -E "Compute Unit:| Device Type:" >$_tfile while read _linepair ; do - last_cu_count=`echo $_linepair | cut -d":" -f2` + _fieldvalue=`echo $_linepair | cut -d":" -f2` _fieldtype=`echo $_linepair | cut -d":" -f1` - if [ $last_cu_count == "CPU" ] ; then + if [ $_fieldvalue == "CPU" ] ; then _last_device_type_was_gpu=0 - elif [ $last_cu_count == "GPU" ] ; then + elif [ $_fieldvalue == "GPU" ] ; then _last_device_type_was_gpu=1 else + # else the _fieldvalue was the number of CUs or GCPUs if [[ $_last_device_type_was_gpu == 1 ]] ; then - if [ "$_fieldtype" == "BDFID" ] ; then - bdfids+=( $last_cu_count ) - else - _available_devices=$(( $_available_devices + 1 )) - if [[ $_available_CUs_per_device == 0 ]] ; then - _available_CUs_per_device=$last_cu_count - else - if [[ $_available_CUs_per_device != $last_cu_count ]] ; then - >&2 echo "ERROR: Defective node! The cu_count for each GPU must be identical" - >&2 echo " Last CU count : $last_cu_count" - >&2 echo " Previous CU count : $_available_CUs_per_device" - >&2 echo " Number of GPUs : $_available_devices" - exit 1 - fi + if [[ $_available_CUs_per_device == 0 ]] ; then + # set number of CUs from the first GPU agent + _available_CUs_per_device=$_fieldvalue + else + # ensure all subsequent GPU agents have same number of CUs + if [[ $_available_CUs_per_device != $_fieldvalue ]] ; then + >&2 echo "ERROR: Defective node! The cu_count for each GPU must be identical" + >&2 echo " Last CU count : $_fieldvalue" + >&2 echo " First CU count : $_available_CUs_per_device" + >&2 echo " Scanned Number of GPUs : $_scanned_num_devices" + exit 1 fi fi fi @@ -231,22 +261,41 @@ while read _linepair ; do done < $_tfile rm $_tfile -if [[ $_available_devices -lt 1 ]] ; then - >&2 echo "ERROR: Local rank $_local_rank_num found no GPUS available" - >&2 echo " available_devices=$_available_devices" - exit 1 +if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then + _num_devices=$_scanned_num_devices +else + if [[ "$ROCR_VISIBLE_DEVICES" =~ .*",".* ]] ; then + >&2 echo "ERROR: preset ROCR_VISIBLE_DEVICES '$ROCR_VISIBLE_DEVICES' exposes more than one GPU" + >&2 echo " gpurun assumes each process will get a single GPU" + >&2 echo " use a single index between 0 and $(( $_scanned_num_devices - 1 ))" + exit 1 + fi + if [[ "$ROCR_VISIBLE_DEVICES" =~ .*"-".* ]] ; then + >&2 echo "ERROR: preset ROCR_VISIBLE_DEVICES '$ROCR_VISIBLE_DEVICES' is invalid" + >&2 echo " use a single index between 0 and $(( $_scanned_num_devices - 1 ))" + exit 1 + fi + if [[ $ROCR_VISIBLE_DEVICES -ge $_scanned_num_devices ]] ; then + >&2 echo "ERROR: preset ROCR_VISIBLE_DEVICES '$ROCR_VISIBLE_DEVICES' is invalid" + >&2 echo " use a single index between 0 and $(( $_scanned_num_devices - 1 ))" + exit 1 + fi + _num_devices=1 + if [[ $_scanned_num_devices != 1 ]] && [[ "$GPURUN_VERBOSE" != "0" ]] ; then + >&2 echo "WARNING: preset ROCR_VISIBLE_DEVICES, process $_local_rank_num of $_num_local_ranks to use dev $ROCR_VISIBLE_DEVICES" + fi fi -_node_cus=$(( $_available_devices * $_available_CUs_per_device )) +_node_cus=$(( $_num_devices * $_available_CUs_per_device )) if [ $_num_local_ranks -gt $_node_cus ] ; then - >&2 echo "ERROR: Not enough node CUs ($_node_cus) for $_num_local_ranks ranks " + >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " exit 1 fi -if [ $_available_devices -gt $_num_local_ranks ] ; then +if [ $_num_devices -gt $_num_local_ranks ] ; then _utilized_devices=$_num_local_ranks else - _utilized_devices=$_available_devices + _utilized_devices=$_num_devices fi # Calculate number of GPUs to use to evenly spread ranks across GPUs. @@ -259,7 +308,18 @@ if [ $_uncovered_ranks != 0 ] ; then # then add an extra rplace per GPU to make room for remainder. _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 )) fi -_device_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU + $GPURUN_DEVICE_BIAS )) + +if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then + _device_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU )) + # Some users want to shift selected device to avoid dev 0 + _device_num=$(( ( $_device_num + $GPURUN_DEVICE_BIAS ) % $_num_devices )) + _relative_device_num=$_device_num +else + # if caller set ROCR_VISIBLE_DEVICES, that becomes the device_num + _device_num=$ROCR_VISIBLE_DEVICES + _relative_device_num=0 +fi + _utilized_CUs_per_device=$_available_CUs_per_device _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) # Lower utilized CUs till divisible by number of rplaces per GPU @@ -272,22 +332,23 @@ _CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU )) # Diagnostics: if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "1" || "$GPURUN_VERBOSE" == "2" ]]; then _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device )) - _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_available_devices )) + _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_num_devices )) _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks )) _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace )) _utilization=$(( ( $_used_cus * 100 ) / $_node_cus )) - if ! [ $_available_devices -gt $_num_local_ranks ] ; then + if ! [ $_num_devices -gt $_num_local_ranks ] ; then if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then _extra_diags=true fi fi >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" - >&2 echo "- OPENMPI RANKS: $_num_local_ranks (OMPI_COMM_WORLD_LOCAL_SIZE)" + >&2 echo "- PROCESSES: $_num_local_ranks (OMPI_COMM_WORLD_LOCAL_SIZE)" [ $_extra_diags ] && echo - >&2 echo "- AVAILALBLE GPUS: $_available_devices" + >&2 echo "- SCANNED GPUS: $_scanned_num_devices ($_sysdevdir)" + >&2 echo "- AVAILABLE GPUS: $_num_devices" [ $_extra_diags ] && \ - >&2 echo "-- USED GPUS: $(( $_available_devices - $_wasted_GPUs ))" + >&2 echo "-- USED GPUS: $(( $_num_devices - $_wasted_GPUs ))" [ $_extra_diags ] && \ >&2 echo "-- UNUSED GPUS: $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) " [ $_extra_diags ] && echo @@ -299,21 +360,24 @@ if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "1" || "$GPURUN_VERBOSE" >&2 echo "-- UNUSED RPLACEs: $_total_wasted_rplaces" ; \ #&2 echo "- LAST GPU UNUSED RPLACES: $(( $_total_wasted_rplaces % $_number_of_rplaces_per_GPU )) " [ $_extra_diags ] && echo - >&2 echo "- CUs PER GPU: $_available_CUs_per_device" + >&2 echo "- CUs PER GPU: $_available_CUs_per_device" [ $_extra_diags ] && \ >&2 echo "-- USED CUs PER GPU: $_utilized_CUs_per_device" [ $_extra_diags ] && \ >&2 echo "-- UNUSED CUs PER GPU:$_wasted_CUs_on_each_GPU" >&2 echo "- CUs PER RPLACE: $_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)" >&2 echo "- FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU" - >&2 echo "- NODE UTILIZATION: $_utilization %" + if [[ -z "$HSA_CU_MASK" ]] ; then + # node utilizatino could be incorrect with preset mask. + >&2 echo "- NODE UTILIZATION: $_utilization %" + fi fi if [ $_CUs_per_rplace != $_available_CUs_per_device ] ; then # Build the CU mask for this rank, bits_to_set = _CUs_per_rplace _bits_to_set=$_CUs_per_rplace # This formula keeps adjacent ranks on same GPU which should be preferred - _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _device_num * $_utilized_CUs_per_device) )) + _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _relative_device_num * $_utilized_CUs_per_device) )) # use bc because these values can be very large _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc` _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc` @@ -325,57 +389,51 @@ if [ $_CUs_per_rplace != $_available_CUs_per_device ] ; then _mask="0x$_mask" fi -# Get NUMANODE and cpuset for this GPU identified by BDFID -_bdfid=${bdfids[$_device_num]} -_bdfidstr=`echo "obase=16; $_bdfid" | bc | tr '[:upper:]' '[:lower:]'` -if [ ${#_bdfidstr} == 3 ] ; then - _bdfidstrc="0${_bdfidstr:0:1}:${_bdfidstr:1:2}" -else - _bdfidstrc="${_bdfidstr:0:2}:${_bdfidstr:2:2}" -fi -#NUMANODE=`lspci -vmm -s $_bdfidstrc | grep -m 1 NUMANode | cut -d":" -f2` -NUMANODE=`cat /sys/bus/pci/devices/0000:$_bdfidstrc.0/numa_node` -# Sometimes lscpu shows 0 for the node, when /sys/bus/pci/.. shows -1 -[[ $NUMANODE == -1 ]] && NUMANODE=0 -[[ "$NUMANODE" == "" ]] && NUMANODE=0 -_taskset_cmd="taskset -c " -lscpu --extended=cpu,node >$_tfile -while read _linepair ; do - _nn=`echo $_linepair | awk '{print $2}'` - if [ $_nn == $NUMANODE ]; then - _cpu=`echo $_linepair | awk '{print $1}'` - if [ $_notfirstitem ] ; then - _taskset_cmd+=",$_cpu" - else - _taskset_cmd+="$_cpu" - _notfirstitem=1 - fi - fi -done < $_tfile -rm $_tfile +# retrieve scanned info +_bdfidstrc=${_long_bdfid[$_device_num]} +NUMANODE=${_numanode[$_device_num]} +_taskset_cmd="taskset -c ${_cpulist[$_device_num]}" +# If gpurun was not given command to execute, then don't run taskset_cmd [ "$*" == "" ] && _taskset_cmd="" -# FIXME: What if existing ROCR_VISIBLE_DEVICES has multiple non-sequential -# devices not starting at 0. -[[ $_available_devices != 1 ]] && export ROCR_VISIBLE_DEVICES=$_device_num + +# only set ROCR_VISIBLE_DEVICES if not already set and multiple devices available +[[ -z $ROCR_VISIBLE_DEVICES ]] && [[ $_num_devices != 1 ]] && export ROCR_VISIBLE_DEVICES=$_device_num export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace + +# - Limit HSA queues when multiple ranks per GPU +if [ $_number_of_rplaces_per_GPU != 1 ] ; then + # Only set these env controls if not set by caller + [[ -z "$GPU_MAX_HW_QUEUES" ]] && export GPU_MAX_HW_QUEUES=1 + [[ -z "$LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" ]] && export LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES=1 +fi + +if [[ ! -z "$HSA_CU_MASK" ]] && [[ "$GPURUN_VERBOSE" != "0" ]] ; then + >&2 echo "WARNING: preset HSA_CU_MASK:$HSA_CU_MASK" +fi if [ $_CUs_per_rplace == $_available_CUs_per_device ] ; then - # Do not set HSA_CU_MASK when using all CUs - [[ "$GPURUN_VERBOSE" == "1" || "$GPURUN_VERBOSE" == "" ]] && \ + # Do not modify HSA_CU_MASK when using all CUs + if [ "$GPURUN_VERBOSE" == "1" ] || [ "$GPURUN_VERBOSE" == "" ] ; then printf "RANK:%02d D:%d PCI:%5s NN:%d \n" $_local_rank_num $_device_num $_bdfidstrc $NUMANODE >&2 - [ "$GPURUN_VERBOSE" == "2" ] && \ - printf "RANK:%02d D:%d PCI:%5s NN:%d \n taskset cmd: %s \n " \ - $_local_rank_num $_device_num $_bdfidstrc $NUMANODE "$_taskset_cmd" >&2 + fi + if [ "$GPURUN_VERBOSE" == "2" ] ; then + printf "RANK:%02d D:%d PCI:%5s NN:%d CMD:%s $* \n" $_local_rank_num $_device_num $_bdfidstrc $NUMANODE "$_taskset_cmd" >&2 + fi $_taskset_cmd $* else - # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0: - export HSA_CU_MASK=0:$_mask - [[ "$GPURUN_VERBOSE" == "1" || "$GPURUN_VERBOSE" == "" ]] && \ - printf "RANK:%02d D:%d PCI:%5s NN:%d CUMASK:0:%s \n" \ - $_local_rank_num $_device_num $_bdfidstrc $NUMANODE $_mask >&2 - [ "$GPURUN_VERBOSE" == "2" ] && \ - printf "RANK:%02d D:%d PCI:%5s NN:%d CUMASK:0:%s \n taskset cmd: %s \n" \ - $_local_rank_num $_device_num $_bdfidstrc $NUMANODE $_mask "$_taskset_cmd" &>2 + if [[ -z "$HSA_CU_MASK" ]] ; then + # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0: + export HSA_CU_MASK=0:$_mask + else + # use preset mask + _mask=$HSA_CU_MASK + fi + if [ "$GPURUN_VERBOSE" == "1" ] || [ "$GPURUN_VERBOSE" == "" ] ; then + printf "RANK:%02d D:%d PCI:%5s NN:%d CUMASK:$_mask \n" $_local_rank_num $_device_num $_bdfidstrc $NUMANODE >&2 + fi + if [ "$GPURUN_VERBOSE" == "2" ] ; then + printf "RANK:%02d D:%d PCI:%5s NN:%d CUMASK:$_mask CMD:%s $* \n" $_local_rank_num $_device_num $_bdfidstrc $NUMANODE "$_taskset_cmd" >&2 + fi HSA_CU_MASK=0:$_mask \ $_taskset_cmd $* fi