Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[likwid-mpirun] Update for better interaction with SLURM #616

Merged
merged 3 commits into from
Sep 9, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 81 additions & 20 deletions src/applications/likwid-mpirun.lua
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ if os.getenv("LIKWID_FORCE") ~= nil then
force = true
end

local LIKWID_PIN="<INSTALLED_PREFIX>/bin/likwid-pin"
local LIKWID_PIN="likwid-pin"
local LIKWID_PERFCTR="<INSTALLED_PREFIX>/bin/likwid-perfctr"

local readHostfile = nil
Expand Down Expand Up @@ -242,6 +242,10 @@ local function writeHostfileOpenMPI(hostlist, filename)
print_stderr("ERROR: Cannot open hostfile "..filename)
mpirun_exit(1)
end
if debug then
print_stdout("DEBUG: New Hostfile for OpenMPI:")
print_stdout("DEBUG: " .. string.rep("-", 30))
end
for i, hostcontent in pairs(hostlist) do
str = hostcontent["hostname"]
if hostcontent["slots"] then
Expand All @@ -250,8 +254,14 @@ local function writeHostfileOpenMPI(hostlist, filename)
if hostcontent["maxslots"] then
str = str .. string.format(" max-slots=%d", hostcontent["maxslots"])
end
if debug then
print_stdout("DEBUG: " .. str)
end
f:write(str .. "\n")
end
if debug then
print_stdout("DEBUG: " .. string.rep("-", 30))
end
f:close()
end

Expand Down Expand Up @@ -341,6 +351,8 @@ local function writeHostfileIntelMPI(hostlist, filename)
str = hostcontent["hostname"]
if hostcontent["ppn"] then
str = str .. string.format(":%d", hostcontent["ppn"])
elseif hostcontent["slots"] then
str = str .. string.format(":%d", hostcontent["slots"])
end
f:write(str .. "\n")
end
Expand Down Expand Up @@ -638,6 +650,9 @@ local function _srun_get_mpi_types()
if line:match("srun: ([^%s]+)") then
local mpitype = line:match("srun: ([^%s]+)")
t[mpitype] = true
elseif line:match("%s+([^%s]+)") then
local mpitype = line:match("%s+([^%s]+)")
t[mpitype] = true
end
end
end
Expand All @@ -656,8 +671,33 @@ local function executeSlurm(wrapperscript, hostfile, env, nrNodes)
if not slurm_no_tasks_per_node then
opts["ntasks-per-node"] = string.format("%d", ppn)
end
opts["cpu_bind"] = "none"
opts["cpus-per-task"] = string.format("%d", tpp)
local cpumasks = {}
for _, c in pairs(cpuexprs) do
local cmask = {0, 0, 0, 0, 0, 0, 0, 0}
for _, cpu in pairs(c) do
local icpu = tonumber(cpu)
local offset = icpu % 64
local cmask_idx = math.tointeger(icpu // 64)
cmask[cmask_idx+1] = cmask[cmask_idx+1] | (1<<offset)
end
s = ""
for _, cm in pairs(cmask) do
s = string.format("%0.16x", cm) .. s
end
local idx = -1
for i=1, #s do
if string.sub(s, i, i) ~= "0" then
idx = i
break
end
end
if idx > 0 then
s = string.sub(s, idx)
end
table.insert(cpumasks, string.format("0x%s", s))
end
opts["cpu-bind"] = "mask_cpu:"..table.concat(cpumasks, ",")
--opts["cpus-per-task"] = string.format("%d", tpp)
supported_types = _srun_get_mpi_types()
if supported_types["pmi2"] then
opts["mpi"] = "pmi2"
Expand Down Expand Up @@ -907,6 +947,12 @@ local function getOmpType()
elseif line:match("libiomp%d*.so") then
omptype = "intel"
break
elseif line:match("libnvomp.so") then
omptype = "nvidia"
break
elseif line:match("libomp.so") then
omptype = "llvm"
break
elseif line:match("not a dynamic executable") then
omptype = "none"
dyn_linked = false
Expand Down Expand Up @@ -962,7 +1008,6 @@ local function assignHosts(hosts, np, ppn, tpp)
if host["maxslots"] and host["maxslots"] < ppn*tpp then
table.insert(newhosts, {hostname=host["hostname"],
slots=host["maxslots"],
ppn=ppn,
maxslots=host["maxslots"],
interface=host["interface"]})
if debug then
Expand All @@ -973,7 +1018,6 @@ local function assignHosts(hosts, np, ppn, tpp)
else
table.insert(newhosts, {hostname=host["hostname"],
slots=ppn*tpp,
ppn=ppn,
maxslots=host["slots"],
interface=host["interface"]})
if debug then
Expand Down Expand Up @@ -1018,7 +1062,6 @@ local function assignHosts(hosts, np, ppn, tpp)
mpirun_exit(1)
else
table.insert(newhosts, {hostname=host["hostname"],
ppn=ppn,
slots=ppn*tpp,
maxslots=host["slots"],
interface=host["interface"]})
Expand Down Expand Up @@ -1413,22 +1456,21 @@ local function setPerfStrings(perflist, cpuexprs)
end

local function checkLikwid()
local f = io.popen("which likwid-pin 2>/dev/null", "r")
if f ~= nil then
local s = f:read("*line")
if s ~= nil and s ~= LIKWID_PIN then
LIKWID_PIN = s
if string.sub(LIKWID_PIN, 1,1) ~= "/" then
local before = LIKWID_PIN
LIKWID_PIN = abspath(LIKWID_PIN)
if debug then
print_stdout(string.format("DEBUG: Resolved %s to %s", before, LIKWID_PIN))
end
f:close()
end
f = io.popen("which likwid-perfctr 2>/dev/null", "r")
if f ~= nil then
local s = f:read("*line")
if s ~= nil and s ~= LIKWID_PERFCTR then
LIKWID_PERFCTR = s
if string.sub(LIKWID_PERFCTR, 1,1) ~= "/" then
local before = LIKWID_PERFCTR
LIKWID_PERFCTR = abspath(LIKWID_PERFCTR)
if debug then
print_stdout(string.format("DEBUG: Resolved %s to %s", before, LIKWID_PERFCTR))
end
f:close()
end

end

local function writeWrapperScript(scriptname, execStr, hosts, envsettings, outputname)
Expand Down Expand Up @@ -1513,7 +1555,7 @@ local function writeWrapperScript(scriptname, execStr, hosts, envsettings, outpu
cpuexpr_opt = "-C"
else
table.insert(cmd, LIKWID_PIN)
table.insert(cmd,"-q")
--table.insert(cmd,"-q")
if #perf > 0 then
table.insert(only_pinned_processes, i)
end
Expand All @@ -1526,7 +1568,11 @@ local function writeWrapperScript(scriptname, execStr, hosts, envsettings, outpu
end
table.insert(cmd, skipStr)
table.insert(cmd, cpuexpr_opt)
table.insert(cmd, table.concat(cpuexprs[i], ","))
if mpitype == "slurm" then
table.insert(cmd, string.format("L:N:0-%d", #cpuexprs[i] - 1))
else
table.insert(cmd, table.concat(cpuexprs[i], ","))
end
if use_perfctr then
for j, expr in pairs(perfexprs) do
table.insert(cmd, "-g")
Expand Down Expand Up @@ -2495,6 +2541,14 @@ elseif np < #cpuexprs then
ppn = #cpuexprs
end

if debug then
print_stdout("DEBUG: Detected environment")
print_stdout("DEBUG: MPI " .. mpitype)
if omptype then
print_stdout("DEBUG: OpenMP " .. omptype)
end
end

if skipStr == "" then
if mpitype == "intelmpi" then
maj, min = getMpiVersion()
Expand Down Expand Up @@ -2545,6 +2599,13 @@ if skipStr == "" then
else
skipStr = '-s 0x3'
end
elseif omptype == "nvidia" then
if nrNodes == 1 and tpp == 2 then
skipStr = '-s 0x1'
--else
--skipStr = '-s 0x3'
end

end
end
end
Expand Down
Loading