From d8316f7d9245973ab2c8467d85da06a58d566131 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Tue, 27 Nov 2018 23:03:14 +0100 Subject: [PATCH] OpenCL: thread interleaving If two threads are using the same GPU device the start time of each hash round is optimized based on the average time needed to calculate a bunch of hashes. This way to optimize the hash rate was first introduced by @SChernykh. This implementation based on the implementation in xmrig but differen in the details. - introduce a new config option `interleave` - implement thread interleaving --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 116 ++++++++++++++++++++++++++++ xmrstak/backend/amd/amd_gpu/gpu.hpp | 19 ++++- xmrstak/backend/amd/autoAdjust.hpp | 2 +- xmrstak/backend/amd/config.tpl | 10 ++- xmrstak/backend/amd/jconf.cpp | 21 ++++- xmrstak/backend/amd/jconf.hpp | 1 + xmrstak/backend/amd/minethd.cpp | 5 ++ 7 files changed, 168 insertions(+), 6 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 58902400d..42a461fea 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -18,6 +18,7 @@ #include "xmrstak/picosha2/picosha2.hpp" #include "xmrstak/params.hpp" #include "xmrstak/version.hpp" +#include "xmrstak/net/msgstruct.hpp" #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #if defined _MSC_VER #include @@ -730,11 +732,21 @@ std::vector getAMDDevices(int index) continue; } + std::vector openCLDriverVer(1024); + if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k); + continue; + } + + bool isHSAOpenCL = std::string(openCLDriverVer.data()).find("HSA") != std::string::npos; + // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory ctx.deviceIdx = k; ctx.freeMem = std::min(ctx.freeMem, maxMem); ctx.name = std::string(devNameVec.data()); ctx.DeviceID = device_list[k]; + ctx.interleave = 40; printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); ctxVec.push_back(ctx); } @@ -936,8 +948,27 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); + std::vector> interleaveData(num_gpus, nullptr); + for(int i = 0; i < num_gpus; ++i) { + const size_t devIdx = ctx[i].deviceIdx; + if(interleaveData.size() <= devIdx) + { + interleaveData.resize(devIdx + 1u, nullptr); + } + if(!interleaveData[devIdx]) + { + interleaveData[devIdx].reset(new InterleaveData{}); + interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms(); + + } + ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU; + ++interleaveData[devIdx]->numThreadsOnGPU; + ctx[i].interleaveData = interleaveData[devIdx]; + ctx[i].interleaveData->adjustThreshold = static_cast(ctx[i].interleave)/100.0; + ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold; + const std::string backendName = xmrstak::params::inst().openCLVendor; if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0) { @@ -1126,6 +1157,91 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_SUCCESS; } +void updateTimings(GpuContext* ctx, const uint64_t t) +{ + // averagingBias = 1.0 - only the last delta time is taken into account + // averagingBias = 0.5 - the last delta time has the same weight as all the previous ones combined + // averagingBias = 0.1 - the last delta time has 10% weight of all the previous ones combined + const double averagingBias = 0.1; + + { + int64_t t2 = get_timestamp_ms(); + std::lock_guard g(ctx->interleaveData->mutex); + // 20000 mean that something went wrong an we reset the average + if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0) + ctx->interleaveData->avgKernelRuntime = (t2 - t); + else + ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (t2 - t) * averagingBias; + } +} + +uint64_t interleaveAdjustDelay(GpuContext* ctx) +{ + uint64_t t0 = get_timestamp_ms(); + + if(ctx->interleaveData->numThreadsOnGPU > 1 && ctx->interleaveData->adjustThreshold > 0.0) + { + t0 = get_timestamp_ms(); + std::unique_lock g(ctx->interleaveData->mutex); + + int64_t delay = 0; + double dt = 0.0; + + if(t0 > ctx->interleaveData->lastRunTimeStamp) + dt = static_cast(t0 - ctx->interleaveData->lastRunTimeStamp); + + const double avgRuntime = ctx->interleaveData->avgKernelRuntime; + const double optimalTimeOffset = avgRuntime * ctx->interleaveData->adjustThreshold; + + // threshold where the the auto adjustment is disabled + constexpr uint32_t maxDelay = 10; + constexpr double maxAutoAdjust = 0.05; + + if((dt > 0) && (dt < optimalTimeOffset)) + { + delay = static_cast((optimalTimeOffset - dt)); + if(ctx->lastDelay == delay && delay > maxDelay) + ctx->interleaveData->adjustThreshold -= 0.001; + // if the delay doubled than increase the adjustThreshold + else if(delay > 1 && ctx->lastDelay * 2 < delay) + ctx->interleaveData->adjustThreshold += 0.001; + ctx->lastDelay = delay; + + // this is std::clamp which is available in c++17 + ctx->interleaveData->adjustThreshold = std::max( + std::max(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold - maxAutoAdjust), + std::min(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold + maxAutoAdjust) + ); + // avoid that the auto adjustment is disable interleaving + ctx->interleaveData->adjustThreshold = std::max( + ctx->interleaveData->adjustThreshold, + 0.001 + ); + } + delay = std::max(int64_t(0), delay); + + ctx->interleaveData->lastRunTimeStamp = t0 + delay; + + g.unlock(); + if(delay > 0) + { + // do not notify the user anymore if we reach a good delay + if(delay > maxDelay) + printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf", + ctx->deviceIdx, + ctx->idWorkerOnDevice, + static_cast(delay), + avgRuntime, + ctx->interleaveData->adjustThreshold * 100. + ); + + std::this_thread::sleep_for(std::chrono::milliseconds(delay)); + } + } + + return t0; +} + size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) { // switch to the kernel storage diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 63c5029d7..a23a4b2a3 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -12,12 +12,23 @@ #include #include #include +#include +#include #define ERR_SUCCESS (0) #define ERR_OCL_API (2) #define ERR_STUPID_PARAMS (1) +struct InterleaveData +{ + std::mutex mutex; + double adjustThreshold = 0.4; + double startAdjustThreshold = 0.4; + double avgKernelRuntime = 0.0; + uint64_t lastRunTimeStamp = 0; + uint32_t numThreadsOnGPU = 0; +}; struct GpuContext { @@ -42,6 +53,10 @@ struct GpuContext size_t freeMem; int computeUnits; std::string name; + std::shared_ptr interleaveData; + uint32_t idWorkerOnDevice = 0u; + int interleave = 40; + uint64_t lastDelay = 0; uint32_t Nonce; @@ -54,5 +69,5 @@ std::vector getAMDDevices(int index); size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx); size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo); size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo); - - +uint64_t interleaveAdjustDelay(GpuContext* ctx); +void updateTimings(GpuContext* ctx, const uint64_t t); diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 387e84c38..dd6928eeb 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -181,7 +181,7 @@ class autoAdjust conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" - " \"unroll\" : 8, \"comp_mode\" : true\n" + + " \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" + " },\n"; } else diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index b43faa640..50ce3e27e 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -22,10 +22,16 @@ R"===(// generated by XMRSTAK_VERSION * to use a intensity which is not the multiple of the worksize. * If you set false and the intensity is not multiple of the worksize the miner can crash: * in this case set the intensity to a multiple of the worksize or activate comp_mode. + * interleave - Controls the starting point in time between two threads on the same GPU device relative to the last started thread. + * This option has only an effect if two compute threads using the same GPU device: valid range [0;100] + * 0 = disable thread interleaving + * 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + * "interleave" : 40 + * }, * ], * If you do not wish to mine with your AMD GPU(s) then use: * "gpu_threads_conf" : diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index ee9c4ea0e..e4da91ca1 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -106,7 +106,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode, *interleave; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); @@ -115,11 +115,30 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) memChunk = GetObjectMember(oThdConf, "mem_chunk"); unroll = GetObjectMember(oThdConf, "unroll"); compMode = GetObjectMember(oThdConf, "comp_mode"); + interleave = GetObjectMember(oThdConf, "interleave"); if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr || stridedIndex == nullptr || unroll == nullptr || compMode == nullptr) return false; + // interleave is optional + if(interleave == nullptr) + cfg.interleave = 50; + else if(!interleave->IsUint64()) + { + printer::inst()->print_msg(L0, "ERROR: interleave must be a number"); + return false; + } + else if((int)interleave->GetInt64() < 0 || (int)interleave->GetInt64() > 100) + { + printer::inst()->print_msg(L0, "ERROR: interleave must be in range [0;100]"); + return false; + } + else + { + cfg.interleave = (int)interleave->GetInt64(); + } + if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) return false; diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index b852c5940..61494d910 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -27,6 +27,7 @@ class jconf size_t w_size; long long cpu_aff; int stridedIndex; + int interleave = 40; int memChunk; int unroll; bool compMode; diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 5e70f25a6..5e0d67889 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -100,6 +100,7 @@ bool minethd::init_gpus() vGpuData[i].memChunk = cfg.memChunk; vGpuData[i].compMode = cfg.compMode; vGpuData[i].unroll = cfg.unroll; + vGpuData[i].interleave = cfg.interleave; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; @@ -242,6 +243,7 @@ void minethd::work_main() break; } + uint64_t t0 = interleaveAdjustDelay(pGpuCtx); cl_uint results[0x100]; memset(results,0,sizeof(cl_uint)*(0x100)); @@ -269,6 +271,9 @@ void minethd::work_main() uint64_t iStamp = get_timestamp_ms(); iHashCount.store(iCount, std::memory_order_relaxed); iTimestamp.store(iStamp, std::memory_order_relaxed); + + updateTimings(pGpuCtx, t0); + std::this_thread::yield(); }