Skip to content

Commit

Permalink
OpenCL: thread interleaving
Browse files Browse the repository at this point in the history
If two threads are using the same GPU device the start time of each hash round is optimized based on the average time needed to calculate a bunch of hashes.

This way to optimize the hash rate was first introduced by @SChernykh. This implementation based on the implementation in xmrig but differen in the details.

- introduce a new config option `interleave`
- implement thread interleaving
  • Loading branch information
psychocrypt committed Nov 27, 2018
1 parent 76f0de7 commit 5263c99
Show file tree
Hide file tree
Showing 7 changed files with 169 additions and 7 deletions.
116 changes: 116 additions & 0 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "xmrstak/picosha2/picosha2.hpp"
#include "xmrstak/params.hpp"
#include "xmrstak/version.hpp"
#include "xmrstak/net/msgstruct.hpp"

#include <stdio.h>
#include <string.h>
Expand All @@ -34,6 +35,7 @@
#include <vector>
#include <string>
#include <iostream>
#include <thread>

#if defined _MSC_VER
#include <direct.h>
Expand Down Expand Up @@ -730,11 +732,21 @@ std::vector<GpuContext> getAMDDevices(int index)
continue;
}

std::vector<char> openCLDriverVer(1024);
if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k);
continue;
}

bool isHSAOpenCL = std::string(openCLDriverVer.data()).find("HSA") != std::string::npos;

// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
ctx.deviceIdx = k;
ctx.freeMem = std::min(ctx.freeMem, maxMem);
ctx.name = std::string(devNameVec.data());
ctx.DeviceID = device_list[k];
ctx.interleave = 40;
printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
ctxVec.push_back(ctx);
}
Expand Down Expand Up @@ -936,8 +948,27 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
// create a directory for the OpenCL compile cache
create_directory(get_home() + "/.openclcache");

std::vector<std::shared_ptr<InterleaveData>> interleaveData(num_gpus, nullptr);

for(int i = 0; i < num_gpus; ++i)
{
const size_t devIdx = ctx[i].deviceIdx;
if(interleaveData.size() <= devIdx)
{
interleaveData.resize(devIdx + 1u, nullptr);
}
if(!interleaveData[devIdx])
{
interleaveData[devIdx].reset(new InterleaveData{});
interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms();

}
ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU;
++interleaveData[devIdx]->numThreadsOnGPU;
ctx[i].interleaveData = interleaveData[devIdx];
ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;

const std::string backendName = xmrstak::params::inst().openCLVendor;
if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
{
Expand Down Expand Up @@ -1126,6 +1157,91 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
return ERR_SUCCESS;
}

void updateTimings(GpuContext* ctx, const uint64_t t)
{
// averagingBias = 1.0 - only the last delta time is taken into account
// averagingBias = 0.5 - the last delta time has the same weight as all the previous ones combined
// averagingBias = 0.1 - the last delta time has 10% weight of all the previous ones combined
const double averagingBias = 0.1;

{
int64_t t2 = get_timestamp_ms();
std::lock_guard<std::mutex> g(ctx->interleaveData->mutex);
// 20000 mean that something went wrong an we reset the average
if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0)
ctx->interleaveData->avgKernelRuntime = (t2 - t);
else
ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (t2 - t) * averagingBias;
}
}

uint64_t interleaveAdjustDelay(GpuContext* ctx)
{
uint64_t t0 = get_timestamp_ms();

if(ctx->interleaveData->numThreadsOnGPU > 1 && ctx->interleaveData->adjustThreshold > 0.0)
{
t0 = get_timestamp_ms();
std::unique_lock<std::mutex> g(ctx->interleaveData->mutex);

int64_t delay = 0;
double dt = 0.0;

if(t0 > ctx->interleaveData->lastRunTimeStamp)
dt = static_cast<double>(t0 - ctx->interleaveData->lastRunTimeStamp);

const double avgRuntime = ctx->interleaveData->avgKernelRuntime;
const double optimalTimeOffset = avgRuntime * ctx->interleaveData->adjustThreshold;

// threshold where the the auto adjustment is disabled
constexpr uint32_t maxDelay = 10;
constexpr double maxAutoAdjust = 0.05;

if((dt > 0) && (dt < optimalTimeOffset))
{
delay = static_cast<int64_t>((optimalTimeOffset - dt));
if(ctx->lastDelay == delay && delay > maxDelay)
ctx->interleaveData->adjustThreshold -= 0.001;
// if the delay doubled than increase the adjustThreshold
else if(delay > 1 && ctx->lastDelay * 2 < delay)
ctx->interleaveData->adjustThreshold += 0.001;
ctx->lastDelay = delay;

// this is std::clamp which is available in c++17
ctx->interleaveData->adjustThreshold = std::max(
std::max(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold - maxAutoAdjust),
std::min(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold + maxAutoAdjust)
);
// avoid that the auto adjustment is disable interleaving
ctx->interleaveData->adjustThreshold = std::max(
ctx->interleaveData->adjustThreshold,
0.001
);
}
delay = std::max(int64_t(0), delay);

ctx->interleaveData->lastRunTimeStamp = t0 + delay;

g.unlock();
if(delay > 0)
{
// do not notify the user anymore if we reach a good delay
if(delay > maxDelay)
printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf",
ctx->deviceIdx,
ctx->idWorkerOnDevice,
static_cast<uint32_t>(delay),
avgRuntime,
ctx->interleaveData->adjustThreshold * 100.
);

std::this_thread::sleep_for(std::chrono::milliseconds(delay));
}
}

return t0;
}

size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
{
// switch to the kernel storage
Expand Down
19 changes: 17 additions & 2 deletions xmrstak/backend/amd/amd_gpu/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,23 @@
#include <stdint.h>
#include <string>
#include <vector>
#include <mutex>
#include <memory>

#define ERR_SUCCESS (0)
#define ERR_OCL_API (2)
#define ERR_STUPID_PARAMS (1)

struct InterleaveData
{
std::mutex mutex;

double adjustThreshold = 0.4;
double startAdjustThreshold = 0.4;
double avgKernelRuntime = 0.0;
uint64_t lastRunTimeStamp = 0;
uint32_t numThreadsOnGPU = 0;
};

struct GpuContext
{
Expand All @@ -42,6 +53,10 @@ struct GpuContext
size_t freeMem;
int computeUnits;
std::string name;
std::shared_ptr<InterleaveData> interleaveData;
uint32_t idWorkerOnDevice = 0u;
int interleave = 40;
uint64_t lastDelay = 0;

uint32_t Nonce;

Expand All @@ -54,5 +69,5 @@ std::vector<GpuContext> getAMDDevices(int index);
size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);


uint64_t interleaveAdjustDelay(GpuContext* ctx);
void updateTimings(GpuContext* ctx, const uint64_t t);
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/autoAdjust.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ class autoAdjust
conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
" \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
" \"unroll\" : 8, \"comp_mode\" : true\n" +
" \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
" },\n";
}
else
Expand Down
10 changes: 8 additions & 2 deletions xmrstak/backend/amd/config.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,16 @@ R"===(// generated by XMRSTAK_VERSION
* to use a intensity which is not the multiple of the worksize.
* If you set false and the intensity is not multiple of the worksize the miner can crash:
* in this case set the intensity to a multiple of the worksize or activate comp_mode.
* interleave - Controls the starting point in time between two threads on the same GPU device relative to the last started thread.
* This option has only an effect if two compute threads using the same GPU device: valid range [0;100]
* 0 = disable thread interleaving
* 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished
* "gpu_threads_conf" :
* [
* { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
* "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
* { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
* "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
* "interleave" : 40
* },
* ],
* If you do not wish to mine with your AMD GPU(s) then use:
* "gpu_threads_conf" :
Expand Down
21 changes: 20 additions & 1 deletion xmrstak/backend/amd/jconf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;

const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode;
const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode, *interleave;
idx = GetObjectMember(oThdConf, "index");
intensity = GetObjectMember(oThdConf, "intensity");
w_size = GetObjectMember(oThdConf, "worksize");
Expand All @@ -115,11 +115,30 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
memChunk = GetObjectMember(oThdConf, "mem_chunk");
unroll = GetObjectMember(oThdConf, "unroll");
compMode = GetObjectMember(oThdConf, "comp_mode");
interleave = GetObjectMember(oThdConf, "interleave");

if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
stridedIndex == nullptr || unroll == nullptr || compMode == nullptr)
return false;

// interleave is optional
if(interleave == nullptr)
cfg.interleave = 50;
else if(!interleave->IsUint64())
{
printer::inst()->print_msg(L0, "ERROR: interleave must be a number");
return false;
}
else if((int)interleave->GetInt64() < 0 || (int)interleave->GetInt64() > 100)
{
printer::inst()->print_msg(L0, "ERROR: interleave must be in range [0;100]");
return false;
}
else
{
cfg.interleave = (int)interleave->GetInt64();
}

if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
return false;

Expand Down
1 change: 1 addition & 0 deletions xmrstak/backend/amd/jconf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class jconf
size_t w_size;
long long cpu_aff;
int stridedIndex;
int interleave = 40;
int memChunk;
int unroll;
bool compMode;
Expand Down
7 changes: 6 additions & 1 deletion xmrstak/backend/amd/minethd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ bool minethd::init_gpus()
vGpuData[i].memChunk = cfg.memChunk;
vGpuData[i].compMode = cfg.compMode;
vGpuData[i].unroll = cfg.unroll;
vGpuData[i].interleave = cfg.interleave;
}

return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
Expand Down Expand Up @@ -242,6 +243,7 @@ void minethd::work_main()
break;
}

uint64_t t0 = interleaveAdjustDelay(pGpuCtx);

cl_uint results[0x100];
memset(results,0,sizeof(cl_uint)*(0x100));
Expand Down Expand Up @@ -269,6 +271,9 @@ void minethd::work_main()
uint64_t iStamp = get_timestamp_ms();
iHashCount.store(iCount, std::memory_order_relaxed);
iTimestamp.store(iStamp, std::memory_order_relaxed);

updateTimings(pGpuCtx, t0);

std::this_thread::yield();
}

Expand All @@ -277,4 +282,4 @@ void minethd::work_main()
}

} // namespace amd
} // namespace xmrstak
} // namespace xmrstak

0 comments on commit 5263c99

Please sign in to comment.