From d8316f7d9245973ab2c8467d85da06a58d566131 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Tue, 27 Nov 2018 23:03:14 +0100
Subject: [PATCH] OpenCL: thread interleaving

If two threads are using the same GPU device the start time of each hash round is optimized based on the average time needed to calculate a bunch of hashes.

This way to optimize the hash rate was first introduced by @SChernykh. This implementation based on the implementation in xmrig but differen in the details.

- introduce a new config option `interleave`
- implement thread interleaving
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 116 ++++++++++++++++++++++++++++
 xmrstak/backend/amd/amd_gpu/gpu.hpp |  19 ++++-
 xmrstak/backend/amd/autoAdjust.hpp  |   2 +-
 xmrstak/backend/amd/config.tpl      |  10 ++-
 xmrstak/backend/amd/jconf.cpp       |  21 ++++-
 xmrstak/backend/amd/jconf.hpp       |   1 +
 xmrstak/backend/amd/minethd.cpp     |   5 ++
 7 files changed, 168 insertions(+), 6 deletions(-)
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 58902400d..42a461fea 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -18,6 +18,7 @@
 #include "xmrstak/picosha2/picosha2.hpp"
 #include "xmrstak/params.hpp"
 #include "xmrstak/version.hpp"
+#include "xmrstak/net/msgstruct.hpp"
 
 #include <stdio.h>
 #include <string.h>
@@ -34,6 +35,7 @@
 #include <vector>
 #include <string>
 #include <iostream>
+#include <thread>
 
 #if defined _MSC_VER
 #include <direct.h>
@@ -730,11 +732,21 @@ std::vector<GpuContext> getAMDDevices(int index)
 				continue;
 			}
 
+			std::vector<char> openCLDriverVer(1024);
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k);
+				continue;
+			}
+
+			bool isHSAOpenCL = std::string(openCLDriverVer.data()).find("HSA") != std::string::npos;
+
 			// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
 			ctx.deviceIdx = k;
 			ctx.freeMem = std::min(ctx.freeMem, maxMem);
 			ctx.name = std::string(devNameVec.data());
 			ctx.DeviceID = device_list[k];
+			ctx.interleave = 40;
 			printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
 			ctxVec.push_back(ctx);
 		}
@@ -936,8 +948,27 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	// create a directory  for the OpenCL compile cache
 	create_directory(get_home() + "/.openclcache");
 
+	std::vector<std::shared_ptr<InterleaveData>> interleaveData(num_gpus, nullptr);
+
 	for(int i = 0; i < num_gpus; ++i)
 	{
+		const size_t devIdx = ctx[i].deviceIdx;
+		if(interleaveData.size() <= devIdx)
+		{
+			interleaveData.resize(devIdx + 1u, nullptr);
+		}
+		if(!interleaveData[devIdx])
+		{
+			interleaveData[devIdx].reset(new InterleaveData{});
+			interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms();
+
+		}
+		ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU;
+		++interleaveData[devIdx]->numThreadsOnGPU;
+		ctx[i].interleaveData = interleaveData[devIdx];
+		ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
+		ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;
+
 		const std::string backendName = xmrstak::params::inst().openCLVendor;
 		if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
 		{
@@ -1126,6 +1157,91 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	return ERR_SUCCESS;
 }
 
+void updateTimings(GpuContext* ctx, const uint64_t t)
+{
+    // averagingBias = 1.0 - only the last delta time is taken into account
+    // averagingBias = 0.5 - the last delta time has the same weight as all the previous ones combined
+    // averagingBias = 0.1 - the last delta time has 10% weight of all the previous ones combined
+    const double averagingBias = 0.1;
+
+    {
+		int64_t t2 = get_timestamp_ms();
+		std::lock_guard<std::mutex> g(ctx->interleaveData->mutex);
+		// 20000 mean that something went wrong an we reset the average
+		if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0)
+			ctx->interleaveData->avgKernelRuntime = (t2 - t);
+		else
+			ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (t2 - t) * averagingBias;
+    }
+}
+
+uint64_t interleaveAdjustDelay(GpuContext* ctx)
+{
+	uint64_t t0 = get_timestamp_ms();
+
+	if(ctx->interleaveData->numThreadsOnGPU > 1 && ctx->interleaveData->adjustThreshold > 0.0)
+    {
+		t0 = get_timestamp_ms();
+		std::unique_lock<std::mutex> g(ctx->interleaveData->mutex);
+
+		int64_t delay = 0;
+        double dt = 0.0;
+
+		if(t0 > ctx->interleaveData->lastRunTimeStamp)
+			dt = static_cast<double>(t0 - ctx->interleaveData->lastRunTimeStamp);
+
+		const double avgRuntime = ctx->interleaveData->avgKernelRuntime;
+		const double optimalTimeOffset = avgRuntime * ctx->interleaveData->adjustThreshold;
+
+		// threshold where the the auto adjustment is disabled
+		constexpr uint32_t maxDelay = 10;
+		constexpr double maxAutoAdjust = 0.05;
+
+		if((dt > 0) && (dt < optimalTimeOffset))
+		{
+            delay = static_cast<int64_t>((optimalTimeOffset  - dt));
+			if(ctx->lastDelay == delay && delay > maxDelay)
+				ctx->interleaveData->adjustThreshold -= 0.001;
+			// if the delay doubled than increase the adjustThreshold
+			else if(delay > 1 && ctx->lastDelay * 2 < delay)
+				ctx->interleaveData->adjustThreshold += 0.001;
+			ctx->lastDelay = delay;
+
+			// this is std::clamp which is available in c++17
+			ctx->interleaveData->adjustThreshold = std::max(
+				std::max(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold - maxAutoAdjust),
+				std::min(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold + maxAutoAdjust)
+			);
+			// avoid that the auto adjustment is disable interleaving
+			ctx->interleaveData->adjustThreshold = std::max(
+				ctx->interleaveData->adjustThreshold,
+				0.001
+			);
+		}
+		delay = std::max(int64_t(0), delay);
+
+		ctx->interleaveData->lastRunTimeStamp = t0 + delay;
+
+		g.unlock();
+		if(delay > 0)
+		{
+			// do not notify the user anymore if we reach a good delay
+			if(delay > maxDelay)
+				printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf",
+					ctx->deviceIdx,
+					ctx->idWorkerOnDevice,
+					static_cast<uint32_t>(delay),
+					avgRuntime,
+					ctx->interleaveData->adjustThreshold * 100.
+				);
+
+			std::this_thread::sleep_for(std::chrono::milliseconds(delay));
+		}
+    }
+
+    return t0;
+}
+
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 {
 	// switch to the kernel storage
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 63c5029d7..a23a4b2a3 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -12,12 +12,23 @@
 #include <stdint.h>
 #include <string>
 #include <vector>
+#include <mutex>
+#include <memory>
 
 #define ERR_SUCCESS (0)
 #define ERR_OCL_API (2)
 #define ERR_STUPID_PARAMS (1)
 
+struct InterleaveData
+{
+    std::mutex mutex;
 
+    double adjustThreshold = 0.4;
+    double startAdjustThreshold = 0.4;
+    double avgKernelRuntime = 0.0;
+    uint64_t lastRunTimeStamp = 0;
+    uint32_t numThreadsOnGPU = 0;
+};
 
 struct GpuContext
 {
@@ -42,6 +53,10 @@ struct GpuContext
 	size_t freeMem;
 	int computeUnits;
 	std::string name;
+	std::shared_ptr<InterleaveData> interleaveData;
+	uint32_t idWorkerOnDevice = 0u;
+	int interleave = 40;
+	uint64_t lastDelay = 0;
 
 	uint32_t Nonce;
 
@@ -54,5 +69,5 @@ std::vector<GpuContext> getAMDDevices(int index);
 size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
 size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);
-
-
+uint64_t interleaveAdjustDelay(GpuContext* ctx);
+void updateTimings(GpuContext* ctx, const uint64_t t);
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 387e84c38..dd6928eeb 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -181,7 +181,7 @@ class autoAdjust
 				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
 					"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
-					"    \"unroll\" : 8, \"comp_mode\" : true\n" +
+					"    \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
 					"  },\n";
 			}
 			else
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index b43faa640..50ce3e27e 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -22,10 +22,16 @@ R"===(// generated by XMRSTAK_VERSION
  *                 to use a intensity which is not the multiple of the worksize.
  *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
  *                 in this case set the intensity to a multiple of the worksize or activate comp_mode.
+ * interleave    - Controls the starting point in time between two threads on the same GPU device relative to the last started thread.
+ *                 This option has only an effect if two compute threads using the same GPU device: valid range [0;100]
+ *                 0  = disable thread interleaving
+ *                 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
- *    "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
+ *     { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+ *       "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+ *       "interleave" : 40
+ *     },
  * ],
  * If you do not wish to mine with your AMD GPU(s) then use:
  * "gpu_threads_conf" :
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index ee9c4ea0e..e4da91ca1 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -106,7 +106,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode;
+	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode, *interleave;
 	idx = GetObjectMember(oThdConf, "index");
 	intensity = GetObjectMember(oThdConf, "intensity");
 	w_size = GetObjectMember(oThdConf, "worksize");
@@ -115,11 +115,30 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	memChunk = GetObjectMember(oThdConf, "mem_chunk");
 	unroll = GetObjectMember(oThdConf, "unroll");
 	compMode = GetObjectMember(oThdConf, "comp_mode");
+	interleave = GetObjectMember(oThdConf, "interleave");
 
 	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
 		stridedIndex == nullptr || unroll == nullptr || compMode == nullptr)
 		return false;
 
+	// interleave is optional
+	if(interleave == nullptr)
+		cfg.interleave = 50;
+	else if(!interleave->IsUint64())
+	{
+		printer::inst()->print_msg(L0, "ERROR: interleave must be a number");
+		return false;
+	}
+	else if((int)interleave->GetInt64() < 0 || (int)interleave->GetInt64() > 100)
+	{
+		printer::inst()->print_msg(L0, "ERROR: interleave must be in range [0;100]");
+		return false;
+	}
+	else
+	{
+		cfg.interleave = (int)interleave->GetInt64();
+	}
+
 	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
 		return false;
 
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index b852c5940..61494d910 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -27,6 +27,7 @@ class jconf
 		size_t w_size;
 		long long cpu_aff;
 		int stridedIndex;
+		int interleave = 40;
 		int memChunk;
 		int unroll;
 		bool compMode;
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 5e70f25a6..5e0d67889 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -100,6 +100,7 @@ bool minethd::init_gpus()
 		vGpuData[i].memChunk = cfg.memChunk;
 		vGpuData[i].compMode = cfg.compMode;
 		vGpuData[i].unroll = cfg.unroll;
+		vGpuData[i].interleave = cfg.interleave;
 	}
 
 	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
@@ -242,6 +243,7 @@ void minethd::work_main()
 					break;
 			}
 
+			uint64_t t0 = interleaveAdjustDelay(pGpuCtx);
 
 			cl_uint results[0x100];
 			memset(results,0,sizeof(cl_uint)*(0x100));
@@ -269,6 +271,9 @@ void minethd::work_main()
 			uint64_t iStamp = get_timestamp_ms();
 			iHashCount.store(iCount, std::memory_order_relaxed);
 			iTimestamp.store(iStamp, std::memory_order_relaxed);
+
+			updateTimings(pGpuCtx, t0);
+
 			std::this_thread::yield();
 		}