-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathVutils_CUDA.h
237 lines (188 loc) · 5.86 KB
/
Vutils_CUDA.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
/**********************************************************/
/* Name: Vic Utilities for CUDA aka Vutils CUDA */
/* Version: 1.0 */
/* Platform: Windows 32-bit & 64-bit */
/* Type: C++ Library for CUDA */
/* Author: Vic P. aka vic4key */
/* Mail: vic4key[at]gmail.com */
/* Website: https://vic.onl/ */
/**********************************************************/
#ifndef VUTILS_CUDA_H
#define VUTILS_CUDA_H
/* Vutils Version */
#define VU_CUDA_VERSION 0x01000000 // Version 01.00.0000
/* The Conditions of Vutils */
#if !defined(_WIN32) && !defined(_WIN64)
#error Vutils CUDA required Windows 32-bit/64-bit platform
#endif
#ifndef __cplusplus
#error Vutils CUDA required C++ compiler
#endif
#include <string>
#include <utility>
#include <algorithm>
#include <cuda_runtime.h>
#ifdef max
#define vu_cuda_max max
#define vu_cuda_min min
#else // std::max/min
#define vu_cuda_max std::max
#define vu_cuda_min std::min
#endif // max/min
namespace vu
{
#ifndef VUTILS_H
#include "inline/defs.inl"
#include "inline/types.inl"
#include "inline/spechrs.inl"
#endif // VUTILS_H
/**
* For both Host and Device
*/
static __host__ __device__ void convert_index_to_position_2d(
dim3& position, int index, int width, int height = NULL)
{
position.x = index % width;
position.y = index / width;
position.z = 1;
}
static __host__ __device__ int convert_position_to_index_2d(
const dim3& position, int width, int height = NULL)
{
return position.y * width + position.x;
}
/**
* For Host
*/
namespace host
{
static __host__ int device_count()
{
int count = 0;
cudaGetDeviceCount(&count);
return count;
}
static __host__ int device_id()
{
int id = -1;
cudaGetDevice(&id);
return id;
}
static __host__ std::string device_name(int id)
{
static cudaDeviceProp prop = { 0 };
cudaGetDeviceProperties(&prop, id);
std::string name(prop.name);
return name;
}
static __host__ void device_synchronize()
{
cudaDeviceSynchronize();
}
static __host__ void device_reset()
{
cudaDeviceReset();
}
template <typename Fn>
__host__ float calcuate_occupancy(int block_size, Fn fn)
{
int max_active_blocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks, static_cast<void*>(fn), block_size, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (max_active_blocks * block_size / props.warpSize) /
float(props.maxThreadsPerMultiProcessor /
props.warpSize);
return occupancy;
}
enum block_size_t : int
{
_auto = 0,
_fixed = 256,
};
template <typename Fn>
__host__ std::pair<dim3, dim3> calculate_execution_configuration_3d(
int width, int height, int depth, Fn fn, block_size_t threads_per_block = block_size_t::_fixed)
{
int min_grid_size = 0;
int num_threads_per_block = block_size_t::_fixed;
if (threads_per_block == block_size_t::_auto)
{
cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &num_threads_per_block, static_cast<void*>(fn));
num_threads_per_block = static_cast<int>(sqrt(num_threads_per_block));
}
cudaDeviceProp prop = { 0 };
cudaGetDeviceProperties(&prop, host::device_id());
num_threads_per_block = VU_ALIGN_UP(num_threads_per_block, prop.warpSize);
int num_elements = width * height * depth;
int num_blocks_per_grid = num_elements / num_threads_per_block + 1;
num_blocks_per_grid = vu_cuda_max(min_grid_size, num_blocks_per_grid);
// dim3 block_size(num_threads_per_block, num_threads_per_block, 1);
// dim3 grid_size(width / num_threads_per_block + 1, height / num_threads_per_block + 1, depth);
dim3 grid_size(num_blocks_per_grid);
dim3 block_size(num_threads_per_block);
return { grid_size, block_size };
}
template <typename Fn>
__host__ std::pair<dim3, dim3> calculate_execution_configuration_2d(
int width, int height, Fn fn, block_size_t threads_per_block = block_size_t::_fixed)
{
return calculate_execution_configuration_3d(width, height, 1, fn, threads_per_block);
}
template <typename Fn>
__host__ std::pair<dim3, dim3> calculate_execution_configuration_1d(
int num_elements, Fn fn, block_size_t threads_per_block = block_size_t::_fixed)
{
return calculate_execution_configuration_3d(num_elements, 1, 1, fn, threads_per_block);
}
template <typename Fn>
__host__ std::pair<dim3, dim3> calculate_execution_configuration_1d(
int width, int height, Fn fn, block_size_t threads_per_block = block_size_t::_fixed)
{
return calculate_execution_configuration_1d(width * height, fn, threads_per_block);
}
} // namespace host
/**
* For Device
*/
namespace device
{
#ifdef __CUDACC__
__device__ void current_element_position_1d(dim3& position)
{
position.x = blockIdx.x * blockDim.x + threadIdx.x;
position.y = 1;
position.z = 1;
}
__device__ int current_element_index_1d()
{
return blockIdx.x * blockDim.x + threadIdx.x;
}
__device__ void current_element_position_2d(dim3& position)
{
position.x = blockIdx.x * blockDim.x + threadIdx.x;
position.y = blockIdx.y * blockDim.y + threadIdx.y;
position.z = 1;
}
__device__ int current_element_index_2d(int width, int height = NULL)
{
return (blockIdx.y * blockDim.y + threadIdx.y) * width + (blockIdx.x * blockDim.x + threadIdx.x);
}
__device__ int current_element_index_3d(int width, int height, int depth)
{
// unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
// unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
// unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
// return x + (width * y) + (width * height * z);
return\
(blockDim.x * blockIdx.x + threadIdx.x) +
(width * (blockDim.y * blockIdx.y + threadIdx.y)) +
(width * height * (blockDim.z * blockIdx.z + threadIdx.z));
}
#endif // __CUDACC__
} // namespace device
} // namespace vu
#endif // VUTILS_CUDA_H