-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlapl_cuda.cu
141 lines (135 loc) · 4.12 KB
/
lapl_cuda.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#include <stdio.h>
#include <cuda.h>
#include <defs.h>
static int idx; /* holds the current iteration index */
static float *dev_arr[2]; /* GPU arrays */
__constant__ float delta;
__constant__ float norm;
/*
* Naive implementation of a single iteration of the lapl
* equation. Each thread takes one site of the output array
*/
__global__ void
dev_lapl_iter(float *out, float *in)
{
int itx = threadIdx.x, ity = threadIdx.y;
int ibx = blockIdx.x, iby = blockIdx.y;
int nbx = gridDim.x, nby = gridDim.y;
int x = itx + ibx*NTX;
int Lx = nbx*NTX;
int Ly = nby*NTY;
int y = ity + iby*NTY;
int v00 = y*Lx + x;
int v0p = y*Lx + (x + 1)%Lx;
int v0m = y*Lx + (Lx + x - 1)%Lx;
int vp0 = ((y+1)%Ly)*Lx + x;
int vm0 = ((Ly+y-1)%Ly)*Lx + x;
out[v00] = norm*in[v00]
+ delta*(in[v0p] + in[v0m] + in[vp0] + in[vm0]);
return;
}
/*
* Optimized implementation of a single iteration of the lapl
* equation. Uses shared memory for input array with halos. Subset of
* threads used to fill-in halo data for input array
*/
__global__ void
dev_lapl_iter_optim(float *out, float *in)
{
int itx = threadIdx.x, ity = threadIdx.y;
int ibx = blockIdx.x, iby = blockIdx.y;
int nbx = gridDim.x, nby = gridDim.y;
int x = itx + ibx*NTX;
int y = ity + iby*NTY;
int Lx = nbx*NTX;
int Ly = nby*NTY;
__shared__ float in_sub[(NTX+2)*(NTY+2)];
/* Fill in bulk (inner volume) sites */
in_sub[(itx+1)+(ity+1)*(NTX+2)] = in[y*Lx + x];
int ithr = itx + NTX*ity;
int xoff = NTX*ibx;
int yoff = NTY*iby;
/* Use sub set of available threads to fill in halo data */
if (ithr < NTX) {
/* y = -1 */
int ix = ithr;
int ii = ix+1;
int vv = ix+xoff + ((Ly+yoff-1)%Ly)*Lx;
in_sub[ii] = in[vv];
/* y = NTY */
ii = (ix+1) + (NTY+1)*(NTX+2);
vv = (ix+xoff) + ((NTY+yoff)%Ly)*Lx;
in_sub[ii] = in[vv];
} else if ((NTX-1)*NTY <= ithr) {
/* x = -1 */
int iy = ithr - (NTX-1)*NTY;
int ii = (iy+1)*(NTX+2);
int vv = (Lx+xoff-1)%Lx + (iy+yoff)*Lx;
in_sub[ii] = in[vv];
/* x = NTX */
ii = (NTX+1) + (iy+1)*(NTX+2);
vv = (NTX + xoff)%Lx + (yoff+iy)*Lx;
in_sub[ii] = in[vv];
}
/* Sync before continuing, as the next step relies on all threads
having filled in the input shared array */
__syncthreads();
/* Heat iteration. The output array is the array in global
memory. The input array is the subarray, with the halo data */
{
int v00 = y*Lx + x;
int x00 = (ity+1)*(NTX+2) + (itx+1);
int x0p = (ity+1)*(NTX+2) + (itx+2);
int x0m = (ity+1)*(NTX+2) + itx;
int xp0 = (ity+2)*(NTX+2) + itx+1;
int xm0 = (ity)*(NTX+2) + itx+1;
out[v00] = norm*in_sub[x00]
+ delta*(in_sub[x0p] + in_sub[x0m] + in_sub[xp0] + in_sub[xm0]);
}
return;
}
/*
* Initialize by writing norm and delta to constant memory,
* allocating device arrays and copying input array to device
*/
extern "C" void
init_lapl_cuda(float *arr, float sigma)
{
cudaSetDevice(1);
cudaMalloc((void **)&dev_arr[0], sizeof(float)*Lx*Ly);
cudaMalloc((void **)&dev_arr[1], sizeof(float)*Lx*Ly);
cudaMemcpy(dev_arr[0], arr, sizeof(float)*Lx*Ly, cudaMemcpyHostToDevice);
float xdelta = sigma / (1.0+4.0*sigma);
float xnorm = 1.0/(1.0+4.0*sigma);
cudaMemcpyToSymbol("norm", &xnorm, sizeof(float), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol("delta", &xdelta, sizeof(float), 0, cudaMemcpyHostToDevice);
idx = 0;
return;
}
/*
* This function drives the CUDA kernel. It also blocks such that
* returning from this function guarantees completion of kernel. Each
* call will increment in the lapl iteration index "idx"
*/
extern "C" void
lapl_iter_cuda(int block_dims[], int thread_dims[])
{
dim3 thr(thread_dims[2], thread_dims[1], thread_dims[0]);
dim3 blk(block_dims[2], block_dims[1], block_dims[0]);
dev_lapl_iter_optim <<< blk, thr >>> (dev_arr[(idx+1)%2], dev_arr[idx%2]);
idx++;
cudaDeviceSynchronize();
return;
}
/*
* Finalize by copying the output array to main memory and freeing
* arrays on device
*/
extern "C" void
fini_lapl_cuda(float *arr)
{
cudaMemcpy(arr, dev_arr[idx % 2], sizeof(float)*Lx*Ly, cudaMemcpyDeviceToHost);
cudaFree(dev_arr[0]);
cudaFree(dev_arr[1]);
return;
}