Skip to content

Commit

Permalink
[maxpool3d] update the coding style slightly; the dimension of an inp…
Browse files Browse the repository at this point in the history
…ut image is a multiple of 16; the omp version ran for 100 times and timing was not reported
  • Loading branch information
Jin Z committed Feb 14, 2023
1 parent 2d9ddba commit 0420f78
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 66 deletions.
27 changes: 15 additions & 12 deletions maxpool3d-cuda/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ int main(int argc, char** argv)
}
int i_img_width = atoi(argv[1]);
int i_img_height = atoi(argv[2]);

if (i_img_width % 16 != 0 || i_img_height % 16 != 0) {
printf("image dimension is a multiple of 16\n");
return 1;
}

int i_img_count = atoi(argv[3]);
int repeat = atoi(argv[4]);

Expand All @@ -66,10 +72,8 @@ int main(int argc, char** argv)

srand(2);

for(int j=0;j<i_img_count;j++)
{
for(int i=0;i<size_image;i++)
{
for(int j=0;j<i_img_count;j++) {
for(int i=0;i<size_image;i++) {
h_image[(j*size_image)+i] = rand()%256 / (DTYPE)255;
}
}
Expand Down Expand Up @@ -101,19 +105,19 @@ int main(int argc, char** argv)

for (int n = 0; n < repeat; n++) {
maxpool3d<<<grid_dim, block_dim>>>(d_image, d_result, Hstride, Vstride,
pool_width, pool_height, i_img_width, i_img_height, o_img_width, o_img_height);
pool_width, pool_height, i_img_width, i_img_height, o_img_width, o_img_height);
}

cudaDeviceSynchronize();
auto end = std::chrono::steady_clock::now();
auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
printf("Average kernel execution time: %f (s)\n", (time * 1e-9f) / repeat);

// verification using the CPU results
cudaMemcpy(d_output, d_result, mem_size_output*i_img_count, cudaMemcpyDeviceToHost);

// verification using the CPU results
for (int z = 0; z < i_img_count; z++)
for (int y = 0; y < o_img_height; y++)
for (int z = 0; z < i_img_count; z++) {
for (int y = 0; y < o_img_height; y++) {
for (int x = 0; x < o_img_width; x++) {
const int xidx = Hstride*x;
const int yidx = Vstride*y;
Expand All @@ -129,12 +133,11 @@ int main(int argc, char** argv)
}
h_output[(((z*o_img_height)+y)*o_img_width)+x] = maxval;
}
}
}

int status = memcmp(h_output, d_output, sizeof(DTYPE)*i_img_count*o_img_width*o_img_height);
if (status == 0)
printf("PASS\n");
else
printf("FAIL\n");
printf("%s\n", (status == 0) ? "PASS" : "FAIL");

free(h_image);
free(h_output);
Expand Down
29 changes: 16 additions & 13 deletions maxpool3d-hip/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ int main(int argc, char** argv)
int i_img_width = atoi(argv[1]);
int i_img_height = atoi(argv[2]);
int i_img_count = atoi(argv[3]);

if (i_img_width % 16 != 0 || i_img_height % 16 != 0) {
printf("image dimension is a multiple of 16\n");
return 1;
}

int repeat = atoi(argv[4]);

int Hstride=2, Vstride=2;
Expand All @@ -66,10 +72,8 @@ int main(int argc, char** argv)

srand(2);

for(int j=0;j<i_img_count;j++)
{
for(int i=0;i<size_image;i++)
{
for(int j=0;j<i_img_count;j++) {
for(int i=0;i<size_image;i++) {
h_image[(j*size_image)+i] = rand()%256 / (DTYPE)255;
}
}
Expand Down Expand Up @@ -100,20 +104,20 @@ int main(int argc, char** argv)
auto start = std::chrono::steady_clock::now();

for (int n = 0; n < repeat; n++) {
hipLaunchKernelGGL(maxpool3d, grid_dim, block_dim, 0, 0, d_image, d_result, Hstride, Vstride,
pool_width, pool_height, i_img_width, i_img_height, o_img_width, o_img_height);
maxpool3d<<<grid_dim, block_dim>>>(d_image, d_result, Hstride, Vstride,
pool_width, pool_height, i_img_width, i_img_height, o_img_width, o_img_height);
}

hipDeviceSynchronize();
auto end = std::chrono::steady_clock::now();
auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
printf("Average kernel execution time: %f (s)\n", (time * 1e-9f) / repeat);

// verification using the CPU results
hipMemcpy(d_output, d_result, mem_size_output*i_img_count, hipMemcpyDeviceToHost);

// verification using the CPU results
for (int z = 0; z < i_img_count; z++)
for (int y = 0; y < o_img_height; y++)
for (int z = 0; z < i_img_count; z++) {
for (int y = 0; y < o_img_height; y++) {
for (int x = 0; x < o_img_width; x++) {
const int xidx = Hstride*x;
const int yidx = Vstride*y;
Expand All @@ -129,12 +133,11 @@ int main(int argc, char** argv)
}
h_output[(((z*o_img_height)+y)*o_img_width)+x] = maxval;
}
}
}

int status = memcmp(h_output, d_output, sizeof(DTYPE)*i_img_count*o_img_width*o_img_height);
if (status == 0)
printf("PASS\n");
else
printf("FAIL\n");
printf("%s\n", (status == 0) ? "PASS" : "FAIL");

free(h_image);
free(h_output);
Expand Down
74 changes: 43 additions & 31 deletions maxpool3d-omp/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <chrono>
#include <omp.h>

typedef float DTYPE;

Expand All @@ -13,6 +15,13 @@ int main(int argc, char** argv)
}
int i_img_width = atoi(argv[1]);
int i_img_height = atoi(argv[2]);

// not required for the omp version
if (i_img_width % 16 != 0 || i_img_height % 16 != 0) {
printf("image dimension is a multiple of 16\n");
return 1;
}

int i_img_count = atoi(argv[3]);
int repeat = atoi(argv[4]);

Expand All @@ -32,10 +41,8 @@ int main(int argc, char** argv)

srand(2);

for(int j=0;j<i_img_count;j++)
{
for(int i=0;i<size_image;i++)
{
for(int j=0;j<i_img_count;j++) {
for(int i=0;i<size_image;i++) {
h_image[(j*size_image)+i] = rand()%256 / (DTYPE)255;
}
}
Expand All @@ -50,36 +57,42 @@ int main(int argc, char** argv)
const int pool_width = Hstride;
const int pool_height = Vstride;

#pragma omp target data map(to: h_image[0:size_image*i_img_count]) \
map(from: d_output[0:size_output*i_img_count])
{
for (int n = 0; n < 100; n++) {
#pragma omp target teams distribute parallel for collapse(3) thread_limit(256)
for (int z = 0; z < i_img_count; z++) {
for (int y = 0; y < o_img_height; y++) {
for (int x = 0; x < o_img_width; x++) {
const int xidx = Hstride*x;
const int yidx = Vstride*y;
DTYPE maxval = (DTYPE)0;
for (int r = 0; r < pool_height; r++)
{
const int idxIntmp = ((z*i_img_height + yidx + r) * i_img_width) + xidx;
for(int c = 0; c < pool_width; c++)
{
const int idxIn = idxIntmp + c;
maxval = fmaxf(maxval,h_image[idxIn]);
#pragma omp target data map(to: h_image[0:size_image*i_img_count]) \
map(from: d_output[0:size_output*i_img_count])
{
auto start = std::chrono::steady_clock::now();

for (int n = 0; n < repeat; n++) {
#pragma omp target teams distribute parallel for collapse(3) thread_limit(256)
for (int z = 0; z < i_img_count; z++) {
for (int y = 0; y < o_img_height; y++) {
for (int x = 0; x < o_img_width; x++) {
const int xidx = Hstride*x;
const int yidx = Vstride*y;
DTYPE maxval = (DTYPE)0;
for (int r = 0; r < pool_height; r++)
{
const int idxIntmp = ((z*i_img_height + yidx + r) * i_img_width) + xidx;
for(int c = 0; c < pool_width; c++)
{
const int idxIn = idxIntmp + c;
maxval = fmaxf(maxval,h_image[idxIn]);
}
}
d_output[(((z*o_img_height)+y)*o_img_width)+x] = maxval;
}
d_output[(((z*o_img_height)+y)*o_img_width)+x] = maxval;
}
}
}
}
}

auto end = std::chrono::steady_clock::now();
auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
printf("Average kernel execution time: %f (s)\n", (time * 1e-9f) / repeat);
}

// verification using the CPU results
for (int z = 0; z < i_img_count; z++)
for (int y = 0; y < o_img_height; y++)
for (int z = 0; z < i_img_count; z++) {
for (int y = 0; y < o_img_height; y++) {
for (int x = 0; x < o_img_width; x++) {
const int xidx = Hstride*x;
const int yidx = Vstride*y;
Expand All @@ -95,12 +108,11 @@ int main(int argc, char** argv)
}
h_output[(((z * o_img_height) + y) * o_img_width) + x] = maxval;
}
}
}

int status = memcmp(h_output, d_output, sizeof(DTYPE)*i_img_count*o_img_height*o_img_width);
if (status == 0)
printf("PASS\n");
else
printf("FAIL\n");
printf("%s\n", (status == 0) ? "PASS" : "FAIL");

free(h_image);
free(h_output);
Expand Down
23 changes: 13 additions & 10 deletions maxpool3d-sycl/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ int main(int argc, char** argv)
}
int i_img_width = atoi(argv[1]);
int i_img_height = atoi(argv[2]);

if (i_img_width % 16 != 0 || i_img_height % 16 != 0) {
printf("image dimension is a multiple of 16\n");
return 1;
}

int i_img_count = atoi(argv[3]);
int repeat = atoi(argv[4]);

Expand All @@ -34,10 +40,8 @@ int main(int argc, char** argv)

srand(2);

for(int j=0;j<i_img_count;j++)
{
for(int i=0;i<size_image;i++)
{
for(int j=0;j<i_img_count;j++) {
for(int i=0;i<size_image;i++) {
h_image[(j*size_image)+i] = rand()%256 / (DTYPE)255;
}
}
Expand Down Expand Up @@ -105,8 +109,8 @@ int main(int argc, char** argv)
q.memcpy(d_output, d_result, mem_size_output*i_img_count).wait();

// verification using the CPU results
for (int z = 0; z < i_img_count; z++)
for (int y = 0; y < o_img_height; y++)
for (int z = 0; z < i_img_count; z++) {
for (int y = 0; y < o_img_height; y++) {
for (int x = 0; x < o_img_width; x++) {
const int xidx = Hstride*x;
const int yidx = Vstride*y;
Expand All @@ -122,12 +126,11 @@ int main(int argc, char** argv)
}
h_output[(((z*o_img_height)+y)*o_img_width)+x] = maxval;
}
}
}

int status = memcmp(h_output, d_output, sizeof(DTYPE)*i_img_count*o_img_width*o_img_height);
if (status == 0)
printf("PASS\n");
else
printf("FAIL\n");
printf("%s\n", (status == 0) ? "PASS" : "FAIL");

free(h_image);
free(h_output);
Expand Down

0 comments on commit 0420f78

Please sign in to comment.