diff --git a/README.md b/README.md index d63a6a1..d1f5bcc 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,33 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* Yan Wu + * [LinkedIn](https://www.linkedin.com/in/yan-wu-a71270159/) +* Tested on: Windows 10, i7-8750H @ 2.20GHz 16.0GB, GTX 1060 6GB (Personal Laptop) +* [Repo Link](https://github.com/wuyan33/Project1-CUDA-Flocking) +### Program Result: +* Result GIFs on 5,000 boids: + * Naive (by LICEcap):
+ + * Uniform grid (by ScreenToGif):
+ + * Coherent Search (by ScreenToGif):
+ +### Performance Analysis + * Framerate change with increasing # of boids for naive, scattered uniform grid, and coherent uniform grid (with visualization):
+
+ * Framerate change with increasing # of boids for naive, scattered uniform grid, and coherent uniform grid (without visualization):
+
+ We can see that with the increasing number of boids, FPS of all three methods are decreasing. Clearly naive method performs worst with higher boid number. + * Framerate change with increasing block size: +
+ This part is tested with visualization and a boid number of 10,000. +### Q & As +* For each implementation, how does changing the number of boids affect performance? Why do you think this is? + * For the naive method, increasing number of boids let to significantly decreasement in FPS, that's because brute force is an algorithm with a time complexity of O(N^2). The other two methods decrease as well but in a lot lower rate. That's because both algorithms have almost linear time complexity. +* For each implementation, how does changing the block count and block size affect performance? Why do you think this is? + * From my result where I chose block count from 32 to 512, changing block count doesn't seem to have significant impact on performance. +* For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not? + * Not so much. I expected coherent method to win, but turns out the two algorithms has about the same performance. I tested for several times, but the performance are different from each others. There were two times FPS for uniform grid method has a difference of over 100. Unsteady outcome might be one of the reasons. +* Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check! + * In my case, performance with 27 neighboring cell is almost the same as 8 neighboring cell when boid number is below 10000. Then as boid number increases, my result has a preference for 8 neighboring cell. While checking 27 neighbors did requires more time to each thread, the cell number decreases as cell become larger. diff --git a/images/coherentSearch.gif b/images/coherentSearch.gif new file mode 100644 index 0000000..831512e Binary files /dev/null and b/images/coherentSearch.gif differ diff --git a/images/increaseblocksize.PNG b/images/increaseblocksize.PNG new file mode 100644 index 0000000..9c4f9a8 Binary files /dev/null and b/images/increaseblocksize.PNG differ diff --git a/images/naive.gif b/images/naive.gif new file mode 100644 index 0000000..6a645e4 Binary files /dev/null and b/images/naive.gif differ diff --git a/images/nonvisualized_boids.PNG b/images/nonvisualized_boids.PNG new file mode 100644 index 0000000..c75fd83 Binary files /dev/null and b/images/nonvisualized_boids.PNG differ diff --git a/images/uniformGrid.gif b/images/uniformGrid.gif new file mode 100644 index 0000000..1a8edc5 Binary files /dev/null and b/images/uniformGrid.gif differ diff --git a/images/visualized_boids.PNG b/images/visualized_boids.PNG new file mode 100644 index 0000000..49a17fe Binary files /dev/null and b/images/visualized_boids.PNG differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fdd636d..b737097 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,5 +10,5 @@ set(SOURCE_FILES cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/src/kernel.cu b/src/kernel.cu index 74dffcb..fa46809 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -5,6 +5,7 @@ #include #include "utilityCore.hpp" #include "kernel.h" +#include "device_launch_parameters.h" // LOOK-2.1 potentially useful for doing grid-based neighbor search #ifndef imax @@ -21,14 +22,14 @@ * Check for CUDA errors; print and exit if there was a problem. */ void checkCUDAError(const char *msg, int line = -1) { - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err) { - if (line >= 0) { - fprintf(stderr, "Line %d: ", line); - } - fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + if (line >= 0) { + fprintf(stderr, "Line %d: ", line); + } + fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } } @@ -76,18 +77,18 @@ glm::vec3 *dev_vel2; // For efficient sorting and the uniform grid. These should always be parallel. int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle? int *dev_particleGridIndices; // What grid cell is this particle in? -// needed for use with thrust + // needed for use with thrust thrust::device_ptr dev_thrust_particleArrayIndices; thrust::device_ptr dev_thrust_particleGridIndices; int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs int *dev_gridCellEndIndices; // to this cell? -// TODO-2.3 - consider what additional buffers you might need to reshuffle -// the position and velocity data to be coherent within cells. +glm::vec3 *c_pos; // TODO-2.3 - consider what additional buffers you might need to reshuffle +glm::vec3 *c_vel; // the position and velocity data to be coherent within cells. -// LOOK-2.1 - Grid parameters based on simulation parameters. -// These are automatically computed for you in Boids::initSimulation + // LOOK-2.1 - Grid parameters based on simulation parameters. + // These are automatically computed for you in Boids::initSimulation int gridCellCount; int gridSideCount; float gridCellWidth; @@ -99,13 +100,13 @@ glm::vec3 gridMinimum; ******************/ __host__ __device__ unsigned int hash(unsigned int a) { - a = (a + 0x7ed55d16) + (a << 12); - a = (a ^ 0xc761c23c) ^ (a >> 19); - a = (a + 0x165667b1) + (a << 5); - a = (a + 0xd3a2646c) ^ (a << 9); - a = (a + 0xfd7046c5) + (a << 3); - a = (a ^ 0xb55a4f09) ^ (a >> 16); - return a; + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; } /** @@ -113,10 +114,10 @@ __host__ __device__ unsigned int hash(unsigned int a) { * Function for generating a random vec3. */ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) { - thrust::default_random_engine rng(hash((int)(index * time))); - thrust::uniform_real_distribution unitDistrib(-1, 1); + thrust::default_random_engine rng(hash((int)(index * time))); + thrust::uniform_real_distribution unitDistrib(-1, 1); - return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng)); + return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng)); } /** @@ -124,52 +125,62 @@ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) { * CUDA kernel for generating boids with a specified mass randomly around the star. */ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { - glm::vec3 rand = generateRandomVec3(time, index); - arr[index].x = scale * rand.x; - arr[index].y = scale * rand.y; - arr[index].z = scale * rand.z; - } + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + glm::vec3 rand = generateRandomVec3(time, index); + arr[index].x = scale * rand.x; + arr[index].y = scale * rand.y; + arr[index].z = scale * rand.z; + } } /** * Initialize memory, update some globals */ void Boids::initSimulation(int N) { - numObjects = N; - dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); - - // LOOK-1.2 - This is basic CUDA memory management and error checking. - // Don't forget to cudaFree in Boids::endSimulation. - cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); - - cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); - - cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); - - // LOOK-1.2 - This is a typical CUDA kernel invocation. - kernGenerateRandomPosArray<<>>(1, numObjects, - dev_pos, scene_scale); - checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); - - // LOOK-2.1 computing grid params - gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); - int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; - gridSideCount = 2 * halfSideCount; - - gridCellCount = gridSideCount * gridSideCount * gridSideCount; - gridInverseCellWidth = 1.0f / gridCellWidth; - float halfGridWidth = gridCellWidth * halfSideCount; - gridMinimum.x -= halfGridWidth; - gridMinimum.y -= halfGridWidth; - gridMinimum.z -= halfGridWidth; - - // TODO-2.1 TODO-2.3 - Allocate additional buffers here. - cudaDeviceSynchronize(); + numObjects = N; + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + // LOOK-1.2 - This is basic CUDA memory management and error checking. + // Don't forget to cudaFree in Boids::endSimulation. + cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); + + cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); + + cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); + + // LOOK-1.2 - This is a typical CUDA kernel invocation. + kernGenerateRandomPosArray <<>>(1, numObjects, + dev_pos, scene_scale); + checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); + + // LOOK-2.1 computing grid params + gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); + int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; + gridSideCount = 2 * halfSideCount; + + gridCellCount = gridSideCount * gridSideCount * gridSideCount; + gridInverseCellWidth = 1.0f / gridCellWidth; + float halfGridWidth = gridCellWidth * halfSideCount; + gridMinimum.x -= halfGridWidth; + gridMinimum.y -= halfGridWidth; + gridMinimum.z -= halfGridWidth; + + // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + cudaMalloc((void**)&dev_particleArrayIndices, N *sizeof(int)); + cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int)); + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + cudaMalloc((void**)&c_pos, N * sizeof(glm::vec3)); + cudaMalloc((void**)&c_vel, N * sizeof(glm::vec3)); + + + cudaThreadSynchronize(); } @@ -181,41 +192,41 @@ void Boids::initSimulation(int N) { * Copy the boid positions into the VBO so that they can be drawn by OpenGL. */ __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); + int index = threadIdx.x + (blockIdx.x * blockDim.x); - float c_scale = -1.0f / s_scale; + float c_scale = -1.0f / s_scale; - if (index < N) { - vbo[4 * index + 0] = pos[index].x * c_scale; - vbo[4 * index + 1] = pos[index].y * c_scale; - vbo[4 * index + 2] = pos[index].z * c_scale; - vbo[4 * index + 3] = 1.0f; - } + if (index < N) { + vbo[4 * index + 0] = pos[index].x * c_scale; + vbo[4 * index + 1] = pos[index].y * c_scale; + vbo[4 * index + 2] = pos[index].z * c_scale; + vbo[4 * index + 3] = 1.0f; + } } __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); - - if (index < N) { - vbo[4 * index + 0] = vel[index].x + 0.3f; - vbo[4 * index + 1] = vel[index].y + 0.3f; - vbo[4 * index + 2] = vel[index].z + 0.3f; - vbo[4 * index + 3] = 1.0f; - } + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < N) { + vbo[4 * index + 0] = vel[index].x + 0.3f; + vbo[4 * index + 1] = vel[index].y + 0.3f; + vbo[4 * index + 2] = vel[index].z + 0.3f; + vbo[4 * index + 3] = 1.0f; + } } /** * Wrapper for call to the kernCopyboidsToVBO CUDA kernel. */ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) { - dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); - kernCopyPositionsToVBO << > >(numObjects, dev_pos, vbodptr_positions, scene_scale); - kernCopyVelocitiesToVBO << > >(numObjects, dev_vel1, vbodptr_velocities, scene_scale); + kernCopyPositionsToVBO <<>>(numObjects, dev_pos, vbodptr_positions, scene_scale); + kernCopyVelocitiesToVBO <<>>(numObjects, dev_vel1, vbodptr_velocities, scene_scale); - checkCUDAErrorWithLine("copyBoidsToVBO failed!"); + checkCUDAErrorWithLine("copyBoidsToVBO failed!"); - cudaDeviceSynchronize(); + cudaThreadSynchronize(); } @@ -230,10 +241,34 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) * in the `pos` and `vel` arrays. */ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { - // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves - // Rule 2: boids try to stay a distance d away from each other - // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + // initialize velocity changes generated by 3 rules. + glm::vec3 self_pos = pos[iSelf]; + glm::vec3 perceived_center(0.f), c(0.f), perceived_velocity(0.f); + float count_rule1 = 0, count_rule3 = 0; + for (int i = 0; i < N; i++) { + if (i == iSelf) continue; + float distance = glm::distance(self_pos, pos[i]); + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + if (distance < rule1Distance) { + perceived_center += pos[i]; + count_rule1++; + } + // Rule 2: boids try to stay a distance d away from each other + if (distance < rule2Distance) { + c -= (pos[i] - self_pos); + } + // Rule 3: boids try to match the speed of surrounding boids + if (distance < rule3Distance) { + perceived_velocity += vel[i]; + count_rule3++; + } + } + perceived_center /= count_rule1; + perceived_velocity /= count_rule3; + glm::vec3 v1 = (perceived_center - self_pos) * rule1Scale; + glm::vec3 v2 = c * rule2Scale; + glm::vec3 v3 = perceived_velocity * rule3Scale; + return v1 + v2 + v3; } /** @@ -241,10 +276,15 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po * For each of the `N` bodies, update its position based on its current velocity. */ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, - glm::vec3 *vel1, glm::vec3 *vel2) { - // Compute a new velocity based on pos and vel1 - // Clamp the speed - // Record the new velocity into vel2. Question: why NOT vel1? + glm::vec3 *vel1, glm::vec3 *vel2) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) return; + // Compute a new velocity based on pos and vel1 + glm::vec3 new_vel = vel1[index] + computeVelocityChange(N, index, pos, vel1); + // Clamp the speed(check if new_vel exceeds max allowed speed) + new_vel = (glm::length(new_vel) > maxSpeed) ? glm::normalize(new_vel) * maxSpeed : new_vel; + // Record the new velocity into vel2. Question: why NOT vel1? + vel2[index] = new_vel; } /** @@ -252,24 +292,24 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, * For each of the `N` bodies, update its position based on its current velocity. */ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { - // Update position by velocity - int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index >= N) { - return; - } - glm::vec3 thisPos = pos[index]; - thisPos += vel[index] * dt; - - // Wrap the boids around so we don't lose them - thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x; - thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y; - thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z; - - thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x; - thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y; - thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z; - - pos[index] = thisPos; + // Update position by velocity + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + glm::vec3 thisPos = pos[index]; + thisPos += vel[index] * dt; + + // Wrap the boids around so we don't lose them + thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x; + thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y; + thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z; + + thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x; + thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y; + thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z; + + pos[index] = thisPos; } // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index. @@ -279,179 +319,365 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { // for(y) // for(z)? Or some other order? __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) { - return x + y * gridResolution + z * gridResolution * gridResolution; + return x + y * gridResolution + z * gridResolution * gridResolution; } __global__ void kernComputeIndices(int N, int gridResolution, - glm::vec3 gridMin, float inverseCellWidth, - glm::vec3 *pos, int *indices, int *gridIndices) { - // TODO-2.1 - // - Label each boid with the index of its grid cell. - // - Set up a parallel array of integer indices as pointers to the actual - // boid data in pos and vel1/vel2 + glm::vec3 gridMin, float inverseCellWidth, + glm::vec3 *pos, int *indices, int *gridIndices) { + // TODO-2.1 + // - Label each boid with the index of its grid cell. + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + glm::ivec3 grid_index = (pos[index] - gridMin) * inverseCellWidth; + // - Set up a parallel array of integer indices as pointers to the actual + gridIndices[index] = gridIndex3Dto1D(grid_index.x, grid_index.y, grid_index.z, gridResolution); + // boid data in pos and vel1/vel2 + indices[index] = index; } // LOOK-2.1 Consider how this could be useful for indicating that a cell // does not enclose any boids __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { - intBuffer[index] = value; - } + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + intBuffer[index] = value; + } } __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, - int *gridCellStartIndices, int *gridCellEndIndices) { - // TODO-2.1 - // Identify the start point of each cell in the gridIndices array. - // This is basically a parallel unrolling of a loop that goes - // "this index doesn't match the one before it, must be a new cell!" + int *gridCellStartIndices, int *gridCellEndIndices) { + // TODO-2.1 + // Identify the start point of each cell in the gridIndices array. + // This is basically a parallel unrolling of a loop that goes + // "this index doesn't match the one before it, must be a new cell!" + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + int grid_cell_index = particleGridIndices[index]; + + if (index == 0) { + gridCellStartIndices[grid_cell_index] = index; + }else { + int prev_cell_index = particleGridIndices[index - 1]; + if (prev_cell_index != grid_cell_index) { + gridCellStartIndices[grid_cell_index] = index; + gridCellEndIndices[prev_cell_index] = index - 1; + } + } + if (index == N - 1) gridCellEndIndices[grid_cell_index] = N - 1; } __global__ void kernUpdateVelNeighborSearchScattered( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - int *particleArrayIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce - // the number of boids that need to be checked. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + int *particleArrayIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce + // the number of boids that need to be checked. + // - Identify the grid cell that this particle is in + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + glm::ivec3 grid_index = (pos[index] - gridMin) * inverseCellWidth; + glm::vec3 boid = pos[index]; + // - Identify which cells may contain neighbors. This isn't always 8. + // - For each cell, read the start/end indices in the boid pointer array. + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + + glm::vec3 perceived_center(0.f), c(0.f), perceived_velocity(0.f); + float count_rule1 = 0, count_rule3 = 0; + + for (int i = 0; i < 2; i++) { + int x = imax(grid_index.x - i, 0); + for (int j = 0; j < 2; j++) { + int y = imax(grid_index.y - j, 0); + for (int k = 0; k < 2; k++) { + int z = imax(grid_index.z - k, 0); + int cur_grid_index = gridIndex3Dto1D(x, y, z, gridResolution); + if (gridCellStartIndices[cur_grid_index] < 0) continue; + int start = gridCellStartIndices[cur_grid_index]; + int end = gridCellStartIndices[cur_grid_index]; + for (int b = start; b <= end; b++) { + int particle_index = particleArrayIndices[b]; + float distance = glm::distance(pos[particle_index], boid); + if (b == index) continue; + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + if (distance < rule1Distance) { + perceived_center += pos[particle_index]; + count_rule1++; + } + // Rule 2: boids try to stay a distance d away from each other + if (distance < rule2Distance) { + c -= (pos[particle_index] - boid); + } + // Rule 3: boids try to match the speed of surrounding boids + if (distance < rule3Distance) { + perceived_velocity += vel1[particle_index]; + count_rule3++; + } + } + } + } + } + glm::vec3 new_vel = vel1[index]; + if (count_rule1 > 0) { + perceived_center /= count_rule1; + new_vel += (perceived_center - boid) * rule1Scale; + + } + if (count_rule3 > 0) { + perceived_velocity /= count_rule3; + new_vel += perceived_velocity * rule3Scale; + } + new_vel += c * rule2Scale; + // - Clamp the speed change before putting the new speed in vel2 + new_vel = (glm::length(new_vel) > maxSpeed) ? glm::normalize(new_vel) * maxSpeed : new_vel; + vel2[index] = new_vel; } __global__ void kernUpdateVelNeighborSearchCoherent( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, - // except with one less level of indirection. - // This should expect gridCellStartIndices and gridCellEndIndices to refer - // directly to pos and vel1. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // DIFFERENCE: For best results, consider what order the cells should be - // checked in to maximize the memory benefits of reordering the boids data. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, + // except with one less level of indirection. + // This should expect gridCellStartIndices and gridCellEndIndices to refer + // directly to pos and vel1. + // - Identify the grid cell that this particle is in + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + glm::ivec3 grid_index = (pos[index] - gridMin) * inverseCellWidth; + glm::vec3 boid = pos[index]; + glm::vec3 perceived_center(0.f), c(0.f), perceived_velocity(0.f); + float count_rule1 = 0, count_rule3 = 0; + + // - Identify which cells may contain neighbors. This isn't always 8. + for (int i = 0; i < 2; i++) { + int x = imax(grid_index.x - i, 0); + for (int j = 0; j < 2; j++) { + int y = imax(grid_index.y - j, 0); + for (int k = 0; k < 2; k++) { + int z = imax(grid_index.z - k, 0); + int grid_index = gridIndex3Dto1D(x, y, z, gridResolution); + if (gridCellStartIndices[grid_index] == -1) continue; + // - For each cell, read the start/end indices in the boid pointer array. + int start = gridCellStartIndices[grid_index]; + int end = gridCellEndIndices[grid_index]; + for (int b = start; b <= end; b++) { + // DIFFERENCE: For best results, consider what order the cells should be + // checked in to maximize the memory benefits of reordering the boids data. + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + float distance = glm::distance(pos[b], boid); + if (b == index) continue; + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + if (distance < rule1Distance) { + perceived_center += pos[b]; + count_rule1++; + } + // Rule 2: boids try to stay a distance d away from each other + if (distance < rule2Distance) { + c -= (pos[b] - boid); + } + // Rule 3: boids try to match the speed of surrounding boids + if (distance < rule3Distance) { + perceived_velocity += vel1[b]; + count_rule3++; + } + } + } + } + } + // - Clamp the speed change before putting the new speed in vel2 + glm::vec3 new_vel = vel1[index]; + if (count_rule1 > 0) { + perceived_center /= count_rule1; + new_vel += (perceived_center - boid) * rule1Scale; + + } + if (count_rule3 > 0) { + perceived_velocity /= count_rule3; + new_vel += perceived_velocity * rule3Scale; + } + new_vel += c * rule2Scale; + // - Clamp the speed change before putting the new speed in vel2 + new_vel = (glm::length(new_vel) > maxSpeed) ? glm::normalize(new_vel) * maxSpeed : new_vel; + vel2[index] = new_vel; +} + +__global__ void kernRearrangeArray( + int N, int *particleArrayIndices, + glm::vec3 *pos, glm::vec3 *vel, glm::vec3 *c_pos, glm::vec3 *c_vel) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + int boid = particleArrayIndices[index]; + c_pos[index] = pos[boid]; + c_vel[index] = vel[boid]; } /** * Step the entire N-body simulation by `dt` seconds. */ void Boids::stepSimulationNaive(float dt) { - // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. - // TODO-1.2 ping-pong the velocity buffers + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. + kernUpdateVelocityBruteForce<<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("Function kernUpdateVelocityBruteForce failed!!!"); + kernUpdatePos<<< fullBlocksPerGrid, blockSize >>>(numObjects, dt, dev_pos, dev_vel2); + checkCUDAErrorWithLine("Function kernUpdatePos failed!!!"); + // TODO-1.2 ping-pong the velocity buffers + std::swap(dev_vel1, dev_vel2); } void Boids::stepSimulationScatteredGrid(float dt) { - // TODO-2.1 - // Uniform Grid Neighbor search using Thrust sort. - // In Parallel: - // - label each particle with its array index as well as its grid index. - // Use 2x width grids. - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed + // TODO-2.1 + // Uniform Grid Neighbor search using Thrust sort. + // In Parallel: + // - label each particle with its array index as well as its grid index. + // Use 2x width grids. + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernResetIntBuffer<<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_gridCellStartIndices, -1); + kernResetIntBuffer<<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_gridCellEndIndices, -1); + kernComputeIndices<<< fullBlocksPerGrid, blockSize >>>( + numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + kernIdentifyCellStartEnd<<< fullBlocksPerGrid, blockSize >>>( + numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchScattered<<< fullBlocksPerGrid, blockSize >>>( + numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2); + + // - Update positions + kernUpdatePos<<< fullBlocksPerGrid, blockSize >>>(numObjects, dt, dev_pos, dev_vel2); + // - Ping-pong buffers as needed + std::swap(dev_vel1, dev_vel2); + } void Boids::stepSimulationCoherentGrid(float dt) { - // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid - // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. - // In Parallel: - // - Label each particle with its array index as well as its grid index. - // Use 2x width grids - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all - // the particle data in the simulation array. - // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid + // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. + // In Parallel: + // - Label each particle with its array index as well as its grid index. + // Use 2x width grids + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernResetIntBuffer <<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_gridCellStartIndices, -1); + kernResetIntBuffer <<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_gridCellEndIndices, -1); + kernComputeIndices <<< fullBlocksPerGrid, blockSize >>>( + numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + kernIdentifyCellStartEnd <<< fullBlocksPerGrid, blockSize >>>( + numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all + // the particle data in the simulation array. + // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED + kernRearrangeArray<<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, c_pos, c_vel); + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchCoherent<<< fullBlocksPerGrid, blockSize >>>( + numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, c_pos, c_vel, dev_vel2); + // - Update positions + kernUpdatePos<<< fullBlocksPerGrid, blockSize >>>(numObjects, dt, c_pos, dev_vel2); + // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + std::swap(dev_vel1, dev_vel2); + std::swap(dev_pos, c_pos); } void Boids::endSimulation() { - cudaFree(dev_vel1); - cudaFree(dev_vel2); - cudaFree(dev_pos); - - // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_vel1); + cudaFree(dev_vel2); + cudaFree(dev_pos); + + // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_gridCellEndIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); + cudaFree(c_pos); + cudaFree(c_vel); } void Boids::unitTest() { - // LOOK-1.2 Feel free to write additional tests here. - - // test unstable sort - int *dev_intKeys; - int *dev_intValues; - int N = 10; - - std::unique_ptrintKeys{ new int[N] }; - std::unique_ptrintValues{ new int[N] }; - - intKeys[0] = 0; intValues[0] = 0; - intKeys[1] = 1; intValues[1] = 1; - intKeys[2] = 0; intValues[2] = 2; - intKeys[3] = 3; intValues[3] = 3; - intKeys[4] = 0; intValues[4] = 4; - intKeys[5] = 2; intValues[5] = 5; - intKeys[6] = 2; intValues[6] = 6; - intKeys[7] = 0; intValues[7] = 7; - intKeys[8] = 5; intValues[8] = 8; - intKeys[9] = 6; intValues[9] = 9; - - cudaMalloc((void**)&dev_intKeys, N * sizeof(int)); - checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); - - cudaMalloc((void**)&dev_intValues, N * sizeof(int)); - checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); - - dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); - - std::cout << "before unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { - std::cout << " key: " << intKeys[i]; - std::cout << " value: " << intValues[i] << std::endl; - } - - // How to copy data to the GPU - cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice); - cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice); - - // Wrap device vectors in thrust iterators for use with thrust. - thrust::device_ptr dev_thrust_keys(dev_intKeys); - thrust::device_ptr dev_thrust_values(dev_intValues); - // LOOK-2.1 Example for using thrust::sort_by_key - thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values); - - // How to copy data back to the CPU side from the GPU - cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost); - cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost); - checkCUDAErrorWithLine("memcpy back failed!"); - - std::cout << "after unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { - std::cout << " key: " << intKeys[i]; - std::cout << " value: " << intValues[i] << std::endl; - } - - // cleanup - cudaFree(dev_intKeys); - cudaFree(dev_intValues); - checkCUDAErrorWithLine("cudaFree failed!"); - return; + // LOOK-1.2 Feel free to write additional tests here. + + // test unstable sort + int *dev_intKeys; + int *dev_intValues; + int N = 10; + + std::unique_ptrintKeys{ new int[N] }; + std::unique_ptrintValues{ new int[N] }; + + intKeys[0] = 0; intValues[0] = 0; + intKeys[1] = 1; intValues[1] = 1; + intKeys[2] = 0; intValues[2] = 2; + intKeys[3] = 3; intValues[3] = 3; + intKeys[4] = 0; intValues[4] = 4; + intKeys[5] = 2; intValues[5] = 5; + intKeys[6] = 2; intValues[6] = 6; + intKeys[7] = 0; intValues[7] = 7; + intKeys[8] = 5; intValues[8] = 8; + intKeys[9] = 6; intValues[9] = 9; + + cudaMalloc((void**)&dev_intKeys, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); + + cudaMalloc((void**)&dev_intValues, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + std::cout << "before unstable sort: " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " key: " << intKeys[i]; + std::cout << " value: " << intValues[i] << std::endl; + } + + // How to copy data to the GPU + cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice); + cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice); + + // Wrap device vectors in thrust iterators for use with thrust. + thrust::device_ptr dev_thrust_keys(dev_intKeys); + thrust::device_ptr dev_thrust_values(dev_intValues); + // LOOK-2.1 Example for using thrust::sort_by_key + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values); + + // How to copy data back to the CPU side from the GPU + cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost); + cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("memcpy back failed!"); + + std::cout << "after unstable sort: " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " key: " << intKeys[i]; + std::cout << " value: " << intValues[i] << std::endl; + } + + // cleanup + cudaFree(dev_intKeys); + cudaFree(dev_intValues); + checkCUDAErrorWithLine("cudaFree failed!"); + return; } diff --git a/src/main.cpp b/src/main.cpp index b82c8c6..ddd0e3b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,8 +14,8 @@ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation const int N_FOR_VIS = 5000;