diff --git a/README.md b/README.md
index d63a6a1..d1f5bcc 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,33 @@
**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
Project 1 - Flocking**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+* Yan Wu
+ * [LinkedIn](https://www.linkedin.com/in/yan-wu-a71270159/)
+* Tested on: Windows 10, i7-8750H @ 2.20GHz 16.0GB, GTX 1060 6GB (Personal Laptop)
+* [Repo Link](https://github.com/wuyan33/Project1-CUDA-Flocking)
+### Program Result:
+* Result GIFs on 5,000 boids:
+ * Naive (by LICEcap):
+
+ * Uniform grid (by ScreenToGif):
+
+ * Coherent Search (by ScreenToGif):
+
+### Performance Analysis
+ * Framerate change with increasing # of boids for naive, scattered uniform grid, and coherent uniform grid (with visualization):
+ ![](/images/visualized_boids.PNG)
+ * Framerate change with increasing # of boids for naive, scattered uniform grid, and coherent uniform grid (without visualization):
+ ![](/images/nonvisualized_boids.PNG)
+ We can see that with the increasing number of boids, FPS of all three methods are decreasing. Clearly naive method performs worst with higher boid number.
+ * Framerate change with increasing block size:
+ ![](/images/increaseblocksize.PNG)
+ This part is tested with visualization and a boid number of 10,000.
+### Q & As
+* For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+ * For the naive method, increasing number of boids let to significantly decreasement in FPS, that's because brute force is an algorithm with a time complexity of O(N^2). The other two methods decrease as well but in a lot lower rate. That's because both algorithms have almost linear time complexity.
+* For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+ * From my result where I chose block count from 32 to 512, changing block count doesn't seem to have significant impact on performance.
+* For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+ * Not so much. I expected coherent method to win, but turns out the two algorithms has about the same performance. I tested for several times, but the performance are different from each others. There were two times FPS for uniform grid method has a difference of over 100. Unsteady outcome might be one of the reasons.
+* Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+ * In my case, performance with 27 neighboring cell is almost the same as 8 neighboring cell when boid number is below 10000. Then as boid number increases, my result has a preference for 8 neighboring cell. While checking 27 neighbors did requires more time to each thread, the cell number decreases as cell become larger.
diff --git a/images/coherentSearch.gif b/images/coherentSearch.gif
new file mode 100644
index 0000000..831512e
Binary files /dev/null and b/images/coherentSearch.gif differ
diff --git a/images/increaseblocksize.PNG b/images/increaseblocksize.PNG
new file mode 100644
index 0000000..9c4f9a8
Binary files /dev/null and b/images/increaseblocksize.PNG differ
diff --git a/images/naive.gif b/images/naive.gif
new file mode 100644
index 0000000..6a645e4
Binary files /dev/null and b/images/naive.gif differ
diff --git a/images/nonvisualized_boids.PNG b/images/nonvisualized_boids.PNG
new file mode 100644
index 0000000..c75fd83
Binary files /dev/null and b/images/nonvisualized_boids.PNG differ
diff --git a/images/uniformGrid.gif b/images/uniformGrid.gif
new file mode 100644
index 0000000..1a8edc5
Binary files /dev/null and b/images/uniformGrid.gif differ
diff --git a/images/visualized_boids.PNG b/images/visualized_boids.PNG
new file mode 100644
index 0000000..49a17fe
Binary files /dev/null and b/images/visualized_boids.PNG differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..b737097 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
cuda_add_library(src
${SOURCE_FILES}
- OPTIONS -arch=sm_20
+ OPTIONS -arch=sm_61
)
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..fa46809 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -5,6 +5,7 @@
#include
#include "utilityCore.hpp"
#include "kernel.h"
+#include "device_launch_parameters.h"
// LOOK-2.1 potentially useful for doing grid-based neighbor search
#ifndef imax
@@ -21,14 +22,14 @@
* Check for CUDA errors; print and exit if there was a problem.
*/
void checkCUDAError(const char *msg, int line = -1) {
- cudaError_t err = cudaGetLastError();
- if (cudaSuccess != err) {
- if (line >= 0) {
- fprintf(stderr, "Line %d: ", line);
- }
- fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
+ cudaError_t err = cudaGetLastError();
+ if (cudaSuccess != err) {
+ if (line >= 0) {
+ fprintf(stderr, "Line %d: ", line);
+ }
+ fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
+ exit(EXIT_FAILURE);
+ }
}
@@ -76,18 +77,18 @@ glm::vec3 *dev_vel2;
// For efficient sorting and the uniform grid. These should always be parallel.
int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
int *dev_particleGridIndices; // What grid cell is this particle in?
-// needed for use with thrust
+ // needed for use with thrust
thrust::device_ptr dev_thrust_particleArrayIndices;
thrust::device_ptr dev_thrust_particleGridIndices;
int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
int *dev_gridCellEndIndices; // to this cell?
-// TODO-2.3 - consider what additional buffers you might need to reshuffle
-// the position and velocity data to be coherent within cells.
+glm::vec3 *c_pos; // TODO-2.3 - consider what additional buffers you might need to reshuffle
+glm::vec3 *c_vel; // the position and velocity data to be coherent within cells.
-// LOOK-2.1 - Grid parameters based on simulation parameters.
-// These are automatically computed for you in Boids::initSimulation
+ // LOOK-2.1 - Grid parameters based on simulation parameters.
+ // These are automatically computed for you in Boids::initSimulation
int gridCellCount;
int gridSideCount;
float gridCellWidth;
@@ -99,13 +100,13 @@ glm::vec3 gridMinimum;
******************/
__host__ __device__ unsigned int hash(unsigned int a) {
- a = (a + 0x7ed55d16) + (a << 12);
- a = (a ^ 0xc761c23c) ^ (a >> 19);
- a = (a + 0x165667b1) + (a << 5);
- a = (a + 0xd3a2646c) ^ (a << 9);
- a = (a + 0xfd7046c5) + (a << 3);
- a = (a ^ 0xb55a4f09) ^ (a >> 16);
- return a;
+ a = (a + 0x7ed55d16) + (a << 12);
+ a = (a ^ 0xc761c23c) ^ (a >> 19);
+ a = (a + 0x165667b1) + (a << 5);
+ a = (a + 0xd3a2646c) ^ (a << 9);
+ a = (a + 0xfd7046c5) + (a << 3);
+ a = (a ^ 0xb55a4f09) ^ (a >> 16);
+ return a;
}
/**
@@ -113,10 +114,10 @@ __host__ __device__ unsigned int hash(unsigned int a) {
* Function for generating a random vec3.
*/
__host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
- thrust::default_random_engine rng(hash((int)(index * time)));
- thrust::uniform_real_distribution unitDistrib(-1, 1);
+ thrust::default_random_engine rng(hash((int)(index * time)));
+ thrust::uniform_real_distribution unitDistrib(-1, 1);
- return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
+ return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
}
/**
@@ -124,52 +125,62 @@ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
* CUDA kernel for generating boids with a specified mass randomly around the star.
*/
__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) {
- int index = (blockIdx.x * blockDim.x) + threadIdx.x;
- if (index < N) {
- glm::vec3 rand = generateRandomVec3(time, index);
- arr[index].x = scale * rand.x;
- arr[index].y = scale * rand.y;
- arr[index].z = scale * rand.z;
- }
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index < N) {
+ glm::vec3 rand = generateRandomVec3(time, index);
+ arr[index].x = scale * rand.x;
+ arr[index].y = scale * rand.y;
+ arr[index].z = scale * rand.z;
+ }
}
/**
* Initialize memory, update some globals
*/
void Boids::initSimulation(int N) {
- numObjects = N;
- dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
- // LOOK-1.2 - This is basic CUDA memory management and error checking.
- // Don't forget to cudaFree in Boids::endSimulation.
- cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
- checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
-
- cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
- checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
-
- cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
- checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
-
- // LOOK-1.2 - This is a typical CUDA kernel invocation.
- kernGenerateRandomPosArray<<>>(1, numObjects,
- dev_pos, scene_scale);
- checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
-
- // LOOK-2.1 computing grid params
- gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
- int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
- gridSideCount = 2 * halfSideCount;
-
- gridCellCount = gridSideCount * gridSideCount * gridSideCount;
- gridInverseCellWidth = 1.0f / gridCellWidth;
- float halfGridWidth = gridCellWidth * halfSideCount;
- gridMinimum.x -= halfGridWidth;
- gridMinimum.y -= halfGridWidth;
- gridMinimum.z -= halfGridWidth;
-
- // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
- cudaDeviceSynchronize();
+ numObjects = N;
+ dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+ // LOOK-1.2 - This is basic CUDA memory management and error checking.
+ // Don't forget to cudaFree in Boids::endSimulation.
+ cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
+ checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
+
+ cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
+ checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
+
+ cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
+ checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
+
+ // LOOK-1.2 - This is a typical CUDA kernel invocation.
+ kernGenerateRandomPosArray <<>>(1, numObjects,
+ dev_pos, scene_scale);
+ checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
+
+ // LOOK-2.1 computing grid params
+ gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+ int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
+ gridSideCount = 2 * halfSideCount;
+
+ gridCellCount = gridSideCount * gridSideCount * gridSideCount;
+ gridInverseCellWidth = 1.0f / gridCellWidth;
+ float halfGridWidth = gridCellWidth * halfSideCount;
+ gridMinimum.x -= halfGridWidth;
+ gridMinimum.y -= halfGridWidth;
+ gridMinimum.z -= halfGridWidth;
+
+ // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+ cudaMalloc((void**)&dev_particleArrayIndices, N *sizeof(int));
+ cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+ cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+ cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+ dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices);
+ dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices);
+ cudaMalloc((void**)&c_pos, N * sizeof(glm::vec3));
+ cudaMalloc((void**)&c_vel, N * sizeof(glm::vec3));
+
+
+ cudaThreadSynchronize();
}
@@ -181,41 +192,41 @@ void Boids::initSimulation(int N) {
* Copy the boid positions into the VBO so that they can be drawn by OpenGL.
*/
__global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) {
- int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
- float c_scale = -1.0f / s_scale;
+ float c_scale = -1.0f / s_scale;
- if (index < N) {
- vbo[4 * index + 0] = pos[index].x * c_scale;
- vbo[4 * index + 1] = pos[index].y * c_scale;
- vbo[4 * index + 2] = pos[index].z * c_scale;
- vbo[4 * index + 3] = 1.0f;
- }
+ if (index < N) {
+ vbo[4 * index + 0] = pos[index].x * c_scale;
+ vbo[4 * index + 1] = pos[index].y * c_scale;
+ vbo[4 * index + 2] = pos[index].z * c_scale;
+ vbo[4 * index + 3] = 1.0f;
+ }
}
__global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) {
- int index = threadIdx.x + (blockIdx.x * blockDim.x);
-
- if (index < N) {
- vbo[4 * index + 0] = vel[index].x + 0.3f;
- vbo[4 * index + 1] = vel[index].y + 0.3f;
- vbo[4 * index + 2] = vel[index].z + 0.3f;
- vbo[4 * index + 3] = 1.0f;
- }
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+ if (index < N) {
+ vbo[4 * index + 0] = vel[index].x + 0.3f;
+ vbo[4 * index + 1] = vel[index].y + 0.3f;
+ vbo[4 * index + 2] = vel[index].z + 0.3f;
+ vbo[4 * index + 3] = 1.0f;
+ }
}
/**
* Wrapper for call to the kernCopyboidsToVBO CUDA kernel.
*/
void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
- dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+ dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
- kernCopyPositionsToVBO << > >(numObjects, dev_pos, vbodptr_positions, scene_scale);
- kernCopyVelocitiesToVBO << > >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+ kernCopyPositionsToVBO <<>>(numObjects, dev_pos, vbodptr_positions, scene_scale);
+ kernCopyVelocitiesToVBO <<>>(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
- checkCUDAErrorWithLine("copyBoidsToVBO failed!");
+ checkCUDAErrorWithLine("copyBoidsToVBO failed!");
- cudaDeviceSynchronize();
+ cudaThreadSynchronize();
}
@@ -230,10 +241,34 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
* in the `pos` and `vel` arrays.
*/
__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
- // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
- // Rule 2: boids try to stay a distance d away from each other
- // Rule 3: boids try to match the speed of surrounding boids
- return glm::vec3(0.0f, 0.0f, 0.0f);
+ // initialize velocity changes generated by 3 rules.
+ glm::vec3 self_pos = pos[iSelf];
+ glm::vec3 perceived_center(0.f), c(0.f), perceived_velocity(0.f);
+ float count_rule1 = 0, count_rule3 = 0;
+ for (int i = 0; i < N; i++) {
+ if (i == iSelf) continue;
+ float distance = glm::distance(self_pos, pos[i]);
+ // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+ if (distance < rule1Distance) {
+ perceived_center += pos[i];
+ count_rule1++;
+ }
+ // Rule 2: boids try to stay a distance d away from each other
+ if (distance < rule2Distance) {
+ c -= (pos[i] - self_pos);
+ }
+ // Rule 3: boids try to match the speed of surrounding boids
+ if (distance < rule3Distance) {
+ perceived_velocity += vel[i];
+ count_rule3++;
+ }
+ }
+ perceived_center /= count_rule1;
+ perceived_velocity /= count_rule3;
+ glm::vec3 v1 = (perceived_center - self_pos) * rule1Scale;
+ glm::vec3 v2 = c * rule2Scale;
+ glm::vec3 v3 = perceived_velocity * rule3Scale;
+ return v1 + v2 + v3;
}
/**
@@ -241,10 +276,15 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
* For each of the `N` bodies, update its position based on its current velocity.
*/
__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
- glm::vec3 *vel1, glm::vec3 *vel2) {
- // Compute a new velocity based on pos and vel1
- // Clamp the speed
- // Record the new velocity into vel2. Question: why NOT vel1?
+ glm::vec3 *vel1, glm::vec3 *vel2) {
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (index >= N) return;
+ // Compute a new velocity based on pos and vel1
+ glm::vec3 new_vel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+ // Clamp the speed(check if new_vel exceeds max allowed speed)
+ new_vel = (glm::length(new_vel) > maxSpeed) ? glm::normalize(new_vel) * maxSpeed : new_vel;
+ // Record the new velocity into vel2. Question: why NOT vel1?
+ vel2[index] = new_vel;
}
/**
@@ -252,24 +292,24 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
* For each of the `N` bodies, update its position based on its current velocity.
*/
__global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
- // Update position by velocity
- int index = threadIdx.x + (blockIdx.x * blockDim.x);
- if (index >= N) {
- return;
- }
- glm::vec3 thisPos = pos[index];
- thisPos += vel[index] * dt;
-
- // Wrap the boids around so we don't lose them
- thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
- thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
- thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
-
- thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
- thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
- thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
-
- pos[index] = thisPos;
+ // Update position by velocity
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (index >= N) {
+ return;
+ }
+ glm::vec3 thisPos = pos[index];
+ thisPos += vel[index] * dt;
+
+ // Wrap the boids around so we don't lose them
+ thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
+ thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
+ thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
+
+ thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
+ thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
+ thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
+
+ pos[index] = thisPos;
}
// LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
@@ -279,179 +319,365 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
// for(y)
// for(z)? Or some other order?
__device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
- return x + y * gridResolution + z * gridResolution * gridResolution;
+ return x + y * gridResolution + z * gridResolution * gridResolution;
}
__global__ void kernComputeIndices(int N, int gridResolution,
- glm::vec3 gridMin, float inverseCellWidth,
- glm::vec3 *pos, int *indices, int *gridIndices) {
- // TODO-2.1
- // - Label each boid with the index of its grid cell.
- // - Set up a parallel array of integer indices as pointers to the actual
- // boid data in pos and vel1/vel2
+ glm::vec3 gridMin, float inverseCellWidth,
+ glm::vec3 *pos, int *indices, int *gridIndices) {
+ // TODO-2.1
+ // - Label each boid with the index of its grid cell.
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= N) return;
+ glm::ivec3 grid_index = (pos[index] - gridMin) * inverseCellWidth;
+ // - Set up a parallel array of integer indices as pointers to the actual
+ gridIndices[index] = gridIndex3Dto1D(grid_index.x, grid_index.y, grid_index.z, gridResolution);
+ // boid data in pos and vel1/vel2
+ indices[index] = index;
}
// LOOK-2.1 Consider how this could be useful for indicating that a cell
// does not enclose any boids
__global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
- int index = (blockIdx.x * blockDim.x) + threadIdx.x;
- if (index < N) {
- intBuffer[index] = value;
- }
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index < N) {
+ intBuffer[index] = value;
+ }
}
__global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
- int *gridCellStartIndices, int *gridCellEndIndices) {
- // TODO-2.1
- // Identify the start point of each cell in the gridIndices array.
- // This is basically a parallel unrolling of a loop that goes
- // "this index doesn't match the one before it, must be a new cell!"
+ int *gridCellStartIndices, int *gridCellEndIndices) {
+ // TODO-2.1
+ // Identify the start point of each cell in the gridIndices array.
+ // This is basically a parallel unrolling of a loop that goes
+ // "this index doesn't match the one before it, must be a new cell!"
+
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= N) return;
+
+ int grid_cell_index = particleGridIndices[index];
+
+ if (index == 0) {
+ gridCellStartIndices[grid_cell_index] = index;
+ }else {
+ int prev_cell_index = particleGridIndices[index - 1];
+ if (prev_cell_index != grid_cell_index) {
+ gridCellStartIndices[grid_cell_index] = index;
+ gridCellEndIndices[prev_cell_index] = index - 1;
+ }
+ }
+ if (index == N - 1) gridCellEndIndices[grid_cell_index] = N - 1;
}
__global__ void kernUpdateVelNeighborSearchScattered(
- int N, int gridResolution, glm::vec3 gridMin,
- float inverseCellWidth, float cellWidth,
- int *gridCellStartIndices, int *gridCellEndIndices,
- int *particleArrayIndices,
- glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
- // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
- // the number of boids that need to be checked.
- // - Identify the grid cell that this particle is in
- // - Identify which cells may contain neighbors. This isn't always 8.
- // - For each cell, read the start/end indices in the boid pointer array.
- // - Access each boid in the cell and compute velocity change from
- // the boids rules, if this boid is within the neighborhood distance.
- // - Clamp the speed change before putting the new speed in vel2
+ int N, int gridResolution, glm::vec3 gridMin,
+ float inverseCellWidth, float cellWidth,
+ int *gridCellStartIndices, int *gridCellEndIndices,
+ int *particleArrayIndices,
+ glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+ // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+ // the number of boids that need to be checked.
+ // - Identify the grid cell that this particle is in
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= N) return;
+ glm::ivec3 grid_index = (pos[index] - gridMin) * inverseCellWidth;
+ glm::vec3 boid = pos[index];
+ // - Identify which cells may contain neighbors. This isn't always 8.
+ // - For each cell, read the start/end indices in the boid pointer array.
+ // - Access each boid in the cell and compute velocity change from
+ // the boids rules, if this boid is within the neighborhood distance.
+
+ glm::vec3 perceived_center(0.f), c(0.f), perceived_velocity(0.f);
+ float count_rule1 = 0, count_rule3 = 0;
+
+ for (int i = 0; i < 2; i++) {
+ int x = imax(grid_index.x - i, 0);
+ for (int j = 0; j < 2; j++) {
+ int y = imax(grid_index.y - j, 0);
+ for (int k = 0; k < 2; k++) {
+ int z = imax(grid_index.z - k, 0);
+ int cur_grid_index = gridIndex3Dto1D(x, y, z, gridResolution);
+ if (gridCellStartIndices[cur_grid_index] < 0) continue;
+ int start = gridCellStartIndices[cur_grid_index];
+ int end = gridCellStartIndices[cur_grid_index];
+ for (int b = start; b <= end; b++) {
+ int particle_index = particleArrayIndices[b];
+ float distance = glm::distance(pos[particle_index], boid);
+ if (b == index) continue;
+ // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+ if (distance < rule1Distance) {
+ perceived_center += pos[particle_index];
+ count_rule1++;
+ }
+ // Rule 2: boids try to stay a distance d away from each other
+ if (distance < rule2Distance) {
+ c -= (pos[particle_index] - boid);
+ }
+ // Rule 3: boids try to match the speed of surrounding boids
+ if (distance < rule3Distance) {
+ perceived_velocity += vel1[particle_index];
+ count_rule3++;
+ }
+ }
+ }
+ }
+ }
+ glm::vec3 new_vel = vel1[index];
+ if (count_rule1 > 0) {
+ perceived_center /= count_rule1;
+ new_vel += (perceived_center - boid) * rule1Scale;
+
+ }
+ if (count_rule3 > 0) {
+ perceived_velocity /= count_rule3;
+ new_vel += perceived_velocity * rule3Scale;
+ }
+ new_vel += c * rule2Scale;
+ // - Clamp the speed change before putting the new speed in vel2
+ new_vel = (glm::length(new_vel) > maxSpeed) ? glm::normalize(new_vel) * maxSpeed : new_vel;
+ vel2[index] = new_vel;
}
__global__ void kernUpdateVelNeighborSearchCoherent(
- int N, int gridResolution, glm::vec3 gridMin,
- float inverseCellWidth, float cellWidth,
- int *gridCellStartIndices, int *gridCellEndIndices,
- glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
- // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
- // except with one less level of indirection.
- // This should expect gridCellStartIndices and gridCellEndIndices to refer
- // directly to pos and vel1.
- // - Identify the grid cell that this particle is in
- // - Identify which cells may contain neighbors. This isn't always 8.
- // - For each cell, read the start/end indices in the boid pointer array.
- // DIFFERENCE: For best results, consider what order the cells should be
- // checked in to maximize the memory benefits of reordering the boids data.
- // - Access each boid in the cell and compute velocity change from
- // the boids rules, if this boid is within the neighborhood distance.
- // - Clamp the speed change before putting the new speed in vel2
+ int N, int gridResolution, glm::vec3 gridMin,
+ float inverseCellWidth, float cellWidth,
+ int *gridCellStartIndices, int *gridCellEndIndices,
+ glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+ // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+ // except with one less level of indirection.
+ // This should expect gridCellStartIndices and gridCellEndIndices to refer
+ // directly to pos and vel1.
+ // - Identify the grid cell that this particle is in
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= N) return;
+ glm::ivec3 grid_index = (pos[index] - gridMin) * inverseCellWidth;
+ glm::vec3 boid = pos[index];
+ glm::vec3 perceived_center(0.f), c(0.f), perceived_velocity(0.f);
+ float count_rule1 = 0, count_rule3 = 0;
+
+ // - Identify which cells may contain neighbors. This isn't always 8.
+ for (int i = 0; i < 2; i++) {
+ int x = imax(grid_index.x - i, 0);
+ for (int j = 0; j < 2; j++) {
+ int y = imax(grid_index.y - j, 0);
+ for (int k = 0; k < 2; k++) {
+ int z = imax(grid_index.z - k, 0);
+ int grid_index = gridIndex3Dto1D(x, y, z, gridResolution);
+ if (gridCellStartIndices[grid_index] == -1) continue;
+ // - For each cell, read the start/end indices in the boid pointer array.
+ int start = gridCellStartIndices[grid_index];
+ int end = gridCellEndIndices[grid_index];
+ for (int b = start; b <= end; b++) {
+ // DIFFERENCE: For best results, consider what order the cells should be
+ // checked in to maximize the memory benefits of reordering the boids data.
+ // - Access each boid in the cell and compute velocity change from
+ // the boids rules, if this boid is within the neighborhood distance.
+ float distance = glm::distance(pos[b], boid);
+ if (b == index) continue;
+ // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+ if (distance < rule1Distance) {
+ perceived_center += pos[b];
+ count_rule1++;
+ }
+ // Rule 2: boids try to stay a distance d away from each other
+ if (distance < rule2Distance) {
+ c -= (pos[b] - boid);
+ }
+ // Rule 3: boids try to match the speed of surrounding boids
+ if (distance < rule3Distance) {
+ perceived_velocity += vel1[b];
+ count_rule3++;
+ }
+ }
+ }
+ }
+ }
+ // - Clamp the speed change before putting the new speed in vel2
+ glm::vec3 new_vel = vel1[index];
+ if (count_rule1 > 0) {
+ perceived_center /= count_rule1;
+ new_vel += (perceived_center - boid) * rule1Scale;
+
+ }
+ if (count_rule3 > 0) {
+ perceived_velocity /= count_rule3;
+ new_vel += perceived_velocity * rule3Scale;
+ }
+ new_vel += c * rule2Scale;
+ // - Clamp the speed change before putting the new speed in vel2
+ new_vel = (glm::length(new_vel) > maxSpeed) ? glm::normalize(new_vel) * maxSpeed : new_vel;
+ vel2[index] = new_vel;
+}
+
+__global__ void kernRearrangeArray(
+ int N, int *particleArrayIndices,
+ glm::vec3 *pos, glm::vec3 *vel, glm::vec3 *c_pos, glm::vec3 *c_vel) {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= N) return;
+ int boid = particleArrayIndices[index];
+ c_pos[index] = pos[boid];
+ c_vel[index] = vel[boid];
}
/**
* Step the entire N-body simulation by `dt` seconds.
*/
void Boids::stepSimulationNaive(float dt) {
- // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
- // TODO-1.2 ping-pong the velocity buffers
+ dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+ // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+ kernUpdateVelocityBruteForce<<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+ checkCUDAErrorWithLine("Function kernUpdateVelocityBruteForce failed!!!");
+ kernUpdatePos<<< fullBlocksPerGrid, blockSize >>>(numObjects, dt, dev_pos, dev_vel2);
+ checkCUDAErrorWithLine("Function kernUpdatePos failed!!!");
+ // TODO-1.2 ping-pong the velocity buffers
+ std::swap(dev_vel1, dev_vel2);
}
void Boids::stepSimulationScatteredGrid(float dt) {
- // TODO-2.1
- // Uniform Grid Neighbor search using Thrust sort.
- // In Parallel:
- // - label each particle with its array index as well as its grid index.
- // Use 2x width grids.
- // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
- // are welcome to do a performance comparison.
- // - Naively unroll the loop for finding the start and end indices of each
- // cell's data pointers in the array of boid indices
- // - Perform velocity updates using neighbor search
- // - Update positions
- // - Ping-pong buffers as needed
+ // TODO-2.1
+ // Uniform Grid Neighbor search using Thrust sort.
+ // In Parallel:
+ // - label each particle with its array index as well as its grid index.
+ // Use 2x width grids.
+ dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+ kernResetIntBuffer<<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_gridCellStartIndices, -1);
+ kernResetIntBuffer<<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_gridCellEndIndices, -1);
+ kernComputeIndices<<< fullBlocksPerGrid, blockSize >>>(
+ numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+ dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices);
+ dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices);
+ // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+ // are welcome to do a performance comparison.
+ thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+ // - Naively unroll the loop for finding the start and end indices of each
+ // cell's data pointers in the array of boid indices
+ kernIdentifyCellStartEnd<<< fullBlocksPerGrid, blockSize >>>(
+ numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+ // - Perform velocity updates using neighbor search
+ kernUpdateVelNeighborSearchScattered<<< fullBlocksPerGrid, blockSize >>>(
+ numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+ dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+ // - Update positions
+ kernUpdatePos<<< fullBlocksPerGrid, blockSize >>>(numObjects, dt, dev_pos, dev_vel2);
+ // - Ping-pong buffers as needed
+ std::swap(dev_vel1, dev_vel2);
+
}
void Boids::stepSimulationCoherentGrid(float dt) {
- // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
- // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
- // In Parallel:
- // - Label each particle with its array index as well as its grid index.
- // Use 2x width grids
- // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
- // are welcome to do a performance comparison.
- // - Naively unroll the loop for finding the start and end indices of each
- // cell's data pointers in the array of boid indices
- // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
- // the particle data in the simulation array.
- // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
- // - Perform velocity updates using neighbor search
- // - Update positions
- // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+ // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
+ // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+ // In Parallel:
+ // - Label each particle with its array index as well as its grid index.
+ // Use 2x width grids
+ dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+ kernResetIntBuffer <<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_gridCellStartIndices, -1);
+ kernResetIntBuffer <<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_gridCellEndIndices, -1);
+ kernComputeIndices <<< fullBlocksPerGrid, blockSize >>>(
+ numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+ dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices);
+ dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices);
+ // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+ // are welcome to do a performance comparison.
+ thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+ // - Naively unroll the loop for finding the start and end indices of each
+ // cell's data pointers in the array of boid indices
+ kernIdentifyCellStartEnd <<< fullBlocksPerGrid, blockSize >>>(
+ numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+ // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
+ // the particle data in the simulation array.
+ // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+ kernRearrangeArray<<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, c_pos, c_vel);
+ // - Perform velocity updates using neighbor search
+ kernUpdateVelNeighborSearchCoherent<<< fullBlocksPerGrid, blockSize >>>(
+ numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+ dev_gridCellStartIndices, dev_gridCellEndIndices, c_pos, c_vel, dev_vel2);
+ // - Update positions
+ kernUpdatePos<<< fullBlocksPerGrid, blockSize >>>(numObjects, dt, c_pos, dev_vel2);
+ // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+ std::swap(dev_vel1, dev_vel2);
+ std::swap(dev_pos, c_pos);
}
void Boids::endSimulation() {
- cudaFree(dev_vel1);
- cudaFree(dev_vel2);
- cudaFree(dev_pos);
-
- // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+ cudaFree(dev_vel1);
+ cudaFree(dev_vel2);
+ cudaFree(dev_pos);
+
+ // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+ cudaFree(dev_gridCellEndIndices);
+ cudaFree(dev_gridCellStartIndices);
+ cudaFree(dev_particleArrayIndices);
+ cudaFree(dev_particleGridIndices);
+ cudaFree(c_pos);
+ cudaFree(c_vel);
}
void Boids::unitTest() {
- // LOOK-1.2 Feel free to write additional tests here.
-
- // test unstable sort
- int *dev_intKeys;
- int *dev_intValues;
- int N = 10;
-
- std::unique_ptrintKeys{ new int[N] };
- std::unique_ptrintValues{ new int[N] };
-
- intKeys[0] = 0; intValues[0] = 0;
- intKeys[1] = 1; intValues[1] = 1;
- intKeys[2] = 0; intValues[2] = 2;
- intKeys[3] = 3; intValues[3] = 3;
- intKeys[4] = 0; intValues[4] = 4;
- intKeys[5] = 2; intValues[5] = 5;
- intKeys[6] = 2; intValues[6] = 6;
- intKeys[7] = 0; intValues[7] = 7;
- intKeys[8] = 5; intValues[8] = 8;
- intKeys[9] = 6; intValues[9] = 9;
-
- cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
- checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
-
- cudaMalloc((void**)&dev_intValues, N * sizeof(int));
- checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
-
- dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
- std::cout << "before unstable sort: " << std::endl;
- for (int i = 0; i < N; i++) {
- std::cout << " key: " << intKeys[i];
- std::cout << " value: " << intValues[i] << std::endl;
- }
-
- // How to copy data to the GPU
- cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-
- // Wrap device vectors in thrust iterators for use with thrust.
- thrust::device_ptr dev_thrust_keys(dev_intKeys);
- thrust::device_ptr dev_thrust_values(dev_intValues);
- // LOOK-2.1 Example for using thrust::sort_by_key
- thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
-
- // How to copy data back to the CPU side from the GPU
- cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
- cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
- checkCUDAErrorWithLine("memcpy back failed!");
-
- std::cout << "after unstable sort: " << std::endl;
- for (int i = 0; i < N; i++) {
- std::cout << " key: " << intKeys[i];
- std::cout << " value: " << intValues[i] << std::endl;
- }
-
- // cleanup
- cudaFree(dev_intKeys);
- cudaFree(dev_intValues);
- checkCUDAErrorWithLine("cudaFree failed!");
- return;
+ // LOOK-1.2 Feel free to write additional tests here.
+
+ // test unstable sort
+ int *dev_intKeys;
+ int *dev_intValues;
+ int N = 10;
+
+ std::unique_ptrintKeys{ new int[N] };
+ std::unique_ptrintValues{ new int[N] };
+
+ intKeys[0] = 0; intValues[0] = 0;
+ intKeys[1] = 1; intValues[1] = 1;
+ intKeys[2] = 0; intValues[2] = 2;
+ intKeys[3] = 3; intValues[3] = 3;
+ intKeys[4] = 0; intValues[4] = 4;
+ intKeys[5] = 2; intValues[5] = 5;
+ intKeys[6] = 2; intValues[6] = 6;
+ intKeys[7] = 0; intValues[7] = 7;
+ intKeys[8] = 5; intValues[8] = 8;
+ intKeys[9] = 6; intValues[9] = 9;
+
+ cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
+ checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
+
+ cudaMalloc((void**)&dev_intValues, N * sizeof(int));
+ checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+
+ dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+ std::cout << "before unstable sort: " << std::endl;
+ for (int i = 0; i < N; i++) {
+ std::cout << " key: " << intKeys[i];
+ std::cout << " value: " << intValues[i] << std::endl;
+ }
+
+ // How to copy data to the GPU
+ cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+
+ // Wrap device vectors in thrust iterators for use with thrust.
+ thrust::device_ptr dev_thrust_keys(dev_intKeys);
+ thrust::device_ptr dev_thrust_values(dev_intValues);
+ // LOOK-2.1 Example for using thrust::sort_by_key
+ thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+
+ // How to copy data back to the CPU side from the GPU
+ cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
+ cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+ checkCUDAErrorWithLine("memcpy back failed!");
+
+ std::cout << "after unstable sort: " << std::endl;
+ for (int i = 0; i < N; i++) {
+ std::cout << " key: " << intKeys[i];
+ std::cout << " value: " << intValues[i] << std::endl;
+ }
+
+ // cleanup
+ cudaFree(dev_intKeys);
+ cudaFree(dev_intValues);
+ checkCUDAErrorWithLine("cudaFree failed!");
+ return;
}
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..ddd0e3b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
// LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
#define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
// LOOK-1.2 - change this to adjust particle count in the simulation
const int N_FOR_VIS = 5000;