diff --git a/README.md b/README.md
index d63a6a1..1c162e4 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,27 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yichen Shou
+  * [LinkedIn](https://www.linkedin.com/in/yichen-shou-68023455/), [personal website](http://www.yichenshou.com/)
+* Tested on: Windows 10, i7-6500U @ 2.50GHz 12GB RAM, GeForce 940M 8GB (Personal Laptop)
 
-### (TODO: Your README)
+## Project Overview
+This project is a flocking simulation that implements the [Reynolds Boids algorithm](http://www.vergenet.net/~conrad/boids/pseudocode.html). CUDA is used to increase simulation efficiency. I tested three different performances and analyzed their performance. 
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![](images/Boids.gif)
+
+## Performance Analysis
+### Boids vs. Performance
+![](images/boidNumber.PNG)
+* Performance for the naive implementation decreases as expected as boid number increase, since there are simply more boids to check.
+* Performance shows an interesting dip for both Scatter and Coherent grids as the number of simulated boids increase. Framerate drops past the 1000 boid mark, and doesn't come back up until past 9000. My theory is that in the middle range, there are a lot of individual boid groups, not close enough to form big groups but not small enough to be trivial, that end up slowing down the framerate. 
+
+### Blocks vs. Performance
+![](images/blockNumber.PNG)
+
+* Naive grids are not effected by block size as expected, since it always just checks every other boid.
+* Scatter and Coherent grids actually suffer a framerate drop once block size is increased, which is a bit unexpected since increased block size usually increased performance. I suspect that this might have something to do with memory. One hypothesis is that the increased block size is causing more memory to be loaded than needed, thus wasting time to copying memory. 
+
+### Grids vs. Performance
+* Coherent performs faster than Scatter, as expected due to the improved algorithm. Compared to the scatter grid, the coherent grid does a little more preprocessing (using a buffer array to reorder data), but yields a good chunk of performance increase especially at high boid numbers. 
+* Surprisingly, changing te cell width and checking 27 as opposed to 8 neighboring cells did not affect performance all that much. One would expect that as there are more cells to check, it would be slower, but the result was not so. My theory is that the process of checking boid distance turned out to be quite trivial, and it is actually calculating the rules' effects that take time. Thus simply checking with more boids does not affect the framerate too much. 
\ No newline at end of file
diff --git a/images/Boids.gif b/images/Boids.gif
new file mode 100644
index 0000000..c1acdce
Binary files /dev/null and b/images/Boids.gif differ
diff --git a/images/blockNumber.PNG b/images/blockNumber.PNG
new file mode 100644
index 0000000..362101b
Binary files /dev/null and b/images/blockNumber.PNG differ
diff --git a/images/boidNumber.PNG b/images/boidNumber.PNG
new file mode 100644
index 0000000..12e2e57
Binary files /dev/null and b/images/boidNumber.PNG differ
diff --git a/performanceAnalysis.xlsx b/performanceAnalysis.xlsx
new file mode 100644
index 0000000..b602929
Binary files /dev/null and b/performanceAnalysis.xlsx differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..dff0113 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..bd5027e 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -37,7 +37,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
+#define blockSize 1024
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -47,7 +47,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 
 #define rule1Scale 0.01f
 #define rule2Scale 0.1f
-#define rule3Scale 0.1f
+#define rule3Scale 0.2f
 
 #define maxSpeed 1.0f
 
@@ -85,6 +85,7 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_buffer;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -157,7 +158,7 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  gridCellWidth = 2.0 * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -169,6 +170,24 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_buffer, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_buffer failed!");
+
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+
   cudaDeviceSynchronize();
 }
 
@@ -230,10 +249,53 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+	// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+	glm::vec3 center(0.0f, 0.0f, 0.0f);
+	int centerNeighbors = 0;
+
+	// Rule 2: boids try to stay a distance d away from each other
+	glm::vec3 separation(0.0f, 0.0f, 0.0f);
+
+	// Rule 3: boids try to match the speed of surrounding boids
+	glm::vec3 cohesion(0.0f, 0.0f, 0.0f);
+	int cohesionNeighbors = 0;
+
+	glm::vec3 myPos = pos[iSelf];
+	glm::vec3 myVel = vel[iSelf];
+
+	for (int i = 0; i < N; i++) {
+		if (i != iSelf) {
+			glm::vec3 otherPos = pos[i];
+			glm::vec3 otherVel = vel[i];
+			
+			float distance = glm::length(myPos - otherPos);
+
+			if (distance < rule1Distance) {
+				center += otherPos;
+				centerNeighbors++;
+			}
+			if (distance < rule2Distance) {
+				separation -= (otherPos - myPos);
+			}
+			if (distance < rule3Distance) {
+				cohesion += otherVel;
+				cohesionNeighbors++;
+			}
+		}
+	}
+	
+	if (centerNeighbors > 0) {
+		center /= centerNeighbors;
+		myVel += (center - myPos) * rule1Scale;
+	}
+	if (cohesionNeighbors > 0) {
+		cohesion /= cohesionNeighbors;
+		myVel += cohesion * rule3Scale;
+	}
+	
+	myVel += separation * rule2Scale;
+	
+	return myVel;
 }
 
 /**
@@ -241,12 +303,35 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 * For each of the `N` bodies, update its position based on its current velocity.
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+	glm::vec3 *vel1, glm::vec3 *vel2) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		// Compute a new velocity based on pos and vel1
+		glm::vec3 newVelocity = computeVelocityChange(N, index, pos, vel1);
+		
+		// Clamp the speed 
+		float speed = glm::length(newVelocity);
+		if (speed > maxSpeed) {
+			newVelocity = newVelocity / speed * maxSpeed;
+		}
+
+		// Record the new velocity into vel2. not vel1 b/c we're using ping-pong buffers
+		vel2[index] = newVelocity;
+	}
+}
+
+
+/**
+*	Updates vel2 into vel1
+*/
+__global__ void kernPingPongVelocity(int N, glm::vec3 *vel1, glm::vec3 *vel2) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		vel1[index] = vel2[index];
+	}
 }
 
+
 /**
 * LOOK-1.2 Since this is pretty trivial, we implemented it for you.
 * For each of the `N` bodies, update its position based on its current velocity.
@@ -277,18 +362,29 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
 //          order for iterating over neighboring grid cells?
 //          for(x)
 //            for(y)
-//             for(z)? Or some other order?
+//             for(z)? Or some other order? zyx to take advantage of cache lines
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
-  return x + y * gridResolution + z * gridResolution * gridResolution;
+	return x + y * gridResolution + z * gridResolution * gridResolution;;
 }
 
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
     // TODO-2.1
-    // - Label each boid with the index of its grid cell.
-    // - Set up a parallel array of integer indices as pointers to the actual
-    //   boid data in pos and vel1/vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		// - Label each boid with the index of its grid cell.
+		int x = (int)((pos[index].x - gridMin.x) * inverseCellWidth);
+		int y = (int)((pos[index].y - gridMin.y) * inverseCellWidth);
+		int z = (int)((pos[index].z - gridMin.z) * inverseCellWidth);
+		int gridIndex = gridIndex3Dto1D(x, y, z, gridResolution);
+		//printf("input is %f, %f, %f, invW is %f, xyz is %d, %d, %d, output is %d\n", pos[index].x, pos[index].y, pos[index].z, inverseCellWidth, x, y, z, gridIndex);
+		gridIndices[index] = gridIndex;
+
+		// - Set up a parallel array of integer indices as pointers to the actual
+		//   boid data in pos and vel1/vel2
+		indices[index] = index;
+	}
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -302,86 +398,333 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
-  // This is basically a parallel unrolling of a loop that goes
-  // "this index doesn't match the one before it, must be a new cell!"
+	// TODO-2.1
+	// Identify the start point of each cell in the gridIndices array.
+	// This is basically a parallel unrolling of a loop that goes
+	// "this index doesn't match the one before it, must be a new cell!"
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < N) {
+		int gridIndex = particleGridIndices[index];
+
+		if (index == 0 || gridIndex != particleGridIndices[index - 1]) {
+			gridCellStartIndices[gridIndex] = index;
+			if (gridCellEndIndices[gridIndex] == -1) {
+				gridCellEndIndices[gridIndex] = index;
+			}
+		}
+		else if (gridIndex == particleGridIndices[index - 1]) {
+			gridCellEndIndices[gridIndex] = imax(index, gridCellEndIndices[gridIndex]);
+		}
+	}
 }
 
+__global__ void kernReorderAndCopyData(int N, int *particleArrayIndices,  glm::vec3* original, glm::vec3* target) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < N) {
+		int gridIndex = particleArrayIndices[index];
+		target[index] = original[gridIndex];
+	}
+}
+
+
 __global__ void kernUpdateVelNeighborSearchScattered(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
+  const int *gridCellStartIndices, const int *gridCellEndIndices,
+  const int *particleArrayIndices,
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < N) {
+		// TODO-2.1 - Update a boid's velocity using the uniform grid to reduce the number of boids that need to be checked.
+		// - Identify the grid cell that this particle is in
+		glm::vec3 myPos = pos[index];
+		glm::vec3 myVel = vel1[index];
+		int x = (int)((myPos.x - gridMin.x) * inverseCellWidth);
+		int y = (int)((myPos.y - gridMin.y) * inverseCellWidth);
+		int z = (int)((myPos.z - gridMin.z) * inverseCellWidth);
+		int myCellIndex = gridIndex3Dto1D(x, y, z, gridResolution);
+
+		// - Identify which cells (including this cell) may contain neighbors. This isn't always 9. 
+		int cellsToCheck[9];
+		for (int i = 0; i < 9; i++) {
+			cellsToCheck[i] = -1; // setting everything to -1 for sanity check
+		}
+
+		cellsToCheck[0] = myCellIndex;
+		int neighborSize = 1;
+		if (x > 0) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x - 1, y, z, gridResolution);
+		if (x < gridResolution - 1) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x + 1, y, z, gridResolution);
+		if (y > 0) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x, y - 1, z, gridResolution);
+		if (y < gridResolution - 1) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x, y + 1, z, gridResolution);
+		if (z > 0) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x, y, z - 1, gridResolution);
+		if (z < gridResolution - 1) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x, y, z + 1, gridResolution);
+
+		// set up variables
+		glm::vec3 center(0.0f, 0.0f, 0.0f);     // rule 1
+		int centerNeighbors = 0;
+		glm::vec3 separation(0.0f, 0.0f, 0.0f); // rule 2
+		glm::vec3 cohesion(0.0f, 0.0f, 0.0f);   // rule 3
+		int cohesionNeighbors = 0;
+
+		// find the 3 rules' forces by checking its neighbor cells for neighbor boids
+		for (int i = 0; i < neighborSize; i++) {
+			// - For each cell, read the start/end indices in the boid pointer array.
+			int currentCell = cellsToCheck[i];
+			if (currentCell == -1) continue; // snaity check
+
+			int startIndex = gridCellStartIndices[currentCell];
+			int endIndex = gridCellEndIndices[currentCell];
+
+			// - Access each boid in the cell and compute velocity change from the boids rules, if this boid is within the neighborhood distance.
+			for (int j = startIndex; j <= endIndex; j++) {
+				if (j == -1) continue; // just to make sure te index is valid
+
+				int otherIndex = particleArrayIndices[j];
+				if (otherIndex != -1 && otherIndex != index ) {
+					glm::vec3 otherPos = pos[otherIndex];
+					glm::vec3 otherVel = vel1[otherIndex];
+
+					float distance = glm::length(myPos - otherPos);
+
+					if (distance < rule1Distance) {
+						center += otherPos;
+						centerNeighbors++;
+					}
+					if (distance < rule2Distance) {
+						separation -= (otherPos - myPos);
+					}
+					if (distance < rule3Distance) {
+						cohesion += otherVel;
+						cohesionNeighbors++;
+					}
+				}
+			}
+		}
+
+		if (centerNeighbors > 0) {
+			center /= centerNeighbors;
+			myVel += (center - myPos) * rule1Scale;
+		}
+		if (cohesionNeighbors > 0) {
+			cohesion /= cohesionNeighbors;
+			myVel += cohesion * rule3Scale;
+		}
+
+		myVel += separation * rule2Scale;
+
+		// - Clamp the speed change before putting the new speed in vel2
+		float speed = glm::length(myVel);
+		if (speed > maxSpeed) {
+			myVel = glm::normalize(myVel) * maxSpeed;
+		}
+
+		vel2[index] = myVel;
+	}
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int N, int gridResolution, glm::vec3 gridMin,
+	float inverseCellWidth, float cellWidth,
+	int *gridCellStartIndices, int *gridCellEndIndices,
+	glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+	// TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+	// except with one less level of indirection.
+	// This should expect gridCellStartIndices and gridCellEndIndices to refer
+	// directly to pos and vel1.
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < N) {
+		// - Identify the grid cell that this particle is in
+		glm::vec3 myPos = pos[index];
+		glm::vec3 myVel = vel1[index];
+		glm::vec3 newVel(0.0f, 0.0f, 0.0f);
+		int x = (int)((myPos.x - gridMin.x) * inverseCellWidth);
+		int y = (int)((myPos.y - gridMin.y) * inverseCellWidth);
+		int z = (int)((myPos.z - gridMin.z) * inverseCellWidth);
+		int myCellIndex = gridIndex3Dto1D(x, y, z, gridResolution);
+
+		// - Identify which cells (including this cell) may contain neighbors. This isn't always 9. 
+		int cellsToCheck[9];
+		for (int i = 0; i < 9; i++) {
+			cellsToCheck[i] = -1; // setting everything to -1 for sanity check
+		}
+
+		cellsToCheck[0] = myCellIndex;
+		int neighborSize = 1;
+		if (x > 0) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x - 1, y, z, gridResolution);
+		if (x < gridResolution - 1) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x + 1, y, z, gridResolution);
+		if (y > 0) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x, y - 1, z, gridResolution);
+		if (y < gridResolution - 1) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x, y + 1, z, gridResolution);
+		if (z > 0) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x, y, z - 1, gridResolution);
+		if (z < gridResolution - 1) cellsToCheck[neighborSize++] = gridIndex3Dto1D(x, y, z + 1, gridResolution);
+
+		// set up variables
+		glm::vec3 center(0.0f, 0.0f, 0.0f);     // rule 1
+		int centerNeighbors = 0;
+		glm::vec3 separation(0.0f, 0.0f, 0.0f); // rule 2
+		glm::vec3 cohesion(0.0f, 0.0f, 0.0f);   // rule 3
+		int cohesionNeighbors = 0;
+
+		// - For each cell, read the start/end indices in the boid pointer array.
+		//   DIFFERENCE: For best results, consider what order the cells should be
+		//   checked in to maximize the memory benefits of reordering the boids data.
+		for (int i = 0; i < neighborSize; i++) {
+			int currentCell = cellsToCheck[i];
+			if (currentCell == -1) continue; // snaity check
+
+			int startIndex = gridCellStartIndices[currentCell];
+			int endIndex = gridCellEndIndices[currentCell];
+
+			// - Access each boid in the cell and compute velocity change from the boids rules, if this boid is within the neighborhood distance.
+			for (int j = startIndex; j <= endIndex; j++) {
+				if (j == -1 || j == index) continue; // just to make sure te index is valid and not the same as this bod
+
+				glm::vec3 otherPos = pos[j];
+				glm::vec3 otherVel = vel1[j];
+
+				float distance = glm::length(myPos - otherPos);
+
+				if (distance < rule1Distance) {
+					center += otherPos;
+					centerNeighbors++;
+				}
+				if (distance < rule2Distance) {
+					separation -= (otherPos - myPos);
+				}
+				if (distance < rule3Distance) {
+					cohesion += otherVel;
+					cohesionNeighbors++;
+				}
+			} // end startIndex to endIndex for loop
+		} // end check each neighbor cell for loop
+
+		if (centerNeighbors > 0) {
+			center /= centerNeighbors;
+			myVel += (center - myPos) * rule1Scale;
+		}
+		if (cohesionNeighbors > 0) {
+			cohesion /= cohesionNeighbors;
+			myVel += cohesion * rule3Scale;
+		}
+
+		myVel += separation * rule2Scale;
+
+		// - Clamp the speed change before putting the new speed in vel2
+		float speed = glm::length(myVel);
+		if (speed > maxSpeed) {
+			myVel = glm::normalize(myVel) * maxSpeed;
+		}
+
+		vel2[index] = myVel;
+	} // end if (index < N)
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	// TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+	// calculate vel2
+	kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+	// TODO-1.2 ping-pong the velocity buffers
+	kernPingPongVelocity << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernPingPongVelocity failed!");
+
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
-  // Uniform Grid Neighbor search using Thrust sort.
-  // In Parallel:
-  // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	
+	// TODO-2.1: Uniform Grid Neighbor search using Thrust sort in Parallel:
+	// - label each particle with its array index as well as its grid index. Use 2x width grids.
+	kernComputeIndices << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount,
+		gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+	// - Unstable key sort using Thrust. A stable sort isn't necessary, but you are welcome to do a performance comparison.
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	// - Naively unroll the loop for finding the start and end indices of each cell's data pointers in the array of boid indices
+	kernResetIntBuffer << < fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer for start indices failed!");
+	kernResetIntBuffer << < fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer for end indices failed!");
+	kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices,
+		dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+	  
+	// - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchScattered << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount,
+		gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices,
+		dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+	// - Update positions
+	kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+	
+	// - Ping-pong buffers as needed
+	kernPingPongVelocity << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernPingPongVelocity failed!");
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
-  // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
-  // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
-  //   the particle data in the simulation array.
-  //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	// TODO-2.3 - Uniform Grid Neighbor search using Thrust sort on cell-coherent data. In Parallel:
+	// - Label each particle with its array index as well as its grid index. Use 2x width grids
+	kernComputeIndices << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount,
+		gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+	// - Unstable key sort using Thrust. A stable sort isn't necessary, but you are welcome to do a performance comparison.
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	// - Naively unroll the loop for finding the start and end indices of each cell's data pointers in the array of boid indices
+	kernResetIntBuffer << < fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer for start indices failed!");
+	kernResetIntBuffer << < fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer for end indices failed!");
+	kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices,
+		dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+	// - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all the particle data in the simulation array.
+	// reorder and copy position
+	kernReorderAndCopyData << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_buffer);
+	checkCUDAErrorWithLine("kernReorderAndCopyData from pos to buffer failed!");
+	std::swap(dev_pos, dev_buffer);
+	// reorder and copy the velocities
+	kernReorderAndCopyData << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_vel1, dev_buffer); // now buffer is vel1, vel1 is useless
+	checkCUDAErrorWithLine("kernReorderAndCopyData from vel1 to buffer failed!");
+	kernReorderAndCopyData << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_vel2, dev_vel1); // now vel1 has vel2's data, vel2 is useless
+	checkCUDAErrorWithLine("kernReorderAndCopyData from vel1 to buffer failed!");
+	std::swap(dev_vel2, dev_vel1); // now vel2 has vel2's data back, vel1 is useless
+	std::swap(dev_vel1, dev_buffer); // now vel1 has vel1's data back, buffer is useless
+
+	// - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchCoherent << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount,
+		gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices,
+		dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");
+
+	// - Update positions
+	kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+	
+	// - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+	kernPingPongVelocity << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernPingPongVelocity failed!");
 }
 
 void Boids::endSimulation() {
@@ -390,68 +733,81 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_buffer);
 }
 
 void Boids::unitTest() {
   // LOOK-1.2 Feel free to write additional tests here.
 
   // test unstable sort
-  int *dev_intKeys;
-  int *dev_intValues;
-  int N = 10;
-
-  std::unique_ptr<int[]>intKeys{ new int[N] };
-  std::unique_ptr<int[]>intValues{ new int[N] };
-
-  intKeys[0] = 0; intValues[0] = 0;
-  intKeys[1] = 1; intValues[1] = 1;
-  intKeys[2] = 0; intValues[2] = 2;
-  intKeys[3] = 3; intValues[3] = 3;
-  intKeys[4] = 0; intValues[4] = 4;
-  intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
-
-  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
+  glm::vec3 *dev_testPos;
+  int *dev_testGrid;
+  int *dev_testArray;
+  int N = 4;
+  float width = 1;
+  float invWidth = 1.0f / width;
+  int resolution = 3;
+  int cellCount = resolution * resolution * resolution;
+
+  std::unique_ptr<glm::vec3[]>testPos{ new glm::vec3[N] };
+  std::unique_ptr<int[]>testGrid{ new int[cellCount] };
+  std::unique_ptr<int[]>testArray{ new int[cellCount] };
+   
+
+  testPos[0] = glm::vec3(0, 0.2f, 1.2f);
+  testPos[1] = glm::vec3(0, 1.1f, 2.2f);
+  testPos[2] = glm::vec3(2.5f, 0.1f, 0);
+  testPos[3] = glm::vec3(2.5f, 2.5f, 2.5f);
+
+
+  cudaMalloc((void**)&dev_testPos, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
 
-  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
+  cudaMalloc((void**)&dev_testGrid, cellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+
+  cudaMalloc((void**)&dev_testArray, cellCount * sizeof(int));
   checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
 
   dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
 
-  std::cout << "before unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
+  std::cout << "before compute indeces " << std::endl;
+  for (int i = 0; i < cellCount; i++) {
+	  testGrid[i] = 0;
+	  testArray[i] = 0;
+
+    std::cout << "  gridIndex: " << testGrid[i];
+    std::cout << "    arrayIndex: " << testArray[i] << std::endl;
   }
 
   // How to copy data to the GPU
-  cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_testPos, testPos.get(), sizeof(glm::vec3) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_testGrid, testGrid.get(), sizeof(int) * cellCount, cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_testArray, testArray.get(), sizeof(int) * cellCount, cudaMemcpyHostToDevice);
 
-  // Wrap device vectors in thrust iterators for use with thrust.
-  thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
-  thrust::device_ptr<int> dev_thrust_values(dev_intValues);
-  // LOOK-2.1 Example for using thrust::sort_by_key
-  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+  kernComputeIndices << < fullBlocksPerGrid, blockSize >> > (N, resolution,
+	  glm::vec3(0,0,0), invWidth, dev_testPos, dev_testArray, dev_testGrid);
 
   // How to copy data back to the CPU side from the GPU
-  cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  cudaMemcpy(testGrid.get(), dev_testGrid, sizeof(int) * cellCount, cudaMemcpyDeviceToHost);
+  cudaMemcpy(testArray.get(), dev_testArray, sizeof(int) * cellCount, cudaMemcpyDeviceToHost);
   checkCUDAErrorWithLine("memcpy back failed!");
 
-  std::cout << "after unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
+  std::cout << "\nafter compute indeces " << std::endl;
+  for (int i = 0; i < cellCount; i++) {
+	  std::cout << "  gridIndex: " << testGrid[i];
+	  std::cout << "    arrayIndex: " << testArray[i] << std::endl;
   }
 
   // cleanup
-  cudaFree(dev_intKeys);
-  cudaFree(dev_intValues);
+  cudaFree(dev_testPos);
+  cudaFree(dev_testGrid);
+  cudaFree(dev_testArray);
   checkCUDAErrorWithLine("cudaFree failed!");
   return;
 }
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..ddd0e3b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;