diff --git a/README.md b/README.md
index d63a6a1..8f0c2cf 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,49 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yu Sun
+* [LinkedIn](https://www.linkedin.com/in/yusun3/)
+* Tested on: Windows 10 , i7-6700HQ CPU @ 2.60GHz × 8 , GeForce GTX 960M/PCIe/SSE2, 7.7GB Memory (Personal Laptop)
 
-### (TODO: Your README)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Below is a demo run of the flocking algorithm @ 10000 boids and 128 as the blocksize
+![](images/demo.gif)
+
+The main algorithm behind is fairly straight-forward, the crowd of boids will follow three rules:
+* The boids should move toward the perceived center of mass of other boids around
+* The boids will move in the direction of the perceived average movement of other boids around
+* The boids will keep some distance between each other and other groups
+
+## Performance Analysis (Number of Bloids)
+There are three implements done in this project, the naive solution checks for all the other boid in the system so one can
+easily tell this is an O(N^2) algorithm, and not utilizing much of GPU's computation power. The second solution separate the 
+searching areas into grids and check for only the neighoring grid where there might be influence. The third solution place the 
+index in order to eliminate the additional indirection for pointer lookup. Below shows an analysis of how these three approaches 
+perform with different number of bloids in the simulation.
+
+![](images/fps_N.png)
+
+One can easily tell the performance gain while avoiding the naive approach. This is because the check can now be done in parallel and finished much faster
+by the GPU. While for the naive solutions, every thread have to do the same computation and wait for others to finish. Generally speaking, the coherent approach 
+perform better than the scatter one, this is expected because by sorting the index essentially each thread has more likelyhood of accessing continuous blocks of
+memory and utilizing caches. As I would expect if I changed to code to check a 27-cell neighborhood the gain of coherent approach is even more obvious since there 
+are more memory locations that needs to be accessed, which are in general scattered in different places while using the scattered approach.
+
+Also if the number of bloids keep growing, the naive approach would fail quickly because of the way it depletes the hardware capabilities of the GPU.
+
+## Performance Analysis (Block Size)
+
+Another question that would be interesting to ask is how changing the blocksize would affect the computation performance. Below shows how blocksize changes 
+affect the overall performance. 
+![](images/fps_blocksize.png)
+
+One needs to do some experiments when selecting the optimal blocksize because different code and hardware would lead to different optimal blocksizes. On my laptop I found that the performance first increase and then decrease with increasing number of blocksize. If the blocksize is too small, the gain of parallel computation(number of active blocks) is not maximized, while if the blocksize is too big the processor won't have enough resources to handle that many threads.
+
+## Performance Anaylsis (Cell Size)
+By changing the number of neighbors checked from 8 to 27 with 10000 boids and 128 block size, the fps is reduced from 1300 to 450 for scattered approach, 
+and 1350 to 900 for coherent approach. The lowered performance is expected as you need to check more neighbours and therefore you need to do more memory accesses. 
+However, notice that the number of neighbors being check is tripled while the performance for coherent approach is not even cut off by half. This is because neighoring 
+cell checks still means there is large probability that the content would be located in the shared memory so the lookup can be quick. Depending on the radius that
+the programmer set the algorithm to check and how the memory are accessed. Increasing number of neighbors to check doesn't necessarily means the performance would 
+reduce a lot.
+
diff --git a/images/demo.gif b/images/demo.gif
new file mode 100644
index 0000000..e61305b
Binary files /dev/null and b/images/demo.gif differ
diff --git a/images/fps_N.png b/images/fps_N.png
new file mode 100644
index 0000000..ea29b9a
Binary files /dev/null and b/images/fps_N.png differ
diff --git a/images/fps_blocksize.png b/images/fps_blocksize.png
new file mode 100644
index 0000000..0277b12
Binary files /dev/null and b/images/fps_blocksize.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..9d3fb53 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -85,6 +85,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_coherent_pos;
+glm::vec3 *dev_coherent_vel;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -169,6 +171,24 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices faield");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed");
+
+  cudaMalloc((void**)&dev_coherent_vel, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherent_vel failed");
+
+  cudaMalloc((void**)&dev_coherent_pos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherent_pos failed");
+
   cudaDeviceSynchronize();
 }
 
@@ -233,7 +253,48 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 c = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 perceived_vel = glm::vec3(0.0f, 0.0f, 0.0f);
+  float distance = 0.0f;
+  int neighborCount1 = 0;
+  int neighborCount3 = 0; 
+
+  for (int i = 0; i < N; i++){
+    if (i != iSelf ){
+      distance = glm::distance(pos[i], pos[iSelf]);
+      // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+      if (distance < rule1Distance){
+        perceived_center += pos[i];
+        ++neighborCount1;
+      }
+      // Rule 2: boids try to stay a distance d away from each other
+      if (distance < rule2Distance){
+          c -= (pos[i]- pos[iSelf]);
+      }
+      // Rule 3: boids try to match the speed of surrounding boids
+      if (distance < rule3Distance){
+        perceived_vel += vel[i];
+        ++neighborCount3;
+      }
+      
+    }
+  }
+
+  glm::vec3 dVel = glm::vec3(0.0f, 0.0f, 0.0f);
+  if (neighborCount1 > 0){
+    perceived_center /= neighborCount1;
+    dVel += (perceived_center - pos[iSelf]) * rule1Scale;
+  }
+
+  dVel += c * rule2Scale;
+
+  if (neighborCount3 > 0){
+    perceived_vel /= neighborCount3;
+    dVel += perceived_vel * rule3Scale;
+  }
+
+  return dVel;
 }
 
 /**
@@ -245,6 +306,17 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N){
+    return;
+  }
+  glm::vec3 newVel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+  if (glm::length(newVel) > maxSpeed ){
+    newVel = maxSpeed * glm::normalize(newVel);
+  }
+
+  vel2[index] = newVel;
 }
 
 /**
@@ -289,6 +361,12 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if(index < N){
+    glm::ivec3 boid_i = (pos[index] - gridMin) * inverseCellWidth;
+    gridIndices[index] = gridIndex3Dto1D(boid_i.x, boid_i.y, boid_i.z, gridResolution);
+    indices[index] = index;
+  }
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +384,16 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) return;
+  // corner cases
+  if (index == 0) gridCellStartIndices[particleGridIndices[index]] = index;
+  else if (index == N - 1) gridCellEndIndices[particleGridIndices[index]] = index;
+
+  else if (particleGridIndices[index] != particleGridIndices[index + 1]){
+    gridCellEndIndices[particleGridIndices[index]] = index;
+    gridCellStartIndices[particleGridIndices[index + 1]] = index + 1;
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -317,11 +405,76 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
   // - Identify the grid cell that this particle is in
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) return;
+  // placeholders for flock rule
+  glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 c = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 perceived_vel = glm::vec3(0.0f, 0.0f, 0.0f);
+  float distance = 0.0f;
+  int neighborCount1 = 0;
+  int neighborCount3 = 0; 
+
+  glm::ivec3 boid_i = (pos[index] - gridMin) * inverseCellWidth;
   // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+  for (int i = boid_i.z - 1; i <= boid_i.z + 1; i++){
+    for (int j = boid_i.y - 1; j <= boid_i.y + 1; j++){
+      for (int k = boid_i.x - 1; k <= boid_i.x + 1; k++){
+        int _x = imax(k, 0);
+        int _y = imax(j, 0);
+        int _z = imax(i, 0);
+        _x = imin(_x, gridResolution - 1);
+        _y = imin(_y, gridResolution - 1);
+        _z = imin(_z, gridResolution - 1);
+        int i_flatten = gridIndex3Dto1D(_x, _y, _z, gridResolution);
+        if (gridCellStartIndices[i_flatten] != -1 && gridCellEndIndices[i_flatten] !=-1){
+          // - For each cell, read the start/end indices in the boid pointer array.
+          // - Access each boid in the cell and compute velocity change from
+          for (int s = gridCellStartIndices[i_flatten]; s <= gridCellEndIndices[i_flatten]; s++){
+            if (particleArrayIndices[s] != index){
+              distance = glm::distance(pos[particleArrayIndices[s]], pos[index]);
+              //   the boids rules, if this boid is within the neighborhood distance.
+              // - Clamp the speed change before putting the new speed in vel2
+              if (distance < rule1Distance){
+                perceived_center += pos[particleArrayIndices[s]];
+                neighborCount1++;
+              }
+              if (distance < rule2Distance) c -= (pos[particleArrayIndices[s]] - pos[index]);
+              if (distance < rule3Distance){
+                perceived_vel += vel1[particleArrayIndices[s]];
+                neighborCount3++;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  glm::vec3 newVel = vel1[index];
+  if (neighborCount1 > 0){
+    perceived_center /= neighborCount1;
+    newVel += (perceived_center - pos[index]) * rule1Scale;
+  }
+
+  newVel += c * rule2Scale;
+
+  if (neighborCount3 > 0){
+    perceived_vel /= neighborCount3;
+    newVel += perceived_vel * rule3Scale;
+  }
+
+  if (glm::length(newVel) > maxSpeed) newVel = glm::normalize(newVel) * maxSpeed;
+
+  vel2[index] = newVel;
+}
+
+__global__ void kernGetCoherentVal(int N, int *particleArrayIndices,
+  glm::vec3 *pos, glm::vec3 *vel, glm::vec3 *coherentPos, glm::vec3 *coherentVel) {
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) return;
+  coherentPos[index] = pos[particleArrayIndices[index]];
+  coherentVel[index] = vel[particleArrayIndices[index]]; 
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -329,18 +482,74 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   float inverseCellWidth, float cellWidth,
   int *gridCellStartIndices, int *gridCellEndIndices,
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+    // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
   // except with one less level of indirection.
   // This should expect gridCellStartIndices and gridCellEndIndices to refer
   // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) return;
+    // placeholders for flock rule
+    glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 c = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 perceived_vel = glm::vec3(0.0f, 0.0f, 0.0f);
+    float distance = 0.0f;
+    int neighborCount1 = 0;
+    int neighborCount3 = 0; 
+
+    glm::ivec3 boid_i = (pos[index] - gridMin) * inverseCellWidth;
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    //   DIFFERENCE: For best results, consider what order the cells should be
+    //   checked in to maximize the memory benefits of reordering the boids data.
+    for (int i = boid_i.z - 1; i <= boid_i.z + 1; i++){
+      for (int j = boid_i.y - 1; j <= boid_i.y + 1; j++){
+        for (int k = boid_i.x - 1; k <= boid_i.x + 1; k++){
+          int _x = imax(k, 0);
+          int _y = imax(j, 0);
+          int _z = imax(i, 0);
+          _x = imin(_x, gridResolution - 1);
+          _y = imin(_y, gridResolution - 1);
+          _z = imin(_z, gridResolution - 1);
+          int i_flatten = gridIndex3Dto1D(_x, _y, _z, gridResolution);
+          if (gridCellStartIndices[i_flatten] != -1){
+            // - For each cell, read the start/end indices in the boid pointer array.
+            // - Access each boid in the cell and compute velocity change from
+            for (int s = gridCellStartIndices[i_flatten]; s <= gridCellEndIndices[i_flatten]; s++){
+              if (s != index){
+                distance = glm::distance(pos[s], pos[index]);
+                //   the boids rules, if this boid is within the neighborhood distance.
+                // - Clamp the speed change before putting the new speed in vel2
+                if (distance < rule1Distance){
+                  perceived_center += pos[s];
+                  neighborCount1++;
+                }
+                if (distance < rule2Distance) c -= (pos[s] - pos[index]);
+                if (distance < rule3Distance){
+                  perceived_vel += vel1[s];
+                  neighborCount3++;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    glm::vec3 newVel = vel1[index];
+    if (neighborCount1 > 0){
+      perceived_center /= neighborCount1;
+      newVel += (perceived_center - pos[index]) * rule1Scale;
+    }
+
+    newVel += c * rule2Scale;
+
+    if (neighborCount3 > 0){
+      perceived_vel /= neighborCount3;
+      newVel += perceived_vel * rule3Scale;
+    }
+
+    if (glm::length(newVel) > maxSpeed) newVel = glm::normalize(newVel) * maxSpeed;
+
+    vel2[index] = newVel;
 }
 
 /**
@@ -349,6 +558,16 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  // we need to update the pos based on current vel, so need at least to store one of vel/pos
+  kernUpdateVelocityBruteForce <<<fullBlocksPerGrid, blockSize >>> (numObjects, dev_pos, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelocityBruteForce Failed");
+
+  kernUpdatePos <<< fullBlocksPerGrid, blockSize >>> (numObjects, dt, dev_pos, dev_vel1);
+  checkCUDAErrorWithLine("kernUpdatePos Failed");
+
+  std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +583,39 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+  dim3 fullBlocksPerGrid_boids((numObjects + blockSize - 1) / blockSize);
+  dim3 fullBlocksPerGrid_gridCell((gridCellCount + blockSize -1) / blockSize);
+
+  kernResetIntBuffer<<<fullBlocksPerGrid_boids, blockSize>>> (gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer<<<fullBlocksPerGrid_boids, blockSize>>> (gridCellCount, dev_gridCellEndIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer Failed");
+
+  kernComputeIndices <<<fullBlocksPerGrid_boids, blockSize>>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, 
+    dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+  checkCUDAErrorWithLine("kernComputeIndices Failed");
+
+  dev_thrust_particleGridIndices =  thrust::device_ptr<int>(dev_particleGridIndices);
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  // sort inplace
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+  // cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  // cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  // checkCUDAErrorWithLine("memcpy back failed!");
+
+  kernIdentifyCellStartEnd <<<fullBlocksPerGrid_boids, blockSize >>> (numObjects, dev_particleGridIndices, 
+    dev_gridCellStartIndices, dev_gridCellEndIndices);
+  checkCUDAErrorWithLine("kernIdentifyCellStartEnd Failed");
+
+  kernUpdateVelNeighborSearchScattered <<< fullBlocksPerGrid_boids, blockSize >>> (numObjects, gridSideCount, gridMinimum,
+    gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices,
+    dev_pos, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered Failed");
+
+  kernUpdatePos <<< fullBlocksPerGrid_boids, blockSize >>> (numObjects, dt, dev_pos, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdatePos Failed");
+
+  std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,12 +634,51 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  dim3 fullBlocksPerGrid_boids((numObjects + blockSize - 1) / blockSize);
+  dim3 fullBlocksPerGrid_gridCell((gridCellCount + blockSize -1) / blockSize);
+
+  kernResetIntBuffer<<<fullBlocksPerGrid_boids, blockSize>>> (gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer<<<fullBlocksPerGrid_boids, blockSize>>> (gridCellCount, dev_gridCellEndIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer Failed");
+
+  kernComputeIndices <<<fullBlocksPerGrid_boids, blockSize>>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, 
+    dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+  checkCUDAErrorWithLine("kernComputeIndices Failed");
+
+  dev_thrust_particleGridIndices =  thrust::device_ptr<int>(dev_particleGridIndices);
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  // sort inplace
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+  kernIdentifyCellStartEnd <<<fullBlocksPerGrid_boids, blockSize >>> (numObjects, dev_particleGridIndices, 
+    dev_gridCellStartIndices, dev_gridCellEndIndices);
+  checkCUDAErrorWithLine("kernIdentifyCellStartEnd Failed");
+
+  kernGetCoherentVal <<<fullBlocksPerGrid_boids, blockSize >>> (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_coherent_pos, dev_coherent_vel);
+  checkCUDAErrorWithLine("kernGetCoherentVal Failed");
+
+  kernUpdateVelNeighborSearchCoherent <<< fullBlocksPerGrid_boids, blockSize >>> (numObjects, gridSideCount, gridMinimum,
+    gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_coherent_pos, dev_coherent_vel, dev_vel1);
+  checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered Failed");
+
+  kernUpdatePos <<< fullBlocksPerGrid_boids, blockSize >>> (numObjects, dt, dev_coherent_pos, dev_vel1);
+  checkCUDAErrorWithLine("kernUpdatePos Failed");
+
+  // don't forget to put the pos & vel back!!!
+  std::swap(dev_pos, dev_coherent_pos);
+  // std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::endSimulation() {
   cudaFree(dev_vel1);
   cudaFree(dev_vel2);
   cudaFree(dev_pos);
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_coherent_pos);
+  cudaFree(dev_coherent_vel);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
 }
@@ -454,4 +745,4 @@ void Boids::unitTest() {
   cudaFree(dev_intValues);
   checkCUDAErrorWithLine("cudaFree failed!");
   return;
-}
+}
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..a59949d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,12 +13,12 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
+#define VISUALIZE 0
+#define UNIFORM_GRID 1
 #define COHERENT_GRID 0
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 10000;
 const float DT = 0.2f;
 
 /**
@@ -194,7 +194,7 @@ void initShaders(GLuint * program) {
 
     cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions);
     cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities);
-
+    
     // execute the kernel
     #if UNIFORM_GRID && COHERENT_GRID
     Boids::stepSimulationCoherentGrid(DT);