diff --git a/README.md b/README.md
index d63a6a1..5c5886c 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,44 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Shineng Tang
+  * [LinkedIn](https://www.linkedin.com/in/shineng-t-224192195/)
+* Tested on: Windows 11, i9-10900k @3.7GHz 32GB, RTX 3090 24GB
 
-### (TODO: Your README)
+# Screenshots
+![](images/plain.PNG)
+
+*5000 boids
+![](images/5000.gif)
+*50,000 boids
+![](images/50000.gif)
+*500,000boids
+![](images/500000.gif)
+
+# Performance Charts
+![](images/1.png)
+![](images/2.png)
+![](images/3.png)
+
+# Introduction 
+In this CUDA project, I implemented three methods to search through all boids in the space and update their positions and velocity to form a flocking behavior.
+* Naive method: Search through all the boids in the space and perform calculations on them.
+* Scattered uniform grid: By using uniform grid, I only search through boids within certain distances to get rid of a lot of unnecessary calculations.
+* Coherent unfiform grid: Based on the uniform grid method, I reshuffled the positions and velocity buffers to make them contiguous in memory, reducing access to the global memory.
+
+### For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+
+By increasing the number of boids, I experienced a noticable frame rate drop. This is because the number of boids ,at a certain point, exceeds the number of threads on the GPU. Thus the threads need more time to handle all the calculations.
+
+### For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+
+The performance difference is noticable when I compared block size 1 and other numbers. However, after reaching a certain number, the block size stops affecting performance. This is because there are only limited number of threads on the gpu that are working at the same time. When the block size reaches a certain number, there aren't enough threads.
+
+### For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+I did experience performance improvements as you can see from the charts above. It was the outcome I expected because by reshuffling the velocity and position buffer in a kernel function. It reduced the number of times to access the global memory, thus boosting up the performance.
+
+### Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+
+Yes, It did. Checking 27 neighboring cells gives more fps especially when dealing with larger number of boids. I was actually not expecting this result. I think the main reason is that checking 27 cells with half the cell width reduces the volume of the search space. (3 * 3 * 3 < (2 * 2) ^ 3)   
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/images/1.png b/images/1.png
new file mode 100644
index 0000000..804a88b
Binary files /dev/null and b/images/1.png differ
diff --git a/images/2.png b/images/2.png
new file mode 100644
index 0000000..18d92f8
Binary files /dev/null and b/images/2.png differ
diff --git a/images/3.png b/images/3.png
new file mode 100644
index 0000000..ac1e885
Binary files /dev/null and b/images/3.png differ
diff --git a/images/5000.gif b/images/5000.gif
new file mode 100644
index 0000000..f4b2dd0
Binary files /dev/null and b/images/5000.gif differ
diff --git a/images/50000.gif b/images/50000.gif
new file mode 100644
index 0000000..099a2ff
Binary files /dev/null and b/images/50000.gif differ
diff --git a/images/500000.gif b/images/500000.gif
new file mode 100644
index 0000000..c49f19d
Binary files /dev/null and b/images/500000.gif differ
diff --git a/images/plain.jpg b/images/plain.jpg
new file mode 100644
index 0000000..aa04795
Binary files /dev/null and b/images/plain.jpg differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..4a249e4 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -3,9 +3,11 @@
 #include <cuda.h>
 #include <cmath>
 #include <glm/glm.hpp>
+
 #include "utilityCore.hpp"
 #include "kernel.h"
 
+
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
 #define imax( a, b ) ( ((a) > (b)) ? (a) : (b) )
@@ -54,6 +56,8 @@ void checkCUDAError(const char *msg, int line = -1) {
 /*! Size of the starting area in simulation space. */
 #define scene_scale 100.0f
 
+#define DOUBLE_WIDTH 1;
+
 /***********************************************
 * Kernel state (pointers are device pointers) *
 ***********************************************/
@@ -85,7 +89,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
-
+glm::vec3* dev_coherentPos;
+glm::vec3* dev_coherentVel;
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -157,7 +162,11 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
+#if DOUBLE_WIDTH
   gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+#else
+  gridCellWidth = std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+#endif
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -169,7 +178,24 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+  cudaMalloc((void**)&dev_coherentPos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherentPos failed!");
+  cudaMalloc((void**)&dev_coherentVel, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherentVel failed!");
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
   cudaDeviceSynchronize();
+  
+
 }
 
 
@@ -231,9 +257,47 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+  glm::vec3 perceivedCenter;
+  glm::vec3 res;
+
+  int numNeighbors = 0;
+  for (int i = 0; i < N; i++) {
+    if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule1Distance) {
+      perceivedCenter += pos[i];
+      numNeighbors++;
+    }     
+  }
+  if (numNeighbors > 0) {
+    perceivedCenter /= numNeighbors;
+    res += (perceivedCenter - pos[iSelf]) * rule1Scale;
+  }
+  
   // Rule 2: boids try to stay a distance d away from each other
+  glm::vec3 c;
+  for (int i = 0; i < N; i++) {
+    if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule2Distance) {
+      c -= pos[i] - pos[iSelf];
+    }   
+  }
+  res += c * rule2Scale;
+  
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 perceivedVelocity;
+  int numNeighbors3 = 0;
+  for (int i = 0; i < N; i++) {
+    if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule3Distance) {
+      perceivedVelocity += vel[i];
+      numNeighbors3++;
+    }
+      
+  }
+
+  if (numNeighbors3 > 0) {
+    perceivedVelocity /= numNeighbors3;
+    res += (perceivedVelocity - vel[iSelf]) * rule3Scale;
+  }
+  
+  return res;
 }
 
 /**
@@ -245,6 +309,23 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+      return;
+  }
+
+  glm::vec3 newVel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+  float speed = glm::length(newVel);
+
+  if (speed > maxSpeed) {
+    newVel = glm::normalize(newVel) * maxSpeed;
+  }
+
+  vel2[index] = newVel;
+
+
 }
 
 /**
@@ -289,6 +370,20 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+
+  glm::vec3 thisPos = pos[index];  
+  int gridIndexX = glm::floor((thisPos.x - gridMin.x) * inverseCellWidth);
+  int gridIndexY = glm::floor((thisPos.y - gridMin.y) * inverseCellWidth);
+  int gridIndexZ = glm::floor((thisPos.z - gridMin.z) * inverseCellWidth);
+  int gridIndex1D = gridIndex3Dto1D(gridIndexX, gridIndexY, gridIndexZ, gridResolution);
+  gridIndices[index] = gridIndex1D;
+
+  indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +401,37 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  int cur = particleGridIndices[index];
+  // 0 5 5 5 6 7 
+  if (index == 0 || index > 0 && cur != particleGridIndices[index - 1]) {
+    gridCellStartIndices[cur] = index;
+  }
+
+  if (index == N - 1 || index < N - 1 && cur != particleGridIndices[index + 1]) {
+    gridCellEndIndices[cur] = index;
+  }
+}
+
+__global__ void kernReshuffleBoidProperties(int N, int* particleArrayIndices,
+  glm::vec3* pos, glm::vec3* vel, glm::vec3* coPos, glm::vec3* coVel) {
+  // TODO-2.1
+  // Identify the start point of each cell in the gridIndices array.
+  // This is basically a parallel unrolling of a loop that goes
+  // "this index doesn't match the one before it, must be a new cell!"
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  int cur = particleArrayIndices[index];
+  coPos[index] = pos[cur];
+  coVel[index] = vel[cur];
+  
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +448,80 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  glm::vec3 thisPos = pos[index];
+ 
+  float gridIndexX = (thisPos.x - gridMin.x) * inverseCellWidth;
+  float gridIndexY = (thisPos.y - gridMin.y) * inverseCellWidth;
+  float gridIndexZ = (thisPos.z - gridMin.z) * inverseCellWidth;
+
+  glm::vec3 perceivedCenter;
+  glm::vec3 perceivedVelocity;
+  glm::vec3 velChange;
+  glm::vec3 c;
+  int numNeighbors = 0;
+  int numNeighbors3 = 0;
+#if DOUBLE_WIDETH
+  for (int z = gridIndexZ - 0.5f; z <= int(gridIndexZ + 0.5f); z++) {
+    for (int y = gridIndexY - 0.5f; y <= int(gridIndexY + 0.5f); y++) {
+      for (int x = gridIndexX - 0.5f; x <= int(gridIndexX + 0.5f); x++) {
+#else 
+  for (int z = gridIndexZ - 1; z <= gridIndexZ + 1; z++) {
+    for (int y = gridIndexY - 1; y <= gridIndexY + 1; y++) {
+      for (int x = gridIndexX - 1; x <= gridIndexX + 1; x++) {
+#endif
+        if (x < 0 || y < 0 || z < 0 || x >= gridResolution || y >= gridResolution || z >= gridResolution) continue;
+        int curGridIndex1D = gridIndex3Dto1D(x, y, z, gridResolution);
+        int startIndex = gridCellStartIndices[curGridIndex1D];
+        int endIndex = gridCellEndIndices[curGridIndex1D];
+        if (startIndex == -1 || endIndex == -1) continue;
+        for (int i = startIndex; i <= endIndex; i++) {
+          int curBoid = particleArrayIndices[i];
+          float distance = glm::distance(pos[curBoid], pos[index]);
+          //compute pos, vel of neighbor boids
+          if (curBoid != index) {
+            if (distance < rule1Distance) {
+              perceivedCenter += pos[curBoid];
+              numNeighbors++;
+            }
+           
+            if (distance < rule2Distance) {
+              c -= pos[curBoid] - pos[index];
+            }
+           
+            if (distance < rule3Distance) {
+              perceivedVelocity += vel1[curBoid];
+              numNeighbors3++;
+            }
+            
+          }
+        }
+      }
+    }
+  }
+  if (numNeighbors > 0) {
+    perceivedCenter /= numNeighbors;
+    velChange += (perceivedCenter - pos[index]) * rule1Scale;
+  }
+  velChange += c * rule2Scale;
+
+  if (numNeighbors3 > 0) {
+    perceivedVelocity /= numNeighbors3;
+    velChange += perceivedVelocity * rule3Scale;
+  }
+
+  glm::vec3 newVel = vel1[index] + velChange;
+  float speed = glm::length(newVel);
+
+  if (speed > maxSpeed) {
+    newVel = glm::normalize(newVel) * maxSpeed;
+  }
+
+  vel2[index] = newVel;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +541,81 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  glm::vec3 thisPos = pos[index];
+
+  float gridIndexX = (thisPos.x - gridMin.x) * inverseCellWidth;
+  float gridIndexY = (thisPos.y - gridMin.y) * inverseCellWidth;
+  float gridIndexZ = (thisPos.z - gridMin.z) * inverseCellWidth;
+
+  glm::vec3 perceivedCenter;
+  glm::vec3 perceivedVelocity;
+  glm::vec3 velChange;
+  glm::vec3 c;
+  int numNeighbors = 0;
+  int numNeighbors3 = 0;
+#if DOUBLE_WIDETH
+  for (int z = gridIndexZ - 0.5f; z <= int(gridIndexZ + 0.5f); z++) {
+    for (int y = gridIndexY - 0.5f; y <= int(gridIndexY + 0.5f); y++) {
+      for (int x = gridIndexX - 0.5f; x <= int(gridIndexX + 0.5f); x++) {
+#else 
+  for (int z = gridIndexZ - 1; z <= gridIndexZ + 1; z++) {
+        for (int y = gridIndexY - 1; y <= gridIndexY + 1; y++) {
+          for (int x = gridIndexX - 1; x <= gridIndexX + 1; x++) {
+#endif
+        if (x < 0 || y < 0 || z < 0 || x >= gridResolution || y >= gridResolution || z >= gridResolution) continue;
+        int curGridIndex1D = gridIndex3Dto1D(x, y, z, gridResolution);
+        int startIndex = gridCellStartIndices[curGridIndex1D];
+        int endIndex = gridCellEndIndices[curGridIndex1D];
+        if (startIndex == -1 || endIndex == -1) continue;
+        for (int i = startIndex; i <= endIndex; i++) {
+          //int curBoid = particleArrayIndices[i];
+          float distance = glm::distance(pos[i], pos[index]);
+          //compute pos, vel of neighbor boids
+          if (i != index) {
+            if (distance < rule1Distance) {
+              perceivedCenter += pos[i];
+              numNeighbors++;
+            }
+
+            if (distance < rule2Distance) {
+              c -= pos[i] - pos[index];
+            }
+
+            if (distance < rule3Distance) {
+              perceivedVelocity += vel1[i];
+              numNeighbors3++;
+            }
+
+          }
+        }
+      }
+    }
+  }
+  if (numNeighbors > 0) {
+    perceivedCenter /= numNeighbors;
+    velChange += (perceivedCenter - pos[index]) * rule1Scale;
+  }
+  velChange += c * rule2Scale;
+
+  if (numNeighbors3 > 0) {
+    perceivedVelocity /= numNeighbors3;
+    velChange += perceivedVelocity * rule3Scale;
+  }
+
+  glm::vec3 newVel = vel1[index] + velChange;
+  float speed = glm::length(newVel);
+
+  if (speed > maxSpeed) {
+    newVel = glm::normalize(newVel) * maxSpeed;
+  }
+
+  vel2[index] = newVel;
+
+
 }
 
 /**
@@ -349,8 +624,18 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+ 
+  kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+ 
+  dev_vel1 = dev_vel2;
+  
+ 
+
 }
 
+
 void Boids::stepSimulationScatteredGrid(float dt) {
   // TODO-2.1
   // Uniform Grid Neighbor search using Thrust sort.
@@ -364,6 +649,29 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  dim3 fullCellsPerBlock((gridCellCount + blockSize - 1) / blockSize);
+
+  kernComputeIndices <<<fullBlocksPerGrid, blockSize >>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+  //thrust
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+  kernResetIntBuffer << <fullCellsPerBlock, threadsPerBlock >> > (gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer << <fullCellsPerBlock, threadsPerBlock >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+  kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+  //neighbor search
+  kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum,
+    gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices,
+    dev_pos, dev_vel1, dev_vel2);
+  
+  //updatePos
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+
+  dev_vel1 = dev_vel2;
+
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,76 +690,108 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  dim3 fullCellsPerBlock((gridCellCount + blockSize - 1) / blockSize);
+
+  kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+  //thrust
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+
+  kernResetIntBuffer << <fullCellsPerBlock, threadsPerBlock >> > (gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer << <fullCellsPerBlock, threadsPerBlock >> > (gridCellCount, dev_gridCellEndIndices, -1);
+  kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+  kernReshuffleBoidProperties << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices,
+    dev_pos, dev_vel1, dev_coherentPos, dev_coherentVel);
+  //neighbor search
+  kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum,
+    gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_coherentPos, dev_coherentVel, dev_vel2);
+  //updatePos
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_coherentPos, dev_vel2);
+
+  std::swap(dev_vel1, dev_vel2);
+  std::swap(dev_pos, dev_coherentPos);
 }
 
 void Boids::endSimulation() {
   cudaFree(dev_vel1);
   cudaFree(dev_vel2);
   cudaFree(dev_pos);
-
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_coherentPos);
+  cudaFree(dev_coherentVel);
+  
 }
 
 void Boids::unitTest() {
   // LOOK-1.2 Feel free to write additional tests here.
 
   // test unstable sort
-  int *dev_intKeys;
-  int *dev_intValues;
-  int N = 10;
-
-  std::unique_ptr<int[]>intKeys{ new int[N] };
-  std::unique_ptr<int[]>intValues{ new int[N] };
-
-  intKeys[0] = 0; intValues[0] = 0;
-  intKeys[1] = 1; intValues[1] = 1;
-  intKeys[2] = 0; intValues[2] = 2;
-  intKeys[3] = 3; intValues[3] = 3;
-  intKeys[4] = 0; intValues[4] = 4;
-  intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
-
-  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
-
-  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
-
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  std::cout << "before unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // How to copy data to the GPU
-  cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-
-  // Wrap device vectors in thrust iterators for use with thrust.
-  thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
-  thrust::device_ptr<int> dev_thrust_values(dev_intValues);
-  // LOOK-2.1 Example for using thrust::sort_by_key
-  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+  //int *dev_intKeys;
+  //int *dev_intValues;
+  //int N = 10;
+
+  //std::unique_ptr<int[]>intKeys{ new int[N] };
+  //std::unique_ptr<int[]>intValues{ new int[N] };
+
+  //intKeys[0] = 0; intValues[0] = 0;
+  //intKeys[1] = 1; intValues[1] = 1;
+  //intKeys[2] = 0; intValues[2] = 2;
+  //intKeys[3] = 3; intValues[3] = 3;
+  //intKeys[4] = 0; intValues[4] = 4;
+  //intKeys[5] = 2; intValues[5] = 5;
+  //intKeys[6] = 2; intValues[6] = 6;
+  //intKeys[7] = 0; intValues[7] = 7;
+  //intKeys[8] = 5; intValues[8] = 8;
+  //intKeys[9] = 6; intValues[9] = 9;
+
+  //cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
+  //checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
+
+  //cudaMalloc((void**)&dev_intValues, N * sizeof(int));
+  //checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+
+  //dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+  //std::cout << "before unstable sort: " << std::endl;
+  //for (int i = 0; i < N; i++) {
+  //  std::cout << "  key: " << intKeys[i];
+  //  std::cout << " value: " << intValues[i] << std::endl;
+  //}
+
+  //// How to copy data to the GPU
+  //cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+  //cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+
+  //// Wrap device vectors in thrust iterators for use with thrust.
+  //thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
+  //thrust::device_ptr<int> dev_thrust_values(dev_intValues);
+  //// LOOK-2.1 Example for using thrust::sort_by_key
+  //thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+
+  //// How to copy data back to the CPU side from the GPU
+  //cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  //cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  //checkCUDAErrorWithLine("memcpy back failed!");
+
+  //std::cout << "after unstable sort: " << std::endl;
+  //for (int i = 0; i < N; i++) {
+  //  std::cout << "  key: " << intKeys[i];
+  //  std::cout << " value: " << intValues[i] << std::endl;
+  //}
+
+  //// cleanup
+  //cudaFree(dev_intKeys);
+  //cudaFree(dev_intValues);
+  //checkCUDAErrorWithLine("cudaFree failed!");
+ 
 
-  // How to copy data back to the CPU side from the GPU
-  cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  checkCUDAErrorWithLine("memcpy back failed!");
-
-  std::cout << "after unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
 
-  // cleanup
-  cudaFree(dev_intKeys);
-  cudaFree(dev_intValues);
-  checkCUDAErrorWithLine("cudaFree failed!");
   return;
 }
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..9ee7693 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,12 +14,12 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;
-const float DT = 0.2f;
+const float DT = 0.05f;
 
 /**
 * C main function.