diff --git a/README.md b/README.md
index d63a6a1..9594abb 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,48 @@
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
-Project 1 - Flocking**
+Project 1 Flocking
+====================
+
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture**
+
+**Anantha Srinivas**
+[LinkedIn](https://www.linkedin.com/in/anantha-srinivas-00198958/), [Twitter](https://twitter.com/an2tha)
+
+**Tested on:**
+* Windows 10, i7-8700 @ 3.20GHz 16GB, GTX 1080 8097MB (Personal)
+* Built for Visual Studio 2017 using the v140 toolkit
+
+Implementation
+---
+
+This is the simulation of 100K boids at 350 Frames per second. This shows semi-coherent memory access uniform grid flocking.
+
+![](images/boid9.gif)
+
+---
+
+__Performance Graph__
+![](images/performance_graph.PNG)
+
+Experimental setup
+---
+* Tested on a GTX 1080, with compute capability of 6.1
+* NVIDIA vertical Sync was disabled (which actually increased the frame rate).
+* Visualization was turned off and program run in Release mode.
+
+Interesting Insights
+---
+* Even though semi-coherent memory access involves an extra step in making sure that the position and velocity data are contiguous, it does run faster than regular unifrorm grid search method. This can probably be attributed to caching and faster memory access in GPU. 
+
+* I noticed that uniform grid search is not best throughout. Although it does perform way better than brute force searching, the frame rate does not linearly increase withnumber of boid. The frame rate actually peaks somewhere near 10k boids (which is suprisingly faster than 5k boids) suggesting that this method of searching needs to have some level of saturation in the data.
+
+Further thoughts
+---
+* **For each implementation, how does changing the number of boids affect performance? Why do you think this is?** As a general rule increasing the number of boids decreases the framerate across all methods. However, there were some anomolies as noted above. For uniform and coherent memory access methods, the frame rate actually peaked for 10k boids. The frame rate generally decreases because the number of threads to be launched increases, which further increases the GPU load. As a general rule of thumb only around 60 ~ 80% of GPU should be loaded to get best performance. 
+
+* **For each implementation, how does changing the block count and block size affect performance? Why do you think this is?** This did not affect the performance much. This may be because the default block size of 128 is much smaller than the boid size. Also, block is a logical contruct, so it should technically not matter if all the threads are running the same instructions.
+
+* **For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?** Yes, Coherent grid, resulted in a slightly better performance. This could be due to easier/faster memory access to the GPU. It could also possibly cache the data since they are contiguous.
+
+* **Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!** It doesnt matter to a certain extend on what is the cell width. All the boids have to be covered in some iteration.
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
 
-### (TODO: Your README)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/images/boid9.gif b/images/boid9.gif
new file mode 100644
index 0000000..8d6547b
Binary files /dev/null and b/images/boid9.gif differ
diff --git a/images/performance_graph.PNG b/images/performance_graph.PNG
new file mode 100644
index 0000000..9b5bd92
Binary files /dev/null and b/images/performance_graph.PNG differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..b737097 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..38d5d69 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -3,6 +3,7 @@
 #include <cuda.h>
 #include <cmath>
 #include <glm/glm.hpp>
+
 #include "utilityCore.hpp"
 #include "kernel.h"
 
@@ -76,6 +77,7 @@ glm::vec3 *dev_vel2;
 // For efficient sorting and the uniform grid. These should always be parallel.
 int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
 int *dev_particleGridIndices; // What grid cell is this particle in?
+
 // needed for use with thrust
 thrust::device_ptr<int> dev_thrust_particleArrayIndices;
 thrust::device_ptr<int> dev_thrust_particleGridIndices;
@@ -85,6 +87,9 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_reshuffle_pos;
+glm::vec3 *dev_reshuffle_vel1;
+glm::vec3 *dev_reshuffle_vel2;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -169,10 +174,33 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
+  cudaMalloc((void**)&dev_reshuffle_pos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_reshuffle_pos failed!");
+
+  cudaMalloc((void**)&dev_reshuffle_vel1, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_reshuffle_vel1 failed!");
+
+  cudaMalloc((void**)&dev_reshuffle_vel2, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_reshuffle_vel2 failed!");
+
   cudaDeviceSynchronize();
 }
 
-
 /******************
 * copyBoidsToVBO *
 ******************/
@@ -230,10 +258,61 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+
+	glm::vec3 rule1Velocity(0.f, 0.f, 0.f);
+	glm::vec3 rule2Velocity(0.f, 0.f, 0.f);
+	glm::vec3 rule3Velocity(0.f, 0.f, 0.f);
+
+	const glm::vec3 selBoidPos = pos[iSelf];
+
+	int count1 = 0;
+	int count3 = 0;
+
+	for(int i = 0; i < N; ++i)
+	{
+		if(i == iSelf)
+		{
+			continue;
+		}
+
+		float distanceToBoid = glm::length(pos[i] - selBoidPos);
+		glm::vec3 currBoidPos = pos[i];
+		glm::vec3 currBoidVel = vel[i];
+
+		if(distanceToBoid < rule1Distance)
+		{
+			rule1Velocity += currBoidPos;
+			count1++;
+		}
+
+		if(distanceToBoid < rule2Distance)
+		{
+			rule2Velocity -= (currBoidPos - selBoidPos);
+		}
+
+		if(distanceToBoid < rule3Distance)
+		{
+			rule3Velocity += currBoidVel;
+			count3++;
+		}
+	}
+
+	if(count1 > 0)
+	{
+		rule1Velocity /= float(count1);
+		rule1Velocity = (rule1Velocity - selBoidPos) * rule1Scale;
+	}
+
+	rule2Velocity *= rule2Scale;
+
+	if(count3 > 0)
+	{
+		rule3Velocity /= float(count3);
+		rule3Velocity *= rule3Scale;
+	}
+	
+	
+	return (rule1Velocity + rule2Velocity + rule3Velocity);
 }
 
 /**
@@ -245,6 +324,22 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+
+	// Fetch the index
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) 
+	{
+		return;
+	}
+	glm::vec3 newVelocity = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+	// Clamp the velocity
+	float speed = glm::length(newVelocity);
+	if (speed > maxSpeed) {
+		newVelocity = glm::normalize(newVelocity) * maxSpeed;
+	}
+
+	vel2[index] = newVelocity;
 }
 
 /**
@@ -289,6 +384,18 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+
+	int currIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if(currIndex >= N)
+	{
+		return;
+	}
+	indices[currIndex] = currIndex;
+
+	// Grid Index
+	glm::vec3 gridIndex = glm::floor((pos[currIndex] - gridMin) * inverseCellWidth);
+	gridIndices[currIndex] = gridIndex3Dto1D(gridIndex.x, gridIndex.y, gridIndex.z, gridResolution);
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -301,11 +408,56 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
 }
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
-  int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
-  // This is basically a parallel unrolling of a loop that goes
-  // "this index doesn't match the one before it, must be a new cell!"
+	int *gridCellStartIndices, int *gridCellEndIndices) {
+	// TODO-2.1
+	// Identify the start point of each cell in the gridIndices array.
+	// This is basically a parallel unrolling of a loop that goes
+	// "this index doesn't match the one before it, must be a new cell!"
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N)
+	{
+		return;
+	}
+
+	int currGrid = particleGridIndices[index];
+
+	if (index == (N - 1))
+	{
+		gridCellEndIndices[currGrid] = index;
+	}
+	else
+	{
+		if (index == 0)
+		{
+			gridCellStartIndices[currGrid] = 0;
+		}
+
+		int nextPos = particleGridIndices[index + 1];
+
+		// if there is change in grid cell
+		if(currGrid != nextPos)
+		{
+			gridCellStartIndices[nextPos] = index + 1;
+			gridCellEndIndices[currGrid] = index;
+		}
+	}
+}
+
+__global__ void kernReshufflePosAndVel(int N, int *particleGridIndices, glm::vec3 *unalignedPos, glm::vec3 *unalignedVel1, glm::vec3 *unalignedVel2, glm::vec3 *alignedPos, glm::vec3 *alignedVel1, glm::vec3 *alignedVel2) {
+	// TODO-2.3
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N)
+	{
+		return;
+	}
+
+	int currGrid = particleGridIndices[index];
+
+	alignedPos[index] = unalignedPos[currGrid];
+	alignedVel1[index] = unalignedVel1[currGrid];
+	alignedVel2[index] = unalignedVel2[currGrid];
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +474,106 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N)
+	{
+		return;
+	}
+
+	float maxDistanceToSearch = glm::max(rule1Distance, glm::max(rule2Distance, rule3Distance));
+	glm::vec3 minGridPos = glm::floor((pos[index] - gridMin - glm::vec3(maxDistanceToSearch)) * inverseCellWidth);
+	glm::vec3 maxGridPos = glm::floor((pos[index] - gridMin + glm::vec3(maxDistanceToSearch)) * inverseCellWidth);
+
+	// clamp to boundary
+	minGridPos = glm::clamp(minGridPos, glm::vec3(0.f), glm::vec3(gridResolution - 1));
+	maxGridPos = glm::clamp(maxGridPos, glm::vec3(0.f), glm::vec3(gridResolution - 1));
+
+	// Brute code
+	glm::vec3 rule1Velocity(0.f, 0.f, 0.f);
+	glm::vec3 rule2Velocity(0.f, 0.f, 0.f);
+	glm::vec3 rule3Velocity(0.f, 0.f, 0.f);
+
+	glm::vec3 selBoidPos = pos[index];
+
+	int count1 = 0;
+	int count3 = 0;
+
+	for(int x = minGridPos.x; x <= maxGridPos.x; ++x)
+	{
+		for (int y = minGridPos.y; y <= maxGridPos.y; ++y)
+		{
+			for (int z = minGridPos.z; z <= maxGridPos.z; ++z)
+			{
+				// Current Grid Id
+				int gridId = gridIndex3Dto1D(x, y, z, gridResolution);
+
+				// Start and Stop positions
+				int gridStart = gridCellStartIndices[gridId];
+				int gridEnd = gridCellEndIndices[gridId];
+
+				if(gridStart == -1 || gridEnd == -1)
+				{
+					continue;
+				}
+
+				// Similar to brute
+				for(int boid = gridStart; boid <= gridEnd; ++boid)
+				{
+					int posId = particleArrayIndices[boid];
+
+					if(boid == index)
+					{
+						continue;
+					}
+
+					float distanceToBoid = glm::length(pos[posId] - selBoidPos);
+					glm::vec3 currBoidPos = pos[posId];
+					glm::vec3 currBoidVel = vel1[posId];
+
+					if (distanceToBoid < rule1Distance)
+					{
+						rule1Velocity += currBoidPos;
+						count1++;
+					}
+
+					if (distanceToBoid < rule2Distance)
+					{
+						rule2Velocity -= (currBoidPos - selBoidPos);
+					}
+
+					if (distanceToBoid < rule3Distance)
+					{
+						rule3Velocity += currBoidVel;
+						count3++;
+					}
+				}
+			}
+		}
+	}
+
+	if (count1 > 0)
+	{
+		rule1Velocity /= float(count1);
+		rule1Velocity = (rule1Velocity - selBoidPos) * rule1Scale;
+	}
+
+	rule2Velocity *= rule2Scale;
+
+	if (count3 > 0)
+	{
+		rule3Velocity /= float(count3);
+		rule3Velocity *= rule3Scale;
+	}
+
+	glm::vec3 newVel = vel1[index] +  (rule1Velocity + rule2Velocity + rule3Velocity);
+
+	// clamp it!
+	float speed = glm::length(newVel);
+	if (speed > maxSpeed) {
+		newVel = glm::normalize(newVel) * maxSpeed;
+	}
+	vel2[index] = newVel;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +593,106 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N)
+	{
+		return;
+	}
+
+	float maxDistanceToSearch = glm::max(rule1Distance, glm::max(rule2Distance, rule3Distance));
+	glm::vec3 minGridPos = glm::floor((pos[index] - gridMin - glm::vec3(maxDistanceToSearch)) * inverseCellWidth);
+	glm::vec3 maxGridPos = glm::floor((pos[index] - gridMin + glm::vec3(maxDistanceToSearch)) * inverseCellWidth);
+
+	// clamp to boundary
+	minGridPos = glm::clamp(minGridPos, glm::vec3(0.f), glm::vec3(gridResolution - 1));
+	maxGridPos = glm::clamp(maxGridPos, glm::vec3(0.f), glm::vec3(gridResolution - 1));
+
+	// Brute code
+	glm::vec3 rule1Velocity(0.f, 0.f, 0.f);
+	glm::vec3 rule2Velocity(0.f, 0.f, 0.f);
+	glm::vec3 rule3Velocity(0.f, 0.f, 0.f);
+
+	glm::vec3 selBoidPos = pos[index];
+
+	int count1 = 0;
+	int count3 = 0;
+
+	for (int x = minGridPos.x; x <= maxGridPos.x; ++x)
+	{
+		for (int y = minGridPos.y; y <= maxGridPos.y; ++y)
+		{
+			for (int z = minGridPos.z; z <= maxGridPos.z; ++z)
+			{
+				// Current Grid Id
+				int gridId = gridIndex3Dto1D(x, y, z, gridResolution);
+
+				// Start and Stop positions
+				int gridStart = gridCellStartIndices[gridId];
+				int gridEnd = gridCellEndIndices[gridId];
+
+				if (gridStart == -1 || gridEnd == -1)
+				{
+					continue;
+				}
+
+				// Similar to brute
+				for (int boid = gridStart; boid <= gridEnd; ++boid)
+				{
+					int posId = boid;// particleArrayIndices[boid];
+
+					if (boid == index)
+					{
+						continue;
+					}
+
+					float distanceToBoid = glm::length(pos[posId] - selBoidPos);
+					glm::vec3 currBoidPos = pos[posId];
+					glm::vec3 currBoidVel = vel1[posId];
+
+					if (distanceToBoid < rule1Distance)
+					{
+						rule1Velocity += currBoidPos;
+						count1++;
+					}
+
+					if (distanceToBoid < rule2Distance)
+					{
+						rule2Velocity -= (currBoidPos - selBoidPos);
+					}
+
+					if (distanceToBoid < rule3Distance)
+					{
+						rule3Velocity += currBoidVel;
+						count3++;
+					}
+				}
+			}
+		}
+	}
+
+	if (count1 > 0)
+	{
+		rule1Velocity /= float(count1);
+		rule1Velocity = (rule1Velocity - selBoidPos) * rule1Scale;
+	}
+
+	rule2Velocity *= rule2Scale;
+
+	if (count3 > 0)
+	{
+		rule3Velocity /= float(count3);
+		rule3Velocity *= rule3Scale;
+	}
+
+	glm::vec3 newVel = vel1[index] + (rule1Velocity + rule2Velocity + rule3Velocity);
+
+	// clamp it!
+	float speed = glm::length(newVel);
+	if (speed > maxSpeed) {
+		newVel = glm::normalize(newVel) * maxSpeed;
+	}
+	vel2[index] = newVel;
 }
 
 /**
@@ -348,7 +700,19 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	// 1. Calculate new Velocities
+	kernUpdateVelocityBruteForce << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+
+	// 2. Interchange the buffers
+	glm::vec3* tempVel = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = tempVel;
+
+	// 3. Update the new positions 
+	kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +728,42 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+	// 0. Block size
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	dim3 gridsCellBlockSize((gridCellCount + blockSize - 1) / blockSize);
+
+	// 1. Compute the array and grid indices
+	kernComputeIndices << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+	// 2. Sort the indices - thrust unstable key sort
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	checkCUDAErrorWithLine("thrust::sort_by_key failed!");
+
+	// 3. Reset Start and End.
+	kernResetIntBuffer << < gridsCellBlockSize, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer start failed!");
+	kernResetIntBuffer << < gridsCellBlockSize, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer end failed!");
+
+	// 4. Find Start and End indices
+	kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+	// 5. Update Velocity
+	kernUpdateVelNeighborSearchScattered << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices,
+		dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+	// 6. Swap vel1 and vel2
+	glm::vec3* tempVel = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = tempVel;
+
+	// 7. Update Positions
+	kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +782,60 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+	// 0. Block size
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	dim3 gridsCellBlockSize((gridCellCount + blockSize - 1) / blockSize);
+
+	// 1. Compute the array and grid indices
+	kernComputeIndices << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+	// 2. Sort the indices - thrust unstable key sort
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	checkCUDAErrorWithLine("thrust::sort_by_key failed!");
+
+	// 3. Reset Start and End.
+	kernResetIntBuffer << < gridsCellBlockSize, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer start failed!");
+	kernResetIntBuffer << < gridsCellBlockSize, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer end failed!");
+
+	// 4. Find Start and End indices
+	kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+	// 5. Make a new Contiguous memory pos & vel Data
+	kernReshufflePosAndVel << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2, dev_reshuffle_pos, dev_reshuffle_vel1, dev_reshuffle_vel2);
+	checkCUDAErrorWithLine("kernReshufflePosAndVel failed!");
+
+	// 6. Make the dev pos and vel point to the continous data
+	glm::vec3* tempRef[3];
+	tempRef[0] = dev_pos;
+	tempRef[1] = dev_vel1;
+	tempRef[2] = dev_vel2;
+
+	dev_pos = dev_reshuffle_pos;
+	dev_vel1 = dev_reshuffle_vel1;
+	dev_vel2 = dev_reshuffle_vel2;
+
+	dev_reshuffle_pos = tempRef[0];
+	dev_reshuffle_vel1 = tempRef[1];
+	dev_reshuffle_vel2 = tempRef[2];
+
+	// 7. Update Velocity
+	kernUpdateVelNeighborSearchCoherent << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices,
+		dev_gridCellEndIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");
+
+	// 8. Swap vel1 and vel2
+	glm::vec3* tempVel = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = tempVel;
+
+	// 9. Update Positions
+	kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
 }
 
 void Boids::endSimulation() {
@@ -390,6 +844,16 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+
+  cudaFree(dev_particleArrayIndices); // What index in dev_pos and dev_velX represents this particle?
+  cudaFree(dev_particleGridIndices); // What grid cell is this particle in?
+
+  cudaFree(dev_gridCellStartIndices); // What part of dev_particleArrayIndices belongs
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_reshuffle_pos);
+  cudaFree(dev_reshuffle_vel1);
+  cudaFree(dev_reshuffle_vel2);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..ddd0e3b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;