diff --git a/Project1 Performance.xlsx b/Project1 Performance.xlsx
new file mode 100644
index 0000000..cdbcc82
Binary files /dev/null and b/Project1 Performance.xlsx differ
diff --git a/README.md b/README.md
index d63a6a1..0c225ab 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,55 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Angelina Risi
+  * [LinkedIn](www.linkedin.com/in/angelina-risi)
+* Tested on: Windows 10, i7-6700HQ @ 2.60GHz 8GB, GTX 960M 4096MB (Personal Laptop)
 
-### (TODO: Your README)
+**Images:**  
+Uniform Coherent Grid Under Default Conditions:
+  ![Early in Simulation](/images/CoherentGridSim1.PNG)
+  ![Late in Simulation](/images/CoherentGridSim3.PNG)
+  
+Animation at 15000 Boids (for increased FPS)
+  ![Animated Coherent Grid](/images/uniformcoherentgrid.gif)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+**Features:**  
+* Naive flocking iterating over all other boids to adjust velocity
+* Grid-based search for influencing boids
+  - Region is divided into cells and before search the boids are sorted by containing cell using Thrust
+  - Modified search to dynamically change cell search bounds based on search radius
+    * Search not limited to fixed number of cells when rule distances, grid resolution, etc modified
+* Coherent grid-based search
+  - In addition to sorting boids by cell, their position and velocity data are sorted into this order as well
+    * Allows more direct access to data, potentially improving performance
+
+**Known Issues:**  
+* ~~Boid counts in the range of N = 5153 through N = 10112 cause CUDA launch failure or Thrust bad_alloc errors~~
+  - ~~Cause unknown, as boid counts above and below this range behave correctly~~
+  - ~~May be some sort of setup issue, needs testing on another machine but code appears correct~~
+    * ~~Tried changing architecture to sm_50 (which should be supported by graphics card) and running CMake, but CUDA launch failure~~
+  - ~~Does NOT occur in Naive simulation, probably since it doesn't use Thrust~~
+* ~~After decreasing the cell width to max of rule distances, the program crashed with illegal memory access @ 5000 objects (triggered at buffer reset?), and crashed with a runtime error at higher values, but 4000 ojects works fine and is used for performance analysis~~
+  
+As of most recent commit these errors are resolved. It was due to an error in which variable was used to size the grid start/end buffers.
+    
+**Performance Analysis:**  
+  
+Performance was tested on all three solutions by taking the average FPS during the runtime, which was timed for approximately 1 minute. The default standard for comparison is the number of boids at 5000, block size of 128, and cell width of twice the search radius. Visualization was disabled to get a more accurate measure of performance, especially when a large number of particles would need rendering.  
+  
+* Changing # of Boids:  
+  Increasing the number of boids has the overall trend of decreasing performance. This is especially true for the naive implementation as the processing time is proportional to N^2, due to each boid checking all other boids. However, the grid-based searches have abnormalities in their performance, dipping at low boid counts, then increasing and following a less sharp decrease in performance. From 10000 boids up you can see the linear decrease with doubling boid counts, indicating a logarithmic relationship less steep than the naive N^2 one. Due to errors described prior I had difficulties getting performance data for certain ranges of boid counts.
+  ![FPS Graph w/ Change in Boid Count](/images/defaultFPS.PNG)
+
+* Changing Block Size:  
+  Changing the block size from the default 128, to 256 and 1024, changes the number of threads sharing the same block, which affects such things as memory sharing. Surprisingly, there is a slight decrease in performance seen in testing as the block size increases, but why this occurs is uncertain.
+  ![FPS at Block Size 256](/images/block256FPS.PNG)
+  ![FPS at Block Size 1024](/images/block1024FPS.PNG)
+
+* Changing Cell Width:  
+  At smaller boid counts, we can see a decrease in performance when the cell width decreases due to more overhead in checking more cells. I was unable to test at higher boid counts due to crashes I was unable to resolve. The performance was tested at both the default of cell width = twice the search radius, and cell width = search radius.
+  ![FPS Graph w/ Change in CellWidth](/images/cellWidthFPS.PNG)  
+    
+  The coherent grid proved to perform significantly superior to the scattered grid. While both greatly improve on the Naive implementation at high boid counts, at the highest numbers tested the coherent grid had twice the FPS of the scattered grid. Low boid counts showed better performance for the naive solution, possibly due to the overhead of the extra sorting, buffers, and array accesses outweighing the gain of reduced looping through boids. This does not explain why the FPS at 5000 boids is so low compared to FPS at much higher counts for the grid solutions, though.
+
+  
diff --git a/images/CoherentGridSim1.PNG b/images/CoherentGridSim1.PNG
new file mode 100644
index 0000000..232b9df
Binary files /dev/null and b/images/CoherentGridSim1.PNG differ
diff --git a/images/CoherentGridSim2.PNG b/images/CoherentGridSim2.PNG
new file mode 100644
index 0000000..824ee3b
Binary files /dev/null and b/images/CoherentGridSim2.PNG differ
diff --git a/images/CoherentGridSim3.PNG b/images/CoherentGridSim3.PNG
new file mode 100644
index 0000000..d6dbe4c
Binary files /dev/null and b/images/CoherentGridSim3.PNG differ
diff --git a/images/CoherentGridSim4.PNG b/images/CoherentGridSim4.PNG
new file mode 100644
index 0000000..e699322
Binary files /dev/null and b/images/CoherentGridSim4.PNG differ
diff --git a/images/block1024FPS.PNG b/images/block1024FPS.PNG
new file mode 100644
index 0000000..bab0e71
Binary files /dev/null and b/images/block1024FPS.PNG differ
diff --git a/images/block256FPS.PNG b/images/block256FPS.PNG
new file mode 100644
index 0000000..99ba001
Binary files /dev/null and b/images/block256FPS.PNG differ
diff --git a/images/cellWidthFPS.PNG b/images/cellWidthFPS.PNG
new file mode 100644
index 0000000..5ad3213
Binary files /dev/null and b/images/cellWidthFPS.PNG differ
diff --git a/images/defaultFPS.PNG b/images/defaultFPS.PNG
new file mode 100644
index 0000000..b5e8213
Binary files /dev/null and b/images/defaultFPS.PNG differ
diff --git a/images/uniformcoherentgrid.gif b/images/uniformcoherentgrid.gif
new file mode 100644
index 0000000..cb1fc87
Binary files /dev/null and b/images/uniformcoherentgrid.gif differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..8909228 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -70,22 +70,21 @@ glm::vec3 *dev_pos;
 glm::vec3 *dev_vel1;
 glm::vec3 *dev_vel2;
 
+glm::vec3 *dev_swapBuffer;
+
 // LOOK-2.1 - these are NOT allocated for you. You'll have to set up the thrust
 // pointers on your own too.
 
 // For efficient sorting and the uniform grid. These should always be parallel.
 int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
 int *dev_particleGridIndices; // What grid cell is this particle in?
-// needed for use with thrust
+
 thrust::device_ptr<int> dev_thrust_particleArrayIndices;
 thrust::device_ptr<int> dev_thrust_particleGridIndices;
 
 int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
 int *dev_gridCellEndIndices;   // to this cell?
 
-// TODO-2.3 - consider what additional buffers you might need to reshuffle
-// the position and velocity data to be coherent within cells.
-
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -151,6 +150,9 @@ void Boids::initSimulation(int N) {
   cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
 
+  cudaMalloc((void**)&dev_swapBuffer, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_swapBuffer failed!");
+
   // LOOK-1.2 - This is a typical CUDA kernel invocation.
   kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
     dev_pos, scene_scale);
@@ -158,6 +160,7 @@ void Boids::initSimulation(int N) {
 
   // LOOK-2.1 computing grid params
   gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  //gridCellWidth = 1.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -169,6 +172,23 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  // Allocate boid data pointer array
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  //Alloc. grid to boid mapping array
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  // Alloc. pointers to grid-boid index
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
   cudaDeviceSynchronize();
 }
 
@@ -230,10 +250,51 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+	
+	int n1 = 0; // Counter for rule 1
+	int n3 = 0; // Counter for rule 3
+	glm::vec3 rule1(0.0f); // rule 1 vel. change
+	glm::vec3 rule2(0.0f); // rule 2 vel. change
+	glm::vec3 rule3(0.0f); // rule 3 vel. change
+	float dist;
+
+	// For all non-self boids
+	for (int i = 0; i < N ; i++) {
+		if (i != iSelf) {
+			dist = distance(pos[i], pos[iSelf]);
+			// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+			if (dist < rule1Distance) {
+				// Sum positions
+				rule1 += pos[i];
+				n1++;
+			}
+			// Rule 2: boids try to stay a distance d away from each other
+			if (dist < rule2Distance) {
+				rule2 += pos[iSelf] - pos[i];
+			}
+			// Rule 3: boids try to match the speed of surrounding boids
+			if (dist < rule3Distance) {
+				// Sum velocities
+				rule3 += vel[i];
+				n3++;
+			}
+		}
+	}
+	// Average position (center of mass)
+	if (n1 > 0) { // realized I was dividing by zero
+		rule1 = rule1 / float(n1);
+		rule1 = (rule1 - pos[iSelf]) * rule1Scale;
+	}
+
+	rule2 = rule2 * rule2Scale;
+
+	// Average velocity
+	if (n3 > 0) {
+		rule3 = rule3 / float(n3);
+		rule3 = (rule3 - vel[iSelf]) * rule3Scale;
+	}
+  
+	return (rule1 + rule2 + rule3);
 }
 
 /**
@@ -242,9 +303,18 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	// Compute a new velocity based on pos and vel1
+	glm::vec3 new_vel = computeVelocityChange(N, index, pos, vel1) + vel1[index];
+	
+	// Clamp the speed
+	float speed = length(new_vel); // calc. scalar speed
+	if (speed > maxSpeed) {
+		new_vel = new_vel * maxSpeed / speed; // reduce vel. proportionately to clamp speed to max
+	}
+	// Record the new velocity into vel2. Question: why NOT vel1?
+	vel2[index] = new_vel;
 }
 
 /**
@@ -285,27 +355,54 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
+
     // - Label each boid with the index of its grid cell.
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	// 3D grid index
+	int gX = (pos[index].x - gridMin.x) * inverseCellWidth;
+	int gY = (pos[index].y - gridMin.y) * inverseCellWidth;
+	int gZ = (pos[index].z - gridMin.z) * inverseCellWidth;
+	
+	int gIdx = gridIndex3Dto1D(gX, gY, gZ, gridResolution); // 1D grid index
+	gridIndices[index] = gIdx; // set boid's grid index in array
+
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+	indices[index] = index;
+
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
 //          does not enclose any boids
 __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    intBuffer[index] = value;
-  }
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+	intBuffer[index] = value;
 }
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
+
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	int gIdx = particleGridIndices[index];
+	if (index == 0) {
+		gridCellStartIndices[gIdx] = 0;
+	}
+	else if (gIdx != particleGridIndices[index - 1]) {
+		gridCellEndIndices[particleGridIndices[index - 1]] = index - 1;
+		gridCellStartIndices[gIdx] = index;
+	}
+	if (index == N - 1) {
+		gridCellEndIndices[gIdx] = N - 1;
+	}
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -314,14 +411,103 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   int *gridCellStartIndices, int *gridCellEndIndices,
   int *particleArrayIndices,
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+  // Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
+
   // - Identify the grid cell that this particle is in
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
   // - Identify which cells may contain neighbors. This isn't always 8.
+	// calculate search bounds
+	float search_r = rule1Distance;
+	if (rule2Distance > search_r) search_r = rule2Distance;
+	if (rule3Distance > search_r) search_r = rule3Distance;
+
+	int xMax = (pos[index].x - gridMin.x + search_r) * inverseCellWidth;
+	int xMin = (pos[index].x - gridMin.x - search_r) * inverseCellWidth;
+	int yMax = (pos[index].y - gridMin.y + search_r) * inverseCellWidth;
+	int yMin = (pos[index].y - gridMin.y - search_r) * inverseCellWidth;
+	int zMax = (pos[index].z - gridMin.z + search_r) * inverseCellWidth;
+	int zMin = (pos[index].z - gridMin.z - search_r) * inverseCellWidth;
+
+	// clamp bounds
+	if (xMin < 0) xMin = 0;
+	if (xMax >= gridResolution) xMax = gridResolution - 1;
+	if (yMin < 0) yMin = 0;
+	if (yMax >= gridResolution) yMax = gridResolution - 1;
+	if (zMin < 0) zMin = 0;
+	if (zMax >= gridResolution) zMax = gridResolution - 1;
+
+	int start; // index first boid listed in grid cell
+	int end; // index last boid listed in grid cell
+	int p_Idx; // neighboring particle index
+	glm::vec3 new_vel(0.0f);
+
+	int n1 = 0; // rule 1 counter
+	int n3 = 0; // rule 3 counter
+	glm::vec3 rule1(0.0f); // rule 1 vel. change
+	// rule 2 vel. change is in new_vel
+	glm::vec3 rule3(0.0f); // rule 3 vel. change
+
+	float dist;
+	int gIdx;
+	
   // - For each cell, read the start/end indices in the boid pointer array.
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
+
+	for (int z = zMin; z <= zMax; z++) {
+		for (int y = yMin; y <= yMax; y++) {
+			for (int x = xMin; x <= xMax; x++) {
+				gIdx = gridIndex3Dto1D(x, y, z, gridResolution);
+				start = gridCellStartIndices[gIdx];
+				end = gridCellEndIndices[gIdx];
+				if (start != -1) { // if cell not empty
+					for (int i = start; i < end; i++) {
+						p_Idx = particleArrayIndices[i];
+						if (p_Idx != index) {
+							dist = distance(pos[p_Idx], pos[index]);
+							if (dist < rule1Distance) {
+								rule1 += pos[p_Idx];
+								n1++;
+							}
+							if (dist < rule2Distance) {
+								new_vel += pos[index] - pos[p_Idx];
+							}
+							if (dist < rule3Distance) {
+								// Sum velocities
+								rule3 += vel1[p_Idx];
+								n3++;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	// Average position (center of mass)
+	if (n1 > 0) { // realized I was dividing by zero
+		rule1 = rule1 / float(n1);
+		rule1 = (rule1 - pos[index]) * rule1Scale;
+	}
+	new_vel = new_vel * rule2Scale;
+	// Average velocity
+	if (n3 > 0) {
+		rule3 = rule3 / float(n3);
+		rule3 = (rule3 - vel1[index]) * rule3Scale;
+	}
+	new_vel += rule1 + rule3 + vel1[index];
+
   // - Clamp the speed change before putting the new speed in vel2
+	if (length(new_vel) > maxSpeed) {
+		new_vel = new_vel * maxSpeed / length(new_vel); // reduce vel. proportionately to clamp speed to max
+	}
+
+	// Record the new velocity into vel2
+	vel2[index] = new_vel;
+
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,31 +527,167 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+  // - Identify the grid cell that this particle is in
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	// - Identify which cells may contain neighbors. This isn't always 8.
+	// calculate search bounds
+	float search_r = rule1Distance;
+	if (rule2Distance > search_r) search_r = rule2Distance;
+	if (rule3Distance > search_r) search_r = rule3Distance;
+
+	glm::vec3 max;
+	glm::vec3 min;
+
+	max.x = (pos[index].x - gridMin.x + search_r) * inverseCellWidth;
+	min.x = (pos[index].x - gridMin.x - search_r) * inverseCellWidth;
+	max.y = (pos[index].y - gridMin.y + search_r) * inverseCellWidth;
+	min.y = (pos[index].y - gridMin.y - search_r) * inverseCellWidth;
+	max.z = (pos[index].z - gridMin.z + search_r) * inverseCellWidth;
+	min.z = (pos[index].z - gridMin.z - search_r) * inverseCellWidth;
+
+	// clamp bounds
+	if (min.x < 0) min.x = 0;
+	if (max.x >= gridResolution) max.x = gridResolution - 1;
+	if (min.y < 0) min.y = 0;
+	if (max.y >= gridResolution) max.y = gridResolution - 1;
+	if (min.z < 0) min.y = 0;
+	if (max.z >= gridResolution) max.z = gridResolution - 1;
+
+	int start; // index first boid listed in grid cell
+	int end; // index last boid listed in grid cell
+	glm::vec3 new_vel(0.0f);
+
+	int n1 = 0; // rule 1 counter
+	int n3 = 0; // rule 3 counter
+	glm::vec3 rule1(0.0f); // rule 1 vel. change
+	// rule 2 vel. change is in new_vel
+	glm::vec3 rule3(0.0f); // rule 3 vel. change
+	float dist;
+
+	int gIdx;
+
+	// - For each cell, read the start/end indices in the boid pointer array.
+	// - Access each boid in the cell and compute velocity change from
+	//   the boids rules, if this boid is within the neighborhood distance.
+
+	for (int z = min.z; z <= (int)(max.z); z++) {
+		for (int y = min.y; y <= (int)(max.y); y++) {
+			for (int x = min.x; x <= (int)(max.x); x++) {
+				gIdx = gridIndex3Dto1D(x, y, z, gridResolution);
+				start = gridCellStartIndices[gIdx];
+				end = gridCellEndIndices[gIdx];
+				if (start != -1 && end != -1) { // if cell not empty, & error checking
+					for (int i = start; i <= end; i++) {
+						if (i != index) {
+							dist = distance(pos[i], pos[index]);
+							if (dist < rule1Distance) {
+								rule1 += pos[i];
+								n1++;
+							}
+							if (dist < rule2Distance) {
+								new_vel += pos[index] - pos[i];
+							}
+							if (dist < rule3Distance) {
+								// Sum velocities
+								rule3 += vel1[i];
+								n3++;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	// Average position (center of mass)
+	if (n1 > 0) { // realized I was dividing by zero
+		rule1 = rule1 / float(n1);
+		rule1 = (rule1 - pos[index]) * rule1Scale;
+	}
+	new_vel = new_vel * rule2Scale;
+	// Average velocity
+	if (n3 > 0) {
+		rule3 = rule3 / float(n3);
+		rule3 = (rule3 - vel1[index]) * rule3Scale;
+	}
+	new_vel += rule1 + rule3 + vel1[index];
+
+	// - Clamp the speed change before putting the new speed in vel2
+	if (length(new_vel) > maxSpeed) {
+		new_vel = new_vel * maxSpeed / length(new_vel); // reduce vel. proportionately to clamp speed to max
+	}
+	// Record the new velocity into vel2
+	vel2[index] = new_vel;
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	// Update boid position with velocity
+	kernUpdatePos << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, dt, dev_pos, dev_vel1);
+
+	// Update boid velocity
+	kernUpdateVelocityBruteForce <<<fullBlocksPerGrid, threadsPerBlock>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+	
+	// ping-pong the velocity buffers
+	glm::vec3 *swap = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = swap;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
   // Uniform Grid Neighbor search using Thrust sort.
   // In Parallel:
   // - label each particle with its array index as well as its grid index.
   //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
+	kernResetIntBuffer<<<fullBlocksPerGrid, threadsPerBlock>>>(gridCellCount, dev_gridCellStartIndices, -1); // label empty cells
+	checkCUDAErrorWithLine("reset start buffer failed!");
+	kernResetIntBuffer<<<fullBlocksPerGrid, threadsPerBlock>>>(gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("reset end buffer failed!");
+
+	// Create array of boid indices in parallel with array of respective grid indices
+	kernComputeIndices<<<fullBlocksPerGrid, threadsPerBlock>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("conputeIndices failed!");
+
+	// sort w/ thrust
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	checkCUDAErrorWithLine("thrust sort failed!");
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
+	kernIdentifyCellStartEnd<<<fullBlocksPerGrid, threadsPerBlock>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
   // - Update positions
+	kernUpdatePos << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, dt, dev_pos, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+  // - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid, threadsPerBlock>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("updating velocity failed!");
+
   // - Ping-pong buffers as needed
+	glm::vec3 *swap = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = swap;
+}
+
+__global__ void kernSortData(int N, int* particleArrayIndices, glm::vec3 *vel1, glm::vec3 *vel2, glm::vec3 *pos, glm::vec3 *buffer) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	int boid = particleArrayIndices[index];
+	vel2[index] = vel1[boid]; // sort vel1 data into vel2
+	buffer[index] = pos[boid]; // sort pos data into buffer
 }
 
+
 void Boids::stepSimulationCoherentGrid(float dt) {
   // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
@@ -382,6 +704,54 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	glm::vec3 *swap;
+
+	// reset cell start & end indices
+	kernResetIntBuffer << <fullBlocksPerGrid, threadsPerBlock >> >(gridCellCount, dev_gridCellStartIndices, -1); // label empty cells
+	checkCUDAErrorWithLine("reset start buffer failed!");
+	kernResetIntBuffer << <fullBlocksPerGrid, threadsPerBlock >> >(gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("reset end buffer failed!");
+
+	// Create array of boid indices in parallel with array of respective grid indices
+	kernComputeIndices << <fullBlocksPerGrid, threadsPerBlock >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("reset start buffer failed!");
+
+	// sort w/ thrust
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	checkCUDAErrorWithLine("thrust sort failed!");
+
+	// - Naively unroll the loop for finding the start and end indices of each
+	//   cell's data pointers in the array of boid indices
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, threadsPerBlock >> >(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+	// sort pos & vel data
+	kernSortData << <fullBlocksPerGrid, threadsPerBlock >> >(numObjects, dev_particleArrayIndices, dev_vel1, dev_vel2, dev_pos, dev_swapBuffer);
+	checkCUDAErrorWithLine("kernSortData failed!");
+
+	// - Ping-pong buffers as needed
+	swap = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = swap;
+
+	swap = dev_pos;
+	dev_pos = dev_swapBuffer;
+	dev_swapBuffer = swap;
+
+
+	// - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, threadsPerBlock >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("updating velocity failed!");
+
+	// - Update positions
+	kernUpdatePos << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, dt, dev_pos, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+	// - Ping-pong buffers as needed
+	swap = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = swap;
 }
 
 void Boids::endSimulation() {
@@ -390,6 +760,12 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+
+  cudaFree(dev_swapBuffer);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..291232e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;
@@ -217,6 +217,9 @@ void initShaders(GLuint * program) {
     double timebase = 0;
     int frame = 0;
 
+	double avg_fps = 0;
+	double cnt = 0;
+
     Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
                        // your CUDA development setup is ready to go.
 
@@ -234,6 +237,9 @@ void initShaders(GLuint * program) {
 
       runCUDA();
 
+	  avg_fps += fps;
+	  cnt++;
+
       std::ostringstream ss;
       ss << "[";
       ss.precision(1);
@@ -256,6 +262,10 @@ void initShaders(GLuint * program) {
       glfwSwapBuffers(window);
       #endif
     }
+
+	avg_fps = avg_fps / cnt;
+	printf("avg FPS: %f", avg_fps);
+
     glfwDestroyWindow(window);
     glfwTerminate();
   }