diff --git a/CMakeLists.txt b/CMakeLists.txt
index 150664e..07aaea8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,10 @@ set(GLM_ROOT_DIR "external")
 find_package(GLM REQUIRED)
 include_directories(${GLM_INCLUDE_DIRS})
 
+#set project dir
+file(TO_CMAKE_PATH ${CMAKE_SOURCE_DIR} PROJ_BASE_PATH_NORMALIZED)
+add_definitions(-DPROJ_BASE_PATH=${PROJ_BASE_PATH_NORMALIZED})
+
 set(headers
     src/cudaMat4.hpp
     src/glslUtility.hpp
diff --git a/CMakeSettings.json b/CMakeSettings.json
new file mode 100644
index 0000000..d2bcfce
--- /dev/null
+++ b/CMakeSettings.json
@@ -0,0 +1,26 @@
+﻿{
+  "configurations": [
+    {
+      "name": "x64-Debug",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "buildRoot": "${projectDir}\\out\\build\\${name}",
+      "installRoot": "${projectDir}\\out\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": ""
+    },
+    {
+      "name": "x64-Release",
+      "generator": "Ninja",
+      "configurationType": "Release",
+      "buildRoot": "${projectDir}\\out\\build\\${name}",
+      "installRoot": "${projectDir}\\out\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "msvc_x64_x64" ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index d63a6a1..c72a0fe 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,89 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Mengxuan Huang
+  * [LinkedIn](https://www.linkedin.com/in/mengxuan-huang-52881624a/)
+* Tested on: Windows 11, i9-13980HX @ 2.22GHz 64.0 GB, RTX4090-Laptop 16384MB
 
-### (TODO: Your README)
+## Outcome Summary
+**Record in coherent grid mode** 
+||boid = 5000 |boid = 100000| boid = 10000000|
+|------------- |-------------|-------------|-------------|
+|scene scale = 100.f|![](./gif/b_5000_s_100.gif)|![](./gif/b_100000_s_100.gif)|![](./gif/b_10000000_s_100.gif)|
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+||boid = 100000 |boid = 1000000| boid = 10000000|
+|------------- |-------------|-------------|-------------|
+|scene scale = 500.f|![](./gif/b_100000_s_500.gif)|![](./gif/b_1000000_s_500.gif)|![](./gif/b_10000000_s_500.gif)|
+
+## Performance Analysis
+In general, with the increase of the number of boids, the frame rate drops. When the number of boid is large (like 10000000), increasing the scene scale (increase the number of grid) can improve the frame rate.
+
+- ### For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+|Block Size|Scene Scale|
+|-----------|----------- |
+|128|100.0|
+
+|With Visualization|Without Visualization|
+|-----------|-----------|
+|![](./images/fps_wrt_boids.png)|![](./images/fps_wrt_boids_nv.png)|
+
+According to the statistic result, 
+- **Naive method**: the fps drops with the increase of the number of boids.
+- **Uniform Grid method**: the fps drops with the increase of the number of boids, except when there are 25000 boids.
+- **Coherent Grid method**: the fps drops with the increase of the number of boids, except when there are 25000 boids.
+
+For the **Naive method**, each boid needs to traverse all boids when update velocity. While for the **Uniform Grid** and **Coherent Grid** method, each boid just traverse boids locating its neighbor grids. The increase of the number of the boids causes to the significant drop of fps for **Naive method**. **Uniform Grid** and **Coherent Grid** methods divide the scene into grids which also divide the increase of the number of the boids, and therefore make the fps drop moderately.
+
+Besides, there is a "middleman", which related to an extra indirect addressing, in **Uniform Grid** method while compared to the **Coherent Grid** method. Therefore, for the performance, **Coherent Grid** > **Uniform Grid** > **Naive method**.
+
+Finally, for the abnormal fps drop at **boids = 25000**, This might because there are not enough boids in neighbor grids and there is not enough computation to hide memory latency.
+
+- ### For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+|Scene Scale|With Visualization|
+|-----------|-----------|
+|100.0|No|
+
+To record a stable fps, I tested the **Naive method** on 100000 boids while tested **Uniform Grid** and **Coherent Grid** on 1000000 boids.
+
+|1000000 Boids|100000 Boids|
+|-----------|-----------|
+|![](./images/fps_wrt_blocksize_1.png)|![](./images/fps_wrt_blocksize_2.png)|
+
+According to the statistic result, in all three methods, the fps increase when increase the size of block, the fps firstly increase (from 8 to 32), subsequently keep stable (from 32 to 512), and finally drop (from 512 to 1024).
+
+![](./images/nsight_block.png)
+
+According to the NSight Compute profile result, when block size is small, which is smaller than 32, ***some threads in a warp are masked off and those hardware resources are unused***. Thus, fps increase with the increase of block size when it is smaller than 32.
+
+![](./images/nsight_occupancy.png)
+According to the NSight Compute profile result, when block size is large, since the total number of register in each SM is limited, the number of active thread is limited, which leads to the performance drop. 
+
+- ### For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+|Block Size|# of Boids|
+|-----------|----------- |
+|128|10000000.0|
+
+|scene scale = 100.0|scene scale = 500.0|
+|-----------|-----------|
+|![](./gif/b_10000000_s_100.gif)|![](./gif/b_10000000_s_500.gif)|
+
+As shown in the gif, under the condition that **10000000 Boids**, there is a significant improvement in fps when scene scale changed from 100 to 500, which is because larger scene scale leads to more grids and less boids per grid, and therefore less neighbor traverse time for each boid. 
+
+- ### Did changing cell width and checking 27 vs 8 neighboring cells affect performance?
+|Block Size|# of Boids|Scene scale|Method|
+|-----------|-----------|-----------|-----------|
+|128|2000000|100.0|Coherent Grid|
+
+||8 neighboring cells|27 neighboring cells|
+|-----------|-----------|-----------|
+|fps|80.1|131.2|
+
+According to the testing result, when check 27 neighbor grids, there is a significant fps improvement! This might because we ***halved the grid cell width***:
+![](./images/demo_1.png)
+
+And therefore,
+
+![](./images/demo_2.png)
+
+The search area become smaller, which means less boids each boid need to traverse. Therefore, the seaching become more efficient.
diff --git a/gif/b_10000000_s_100.gif b/gif/b_10000000_s_100.gif
new file mode 100644
index 0000000..2b778cb
Binary files /dev/null and b/gif/b_10000000_s_100.gif differ
diff --git a/gif/b_10000000_s_500.gif b/gif/b_10000000_s_500.gif
new file mode 100644
index 0000000..f82ff4f
Binary files /dev/null and b/gif/b_10000000_s_500.gif differ
diff --git a/gif/b_1000000_s_100.gif b/gif/b_1000000_s_100.gif
new file mode 100644
index 0000000..f495364
Binary files /dev/null and b/gif/b_1000000_s_100.gif differ
diff --git a/gif/b_1000000_s_500.gif b/gif/b_1000000_s_500.gif
new file mode 100644
index 0000000..0776188
Binary files /dev/null and b/gif/b_1000000_s_500.gif differ
diff --git a/gif/b_100000_s_100.gif b/gif/b_100000_s_100.gif
new file mode 100644
index 0000000..4c3c9bd
Binary files /dev/null and b/gif/b_100000_s_100.gif differ
diff --git a/gif/b_100000_s_500.gif b/gif/b_100000_s_500.gif
new file mode 100644
index 0000000..59a6597
Binary files /dev/null and b/gif/b_100000_s_500.gif differ
diff --git a/gif/b_5000_s_100.gif b/gif/b_5000_s_100.gif
new file mode 100644
index 0000000..3e6b423
Binary files /dev/null and b/gif/b_5000_s_100.gif differ
diff --git a/images/demo_1.png b/images/demo_1.png
new file mode 100644
index 0000000..f4adb6b
Binary files /dev/null and b/images/demo_1.png differ
diff --git a/images/demo_2.png b/images/demo_2.png
new file mode 100644
index 0000000..c93b73e
Binary files /dev/null and b/images/demo_2.png differ
diff --git a/images/fps_wrt_blocksize_1.png b/images/fps_wrt_blocksize_1.png
new file mode 100644
index 0000000..b7b29fd
Binary files /dev/null and b/images/fps_wrt_blocksize_1.png differ
diff --git a/images/fps_wrt_blocksize_2.png b/images/fps_wrt_blocksize_2.png
new file mode 100644
index 0000000..cb45356
Binary files /dev/null and b/images/fps_wrt_blocksize_2.png differ
diff --git a/images/fps_wrt_boids.png b/images/fps_wrt_boids.png
new file mode 100644
index 0000000..d73c555
Binary files /dev/null and b/images/fps_wrt_boids.png differ
diff --git a/images/fps_wrt_boids_nv.png b/images/fps_wrt_boids_nv.png
new file mode 100644
index 0000000..33bb8f4
Binary files /dev/null and b/images/fps_wrt_boids_nv.png differ
diff --git a/images/nsight_block.png b/images/nsight_block.png
new file mode 100644
index 0000000..5867c1c
Binary files /dev/null and b/images/nsight_block.png differ
diff --git a/images/nsight_occupancy.png b/images/nsight_occupancy.png
new file mode 100644
index 0000000..6edd1b5
Binary files /dev/null and b/images/nsight_occupancy.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..d3f22bc 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -54,6 +54,8 @@ void checkCUDAError(const char *msg, int line = -1) {
 /*! Size of the starting area in simulation space. */
 #define scene_scale 100.0f
 
+#define SEARCH_27 1
+
 /***********************************************
 * Kernel state (pointers are device pointers) *
 ***********************************************/
@@ -70,6 +72,8 @@ glm::vec3 *dev_pos;
 glm::vec3 *dev_vel1;
 glm::vec3 *dev_vel2;
 
+glm::vec3* dev_pos_reshuffled;
+
 // LOOK-2.1 - these are NOT allocated for you. You'll have to set up the thrust
 // pointers on your own too.
 
@@ -157,7 +161,11 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
+#if SEARCH_27 
+  gridCellWidth = std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+#else
   gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+#endif
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -169,10 +177,28 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
+  // Part 2.3 additional buffers
+  cudaMalloc((void**)&dev_pos_reshuffled, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos_reshuffled failed!");
+
   cudaDeviceSynchronize();
 }
 
-
 /******************
 * copyBoidsToVBO *
 ******************/
@@ -230,21 +256,74 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    // Rule 2: boids try to stay a distance d away from each other
+    // Rule 3: boids try to match the speed of surrounding boids
+    glm::vec3 v1(0.f, 0.f, 0.f), v2(0.f), v3(0.f);
+    glm::vec3 perceived_center(0.f);
+    int neighbor_count_1(0), neighbor_count_3(0);
+    for (int i = 0; i < N; ++i)
+    {
+        if (i != iSelf)
+        {
+            float dist2 = glm::dot(pos[i] - pos[iSelf], pos[i] - pos[iSelf]);
+            if (dist2 < rule1Distance * rule1Distance)
+            {
+                perceived_center += pos[i];
+                ++neighbor_count_1;
+            }
+            if (dist2 < rule2Distance * rule2Distance)
+            {
+                v2 -= (pos[i] - pos[iSelf]);
+            }
+            if (dist2 < rule3Distance * rule3Distance)
+            {
+                v3 += vel[i];
+                ++neighbor_count_3;
+            }
+        }
+    }
+    if (neighbor_count_1 > 0)
+    {
+        perceived_center /= static_cast<float>(neighbor_count_1);
+        v1 = (perceived_center - pos[iSelf]);
+    }
+    if (neighbor_count_3 > 0)
+    {
+        v3 /= static_cast<float>(neighbor_count_3);
+    }
+
+    v1 *= rule1Scale;
+    v2 *= rule2Scale;
+    v3 *= rule3Scale;
+    //printf("v1: [%f, %f, %f]\n", v1.x, v2.y, v3.z);
+    return v1 + v2 + v3;
 }
 
 /**
 * TODO-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
-__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
+__global__ void kernUpdateVelocityBruteForce(int N, 
+                                             glm::vec3 *pos,
+                                             glm::vec3 *vel1, 
+                                             glm::vec3 *vel2) 
+{
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+  // Record the new velocity into vel2. Question: why NOT vel1? // write/read conflict
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N)
+    {
+        return;
+    }
+    glm::vec3 velocity = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+    float speed =  glm::length(velocity);
+    if (speed > maxSpeed)
+    {
+        velocity = (velocity / speed) * maxSpeed;
+    }
+    vel2[index] = velocity;
 }
 
 /**
@@ -282,6 +361,15 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
   return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
+__device__ glm::vec4 positionToGridId(const glm::vec3& pos, const glm::vec3& grid_min, 
+                                float inverse_width, int grid_res)
+{
+    glm::vec3 local_pos = pos - grid_min;
+    glm::ivec3 grid_id_3d = glm::floor(local_pos * inverse_width);
+
+    return { gridIndex3Dto1D(grid_id_3d.x, grid_id_3d.y, grid_id_3d.z, grid_res), grid_id_3d.x, grid_id_3d.y, grid_id_3d.z };
+}
+
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
@@ -289,6 +377,15 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) return;
+    
+    int grid_id = positionToGridId(pos[index], gridMin, inverseCellWidth, gridResolution).x;
+    //printf("%d:[%f, %f, %f, %d, %d, %d]\n", grid_id, 
+    //                                        pos[index].x, pos[index].y, pos[index].z,
+    //                                        grid_id_3d.x , grid_id_3d.y, grid_id_3d.z);
+    indices[index] = index;
+    gridIndices[index] = grid_id;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -301,87 +398,415 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
 }
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
-  int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
-  // This is basically a parallel unrolling of a loop that goes
-  // "this index doesn't match the one before it, must be a new cell!"
+                                         int *gridCellStartIndices, int *gridCellEndIndices) 
+{
+    // TODO-2.1
+    // Identify the start point of each cell in the gridIndices array.
+    // This is basically a parallel unrolling of a loop that goes
+    // "this index doesn't match the one before it, must be a new cell!"
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+    if (index >= N) return;
+
+    if (index == 0 || particleGridIndices[index] != particleGridIndices[index - 1])
+    {
+        gridCellStartIndices[particleGridIndices[index]] = index;
+        //printf("gridCellStartIndices[%d] = %d\n", particleGridIndices[index], index);
+    }
+        
+        
+    if (index == N - 1 || particleGridIndices[index] != particleGridIndices[index + 1])
+        gridCellEndIndices[particleGridIndices[index]] = index;
 }
 
-__global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+__device__ void UpdateVelNeighborSearchScatteredHelp(int grid, int gridCount, int iSelf,
+                                                     int* gridCellStartIndices, int* gridCellEndIndices,
+                                                     int* particleArrayIndices,
+                                                     glm::vec3* pos, glm::vec3* vel,
+                                                     glm::vec3& perceived_center, glm::vec3& v2, glm::vec3& v3,
+                                                     int& neighbor_count_1, int& neighbor_count_3) 
+{
+    if (grid < 0 || grid >= gridCount || gridCellStartIndices[grid] < 0) return;
+
+    // foreach cell check particles
+    for (int i = gridCellStartIndices[grid]; i <= gridCellEndIndices[grid]; ++i)
+    {
+        if (i != iSelf)
+        {
+            float dist2 = glm::dot(pos[particleArrayIndices[i]] - pos[particleArrayIndices[iSelf]],
+                                   pos[particleArrayIndices[i]] - pos[particleArrayIndices[iSelf]]);
+            if (dist2 < rule1Distance * rule1Distance)
+            {
+                perceived_center += pos[particleArrayIndices[i]];
+                ++neighbor_count_1;
+            }
+            if (dist2 < rule2Distance * rule2Distance)
+            {
+                v2 -= (pos[particleArrayIndices[i]] - pos[particleArrayIndices[iSelf]]);
+            }
+            if (dist2 < rule3Distance * rule3Distance)
+            {
+                v3 += vel[particleArrayIndices[i]];
+                ++neighbor_count_3;
+            }
+        }
+    }
 }
 
-__global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+__global__ void kernUpdateVelNeighborSearchScattered(int N, int gridCount, int gridResolution, glm::vec3 gridMin,
+                                                     float inverseCellWidth, float cellWidth,
+                                                     int *gridCellStartIndices, int *gridCellEndIndices,
+                                                     int *particleArrayIndices,
+                                                     glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) 
+{
+    // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+    // the number of boids that need to be checked.
+    // - Identify the grid cell that this particle is in
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    // - For each cell, read the start/end indices in the boid pointer array.
+    // - Access each boid in the cell and compute velocity change from
+    //   the boids rules, if this boid is within the neighborhood distance.
+    // - Clamp the speed change before putting the new speed in vel2
+    int index = threadIdx.x + (blockDim.x * blockIdx.x);
+    if (index >= N) return;
+
+    glm::vec4 ids = positionToGridId(pos[particleArrayIndices[index]], gridMin, inverseCellWidth, gridResolution);
+    int grid_id = ids.x;
+    glm::vec3 grid_id_3d(ids.y, ids.z, ids.a);
+    glm::vec3 grid_center = gridMin + grid_id_3d * cellWidth + 0.5f * cellWidth;
+
+    glm::vec3 offset = (pos[particleArrayIndices[index]] - grid_center) * inverseCellWidth * 2.f;
+    offset = glm::sign(offset) * glm::ceil(glm::abs(offset));
+    glm::ivec3 ioffset = offset;
+
+    glm::vec3 v1(0.f, 0.f, 0.f), v2(0.f), v3(0.f);
+    glm::vec3 perceived_center(0.f);
+    int neighbor_count_1(0), neighbor_count_3(0);
+
+    for (int z = 0; z < 2; ++z)
+    {
+        for (int y = 0; y < 2; ++y)
+        {
+            for (int x = 0; x < 2; ++x)
+            {
+                int grid = grid_id + x * ioffset[0] + 
+                                     y * gridResolution * ioffset[1] +
+                                     z * gridResolution * gridResolution * ioffset[2];
+
+                UpdateVelNeighborSearchScatteredHelp(grid, gridCount, index,
+                                                    gridCellStartIndices, gridCellEndIndices,
+                                                    particleArrayIndices, pos, vel1,
+                                                    perceived_center, v2, v3,
+                                                    neighbor_count_1, neighbor_count_3);
+            }
+        }
+    }
+
+    if (neighbor_count_1 > 0)
+    {
+        perceived_center /= static_cast<float>(neighbor_count_1);
+        v1 = (perceived_center - pos[particleArrayIndices[index]]);
+    }
+    if (neighbor_count_3 > 0)
+    {
+        v3 /= static_cast<float>(neighbor_count_3);
+    }
+
+    v1 *= rule1Scale;
+    v2 *= rule2Scale;
+    v3 *= rule3Scale;
+
+    glm::vec3 velocity = vel1[particleArrayIndices[index]] + v1 + v2 + v3;
+    //glm::vec3 velocity = vel1[particleArrayIndices[index]] + computeVelocityChange(N, particleArrayIndices[index], pos, vel1);
+    float speed = glm::length(velocity);
+    if (speed > maxSpeed)
+    {
+        velocity = (velocity / speed) * maxSpeed;
+    }
+    vel2[particleArrayIndices[index]] = velocity;
+}
+
+__global__ void kernReshuffledPosVel(int N, int* particleArrayIndices, 
+                                     glm::vec3* pos, glm::vec3* vel, 
+                                     glm::vec3* pos_reshuffled, glm::vec3* vel_reshuffled)
+{
+    // compute global id
+    int index = threadIdx.x + (blockDim.x * blockIdx.x);
+    if (index >= N) return;
+
+    // reshuffle position and velocity based on sorted result
+    pos_reshuffled[index] = pos[particleArrayIndices[index]];
+    vel_reshuffled[index] = vel[particleArrayIndices[index]];
+}
+
+__device__ void UpdateVelNeighborSearchCoherentHelp(int grid, int gridCount, int iSelf, const glm::vec3& position,
+                                                     int* gridCellStartIndices, int* gridCellEndIndices,
+                                                     glm::vec3* pos, glm::vec3* vel,
+                                                     glm::vec3& perceived_center, glm::vec3& v2, glm::vec3& v3,
+                                                     int& neighbor_count_1, int& neighbor_count_3)
+{
+    if (grid < 0 || grid >= gridCount || gridCellStartIndices[grid] < 0) return;
+
+    // foreach cell check particles
+    for (int i = gridCellStartIndices[grid]; i <= gridCellEndIndices[grid]; ++i)
+    {
+        if (i != iSelf)
+        {
+            float dist2 = glm::dot(pos[i] - position, pos[i] - position);
+            if (dist2 < rule1Distance * rule1Distance)
+            {
+                perceived_center += pos[i];
+                ++neighbor_count_1;
+            }
+            if (dist2 < rule2Distance * rule2Distance)
+            {
+                v2 -= (pos[i] - position);
+            }
+            if (dist2 < rule3Distance * rule3Distance)
+            {
+                v3 += vel[i];
+                ++neighbor_count_3;
+            }
+        }
+    }
+}
+
+__global__ void kernUpdateVelNeighborSearchCoherent(int N, int gridCount, int gridResolution, glm::vec3 gridMin,
+                                                    float inverseCellWidth, float cellWidth,
+                                                    int *gridCellStartIndices, int *gridCellEndIndices,
+                                                    glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) 
+{
+    // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+    // except with one less level of indirection.
+    // This should expect gridCellStartIndices and gridCellEndIndices to refer
+    // directly to pos and vel1.
+    // - Identify the grid cell that this particle is in
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    // - For each cell, read the start/end indices in the boid pointer array.
+    //   DIFFERENCE: For best results, consider what order the cells should be
+    //   checked in to maximize the memory benefits of reordering the boids data.
+    // - Access each boid in the cell and compute velocity change from
+    //   the boids rules, if this boid is within the neighborhood distance.
+    // - Clamp the speed change before putting the new speed in vel2
+
+    int index = threadIdx.x + (blockDim.x * blockIdx.x);
+    if (index >= N) return;
+
+    const glm::vec3 position = pos[index];
+
+    glm::vec4 ids = positionToGridId(position, gridMin, inverseCellWidth, gridResolution);
+    int grid_id = ids.x;
+    glm::vec3 grid_id_3d(ids.y, ids.z, ids.a);
+    glm::vec3 grid_center = gridMin + grid_id_3d * cellWidth + 0.5f * cellWidth;
+
+    glm::vec3 offset = (position - grid_center) * inverseCellWidth * 2.f;
+    offset = glm::sign(offset) * glm::ceil(glm::abs(offset));
+    glm::ivec3 ioffset = offset;
+
+    glm::vec3 v1(0.f, 0.f, 0.f), v2(0.f), v3(0.f);
+    glm::vec3 perceived_center(0.f);
+    int neighbor_count_1(0), neighbor_count_3(0);
+
+#if SEARCH_27 
+    for (int z = -1; z < 2; ++z)
+    {
+        for (int y = -1; y < 2; ++y)
+        {
+            for (int x = -1; x < 2; ++x)
+            {
+                int grid = grid_id + x * ioffset[0] +
+                    y * gridResolution * ioffset[1] +
+                    z * gridResolution * gridResolution * ioffset[2];
+
+                UpdateVelNeighborSearchCoherentHelp(grid, gridCount, index, position,
+                    gridCellStartIndices, gridCellEndIndices,
+                    pos, vel1,
+                    perceived_center, v2, v3,
+                    neighbor_count_1, neighbor_count_3);
+            }
+        }
+    }
+#else
+    for (int z = 0; z < 2; ++z)
+    {
+        for (int y = 0; y < 2; ++y)
+        {
+            for (int x = 0; x < 2; ++x)
+            {
+                int grid = grid_id + x * ioffset[0] +
+                    y * gridResolution * ioffset[1] +
+                    z * gridResolution * gridResolution * ioffset[2];
+    
+                UpdateVelNeighborSearchCoherentHelp(grid, gridCount, index, position,
+                                                    gridCellStartIndices, gridCellEndIndices,
+                                                    pos, vel1,
+                                                    perceived_center, v2, v3,
+                                                    neighbor_count_1, neighbor_count_3);
+            }
+        }
+    }
+#endif
+    if (neighbor_count_1 > 0)
+    {
+        perceived_center /= static_cast<float>(neighbor_count_1);
+        v1 = (perceived_center - position);
+    }
+    if (neighbor_count_3 > 0)
+    {
+        v3 /= static_cast<float>(neighbor_count_3);
+    }
+    
+    v1 *= rule1Scale;
+    v2 *= rule2Scale;
+    v3 *= rule3Scale;
+    
+    glm::vec3 velocity = vel1[index] + v1 + v2 + v3;
+
+    float speed = glm::length(velocity);
+    if (speed > maxSpeed)
+    {
+        velocity = (velocity / speed) * maxSpeed;
+    }
+    vel2[index] = velocity;
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+    // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernUpdateVelocityBruteForce <<<fullBlocksPerGrid , blockSize>>> (numObjects, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("Naive Update Velocity failed!");
+    cudaDeviceSynchronize();
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("Update Position failed!");
+    cudaDeviceSynchronize();
+
+    // TODO-1.2 ping-pong the velocity buffers
+    std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
-  // Uniform Grid Neighbor search using Thrust sort.
-  // In Parallel:
-  // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed
+    // TODO-2.1
+    // Uniform Grid Neighbor search using Thrust sort.
+    // In Parallel:
+    // - label each particle with its array index as well as its grid index.
+    //   Use 2x width grids.
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //   are welcome to do a performance comparison.
+    // - Naively unroll the loop for finding the start and end indices of each
+    //   cell's data pointers in the array of boid indices
+    // - Perform velocity updates using neighbor search
+    // - Update positions
+    // - Ping-pong buffers as needed
+    
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernResetIntBuffer << < (gridCellCount + blockSize - 1) / blockSize, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << < (gridCellCount + blockSize - 1) / blockSize, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+    checkCUDAErrorWithLine("Reset Start, End failed!");
+    cudaDeviceSynchronize();
+
+    //label particle based on position
+    kernComputeIndices << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum,
+                                                                gridInverseCellWidth,
+                                                                dev_pos,
+                                                                dev_particleArrayIndices,
+                                                                dev_particleGridIndices);
+    checkCUDAErrorWithLine("Label Paritcles failed!");
+    cudaDeviceSynchronize();
+
+    // unstable sort
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize >>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("IdentifyCellStartEnd failed!");
+    cudaDeviceSynchronize();
+
+    kernUpdateVelNeighborSearchScattered <<< fullBlocksPerGrid, blockSize >>> (numObjects, gridCellCount, gridSideCount, gridMinimum,
+                                                                                gridInverseCellWidth, gridCellWidth,
+                                                                                dev_gridCellStartIndices, dev_gridCellEndIndices, 
+                                                                                dev_particleArrayIndices,
+                                                                                dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("Scattered Grid Update Velocity failed!");
+    cudaDeviceSynchronize();
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("Update Position failed!");
+    cudaDeviceSynchronize();
+    
+    // TODO-1.2 ping-pong the velocity buffers
+    std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
-  // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
-  // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
-  //   the particle data in the simulation array.
-  //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
+    // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+    // In Parallel:
+    // - Label each particle with its array index as well as its grid index.
+    //   Use 2x width grids
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //   are welcome to do a performance comparison.
+    // - Naively unroll the loop for finding the start and end indices of each
+    //   cell's data pointers in the array of boid indices
+    // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
+    //   the particle data in the simulation array.
+    //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+    // - Perform velocity updates using neighbor search
+    // - Update positions
+    // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernResetIntBuffer << < (gridCellCount + blockSize - 1) / blockSize, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << < (gridCellCount + blockSize - 1) / blockSize, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+    checkCUDAErrorWithLine("Reset Start, End failed!");
+    cudaDeviceSynchronize();
+
+    //label particle based on position
+    kernComputeIndices << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum,
+                                                                gridInverseCellWidth,
+                                                                dev_pos,
+                                                                dev_particleArrayIndices,
+                                                                dev_particleGridIndices);
+    checkCUDAErrorWithLine("Label Paritcles failed!");
+    cudaDeviceSynchronize();
+
+    // unstable sort
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    // TODO: test stable sort
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("IdentifyCellStartEnd failed!");
+    cudaDeviceSynchronize();
+
+    // reshuffle position and velosity
+    kernReshuffledPosVel<<<fullBlocksPerGrid, blockSize >>>(numObjects, dev_particleArrayIndices, 
+                                                            dev_pos, dev_vel1, 
+                                                            dev_pos_reshuffled, dev_vel2);
+
+
+
+    checkCUDAErrorWithLine("Reshuffle position and velocity failed!");
+    cudaDeviceSynchronize();
+
+    std::swap(dev_pos, dev_pos_reshuffled);
+    std::swap(dev_vel1, dev_vel2);
+
+    kernUpdateVelNeighborSearchCoherent << < fullBlocksPerGrid, blockSize >> > (numObjects, gridCellCount, gridSideCount, gridMinimum,
+                                                                                    gridInverseCellWidth, gridCellWidth,
+                                                                                    dev_gridCellStartIndices, dev_gridCellEndIndices,
+                                                                                    dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("Scattered Grid Update Velocity failed!");
+    cudaDeviceSynchronize();
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("Update Position failed!");
+    cudaDeviceSynchronize();
+
+    std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::endSimulation() {
@@ -390,6 +815,12 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_pos_reshuffled);
 }
 
 void Boids::unitTest() {
@@ -435,6 +866,7 @@ void Boids::unitTest() {
   // Wrap device vectors in thrust iterators for use with thrust.
   thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
   thrust::device_ptr<int> dev_thrust_values(dev_intValues);
+
   // LOOK-2.1 Example for using thrust::sort_by_key
   thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
 
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..fd57c66 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,14 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
+
+// String help macro
+#define JOIN(a, b) a##b
+#define JOIN2(a, b) JOIN(a, b)
+#define STR(a) #a
+#define STR2(a) STR(a)
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;
@@ -42,6 +48,7 @@ int main(int argc, char* argv[]) {
 
 std::string deviceName;
 GLFWwindow *window;
+bool pause = false;
 
 /**
 * Initialization of CUDA and GLFW.
@@ -92,6 +99,7 @@ bool init(int argc, char **argv) {
   glfwSetKeyCallback(window, keyCallback);
   glfwSetCursorPosCallback(window, mousePositionCallback);
   glfwSetMouseButtonCallback(window, mouseButtonCallback);
+  glfwSetWindowSizeCallback(window, windowResizeCallback);
 
   glewExperimental = GL_TRUE;
   if (glewInit() != GLEW_OK) {
@@ -167,9 +175,9 @@ void initShaders(GLuint * program) {
   GLint location;
 
   program[PROG_BOID] = glslUtility::createProgram(
-    "shaders/boid.vert.glsl",
-    "shaders/boid.geom.glsl",
-    "shaders/boid.frag.glsl", attributeLocations, 2);
+      STR2(JOIN2(PROJ_BASE_PATH, /shaders/boid.vert.glsl)),
+      STR2(JOIN2(PROJ_BASE_PATH, /shaders/boid.geom.glsl)),
+      STR2(JOIN2(PROJ_BASE_PATH, /shaders/boid.frag.glsl)), attributeLocations, 2);
     glUseProgram(program[PROG_BOID]);
 
     if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) {
@@ -217,9 +225,9 @@ void initShaders(GLuint * program) {
     double timebase = 0;
     int frame = 0;
 
-    Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
-                       // your CUDA development setup is ready to go.
-
+    //Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
+    //                   // your CUDA development setup is ready to go.
+    
     while (!glfwWindowShouldClose(window)) {
       glfwPollEvents();
 
@@ -231,9 +239,11 @@ void initShaders(GLuint * program) {
         timebase = time;
         frame = 0;
       }
-
-      runCUDA();
-
+      if (!pause)
+      {
+          runCUDA();
+      }
+      
       std::ostringstream ss;
       ss << "[";
       ss.precision(1);
@@ -269,6 +279,10 @@ void initShaders(GLuint * program) {
     if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
       glfwSetWindowShouldClose(window, GL_TRUE);
     }
+    if (key == GLFW_KEY_P && action == GLFW_PRESS)
+    {
+        pause = !pause;
+    }
   }
 
   void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods) {
@@ -294,6 +308,14 @@ void initShaders(GLuint * program) {
 	lastY = ypos;
   }
 
+  void windowResizeCallback(GLFWwindow* window, int w, int h)
+  {
+      width = w;
+      height = h;
+      updateCamera();
+      glViewport(0, 0, width, height);
+  }
+
   void updateCamera() {
     cameraPosition.x = zoom * sin(phi) * sin(theta);
     cameraPosition.z = zoom * cos(theta);
diff --git a/src/main.hpp b/src/main.hpp
index 88e9df7..df91a3b 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -69,6 +69,7 @@ void errorCallback(int error, const char *description);
 void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods);
 void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods);
 void mousePositionCallback(GLFWwindow* window, double xpos, double ypos);
+void windowResizeCallback(GLFWwindow* window, int width, int height);
 void updateCamera();
 void runCUDA();