diff --git a/README.md b/README.md
index d63a6a1..a51b5b9 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,59 @@
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
-Project 1 - Flocking**
+![](images/top_image.gif)
+# Project 1 - Flocking
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zheyuan Xie
+  * [LinkedIn](https://www.linkedin.com/in/zheyuan-xie)
+  * [Github](https://github.com/ZheyuanXie)
+  * [Personal Website](http://errorspace.cn)
+* Tested on: Windows 10 Pro, i7-7700HQ @ 2.80GHz 2.80GHz, 16GB, GTX 1050 2GB (Dell XPS 15 9560)
 
-### (TODO: Your README)
+## Screenshots
+| 5,000 Boids | 100,000 Boids |
+|--|--|
+|![Number of boids: 5k](images/5k_compressed.gif) | ![Number of boids: 100k](images/100k_compressed.gif) |
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## Performance Analysis
+### 1. Number of Boids
+The figure below shows the relationship between performance and boid population. We can observe:
+ - For all three methods, performance decrease as the number of boids increase. 
+ - The naive method has the best performance when the number of boids are small (below 1.5k), but it decays rapidly as the number of boids increase. 
+ - The coherent uniform grid has the best performance when the number of boids are large.
+  - Visualization generally decrease the framerate.
+  - When the number of boids is small, visualization is the performance bottleneck; When the number of boids is large, computation takes over.
+
+ ![](images/population_fps.jpg)
+
+ In the naive method, for each boid we check all other N-1 boids. The time complexity is directly `O(N^2)`. The uniform grid methods is less sensitive to the increase in boids since for each boid we only check its neighboring boids within some distance.
+
+### 2. Block Size
+The figure below shows the performance vs block size curve with visualization turned off. We can observe:
+
+ - For all three methods, a jump in performance is observed when blocksize is increased from 16 to 32. 
+ - Further increasing the blocksize from 32 to 512 has no significant impact in performance.
+
+ ![](images/blocksize_fps.jpg)
+ 
+ This is because the warp size is always 32. Having less then 32 threads per block will result in inactive threads in a warp. But further increase the block size beyond 16 will no longer increase the degree of parallelism. 
+
+
+
+### 3. Coherent Uniform Grid
+Compared with scattered uniform grid, coherent uniform grid has better performance. The performance gap becomes wider as the number of boids increase. This outcome is expected since:
+ - We have one less level of indrection, and therefore one less access (`dev_particleArrayIndices`) to the device's global memory.
+ - Reshuffling the `dev_pos` and `dev_vel1` allows boids in the same cell occupy contigous memory, increase the chance of cache hit.
+
+### 4. Cell Width & Number of Neighboring Cells
+Using half the cell width slightly increase the performance.
+
+||8 Cells, Full Width| 27 Cells, Half Width
+|--|--|--|
+|5k Boids|1256 FPS|1366 FPS|
+|10k Boids|798 FPS |1001 FPS|
+|50k Boids|139 FPS |151 FPS|
+
+Though the number of cells needs to be check is larger, the actual volume that is checked is smaller, and therefore the number of boids checked is smaller. Let's say the neighborhood distance is 1 unit:
+ - Checking 8 Cells with 2-unit width is equivalent to checking a volume of 4\*4\*4=64 units.
+ - Checking 27 Cells with 1-unit width is equivalent to checking a volume of 3\*3\*3=27 units.
+
+ Therefore checking 27 cells with half the cell width means checking less boids and faster computation.
diff --git a/images/100k.gif b/images/100k.gif
new file mode 100644
index 0000000..4fac3fe
Binary files /dev/null and b/images/100k.gif differ
diff --git a/images/100k_compressed.gif b/images/100k_compressed.gif
new file mode 100644
index 0000000..cd9b49c
Binary files /dev/null and b/images/100k_compressed.gif differ
diff --git a/images/5k_compressed.gif b/images/5k_compressed.gif
new file mode 100644
index 0000000..13f10f8
Binary files /dev/null and b/images/5k_compressed.gif differ
diff --git a/images/blocksize_curve.m b/images/blocksize_curve.m
new file mode 100644
index 0000000..3488344
--- /dev/null
+++ b/images/blocksize_curve.m
@@ -0,0 +1,11 @@
+number_of_boids = [16 32 64 128 256 512];
+naive = [16.2377 31.1301 39.1084 36.5137 38.2966 38.1733];
+scatter = [390.202 437.727 433.237 427.975 435.234 430.058];
+coherent = [670.605 913.866 920.439 940.009 928.573 916.015];
+plot(number_of_boids, naive, '.-', 'lineWidth', 1); hold on
+plot(number_of_boids,scatter, '.-', 'lineWidth', 1);
+plot(number_of_boids,coherent, '.-', 'lineWidth', 1);
+legend('naive method', 'scatter grid', 'coherent grid')
+xlabel('Blocksize')
+ylabel('FPS')
+title('Blocksize vs. Boid Population')
\ No newline at end of file
diff --git a/images/blocksize_fps.jpg b/images/blocksize_fps.jpg
new file mode 100644
index 0000000..b4e62b9
Binary files /dev/null and b/images/blocksize_fps.jpg differ
diff --git a/images/population_curve.m b/images/population_curve.m
new file mode 100644
index 0000000..9ad0fcb
--- /dev/null
+++ b/images/population_curve.m
@@ -0,0 +1,19 @@
+number_of_boids = [1 5 10 25 50 75 100];
+naivev = [154.764 147.047 72.72 15.5 4.01 1.85 1.00];
+scatterv = [133.819 152.306 152.116 150.294 103.368 60.0672 43.5686];
+coherentv = [150.787 150.473 166.157 150.463 170.284 123.162 157.798];
+naive = [1725 448 118 23.5 6.5 3 1.6];
+scatter = [1470 1172.5 800 360 133.225 70.5 47.332];
+coherent = [1441 1210 1164 760 402 275 212.1];
+
+plot(number_of_boids, naivev, '--b', 'lineWidth', 1, 'Color','#0072BD'); hold on
+plot(number_of_boids, naive, '-b', 'lineWidth', 1 ,'Color', '#0072BD');
+plot(number_of_boids,scatterv, '--m', 'lineWidth', 1, 'Color','#D95319');
+plot(number_of_boids,scatter, '-m', 'lineWidth', 1 , 'Color','#D95319');
+plot(number_of_boids,coherentv, '--r', 'lineWidth', 1, 'Color', '#77AC30');
+plot(number_of_boids,coherent, '-r', 'lineWidth', 1,'Color','#77AC30');
+
+legend('naive (vis)', 'naive (no vis)', 'scatter (vis)', 'scatter (no vis)', 'coherent (vis)', 'coherent (no vis)')
+xlabel('Number of Boids (Thousands)')
+ylabel('FPS')
+title('FPS vs. Boid Population')
\ No newline at end of file
diff --git a/images/population_fps.jpg b/images/population_fps.jpg
new file mode 100644
index 0000000..ac3ca00
Binary files /dev/null and b/images/population_fps.jpg differ
diff --git a/images/top_image.gif b/images/top_image.gif
new file mode 100644
index 0000000..913a096
Binary files /dev/null and b/images/top_image.gif differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..ff54c92 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -54,6 +54,9 @@ void checkCUDAError(const char *msg, int line = -1) {
 /*! Size of the starting area in simulation space. */
 #define scene_scale 100.0f
 
+// Use half cell width?
+//#define USE_HALF_CELL_WIDTH
+
 /***********************************************
 * Kernel state (pointers are device pointers) *
 ***********************************************/
@@ -85,6 +88,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_shuffled_pos;
+glm::vec3 *dev_shuffled_vel;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -157,7 +162,11 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
+#ifdef USE_HALF_CELL_WIDTH
+  gridCellWidth = std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+#else
   gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+#endif
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -169,6 +178,24 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_shuffled_pos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_shuffled_pos failed!");
+
+  cudaMalloc((void**)&dev_shuffled_vel, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_shuffled_vel failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -230,10 +257,50 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
+  glm::vec3 self_position = pos[iSelf];
+  
+  int rule1_cnt = 0, rule3_cnt = 0;
+  glm::vec3 perceived_center(0.0f, 0.0f, 0.0f);
+  glm::vec3 c(0.0f, 0.0f, 0.0f);
+  glm::vec3 perceived_velocity(0.0f, 0.0f, 0.0f);
+
+  for (int i = 0; i < N; i++) {
+	  if (i == iSelf) {
+		  continue;
+	  }
+	  glm::vec3 target_position = pos[i];
+	  float distance = glm::distance(self_position, target_position);
+	  if (distance < rule1Distance) {
+		  perceived_center += target_position;
+		  rule1_cnt++;
+	  }
+	  if (distance < rule2Distance) {
+		  c -= (target_position - self_position);
+	  }
+	  if (distance < rule3Distance) {
+		  perceived_velocity += vel[i];
+		  rule3_cnt++;
+	  }
+  }
+  
+  glm::vec3 velocity_change(0.0f, 0.0f, 0.0f);
+
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+  if (rule1_cnt > 0) {
+	  perceived_center /= rule1_cnt;
+	  velocity_change += (perceived_center - self_position) * rule1Scale;
+  }
+  
   // Rule 2: boids try to stay a distance d away from each other
+  velocity_change += c * rule2Scale;
+
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  if (rule3_cnt > 0) {
+	  perceived_velocity /= rule3_cnt;
+	  velocity_change += perceived_velocity * rule3Scale;
+  }
+  
+  return velocity_change;
 }
 
 /**
@@ -242,9 +309,22 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+	// Compute a new velocity based on pos and vel1
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+	glm::vec3 velocity_change = computeVelocityChange(N, index, pos, vel1);
+	glm::vec3 new_velocity = vel1[index] + velocity_change;
+
+	// Clamp the speed
+	float speed = glm::length(new_velocity);
+	if (speed > maxSpeed) {
+		new_velocity = new_velocity / speed * maxSpeed;
+	}
+
+	// Record the new velocity into vel2. Question: why NOT vel1?
+	vel2[index] = new_velocity;
 }
 
 /**
@@ -287,8 +367,17 @@ __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 *pos, int *indices, int *gridIndices) {
     // TODO-2.1
     // - Label each boid with the index of its grid cell.
-    // - Set up a parallel array of integer indices as pointers to the actual
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+	glm::vec3 gridIndex3D = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+	int grid_index = gridIndex3Dto1D(gridIndex3D.x, gridIndex3D.y, gridIndex3D.z, gridResolution);
+	gridIndices[index] = grid_index;
+    
+	// - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+	indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +395,25 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+	  return;
+  }
+  int gridIndex = particleGridIndices[index];
+  if (index == 0) {
+    gridCellStartIndices[gridIndex] = index;
+    return;
+  }
+  if (index == N - 1) {
+    gridCellEndIndices[gridIndex] = index;
+    return;
+  }
+  if (gridIndex != particleGridIndices[index - 1]) {
+    gridCellStartIndices[gridIndex] = index;
+  }
+  if (gridIndex != particleGridIndices[index + 1]) {
+    gridCellEndIndices[gridIndex] = index;
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -317,11 +425,118 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
   // - Identify the grid cell that this particle is in
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+	  return;
+  }
+  glm::vec3 relative_pos = (pos[index] - gridMin) * inverseCellWidth;
+  glm::vec3 gridIndex3D = glm::floor(relative_pos);
+
   // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
+
+#ifndef USE_HALF_CELL_WIDTH
+  int x_direction = (glm::round(relative_pos.x - gridIndex3D.x) == 1) ? 1 : -1;
+  int y_direction = (glm::round(relative_pos.y - gridIndex3D.y) == 1) ? 1 : -1;
+  int z_direction = (glm::round(relative_pos.z - gridIndex3D.z) == 1) ? 1 : -1;
+#endif
+
+  glm::vec3 self_position = pos[index];
+  glm::vec3 perceived_center(0.0f, 0.0f, 0.0f);
+  glm::vec3 c(0.0f, 0.0f, 0.0f);
+  glm::vec3 perceived_velocity(0.0f, 0.0f, 0.0f);
+  int rule1_cnt = 0, rule3_cnt = 0;
+
+#ifdef USE_HALF_CELL_WIDTH
+  for (int dx = -1; dx <= 1; dx++) {
+    int x_index = gridIndex3D.x + dx;
+    if (x_index < 0 || x_index > gridResolution) {
+      continue;
+    }
+    for (int dy = -1; dy <= 1; dy++) {
+      int y_index = gridIndex3D.y + dy;
+      if (y_index < 0 || y_index > gridResolution) {
+        continue;
+      }
+      for (int dz = -1; dz <= 1; dz++) {
+        int z_index = gridIndex3D.z + dz;
+        if (z_index < 0 || z_index > gridResolution) {
+          continue;
+        }
+#else
+  for (int dx = 0; dx <= 1; dx++) {
+    int x_index = gridIndex3D.x + dx * x_direction;
+    if (x_index < 0 || x_index > gridResolution) {
+      continue;
+    }
+    for (int dy = 0; dy <= 1; dy++) {
+      int y_index = gridIndex3D.y + dy * y_direction;
+      if (y_index < 0 || y_index > gridResolution) {
+        continue;
+      }
+      for (int dz = 0; dz <= 1; dz++) {
+        int z_index = gridIndex3D.z + dz * z_direction;
+        if (z_index < 0 || z_index > gridResolution) {
+          continue;
+        }
+#endif
+			  int gridIndex1D = gridIndex3Dto1D(x_index, y_index, z_index, gridResolution);
+			  
+			  // - For each cell, read the start/end indices in the boid pointer array.
+			  int startIndex = gridCellStartIndices[gridIndex1D];
+			  if (startIndex == -1) {
+				  continue;
+			  }
+			  int endIndex = gridCellEndIndices[gridIndex1D];
+
+			  for (int SortedIndex = startIndex; SortedIndex <= endIndex; SortedIndex++) {
+				  int targetIndex = particleArrayIndices[SortedIndex];
+				  if (targetIndex == index) {
+					  continue;
+				  }
+				  glm::vec3 target_position = pos[targetIndex];
+				  float distance = glm::distance(self_position, target_position);
+				  if (distance < rule1Distance) {
+					  perceived_center += target_position;
+					  rule1_cnt++;
+				  }
+				  if (distance < rule2Distance) {
+					  c -= (target_position - self_position);
+				  }
+				  if (distance < rule3Distance) {
+					  perceived_velocity += vel1[targetIndex];
+					  rule3_cnt++;
+				  }
+			  }
+		  }
+	  }
+  }
+
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
+  glm::vec3 velocity_change(0.0f, 0.0f, 0.0f);
+
+  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+  if (rule1_cnt > 0) {
+	  perceived_center /= rule1_cnt;
+	  velocity_change += (perceived_center - self_position) * rule1Scale;
+  }
+
+  // Rule 2: boids try to stay a distance d away from each other
+  velocity_change += c * rule2Scale;
+
+  // Rule 3: boids try to match the speed of surrounding boids
+  if (rule3_cnt > 0) {
+	  perceived_velocity /= rule3_cnt;
+	  velocity_change += perceived_velocity * rule3Scale;
+  }
+
   // - Clamp the speed change before putting the new speed in vel2
+  glm::vec3 new_velocity = vel1[index] + velocity_change;
+  float speed = glm::length(new_velocity);
+  if (speed > maxSpeed) {
+	  new_velocity = new_velocity / speed * maxSpeed;
+  }
+  vel2[index] = new_velocity;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -334,13 +549,128 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // This should expect gridCellStartIndices and gridCellEndIndices to refer
   // directly to pos and vel1.
   // - Identify the grid cell that this particle is in
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  glm::vec3 relative_pos = (pos[index] - gridMin) * inverseCellWidth;
+  glm::vec3 gridIndex3D = glm::floor(relative_pos);
+
   // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
+
+#ifndef USE_HALF_CELL_WIDTH
+  int x_direction = (glm::round(relative_pos.x - gridIndex3D.x) == 1) ? 1 : -1;
+  int y_direction = (glm::round(relative_pos.y - gridIndex3D.y) == 1) ? 1 : -1;
+  int z_direction = (glm::round(relative_pos.z - gridIndex3D.z) == 1) ? 1 : -1;
+#endif
+
+  glm::vec3 self_position = pos[index];
+  glm::vec3 perceived_center(0.0f, 0.0f, 0.0f);
+  glm::vec3 c(0.0f, 0.0f, 0.0f);
+  glm::vec3 perceived_velocity(0.0f, 0.0f, 0.0f);
+  int rule1_cnt = 0, rule3_cnt = 0;
+
+#ifdef USE_HALF_CELL_WIDTH
+  for (int dx = -1; dx <= 1; dx++) {
+    int x_index = gridIndex3D.x + dx;
+    if (x_index < 0 || x_index > gridResolution) {
+      continue;
+    }
+    for (int dy = -1; dy <= 1; dy++) {
+      int y_index = gridIndex3D.y + dy;
+      if (y_index < 0 || y_index > gridResolution) {
+        continue;
+      }
+      for (int dz = -1; dz <= 1; dz++) {
+        int z_index = gridIndex3D.z + dz;
+        if (z_index < 0 || z_index > gridResolution) {
+          continue;
+        }
+#else
+  for (int dx = 0; dx <= 1; dx++) {
+    int x_index = gridIndex3D.x + dx * x_direction;
+    if (x_index < 0 || x_index > gridResolution) {
+      continue;
+    }
+    for (int dy = 0; dy <= 1; dy++) {
+      int y_index = gridIndex3D.y + dy * y_direction;
+      if (y_index < 0 || y_index > gridResolution) {
+        continue;
+      }
+      for (int dz = 0; dz <= 1; dz++) {
+        int z_index = gridIndex3D.z + dz * z_direction;
+        if (z_index < 0 || z_index > gridResolution) {
+          continue;
+        }
+        int gridIndex1D = gridIndex3Dto1D(x_index, y_index, z_index, gridResolution);
+#endif
+        // - For each cell, read the start/end indices in the boid pointer array.
+        int startIndex = gridCellStartIndices[gridIndex1D];
+        if (startIndex == -1) {
+          continue;
+        }
+        int endIndex = gridCellEndIndices[gridIndex1D];
+
+        for (int targetIndex = startIndex; targetIndex <= endIndex; targetIndex++) {
+          if (targetIndex == index) {
+            continue;
+          }
+          glm::vec3 target_position = pos[targetIndex];
+          float distance = glm::distance(self_position, target_position);
+          if (distance < rule1Distance) {
+            perceived_center += target_position;
+            rule1_cnt++;
+          }
+          if (distance < rule2Distance) {
+            c -= (target_position - self_position);
+          }
+          if (distance < rule3Distance) {
+            perceived_velocity += vel1[targetIndex];
+            rule3_cnt++;
+          }
+        }
+      }
+    }
+  }
+
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
+  glm::vec3 velocity_change(0.0f, 0.0f, 0.0f);
+
+  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+  if (rule1_cnt > 0) {
+    perceived_center /= rule1_cnt;
+    velocity_change += (perceived_center - self_position) * rule1Scale;
+  }
+
+  // Rule 2: boids try to stay a distance d away from each other
+  velocity_change += c * rule2Scale;
+
+  // Rule 3: boids try to match the speed of surrounding boids
+  if (rule3_cnt > 0) {
+    perceived_velocity /= rule3_cnt;
+    velocity_change += perceived_velocity * rule3Scale;
+  }
+
   // - Clamp the speed change before putting the new speed in vel2
+  glm::vec3 new_velocity = vel1[index] + velocity_change;
+  float speed = glm::length(new_velocity);
+  if (speed > maxSpeed) {
+    new_velocity = new_velocity / speed * maxSpeed;
+  }
+  vel2[index] = new_velocity;
+}
+
+__global__ void kernReshuffleParticleData(
+  int N, int *particleArrayIndices, glm::vec3 *pos, glm::vec3 *shuffled_pos,
+  glm::vec3 *vel, glm::vec3 *shuffled_vel) {
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  int pIndex = particleArrayIndices[index];
+  shuffled_pos[index] = pos[pIndex];
+  shuffled_vel[index] = vel[pIndex];
 }
 
 /**
@@ -348,7 +678,12 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+  
   // TODO-1.2 ping-pong the velocity buffers
+  cudaMemcpy(dev_vel1, dev_vel2, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -357,31 +692,76 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // In Parallel:
   // - label each particle with its array index as well as its grid index.
   //   Use 2x width grids.
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>
+	  (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  thrust::device_ptr<int> dev_thrust_particleArrayIndices(dev_particleArrayIndices);
+  thrust::device_ptr<int> dev_thrust_particleGridIndices(dev_particleGridIndices);
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+  
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+  dim3 fullBlocksPerGrid2((gridCellCount + blockSize - 1) / blockSize);
+  kernResetIntBuffer<<<fullBlocksPerGrid2, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer<<<fullBlocksPerGrid2, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+  kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
   // - Perform velocity updates using neighbor search
+  kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid, blockSize>>>
+	  (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices,
+		  dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
   // - Update positions
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+
   // - Ping-pong buffers as needed
+  cudaMemcpy(dev_vel1, dev_vel2, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
   // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
   // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
+  // - label each particle with its array index as well as its grid index.
+  //   Use 2x width grids.
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>
+	  (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+  
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  thrust::device_ptr<int> dev_thrust_particleArrayIndices(dev_particleArrayIndices);
+  thrust::device_ptr<int> dev_thrust_particleGridIndices(dev_particleGridIndices);
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+  
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+  dim3 fullBlocksPerGrid2((gridCellCount + blockSize - 1) / blockSize);
+  kernResetIntBuffer<<<fullBlocksPerGrid2, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer<<<fullBlocksPerGrid2, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+  kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
   // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
   //   the particle data in the simulation array.
   //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+  kernReshuffleParticleData<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleArrayIndices, 
+    dev_pos, dev_shuffled_pos, dev_vel1, dev_shuffled_vel);
+  cudaMemcpy(dev_pos, dev_shuffled_pos, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
+  cudaMemcpy(dev_vel1, dev_shuffled_vel, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
+
   // - Perform velocity updates using neighbor search
+  kernUpdateVelNeighborSearchCoherent<<<fullBlocksPerGrid, blockSize>>>
+	  (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices,
+		  dev_gridCellEndIndices, dev_pos, dev_vel1, dev_vel2);
+
   // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+
+  // - Ping-pong buffers as needed
+  cudaMemcpy(dev_vel1, dev_vel2, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
 }
 
 void Boids::endSimulation() {
@@ -390,6 +770,12 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_shuffled_pos);
+  cudaFree(dev_shuffled_vel);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..c94fe89 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;
@@ -216,6 +216,10 @@ void initShaders(GLuint * program) {
     double fps = 0;
     double timebase = 0;
     int frame = 0;
+    
+    #define FPS_BUFFER_SIZE 5
+    int fps_cnt = 0;
+    double fps_buffer[FPS_BUFFER_SIZE];
 
     Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
                        // your CUDA development setup is ready to go.
@@ -230,6 +234,15 @@ void initShaders(GLuint * program) {
         fps = frame / (time - timebase);
         timebase = time;
         frame = 0;
+
+        fps_buffer[fps_cnt % FPS_BUFFER_SIZE] = fps;
+        fps_cnt++;
+        double fps_sum = 0.0;
+        for (int i = 0; i < FPS_BUFFER_SIZE; i++) {
+          fps_sum += fps_buffer[i];
+        }
+        double fps_avg = (fps_cnt > FPS_BUFFER_SIZE) ? fps_sum / FPS_BUFFER_SIZE : fps_sum / fps_cnt;
+        std::cout << "CNT:" << fps_cnt << " AVG:" << fps_avg << std::endl;
       }
 
       runCUDA();
diff --git a/src/main.hpp b/src/main.hpp
index 88e9df7..37bde40 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -36,8 +36,8 @@ const float fovy = (float) (PI / 4);
 const float zNear = 0.10f;
 const float zFar = 10.0f;
 // LOOK-1.2: for high DPI displays, you may want to double these settings.
-int width = 1280;
-int height = 720;
+int width = 1280 * 2;
+int height = 720 * 2;
 int pointSize = 2;
 
 // For camera controls