diff --git a/README.md b/README.md
index c3912f9..dabbda3 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-CUDA Rasterizer
+Tile-based CUDA Rasterizer
 ===============
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
@@ -6,11 +6,25 @@ CUDA Rasterizer
 * Tongbo Sui
 * Tested on: Windows 10, i5-3320M @ 2.60GHz 8GB, NVS 5400M 2GB (Personal)
 
+## Overview
+An efficient CUDA rasterizer with two pipeline options. By default it uses tile-based rendering, but also supports scanline rendering (`L` to switch between).
+
 ## Video Demo
 
+###### Tile-based render demo
+[![](img/splash-tile.png)](https://youtu.be/xNBfuONQN48)
+
+###### Scanline demo
 [![](img/Splash.png)](https://vimeo.com/141638182)
 
-## Pipeline Overview
+## Quick Jump List  
+* [Scanline pipeline](#pipeline-overview-scanline)
+* [Tile-based pipeline](#pipeline-overview-tile-based)
+* [Misc features](#misc-features)
+* [Performance analysis](#performance-analysis)
+* [References](#references)
+
+## Pipeline Overview (Scanline)
 
 * Vertex shading
   * Vertex shader with perspective transformation. Takes in vertices and transform the coordinates to window coordinates
@@ -84,6 +98,45 @@ CUDA Rasterizer
 ###### Lambert shading with barycentric interpolation. Two lights are used to better demonstrate the effect
 ![](img/lambert.png)
 
+## Pipeline Overview (Tile-based) 
+
+* Use `L` to switch between pipelines
+* Concept
+  * Divide-and-conquer technique applied to the pixel array, instead of primitive array
+  * Image is first divided into fix-sized bins. Such bins are relatively big in pixel size (e.g. 128x128 px). A kernel is launched to examine bounding box overlapping for each primitive against all bins
+    * Each bin maintains a queue of overlapping primitives
+  * Each bin is then divided into smaller sized tiles (e.g. 8x8 px). A kernel is launched to further examine bounding box overlapping for each bin for each primitive *in the bin's queue*
+    * Each tile maintains a similar queue
+  * Then a kernel is launched to find exact intersections for each pixel. In this case each pixel only needs to examine the primitives in the tile's queue
+* **NOTE**
+  * The lightings are different for the two pipelines to make it easier distinguishing the current pipeline. Tile-based render has major lighting coming from top left. Scanline render has major lighting coming from top right
+  * LIMITATION: current tile-based implementation imposes limitation that there can be at most 1024 triangles covering each bin at the same time. More triangles will result in undefined clipping
+    * Current workaround:
+      * Adjust bin size so that less triangles will cover one single bin
+      * Zoom in more so that triangles get spaced out instead of clutching within several bins
+    * Possible solution: all it needed is a thread-safe, dynamically resizable queue for each bin. This is not included in the implementation and hence the limitation. Theoretical limitation with such data structure is the memory limit of the hardware 
+* Vertex shading: same as scanline
+* Primitive assembly
+  * Assembles triangles from the input vertex list
+  * Initialize various shading property
+  * Flip coordinates to offset coordinate difference in OpenGL
+  * Cache minimum depth. This is the minimum depth of the 3 vertices of each triangle
+  * Pre-calculate signed area, and remove backfaces and degenerate faces
+  * Coarse window & scissor clipping. Removes those that are completely outside of the clipping box
+* Primitive compaction
+  * A simple stream compaction step to actually remove the marked primitives from previous stage
+* Geometry shader
+  * Same as scanline, with slight change that also calculates bounding box of the newly added geometries
+* Bin rasterizer
+  * Take the primitives as input, each thread is responsible for one single primitive. The thread will check coverage with all bins in the image, and push its primitive ID to that bin's coverage list, should there be any overlapping
+* Tile rasterizer
+  * Take the previous coverage list as input. Each thread is responsible for one tile, and will check coverage in the bin's list
+* Fine rasterizer
+  * Take the previous coverage list as input. Each thread is responsible for one pixel, and will check coverage in the tile's list
+* Fragment shader
+  * Same as scanline. Only that it now distinguishes a fragment that isn't overlapping any primitives with one that does
+  * It will directly shade a non-overlapping fragment to black, while shade the "good" fragments with correct lighting
+
 ## Misc Features
 * Mouse-based interactive camera support
   * Controls
@@ -104,14 +157,25 @@ CUDA Rasterizer
 ###### Point shading
 ![](img/point-shading.png)
 
-### Performance Analysis
+## Performance Analysis
 
 * Camera properties
   * Position `(0,0,3)`
   * LookAt `(0,0,0)`
   * FOV = 45.0 degrees
 
-* Performance breakdown
+* Tile-based performance breakdown
+  * Fragement shader time is almost fixed. Only dependent on window size
+  * Breakdown are core pipeline only
+  * Unlike scanline (below), FPS doesn't have obvious changes when changing camera distance. The performance depends more on # of overlapping primitives in each tile. Thus scenes of high depth complexity would have the highest negative impact on performance
+
+###### `cow.obj` performance breakdown
+![](img/cow-perf-tile.png)
+
+###### `cow.obj` FPS comparison by camera distance and pipeline. Positive means camera moving away
+![](img/fps-compare.png)
+
+* Scanline performance breakdown
   * Fragement shader time is almost fixed, since it's only dependent on the pixel count of the output window
   * Breakdown are core pipeline only
   * For the exact same camera properties described above, frame rate largely depends on the transformed size of the primitives, due to the current rasterization implementation
@@ -121,19 +185,19 @@ CUDA Rasterizer
 ###### `cow.obj` performance breakdown
 ![](img/cow-perf.png)
 
-###### `cow.obj` FPS by camera distance
+###### `cow.obj` FPS by camera distance, camera moving away
 ![](img/cow-dist.png)
 
 ###### `suzanne.obj` performance breakdown
 ![](img/suzanne-perf.png)
 
-###### `suzanne.obj` FPS by camera distance
+###### `suzanne.obj` FPS by camera distance, camera moving away
 ![](img/suzanne-dist.png)
 
 ###### `flower.obj` performance breakdown
 ![](img/flower-perf.png)
 
-###### `flower.obj` FPS by camera distance
+###### `flower.obj` FPS by camera distance, camera moving away
 ![](img/flower-dist.png)
 
 * Optimization (`cow.obj`)
@@ -141,7 +205,8 @@ CUDA Rasterizer
     * Alter block size for different kernels to achieve higher warp count
     * Substitute fixed divisions with corresponding multiplications for marginal performance gain
     * Cache repetitive calculations; reorder executions to reduce execution dependency
-  * Backface culling
+    * Optimize calculations to reduce register usage
+  * Backface culling (for scanline)
     * Only useful when the object is big in window
       * Reduces rasterization time
     * Stream compaction overhead might be more significant and cancel out the benefit
@@ -157,3 +222,5 @@ CUDA Rasterizer
   * http://www.songho.ca/opengl/gl_transform.html
 * Bresenham's line algorithm
   * https://en.wikipedia.org/wiki/Bresenham%27s_line_algorithm
+* Tile-based rendering
+  * https://mediatech.aalto.fi/~samuli/publications/laine2011hpg_paper.pdf
diff --git a/img/cow-perf-tile.png b/img/cow-perf-tile.png
new file mode 100644
index 0000000..7540d54
Binary files /dev/null and b/img/cow-perf-tile.png differ
diff --git a/img/fps-compare.png b/img/fps-compare.png
new file mode 100644
index 0000000..7b696f0
Binary files /dev/null and b/img/fps-compare.png differ
diff --git a/img/splash-tile.png b/img/splash-tile.png
new file mode 100644
index 0000000..b31bdaa
Binary files /dev/null and b/img/splash-tile.png differ
diff --git a/src/main.cpp b/src/main.cpp
index a566f73..7372b79 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -16,6 +16,7 @@
 //-------------------------------
 
 MVP mvp;
+bool useScanline = false;
 
 int main(int argc, char **argv) {
     if (argc != 2) {
@@ -83,7 +84,12 @@ void runCuda() {
     dptr = NULL;
 
     cudaGLMapBufferObject((void **)&dptr, pbo);
-    rasterize(dptr);
+	if (useScanline){
+		rasterize(dptr);
+	}
+	else {
+		rasterizeTile(dptr);
+	}
     cudaGLUnmapBufferObject(pbo);
 
     frame++;
@@ -238,9 +244,11 @@ bool init(obj *mesh) {
         0.0, 0.0, 1.0,
         1.0, 0.0, 0.0
     };
-    rasterizeSetBuffers(mesh->getBufIdxsize(), mesh->getBufIdx(),
-            mesh->getBufPossize() / 3,
-            mesh->getBufPos(), mesh->getBufNor(), mesh->getBufCol());
+	rasterizeSetBuffers(mesh->getBufIdxsize(), mesh->getBufIdx(),
+		mesh->getBufPossize() / 3,
+		mesh->getBufPos(), mesh->getBufNor(), mesh->getBufCol());
+
+	rasterizeTileInit();
 
     GLuint passthroughProgram;
     passthroughProgram = initShader();
@@ -273,7 +281,7 @@ void initCuda() {
     // Use device with highest Gflops/s
     cudaGLSetGLDevice(0);
 
-    rasterizeInit(width, height, &mvp);
+	rasterizeInit(width, height, &mvp);
 
     // Clean up on program exit
     atexit(cleanupCuda);
@@ -367,7 +375,7 @@ void deleteTexture(GLuint *tex) {
 }
 
 void shut_down(int return_code) {
-    rasterizeFree();
+	rasterizeFree();
     cudaDeviceReset();
 #ifdef __APPLE__
     glfwTerminate();
@@ -420,6 +428,10 @@ void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods
 			mvp.culling = !mvp.culling;
 			flushDepthBuffer();
 			break;
+		case GLFW_KEY_L:	// Pipeline switch
+			useScanline = !useScanline;
+			flushDepthBuffer();
+			break;
 		}
 	}
 }
\ No newline at end of file
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 4d07e5d..d65677e 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -13,13 +13,41 @@
 #include <stream_compaction/efficient.h>
 #include "sceneStructs.h"
 #include <util/checkCUDAError.h>
+#include <vector>
 #include <glm/gtx/transform.hpp>
 #include "rasterize.h"
 #include "rasterizeTools.h"
 
+#define BINSIDE_LEN 4
+#define TILESIDE_LEN 8
+#define BIN_SIZE BINSIDE_LEN*BINSIDE_LEN	// this many tiles
+#define TILE_SIZE TILESIDE_LEN*TILESIDE_LEN	// this many pixels
+
+#define BINRASTER_BLOCK 256
 #define VERTSHADER_BLOCK 128
 #define FRAGSHADER_BLOCK 256
 
+#define QSEG_SIZE 1024
+
+// Data structure for rasterization filter
+namespace Queue {
+	struct Segment {
+		int queueSize = 0;
+		int queue[QSEG_SIZE];
+	};
+
+	__device__ void push(Segment &seg, int triId){
+		int writeIdx = atomicAdd(&(seg.queueSize), 1);
+		if (writeIdx < QSEG_SIZE){
+			seg.queue[writeIdx] = triId;
+		}
+	}
+
+	__device__ void clear(Segment &seg){
+		atomicExch(&(seg.queueSize), 0);
+	}
+}
+
 static int width = 0;
 static int height = 0;
 __constant__ static int *dev_bufIdx = NULL;
@@ -50,28 +78,22 @@ static glm::vec3 lightCol1 = glm::vec3(0.95f, 0.95f, 1.0f);
 static glm::vec3 light2 = light1 * glm::vec3(-1.0f, 1.0f, -1.0f);
 static glm::vec3 lightCol2 = glm::vec3(1.0f, 0.725f, 0.494f);
 
-/**
- * Kernel that writes the image to the OpenGL PBO directly.
- */
-/*
-__global__ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
-
-    if (x < w && y < h) {
-        glm::vec3 color;
-        color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
-        color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
-        color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
-}
-*/
+// Rasterization filtering
+int rowWidth;
+int columnHeight;
+int binGridWidth, binGridHeight;
+__constant__ static Queue::Segment *binVsTriangle;
+__constant__ static Queue::Segment *tileVsTriangle;
+
+/****************************************************************************************************************************************
+****************************************************************************************************************************************
+****************************************************************************************************************************************
+****************************************************************************************************************************************
+* Scanline pipeline below
+****************************************************************************************************************************************
+****************************************************************************************************************************************
+****************************************************************************************************************************************
+*****************************************************************************************************************************************/
 
 __global__ void sendImageToPBO(uchar4 *pbo, int w, int h, Fragment *image) {
 	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -80,30 +102,16 @@ __global__ void sendImageToPBO(uchar4 *pbo, int w, int h, Fragment *image) {
 
 	if (x < w && y < h) {
 		Fragment f = image[index];
+		pbo[index].w = 0;
 		glm::vec3 color = glm::vec3(255.0f);
 
-		color.x = color.x * glm::clamp(f.col.x, 0.0f, 1.0f);
-		color.y = color.y * glm::clamp(f.col.y, 0.0f, 1.0f);
-		color.z = color.z * glm::clamp(f.col.z, 0.0f, 1.0f);
 		// Each thread writes one pixel location in the texture (textel)
-		pbo[index].w = 0;
-		pbo[index].x = color.x;
-		pbo[index].y = color.y;
-		pbo[index].z = color.z;
+		pbo[index].x = color.x*glm::clamp(f.col.x, 0.0f, 1.0f);
+		pbo[index].y = color.y*glm::clamp(f.col.y, 0.0f, 1.0f);
+		pbo[index].z = color.z*glm::clamp(f.col.z, 0.0f, 1.0f);
 	}
 }
 
-// Writes fragment colors to the framebuffer
-__global__ void render(int w, int h, Fragment *depthbuffer, glm::vec3 *framebuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
-
-    if (x < w && y < h) {
-        framebuffer[index] = depthbuffer[index].col;
-    }
-}
-
 /**
  * Called once at the beginning of the program to allocate memory.
  */
@@ -127,6 +135,8 @@ void flushDepthBuffer(){
 	cudaMemset(dev_depth, mvp->farPlane * 10000, width * height * sizeof(int));
 	cudaMemset(dev_depthbuffer, 0, width * height * sizeof(Fragment));
 	cudaMemset(dev_primitives, 0, triCount * geomShaderLimit * sizeof(Triangle));
+	cudaMemset(binVsTriangle, 0, binGridHeight*binGridWidth*sizeof(Queue::Segment));
+	cudaMemset(tileVsTriangle, 0, binGridHeight*binGridWidth*BIN_SIZE*sizeof(Queue::Segment));
 	checkCUDAError("rasterize flush");
 }
 
@@ -172,24 +182,6 @@ void rasterizeSetBuffers(
     checkCUDAError("rasterizeSetBuffers");
 }
 
-/*
-__device__ void findIntersect(glm::vec3 &i, glm::vec3 p1, glm::vec3 p2, glm::vec3 p3, glm::vec3 p4){
-	// http://paulbourke.net/geometry/pointlineplane/
-	float d = (p4.y - p3.y)*(p2.x - p1.x) - (p4.x - p3.x)*(p2.y - p1.y);
-	if (abs(d) < ZERO_ABSORPTION_EPSILON){
-		// Parallel
-		i = glm::vec3(0.0f);
-	}
-	else {
-		float n = (p4.x - p3.x)*(p1.y - p3.y) - (p4.y - p3.y)*(p1.x - p3.x);
-		float ua = n / d;
-		i.x = p1.x + ua*(p2.x - p1.x);
-		i.y = p1.y + ua*(p2.y - p1.y);
-		i.z = 1;
-	}
-}
-*/
-
 __global__ void shadeVertex(VertexOut *vOut, VertexIn *vIn, const int vertCount, const int width, const int height, const glm::mat4 mvp, const float near, const float far){
 	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
 	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -519,11 +511,6 @@ void rasterize(uchar4 *pbo) {
 		checkCUDAError("Frag shader");
 	}
 
-    // Copy depthbuffer colors into framebuffer
-    //render<<<blockCount2d, blockSize2d>>>(width, height, dev_depthbuffer, dev_framebuffer);
-    // Copy framebuffer into OpenGL buffer for OpenGL previewing
-	//sendImageToPBO << <blockCount2d, blockSize2d >> >(pbo, width, height, dev_framebuffer);
-
 	dim3 blockSize2d2(16, 16);
 
 	dim3 blockCount2d2((width + blockSize2d2.x - 1) / blockSize2d2.x,
@@ -550,5 +537,360 @@ void rasterizeFree() {
 	cudaFree(dv_out_tmp);
 	cudaFree(dv_c_tmp);
 
+	cudaFree(binVsTriangle);
+	cudaFree(tileVsTriangle);
+
     checkCUDAError("rasterizeFree");
 }
+
+
+/****************************************************************************************************************************************
+****************************************************************************************************************************************
+****************************************************************************************************************************************
+****************************************************************************************************************************************
+* Tile-based pipeline below
+****************************************************************************************************************************************
+****************************************************************************************************************************************
+****************************************************************************************************************************************
+*****************************************************************************************************************************************/
+
+void rasterizeTileInit(){
+	// Initialize tile arrays
+	rowWidth = triCount * geomShaderLimit * sizeof(bool);
+	binGridWidth = (width + BINSIDE_LEN*TILESIDE_LEN - 1) / (BINSIDE_LEN*TILESIDE_LEN);
+	binGridHeight = (height + BINSIDE_LEN*TILESIDE_LEN - 1) / (BINSIDE_LEN*TILESIDE_LEN);
+	cudaMalloc((void**)&binVsTriangle, binGridHeight*binGridWidth*sizeof(Queue::Segment));
+	checkCUDAError("Bin array");
+	cudaMalloc((void**)&tileVsTriangle, binGridHeight*binGridWidth*BIN_SIZE*sizeof(Queue::Segment));
+	checkCUDAError("Tile array");
+}
+
+__global__ void assemblePrimitiveT(Triangle *pOut, VertexOut *vIn, int *triIdx, const int triCount, const int width, const int height, const glm::vec3 camPos, const bool doScissor, const Scissor scissor){
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < triCount) {
+		Triangle t;
+		// Assemble vertices
+		t.v[0] = vIn[triIdx[3 * index]];
+		t.v[1] = vIn[triIdx[3 * index + 1]];
+		t.v[2] = vIn[triIdx[3 * index + 2]];
+		// Set rasterization property
+		t.isPoint = false; t.isLine = false; t.isValidGeom = true;
+		// Backface culling & degenerate (zero area)
+		// Calculate signed area for later use also
+		t.signedArea = calculateSignedArea(t.v);
+		// Revert coordinates to fix OpenGL coord quirks
+		t.v[0].pos = glm::vec3((float)width - t.v[0].pos.x, (float)height - t.v[0].pos.y, t.v[0].pos.z);
+		t.v[1].pos = glm::vec3((float)width - t.v[1].pos.x, (float)height - t.v[1].pos.y, t.v[1].pos.z);
+		t.v[2].pos = glm::vec3((float)width - t.v[2].pos.x, (float)height - t.v[2].pos.y, t.v[2].pos.z);
+		// Find bounding box
+		t.box = getAABBForTriangle(t);
+		if (t.signedArea >= 0){
+			t.isValidGeom = false;
+		}
+		// Coarse window & scissor clipping
+		if (doScissor){
+			if (t.box.min.x > scissor.max.x || t.box.max.x < scissor.min.x || t.box.min.y > scissor.max.y || t.box.max.y < scissor.min.y){
+				t.isValidGeom = false;
+			}
+		}
+		if (t.box.min.x > width || t.box.max.x < 0 || t.box.min.y > height || t.box.max.y < 0){
+			t.isValidGeom = false;
+		}
+		// Minimum Z of all 3 vertices; for quick depth test
+		t.minDepth = -t.box.max.z;
+		pOut[index] = t;
+	}
+}
+
+__global__ void assemblePrimitivePointT(Triangle *pOut, VertexOut *vIn, int *triIdx, const int triCount, const int width, const int height){
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < triCount) {
+		Triangle t;
+		t.v[0] = vIn[triIdx[3 * index + 0]];
+		t.v[0].pos = glm::vec3(width - ceil(t.v[0].pos.x), height - ceil(t.v[0].pos.y), t.v[0].pos.z);
+		t.box = getAABB1D(t);
+		t.isPoint = true;
+		t.isValidGeom = true;
+		pOut[index] = t;
+	}
+}
+
+__global__ void simpleShadeGeomT(Triangle *pArr, const int triCount, const int width, const int limit, const int height, const glm::mat4 mvp, const float near, const float far){
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * width);
+
+	if (index < triCount && pArr[index].isValidGeom) {
+		Triangle t = pArr[index];
+		Triangle tN = t;
+		// Calculate a line that represents the vertex normal
+		// Since normal is not MVP-transformed, need to do MVP here for the model-space normal line
+		glm::vec4 clip = mvp*glm::vec4(t.v[0].mpos + t.v[0].nor*0.1f, 1.0f);
+		glm::vec3 ndc = glm::vec3(clip.x / clip.w, clip.y / clip.w, clip.z / clip.w);
+		// Rounding
+		tN.v[1].pos = glm::vec3(
+			width / 2 * (ndc.x + 1),
+			height / 2 * (ndc.y + 1),
+			(far - near) / 2 * ndc.z + (far + near) / 2
+			);
+		tN.v[1].pos = glm::vec3(width - ceil(tN.v[1].pos.x), height - ceil(tN.v[1].pos.y), tN.v[1].pos.z);
+		tN.box = getAABB2D(tN);
+		tN.isLine = true;
+		tN.isValidGeom = true;
+		pArr[index + triCount] = tN;
+	}
+}
+
+__global__ void binCover(Queue::Segment* binVsTriangle, Triangle *dev_primitives, const int primCount, const int width, const int height){
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < primCount){
+		Triangle t = dev_primitives[index];
+		for (int b = 0; b < width*height; b++){
+			int binX = b % width, binY = (b - binX) / width;
+			AABB bin;
+			bin.min.x = binX*BINSIDE_LEN*TILESIDE_LEN, bin.max.x = bin.min.x + BINSIDE_LEN*TILESIDE_LEN;
+			bin.min.y = binY*BINSIDE_LEN*TILESIDE_LEN, bin.max.y = bin.min.y + BINSIDE_LEN*TILESIDE_LEN;
+			if (boxOverlapTest(t.box, bin)){
+				Queue::push(binVsTriangle[b], index);
+			}
+		}
+	}
+}
+
+__global__ void tileCover(Queue::Segment *tileVsTriangle, Queue::Segment *binVsTriangle, Triangle *dev_primitives, const int width){
+	int binId = blockIdx.x;
+	int binX = binId % width, binY = (binId - binX) / width;
+	int baseTileX = binX * BINSIDE_LEN, baseTileY = binY * BINSIDE_LEN;
+	int tileId = baseTileX + threadIdx.x + (baseTileY + threadIdx.y)*width*BINSIDE_LEN;
+	int tileMinX = (baseTileX + threadIdx.x)*TILESIDE_LEN, tileMaxX = tileMinX + TILESIDE_LEN;
+	int tileMinY = (baseTileY + threadIdx.y)*TILESIDE_LEN, tileMaxY = tileMinY + TILESIDE_LEN;
+
+	AABB tile;
+	tile.min.x = tileMinX; tile.max.x = tileMaxX; tile.min.y = tileMinY; tile.max.y = tileMaxY;
+
+	int bound = binVsTriangle[binId].queueSize > QSEG_SIZE ? QSEG_SIZE : binVsTriangle[binId].queueSize;
+
+	for (int i = 0; i < bound; i++){
+		Triangle t = dev_primitives[binVsTriangle[binId].queue[i]];
+		if (boxOverlapTest(t.box, tile)){
+			Queue::push(tileVsTriangle[tileId], binVsTriangle[binId].queue[i]);
+		}
+	}
+	Queue::clear(binVsTriangle[binId]);
+}
+
+__global__ void pixCover(Fragment *dev_depthbuffer, Queue::Segment *tileVsTriangle, Triangle *dev_primitives, const int width, const int height, const int tileGridWidth, const bool doScissor, const Scissor scissor){
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int tileIdx = x/TILESIDE_LEN + (y/TILESIDE_LEN)*tileGridWidth;
+	int index = x + (y * width);
+
+	if (x < width && y < height) {
+		bool covered = true;
+		if (doScissor){
+			if (x < scissor.min.x || x > scissor.max.x || y < scissor.min.y || y > scissor.max.y){
+				covered = false;
+			}
+		}
+		if (tileVsTriangle[tileIdx].queueSize == 0){
+			covered = false;
+		}
+		if (covered){
+			float depth = 100;
+			int bound = glm::min(tileVsTriangle[tileIdx].queueSize, QSEG_SIZE);
+			int tIdx;
+			for (int i = 0; i < bound; i++){
+				tIdx = tileVsTriangle[tileIdx].queue[i];
+
+				if (!dev_primitives[tIdx].isLine && !dev_primitives[tIdx].isPoint) {
+					// General triangle
+					if (dev_primitives[tIdx].minDepth <= depth){
+						glm::vec3 bcc = calculateBarycentricCoordinate(dev_primitives[tIdx], glm::vec2((float)x, (float)y));
+						if (isBarycentricCoordInBounds(bcc)){
+							float dp = getZAtCoordinate(bcc, dev_primitives[tIdx]);
+							if (dp <= depth) {
+								// Shallowest
+								dev_depthbuffer[index].pos = bcc.x * dev_primitives[tIdx].v[0].pos + bcc.y*dev_primitives[tIdx].v[1].pos + bcc.z*dev_primitives[tIdx].v[2].pos;
+								dev_depthbuffer[index].nor = bcc.x * dev_primitives[tIdx].v[0].nor + bcc.y*dev_primitives[tIdx].v[1].nor + bcc.z*dev_primitives[tIdx].v[2].nor;
+								dev_depthbuffer[index].col = bcc.x * dev_primitives[tIdx].v[0].col + bcc.y*dev_primitives[tIdx].v[1].col + bcc.z*dev_primitives[tIdx].v[2].col;
+								depth = dp;
+							}
+						}
+					}
+				}
+				else if (dev_primitives[tIdx].isPoint && dev_primitives[tIdx].v[0].pos.x == x && dev_primitives[tIdx].v[0].pos.y == y){
+					if (-dev_primitives[tIdx].v[0].pos.z <= depth) {
+						// Shallowest
+						dev_depthbuffer[index].col = dev_primitives[tIdx].v[0].col;
+						dev_depthbuffer[index].nor = dev_primitives[tIdx].v[0].nor;
+						dev_depthbuffer[index].pos = dev_primitives[tIdx].v[0].pos;
+						depth = -dev_primitives[tIdx].v[0].pos.z;
+					}
+				}
+				else if (dev_primitives[tIdx].isLine){
+					glm::vec3 min = dev_primitives[tIdx].v[0].pos, max = dev_primitives[tIdx].v[1].pos;
+					if (min.x > max.x){
+						min = dev_primitives[tIdx].v[1].pos; max = dev_primitives[tIdx].v[0].pos;
+					}
+					AABB pointBox = getAABB1D(glm::vec3(x, y, 1.0f));
+
+					if (boxOverlapTest(pointBox, dev_primitives[tIdx].box)){
+						float ratio, dp;
+						if (min.x == max.x){
+							// Straight vertical line
+							ratio = __fdividef(y - min.y, max.y - min.y);
+							//float ratio = (y - min.y) / (max.y - min.y);
+							//float dp = -(ratio*min.z + (1 - ratio)*max.z);
+							dp = -max.z - ratio*(min.z -max.z);
+							if (dp <= depth) {
+								// Shallowest
+								dev_depthbuffer[index].pos = glm::vec3(x, y, dp);
+								dev_depthbuffer[index].nor = dev_primitives[tIdx].v[0].nor;
+								dev_depthbuffer[index].col = glm::vec3(1.0f);
+								depth = dp;
+							}
+						}
+						else {
+							// Bresenham
+							//float slope = (max.y - min.y) / (max.x - min.x);
+							//float ratio = (x - min.x) / (max.x - min.x);
+							float slope = __fdividef(max.y - min.y, max.x - min.x);
+							int assumedY = slope * (x - min.x) + min.y;
+							if (assumedY == y){
+								ratio = __fdividef(x - min.x, max.x - min.x);
+								//float dp = -(ratio*min.z + (1 - ratio)*max.z);
+								dp = -max.z - ratio*(min.z - max.z);
+								if (dp <= depth) {
+									// Shallowest
+									dev_depthbuffer[index].pos = glm::vec3(x, y, dp);
+									dev_depthbuffer[index].nor = dev_primitives[tIdx].v[0].nor;
+									dev_depthbuffer[index].col = glm::vec3(1.0f);
+									depth = dp;
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+		dev_depthbuffer[index].isCovered = covered;
+		Queue::clear(tileVsTriangle[tileIdx]);
+	}
+}
+
+__global__ void shadeFragmentT(Fragment *fBuf, const int pxCount, const int width, const glm::vec3 light1, const glm::vec3 lightCol1, const glm::vec3 light2, const glm::vec3 lightCol2){
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < pxCount) {
+		if (fBuf[index].isCovered){
+			// Add the two lights and do Lambert shading
+			glm::vec3 L1 = glm::normalize(light1 - fBuf[index].pos);
+			glm::vec3 L2 = glm::normalize(light2 - fBuf[index].pos);
+			glm::vec3 C1 = lightCol1*glm::dot(L1, fBuf[index].nor);
+			glm::vec3 C2 = lightCol2*glm::dot(L2, fBuf[index].nor);
+			fBuf[index].col = (C1 + C2)*fBuf[index].col;
+		}
+		else {
+			fBuf[index].col = glm::vec3(0.0f);
+		}
+	}
+}
+
+__global__ void shadeFragmentNormalT(Fragment *fBuf, const int pxCount, const int width){
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < pxCount) {
+		fBuf[index].col = fBuf[index].nor;
+	}
+}
+
+/**
+* Perform rasterization.
+*/
+void rasterizeTile(uchar4 *pbo) {
+	int sideLength2d = 8;
+	dim3 blockSize2d(sideLength2d, sideLength2d);
+
+	dim3 blockCount2d((width + blockSize2d.x - 1) / blockSize2d.x,
+		(height + blockSize2d.y - 1) / blockSize2d.y);
+
+	int vertGridSize = (width*height + VERTSHADER_BLOCK - 1) / VERTSHADER_BLOCK;
+
+	// Vertex shading
+	shadeVertex << <vertGridSize, VERTSHADER_BLOCK >> >(dev_bufShadedVert, dev_bufVertex, vertCount, width, height, mvp->mvp, mvp->nearPlane, mvp->farPlane);
+	checkCUDAError("Vert shader");
+
+	// Primitive assembly
+	if (mvp->pointShading){
+		assemblePrimitivePointT << <vertGridSize, VERTSHADER_BLOCK >> >(dev_primitives, dev_bufShadedVert, dev_bufIdx, triCount, width, height);
+		checkCUDAError("Prim assembly");
+	}
+	else {
+		assemblePrimitiveT << <vertGridSize, VERTSHADER_BLOCK >> >(dev_primitives, dev_bufShadedVert, dev_bufIdx, triCount, width, height, mvp->camPosition, mvp->doScissor, mvp->scissor);
+		checkCUDAError("Prim assembly");
+	}
+
+	int primCount = triCount;
+
+	// Primitive compaction
+	StreamCompaction::Efficient::compact(triCount*geomShaderLimit, dv_f_tmp, dv_idx_tmp, dv_out_tmp, dev_primitives, dv_c_tmp);
+	checkCUDAError("Primitive compact");
+	cudaMemcpy(&primCount, dv_c_tmp, sizeof(int), cudaMemcpyDeviceToHost);
+	cudaMemcpy(dev_primitives, dv_out_tmp, primCount * sizeof(Triangle), cudaMemcpyDeviceToDevice);
+	checkCUDAError("Primitive copy");
+
+	// Geometry shading
+	if (mvp->geomShading){
+		simpleShadeGeomT << <blockCount2d, blockSize2d >> >(dev_primitives, primCount, width, geomShaderLimit, height, mvp->mvp, mvp->nearPlane, mvp->farPlane);
+		checkCUDAError("Geom shader");
+		StreamCompaction::Efficient::compact(triCount*geomShaderLimit, dv_f_tmp, dv_idx_tmp, dv_out_tmp, dev_primitives, dv_c_tmp);
+		checkCUDAError("Geom shader compact");
+		cudaMemcpy(&primCount, dv_c_tmp, sizeof(int), cudaMemcpyDeviceToHost);
+		cudaMemcpy(dev_primitives, dv_out_tmp, primCount * sizeof(Triangle), cudaMemcpyDeviceToDevice);
+		checkCUDAError("Geom shader copy");
+	}
+
+	// Rasterization
+	// Input to bin raster
+	int binCoverGridSize = (primCount + BINRASTER_BLOCK - 1) / BINRASTER_BLOCK;
+	binCover << <binCoverGridSize, BINRASTER_BLOCK>> >(binVsTriangle, dev_primitives, primCount, binGridWidth, binGridHeight);
+	checkCUDAError("Bin cover test");
+
+	// Bin to tile raster
+	dim3 binSize2d(BINSIDE_LEN, BINSIDE_LEN);
+	tileCover << <binGridHeight*binGridWidth, binSize2d >> >(tileVsTriangle, binVsTriangle, dev_primitives, binGridWidth);
+	checkCUDAError("Tile cover test");
+
+	// Tile to fragment raster
+
+	dim3 blockSize2d3(8, 16);
+
+	dim3 blockCount2d3((width + blockSize2d3.x - 1) / blockSize2d3.x,
+		(height + blockSize2d3.y - 1) / blockSize2d3.y);
+	pixCover << <blockCount2d3, blockSize2d3 >> >(dev_depthbuffer, tileVsTriangle, dev_primitives, width, height, binGridWidth*BINSIDE_LEN, mvp->doScissor, mvp->scissor);
+	checkCUDAError("Pixel cover test");
+
+	// Fragment shading
+	int fragGridSize = (width*height + FRAGSHADER_BLOCK - 1) / FRAGSHADER_BLOCK;
+
+	if (mvp->shadeMode == 0){
+		shadeFragmentT << <fragGridSize, FRAGSHADER_BLOCK >> >(dev_depthbuffer, height*width, width, light1, lightCol1, light2, lightCol2);
+		checkCUDAError("Frag shader");
+	}
+	else if (mvp->shadeMode == 1){
+		shadeFragmentNormalT << <fragGridSize, FRAGSHADER_BLOCK >> >(dev_depthbuffer, height*width, width);
+		checkCUDAError("Frag shader");
+	}
+
+	// Render to frame
+	dim3 blockSize2d2(16, 16);
+
+	dim3 blockCount2d2((width + blockSize2d2.x - 1) / blockSize2d2.x,
+		(height + blockSize2d2.y - 1) / blockSize2d2.y);
+
+	sendImageToPBO << <blockCount2d2, blockSize2d2 >> >(pbo, width, height, dev_depthbuffer);
+	checkCUDAError("rasterize");
+}
\ No newline at end of file
diff --git a/src/rasterize.h b/src/rasterize.h
index 6f16eb5..c99c40a 100644
--- a/src/rasterize.h
+++ b/src/rasterize.h
@@ -11,10 +11,12 @@
 #include <glm/glm.hpp>
 #include <util/utilityCore.hpp>
 
+void rasterizeTileInit();
 void rasterizeInit(int width, int height, MVP *mvp);
 void rasterizeSetBuffers(
         int bufIdxSize, int *bufIdx,
         int vertCount, float *bufPos, float *bufNor, float *bufCol);
 void rasterize(uchar4 *pbo);
+void rasterizeTile(uchar4 *pbo);
 void rasterizeFree();
 void flushDepthBuffer();
diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h
index ccf17c9..60c4e95 100644
--- a/src/rasterizeTools.h
+++ b/src/rasterizeTools.h
@@ -55,6 +55,36 @@ AABB getAABBForTriangle(const Triangle tri) {
 	return aabb;
 }
 
+__host__ __device__ static
+AABB getAABB2D(const Triangle tri) {
+	AABB aabb;
+	aabb.min = glm::vec3(
+		glm::min(tri.v[0].pos.x, tri.v[1].pos.x),
+		glm::min(tri.v[0].pos.y, tri.v[1].pos.y),
+		glm::min(tri.v[0].pos.z, tri.v[1].pos.z));
+	aabb.max = glm::vec3(
+		glm::max(tri.v[0].pos.x, tri.v[1].pos.x),
+		glm::max(tri.v[0].pos.y, tri.v[1].pos.y),
+		glm::max(tri.v[0].pos.z, tri.v[1].pos.z));
+	return aabb;
+}
+
+__host__ __device__ static
+AABB getAABB1D(const Triangle tri) {
+	AABB aabb;
+	aabb.min = tri.v[0].pos;
+	aabb.max = tri.v[0].pos;
+	return aabb;
+}
+
+__host__ __device__ static
+AABB getAABB1D(const glm::vec3 point) {
+	AABB aabb;
+	aabb.min = point;
+	aabb.max = point;
+	return aabb;
+}
+
 // CHECKITOUT
 /**
  * Calculate the signed area of a given triangle.
@@ -64,6 +94,12 @@ float calculateSignedArea(const glm::vec3 tri[3]) {
     return 0.5 * ((tri[2].x - tri[0].x) * (tri[1].y - tri[0].y) - (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y));
 }
 
+__host__ __device__ static
+float calculateSignedArea(const VertexOut tri[3]) {
+	return 0.5 * ((tri[2].pos.x - tri[0].pos.x) * (tri[1].pos.y - tri[0].pos.y) - (tri[1].pos.x - tri[0].pos.x) * (tri[2].pos.y - tri[0].pos.y));
+}
+
+
 // CHECKITOUT
 /**
  * Helper function for calculating barycentric coordinates.
@@ -77,6 +113,18 @@ float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c,
     return calculateSignedArea(baryTri) / calculateSignedArea(tri);
 }
 
+/**
+* Helper function for calculating barycentric coordinates.
+*/
+__device__ static
+float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, const float triArea) {
+	glm::vec3 baryTri[3];
+	baryTri[0] = glm::vec3(a, 0);
+	baryTri[1] = glm::vec3(b, 0);
+	baryTri[2] = glm::vec3(c, 0);
+	return __fdividef(calculateSignedArea(baryTri), triArea);
+}
+
 // CHECKITOUT
 /**
  * Calculate barycentric coordinates.
@@ -89,6 +137,14 @@ glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point
     return glm::vec3(alpha, beta, gamma);
 }
 
+__device__ static
+glm::vec3 calculateBarycentricCoordinate(const Triangle tri, glm::vec2 point) {
+	float beta = calculateBarycentricCoordinateValue(glm::vec2(tri.v[0].pos.x, tri.v[0].pos.y), point, glm::vec2(tri.v[2].pos.x, tri.v[2].pos.y), tri.signedArea);
+	float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri.v[0].pos.x, tri.v[0].pos.y), glm::vec2(tri.v[1].pos.x, tri.v[1].pos.y), point, tri.signedArea);
+	float alpha = 1.0 - beta - gamma;
+	return glm::vec3(alpha, beta, gamma);
+}
+
 // CHECKITOUT
 /**
  * Check if a barycentric coordinate is within the boundaries of a triangle.
@@ -111,3 +167,30 @@ float getZAtCoordinate(const glm::vec3 barycentricCoord, const glm::vec3 tri[3])
            + barycentricCoord.y * tri[1].z
            + barycentricCoord.z * tri[2].z);
 }
+
+__host__ __device__ static
+float getZAtCoordinate(const glm::vec3 barycentricCoord, const Triangle tri) {
+	return -(barycentricCoord.x * tri.v[0].pos.z
+		+ barycentricCoord.y * tri.v[1].pos.z
+		+ barycentricCoord.z * tri.v[2].pos.z);
+}
+
+__device__ bool boxOverlapTest(AABB a, AABB b){
+	bool result;
+	if (a.max.x < b.min.x) {
+		result = false;
+	}
+	else if (a.min.x > b.max.x){
+		result = false;
+	}
+	else if (a.max.y < b.min.y){
+		result = false;
+	}
+	else if (a.min.y > b.max.y) {
+		result = false;
+	}
+	else {
+		result = true;
+	}
+	return result;
+}
\ No newline at end of file
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index 22ca8aa..feb3950 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -6,8 +6,6 @@
 #include "glm/glm.hpp"
 #include <glm/gtx/transform.hpp>
 
-#pragma once
-
 struct Scissor {
 	glm::vec2 min;
 	glm::vec2 max;
@@ -81,11 +79,14 @@ struct Triangle {
 	bool isPoint;
 	bool isLine;
 	bool isValidGeom;
+	float signedArea;
+	float minDepth;
 };
 struct Fragment {
 	glm::vec3 pos;
 	glm::vec3 nor;
 	glm::vec3 col;
+	bool isCovered;
 };
 
 struct is_invalid{