CIS565-Fall-2015 · likangning93 · Oct 7, 2015 · Oct 7, 2015 · Oct 8, 2015 · Oct 10, 2015
diff --git a/README.md b/README.md
diff --git a/img/AAAAAAAAAAAAAAA.png b/img/AAAAAAAAAAAAAAA.png
diff --git a/img/AA_spread.png b/img/AA_spread.png
diff --git a/img/AA_spread_comparison.png b/img/AA_spread_comparison.png
diff --git a/img/FSAA.png b/img/FSAA.png
diff --git a/img/MSAA.png b/img/MSAA.png
diff --git a/img/aliased.png b/img/aliased.png
diff --git a/img/charts/antialiasing/antialiasing.csv b/img/charts/antialiasing/antialiasing.csv
@@ -0,0 +1,4 @@
+"";"scanline rasterization";"fragment shading";"render to frame buffer"
+"no antialiasing";"2799.816";"277.787";"284.307"
+"FSAA";"13887.599";"2159.613";"2985.455"
+"MSAA";"13910.298";"1142.535";"2883.819"
diff --git a/img/charts/antialiasing/antialiasing.pdf b/img/charts/antialiasing/antialiasing.pdf
diff --git a/img/charts/antialiasing/antialiasing.png b/img/charts/antialiasing/antialiasing.png
diff --git a/img/charts/instancing/many_cows_host.csv b/img/charts/instancing/many_cows_host.csv
@@ -0,0 +1,7 @@
+"clearDepthBuffer";"408.746";"#ff6633";"0"
+"computeVertexTFs";"2.750";"#99ff66";"0"
+"vertexShader";"186.007";"#ccaa00";"0"
+"primitiveAssembly";"1053.543";"#aa99cc";"0"
+"scanlineRasterization";"1355.387";"#33aa66";"0"
+"fragmentShader";"249.116";"#aaff33";"0"
+"host -> device memcpy";"2899.678";"#ff9933";"0"
diff --git a/img/charts/instancing/many_cows_host.pdf b/img/charts/instancing/many_cows_host.pdf
diff --git a/img/charts/instancing/many_cows_host.png b/img/charts/instancing/many_cows_host.png
diff --git a/img/charts/instancing/many_cows_instanced.csv b/img/charts/instancing/many_cows_instanced.csv
@@ -0,0 +1,7 @@
+"clearDepthBuffer";"407.226";"#ff6633";"0"
+"computeVertexTFs";"2.795";"#99ff66";"0"
+"vertexShader";"160.891";"#ccaa00";"0"
+"primitiveAssembly";"1044.682";"#aa99cc";"0"
+"scanlineRasterization";"1481.845";"#33aa66";"0"
+"fragmentShader";"256.672";"#aaff33";"0"
+"host -> device memcpy";"50.192";"#ff9933";"0"
diff --git a/img/charts/instancing/many_cows_instanced.pdf b/img/charts/instancing/many_cows_instanced.pdf
diff --git a/img/charts/instancing/many_cows_instanced.png b/img/charts/instancing/many_cows_instanced.png
diff --git a/img/charts/instancing/pies.png b/img/charts/instancing/pies.png
diff --git a/img/charts/instancing/single_cow.csv b/img/charts/instancing/single_cow.csv
@@ -0,0 +1,7 @@
+"clearDepthBuffer";"407.638";"#ff6633";"0"
+"computeVertexTFs";"2.751";"#99ff66";"0"
+"vertexShader";"34.2";"#ccaa00";"0"
+"primitiveAssembly";"49.181";"#aa99cc";"0"
+"scanlineRasterization";"1847.084";"#33aa66";"0"
+"fragmentShader";"236.989";"#aaff33";"0"
+"host -> device memcpy";"60.225";"#ff9933";"0"
diff --git a/img/charts/instancing/single_cow.pdf b/img/charts/instancing/single_cow.pdf
diff --git a/img/charts/instancing/single_cow.png b/img/charts/instancing/single_cow.png
diff --git a/img/charts/instancing/stack_comparison.csv b/img/charts/instancing/stack_comparison.csv
@@ -0,0 +1,4 @@
+"";"clearDepthBuffer";"computeVertexTFs";"vertexShader";"primitiveAssembly";"scanlineRasterization";"fragmentShader";"host -> device memcpy"
+"SIngle Cow";"407.638";"2.751";"34.2";"49.181";"1847.084";"236.989";"60.225"
+"Geometry Cows";"408.746";"2.75";"186.007";"1053.543";"1355.387";"249.116";"2899.678"
+"Instanced Cows";"407.226";"2.795";"160.891";"1044.682";"1481.845";"256.672";"50.192"
diff --git a/img/charts/instancing/stack_comparison.pdf b/img/charts/instancing/stack_comparison.pdf
diff --git a/img/charts/instancing/stack_comparison.png b/img/charts/instancing/stack_comparison.png
diff --git a/img/charts/tiling/tiling.csv b/img/charts/tiling/tiling.csv
@@ -0,0 +1,5 @@
+"";"clear tiles/fragment buffer";"compute vertex transforms";"vertex shade";"primitive assembly";"bin primitives (tiling only)";"scanline rasterization"
+"cow";"407.724";"2.790";"33.997";"48.961";"0.0";"2803.701"
+"cow (tiled)";"5.793";"3.970";"33.211";"47.463";"4982.626";"1772.194"
+"zoomed cube";"409.288";"2.0768";"4.691";"10.299";"0.0";"56579.231"
+"zoomed cube (tiled)";"5.854";"3.952";"4.701";"10.212";"18.913";"1546.637"
diff --git a/img/charts/tiling/tiling.pdf b/img/charts/tiling/tiling.pdf
diff --git a/img/charts/tiling/tiling.png b/img/charts/tiling/tiling.png
diff --git a/img/instanced_cows.png b/img/instanced_cows.png
diff --git a/img/instancing_spread.png b/img/instancing_spread.png
diff --git a/img/one_cow.png b/img/one_cow.png
diff --git a/img/tiled_cow.png b/img/tiled_cow.png
diff --git a/img/tiled_cube.png b/img/tiled_cube.png
diff --git a/img/tiling_spread.png b/img/tiling_spread.png
diff --git a/img/untiled_cow.png b/img/untiled_cow.png
diff --git a/img/untiled_cube.png b/img/untiled_cube.png
diff --git a/notes.txt b/notes.txt
@@ -0,0 +1,147 @@
+add:
+-anti-aliasing
+-tessellation shader + wireframe shading OR instancing
+-tile based pipeline
+
+TESSELATION
+-every triangle gets an "innter" value and 4 "outer" values
+-wat
+
+INSTANCING [working]
+-we'll do it two ways
+	-serial instancing: multiple draws between buffer wipes
+	-parallel instancing: allocates more memory every time number of instances changes
+		-more memory in the primitives buffer
+		-add new kernels for parallel instancing
+
+-btw there's a problem where the fragment shader doesn't work on lab machines. wtf.
+
+ANTIALIASING
+-seems like the way MSAA works is:
+	-compute 5 samples per fragment
+	-but only compute fragment shading on the center
+	-use that value for each occluded sample
+	-and then average the samples for a color
+	-it's cheating antialiasing!
+	-for frags with more than one triangle... bad luck, must compute more times. worst case is 5x per frag, so worst case is like FSAA
+
+-TODO:
+-add a struct for AAfrag [done]
+-add a new AAdepth_buffer [done]
+-add a new AArender [done]
+-add a new AAscanline [done]
+-add fragment shaders [done]
+-add antialiasing to rasterize() [done]
+
+TILING
+-need to tile scanline and fragment shading
+-so need to make shared memory versions of each
+-need to use syncthreads()
+-go look at how the work efficient version was done
+
+TODO:
+-add a spatial datastructure
+	-we'll need one for each tile
+	-each needs room to hold a list of every primitive (yikes)
+	-so spatial datastructure size and allocation all needs to be done in one go
+	-also need a fast kernel to "dump" transformed primitives into each structure
+	-also will need to add a special destruction handler to destroy each list (yikes)
+-add a new tiled scanline -> AA version if we can get to it. doubtful.
+	-basically, uses shared memory
+	-so needs to clip to smaller coordinate area, use lower resolution
+	-fractional tiles are OK - be sure to handle this
+	-how to parallelize?
+		-over prims AND tiles? -> how to even do this? b/c need to ensure each tile's threads stay together.
+		-serial tiles, parallelize over prims in tile? -> just run a 250x for loop?
+			-this seems more likely.
+			-but then need to memcpy tile's primitive count over to hst?
+			-only other option is to launch kernel assuming full primitives, early-retire those that don't have a prim. <- ok. doable. slow?
+			-but again: problem is sharing a chunk of depth buffer.
+
+-ditto with fragment shading
+
+-what if...
+-we allocate primitives * [tiles to tesselate screen] (yikes, but similar to above)
+-for each primitive, compute its AABB coords and determine a list of tiles that it covers
+-some are occupied, some are not. tiles no longer need to store a list of internal prims
+-then when we scanline each primitive, we parallelize per tile
+-blocksize must be [tiles to tesselate screen]. share a chunk of depth buffer
+-similar memory requirements, in theory. improvement in that there's potentially less fetching from global memory
+-this idea makes a "tile" kind of like a giant fragment
+-in case of cow at 800x800, that's 5803 * 50 * 50 fragment_tiles.
+-this also allows stream compaction to reduce number of threads to launch
+-hmmm... but idea is to allow a block to share a chunk of the depth buffer. possibly impossible with this implementation
+
+-FUNDAMENTAL PROBLEM: idea is to share a chunk of the depth buffer, which limits the block size
+-this CAN'T BE DONE if we parallelize by primitive!
+	-unless block size is dynamic to number of primitives. potentially bad idea.
+	-otherwise complicates things a lot
+	-b/c suddenly if we have multiple blocks trying to write to the same chunk of screen space
+	-then we have to do another atomic write comparison at the writeback stage.
+	-which we CAN do, it's just slow.
+	-but I suppose... if the goal is just to access the global frame buffer LESS...
+	-this could be doable
+-the only hope is that not all the primitives are in the same chunk of screen space!
+
+-the polygon list implementation (fewer tiles, more data per tile) appears to be how it's done. ok.
+-risk: low occupancy. unless we can figure out how to parallelize over both tiles and polygon lists-just may need stream compaction to cull lists that don't do stuff.
+
+-scratch that: FUNDAMENTAL PROBLEM: the idea is to share a chunk of the depth buffer, which limits the block size
+-assume we try to parallelize across all tiles and all primitives
+	-so, in case of cow at 800x800, grid with 5803 * 50 * 50 kernel instances.
+	-each block however can only assess one tile. that's the whole point of using shared memory.
+	-possible to resolve blocks that end up with primitives spread across tiles, but ruins the point.
+		-this requires thrasing global memory all over again for threads that aren't in the "current tile"
+	-buuuuuut wait. maybe this is only a problem if we do a stream compaction run!
+	-otherwise, primitives can be buffered apart by threads that don't do anything!
+	-potentially lots of wasted threads, but it should work without having to thrash global framebuffer!
+	-so when we allocate list for each tile, must allocate a multiple of block size.
+	-well. or we could handle this in indexing. probably better that way.
+	-theoretically similar to serializing though
+		-low occupancy in serializing a result of not a lot of prims case, though
+		-so this should perform better in case of, say, few prims at a large screen size
+		-and should perform similarly in other cases, b/c effectively serialized by thread controller
+
+REVISED TODO
+-add a define for shared memory size. [done]
+-add a struct for TILE [done]
+	-holds pointer to a list of triangle indices. I suspect copying these over isn't going ot help
+	-holds int of number of triangles (aka point at which to "insert")
+	-also holds either NDC min/max or world min/max. check what rasterize does again
+-add a buffer of TILEs [done]
+-add a TILES setup function [done]
+-add a RASTERIZE TILE PRIMITIVE function <- the biggie. the realllll biggie.
+	-needs to use a shared memory chunk
+	-needs to correctly compute index of tile and index of primitive within tile (yikes)
+	-needs to use atomics both in scanline portion and in writeback portion
+
+-add tiled fragment shading (shouldn't be hard)
+
+-so: according to http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0363d/CJAEEJCF.html,
+it's OKAY to serialize tiles?
+-where does this get us?
+	-so fully parallelizing (running all tiles at once) imroves occupancy but also leads to a lot of wasted threads
+	-but bottom line problem: a tile may be spread across multiple blocks if there's a lot of primitives
+	-in which case the tile's writeback is no better than normal writeback
+	-how to force a single tile into all shared memory? -> might not be possible
+	-unless the block is just really really huge and we run each tile individually
+	-but this constrains the number of polygons we can use
+
+-how about hybrid parallel/serial approach with parallelization per tile? <- THIS. THIS MAKES THE MOST SENSE
+	-each block gets one tile
+	-each thread processes primitives in serial, using atomics to write to the block memory
+	-entire output is written out to the frame buffer; no need for atomics here
+
+TODO README:
+-describe features
+	-instancing
+	-MSAA, FSAA
+	-Tiling
+		-cows scene
+		-zoomed in cube scene
+
+ANALYSIS - INSTANCING (no AA)
+-single cow: 5804 triangles, 2903 vertices	
+-6 cows: 34824 triangles, 17418 verts
+-single cow, host multicow, and device multicow all run at 60 fps
+-cube scene: on the range of 18 fps untiled