diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e3eb28..79a4d70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,7 +77,7 @@ find_package(CUDA REQUIRED) set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) set(CUDA_SEPARABLE_COMPILATION ON) -#add_subdirectory(stream_compaction) # TODO: uncomment if using your own stream compaction +add_subdirectory(stream_compaction) add_subdirectory(src) add_subdirectory(util) @@ -89,7 +89,7 @@ cuda_add_executable(${CMAKE_PROJECT_NAME} target_link_libraries(${CMAKE_PROJECT_NAME} src util - #stream_compaction # TODO: uncomment if using your own stream compaction + stream_compaction ${CORELIBS} ) diff --git a/README.md b/README.md index 22d2f34..7ef1553 100644 --- a/README.md +++ b/README.md @@ -3,346 +3,111 @@ CUDA Rasterizer **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. - - -Instructions (delete me) -======================== - -This is due Sunday, October 11, evening at midnight. - -**Summary:** -In this project, you will use CUDA to implement a simplified -rasterized graphics pipeline, similar to the OpenGL pipeline. You will -implement vertex shading, primitive assembly, rasterization, fragment shading, -and a framebuffer. More information about the rasterized graphics pipeline can -be found in the class slides and in the CIS 560 lecture notes. - -The base code provided includes an OBJ loader and much of the I/O and -bookkeeping code. It also includes some functions that you may find useful, -described below. The core rasterization pipeline is left for you to implement. - -You are not required to use this base code if you don't want -to. You may also change any part of the base code as you please. -**This is YOUR project.** - -**Recommendation:** -Every image you save should automatically get a different -filename. Don't delete all of them! For the benefit of your README, keep a -bunch of them around so you can pick a few to document your progress. - - -### Contents - -* `src/` C++/CUDA source files. -* `util/` C++ utility files. -* `objs/` Example OBJ test files (# verts, # tris in buffers after loading) - * `tri.obj` (3v, 1t): The simplest possible geometric object. - * `cube.obj` (36v, 12t): A small model with low depth-complexity. - * `suzanne.obj` (2904 verts, 968 tris): A medium model with low depth-complexity. - * `suzanne_smooth.obj` (2904 verts, 968 tris): A medium model with low depth-complexity. - This model has normals which must be interpolated. - * `cow.obj` (17412 verts, 5804 tris): A large model with low depth-complexity. - * `cow_smooth.obj` (17412 verts, 5804 tris): A large model with low depth-complexity. - This model has normals which must be interpolated. - * `flower.obj` (1920 verts, 640 tris): A medium model with very high depth-complexity. - * `sponza.obj` (837,489 verts, 279,163 tris): A huge model with very high depth-complexity. -* `renders/` Debug render of an example OBJ. -* `external/` Includes and static libraries for 3rd party libraries. - -### Running the code - -The main function requires a scene description file. Call the program with -one as an argument: `cis565_rasterizer objs/cow.obj`. -(In Visual Studio, `../objs/cow.obj`.) - -If you are using Visual Studio, you can set this in the Debugging > Command -Arguments section in the Project properties. Note that this value is different -for every different configuration type. Make sure you get the path right; read -the console for errors. - -## Requirements - -**Ask on the mailing list for any clarifications.** - -In this project, you are given the following code: - -* A library for loading standard Alias/Wavefront `.obj` format mesh - files and converting them to OpenGL-style buffers of index and vertex data. - * This library does NOT read materials, and provides all colors as white by - default. You can use another library if you wish. -* Simple structs for some parts of the pipeline. -* Depth buffer to framebuffer copy. -* CUDA-GL interop. - -You will need to implement the following features/pipeline stages: - -* Vertex shading. -* (Vertex shader) perspective transformation. -* Primitive assembly with support for triangles read from buffers of index and - vertex data. -* Rasterization. -* Fragment shading. -* A depth buffer for storing and depth testing fragments. -* Fragment to depth buffer writing (**with** atomics for race avoidance). -* (Fragment shader) simple lighting scheme, such as Lambert or Blinn-Phong. - -See below for more guidance. - -You are also required to implement at least "3.0" points in extra features. -(the parenthesized numbers must add to 3.0 or more): - -* (1.0) Tile-based pipeline. -* Additional pipeline stages. - * (1.0) Tessellation shader. - * (1.0) Geometry shader, able to output a variable number of primitives per - input primitive, optimized using stream compaction (thrust allowed). - * (0.5 **if not doing geometry shader**) Backface culling, optimized using - stream compaction (thrust allowed). - * (1.0) Transform feedback. - * (0.5) Scissor test. - * (0.5) Blending (when writing into framebuffer). -* (1.0) Instancing: draw one set of vertex data multiple times, each run - through the vertex shader with a different ID. -* (0.5) Correct color interpolation between points on a primitive. -* (1.0) UV texture mapping with bilinear texture filtering and perspective - correct texture coordinates. -* Support for rasterizing additional primitives: - * (0.5) Lines or line strips. - * (0.5) Points. -* (1.0) Anti-aliasing. -* (1.0) Occlusion queries. -* (1.0) Order-independent translucency using a k-buffer. -* (0.5) **Mouse**-based interactive camera support. - -This extra feature list is not comprehensive. If you have a particular idea -you would like to implement, please **contact us first**. - -**IMPORTANT:** -For each extra feature, please provide the following brief analysis: - -* Concise overview write-up of the feature. -* Performance impact of adding the feature (slower or faster). -* If you did something to accelerate the feature, what did you do and why? -* How might this feature be optimized beyond your current implementation? - - -## Base Code Tour - -You will be working primarily in two files: `rasterize.cu`, and -`rasterizeTools.h`. Within these files, areas that you need to complete are -marked with a `TODO` comment. Areas that are useful to and serve as hints for -optional features are marked with `TODO (Optional)`. Functions that are useful -for reference are marked with the comment `CHECKITOUT`. **You should look at -all TODOs and CHECKITOUTs before starting!** There are not many. - -* `src/rasterize.cu` contains the core rasterization pipeline. - * A few pre-made structs are included for you to use, but those marked with - TODO will also be needed for a simple rasterizer. As with any part of the - base code, you may modify or replace these as you see fit. - -* `src/rasterizeTools.h` contains various useful tools - * Includes a number of barycentric coordinate related functions that you may - find useful in implementing scanline based rasterization. - -* `util/utilityCore.hpp` serves as a kitchen-sink of useful functions. - - -## Rasterization Pipeline - -Possible pipelines are described below. Pseudo-type-signatures are given. -Not all of the pseudocode arrays will necessarily actually exist in practice. - -### First-Try Pipeline - -This describes a minimal version of *one possible* graphics pipeline, similar -to modern hardware (DX/OpenGL). Yours need not match precisely. To begin, try -to write a minimal amount of code as described here. Verify some output after -implementing each pipeline step. This will reduce the necessary time spent -debugging. - -Start out by testing a single triangle (`tri.obj`). - -* Clear the depth buffer with some default value. -* Vertex shading: - * `VertexIn[n] vs_input -> VertexOut[n] vs_output` - * A minimal vertex shader will apply no transformations at all - it draws - directly in normalized device coordinates (-1 to 1 in each dimension). -* Primitive assembly. - * `VertexOut[n] vs_output -> Triangle[n/3] primitives` - * Start by supporting ONLY triangles. For a triangle defined by indices - `(a, b, c)` into `VertexOut` array `vo`, simply copy the appropriate values - into a `Triangle` object `(vo[a], vo[b], vo[c])`. -* Rasterization. - * `Triangle[n/3] primitives -> FragmentIn[m] fs_input` - * A scanline implementation is simpler to start with. -* Fragment shading. - * `FragmentIn[m] fs_input -> FragmentOut[m] fs_output` - * A super-simple test fragment shader: output same color for every fragment. - * Also try displaying various debug views (normals, etc.) -* Fragments to depth buffer. - * `FragmentOut[m] -> FragmentOut[width][height]` - * Results in race conditions - don't bother to fix these until it works! - * Can really be done inside the fragment shader, if you call the fragment - shader from the rasterization kernel for every fragment (including those - which get occluded). **OR,** this can be done before fragment shading, which - may be faster but means the fragment shader cannot change the depth. -* A depth buffer for storing and depth testing fragments. - * `FragmentOut[width][height] depthbuffer` - * An array of `fragment` objects. - * At the end of a frame, it should contain the fragments drawn to the screen. -* Fragment to framebuffer writing. - * `FragmentOut[width][height] depthbuffer -> vec3[width][height] framebuffer` - * Simply copies the colors out of the depth buffer into the framebuffer - (to be displayed on the screen). - -### A Useful Pipeline - -* Clear the depth buffer with some default value. -* Vertex shading: - * `VertexIn[n] vs_input -> VertexOut[n] vs_output` - * Apply some vertex transformation (e.g. model-view-projection matrix using - `glm::lookAt ` and `glm::perspective `). -* Primitive assembly. - * `VertexOut[n] vs_output -> Triangle[n/3] primitives` - * As above. - * Other primitive types are optional. -* Rasterization. - * `Triangle[n/3] primitives -> FragmentIn[m] fs_input` - * You may choose to do a tiled rasterization method, which should have lower - global memory bandwidth. - * A scanline optimization: when rasterizing a triangle, only scan over the - box around the triangle (`getAABBForTriangle`). -* Fragment shading. - * `FragmentIn[m] fs_input -> FragmentOut[m] fs_output` - * Add a shading method, such as Lambert or Blinn-Phong. Lights can be defined - by kernel parameters (like GLSL uniforms). -* Fragments to depth buffer. - * `FragmentOut[m] -> FragmentOut[width][height]` - * Can really be done inside the fragment shader, if you call the fragment - shader from the rasterization kernel for every fragment (including those - which get occluded). **OR,** this can be done before fragment shading, which - may be faster but means the fragment shader cannot change the depth. - * This result in an optimization: it allows you to do depth tests before - spending execution time in complex fragment shader code! - * Handle race conditions! Since multiple primitives write fragments to the - same fragment in the depth buffer, races must be avoided by using CUDA - atomics. - * *Approach 1:* Lock the location in the depth buffer during the time that - a thread is comparing old and new fragment depths (and possibly writing - a new fragment). This should work in all cases, but be slower. - * *Approach 2:* Convert your depth value to a fixed-point `int`, and use - `atomicMin` to store it into an `int`-typed depth buffer `intdepth`. After - that, the value which is stored at `intdepth[i]` is (usually) that of the - fragment which should be stored into the `fragment` depth buffer. - * This may result in some rare race conditions (e.g. across blocks). - * The `flower.obj` test file is good for testing race conditions. -* A depth buffer for storing and depth testing fragments. - * `FragmentOut[width][height] depthbuffer` - * An array of `fragment` objects. - * At the end of a frame, it should contain the fragments drawn to the screen. -* Fragment to framebuffer writing. - * `FragmentOut[width][height] depthbuffer -> vec3[width][height] framebuffer` - * Simply copies the colors out of the depth buffer into the framebuffer - (to be displayed on the screen). - -This is a suggested sequence of pipeline steps, but you may choose to alter the -order of this sequence or merge entire kernels as you see fit. For example, if -you decide that doing has benefits, you can choose to merge the vertex shader -and primitive assembly kernels, or merge the perspective transform into another -kernel. There is not necessarily a right sequence of kernels, and you may -choose any sequence that works. Please document in your README what sequence -you choose and why. - - -## Resources - -The following resources may be useful for this project: - -* High-Performance Software Rasterization on GPUs: - * [Paper (HPG 2011)](http://www.tml.tkk.fi/~samuli/publications/laine2011hpg_paper.pdf) - * [Code](http://code.google.com/p/cudaraster/) - * Note that looking over this code for reference with regard to the paper is - fine, but we most likely will not grant any requests to actually - incorporate any of this code into your project. - * [Slides](http://bps11.idav.ucdavis.edu/talks/08-gpuSoftwareRasterLaineAndPantaleoni-BPS2011.pdf) -* The Direct3D 10 System (SIGGRAPH 2006) - for those interested in doing - geometry shaders and transform feedback: - * [Paper](http://dl.acm.org/citation.cfm?id=1141947) - * [Paper, through Penn Libraries proxy](http://proxy.library.upenn.edu:2247/citation.cfm?id=1141947) -* Multi-Fragment Effects on the GPU using the k-Buffer - for those who want to do - order-independent transparency using a k-buffer: - * [Paper](http://www.inf.ufrgs.br/~comba/papers/2007/kbuffer_preprint.pdf) -* FreePipe: A Programmable, Parallel Rendering Architecture for Efficient - Multi-Fragment Effects (I3D 2010): - * [Paper](https://sites.google.com/site/hmcen0921/cudarasterizer) -* Writing A Software Rasterizer In Javascript: - * [Part 1](http://simonstechblog.blogspot.com/2012/04/software-rasterizer-part-1.html) - * [Part 2](http://simonstechblog.blogspot.com/2012/04/software-rasterizer-part-2.html) - - -## Third-Party Code Policy - -* Use of any third-party code must be approved by asking on our Google Group. -* If it is approved, all students are welcome to use it. Generally, we approve - use of third-party code that is not a core part of the project. For example, - for the path tracer, we would approve using a third-party library for loading - models, but would not approve copying and pasting a CUDA function for doing - refraction. -* Third-party code **MUST** be credited in README.md. -* Using third-party code without its approval, including using another - student's code, is an academic integrity violation, and will, at minimum, - result in you receiving an F for the semester. - - -## README - -Replace the contents of this README.md in a clear manner with the following: - -* A brief description of the project and the specific features you implemented. -* At least one screenshot of your project running. -* A 30 second or longer video of your project running. -* A performance analysis (described below). - -### Performance Analysis - -The performance analysis is where you will investigate how to make your CUDA -programs more efficient using the skills you've learned in class. You must have -performed at least one experiment on your code to investigate the positive or -negative effects on performance. - -We encourage you to get creative with your tweaks. Consider places in your code -that could be considered bottlenecks and try to improve them. - -Provide summary of your optimizations (no more than one page), along with -tables and or graphs to visually explain any performance differences. - -* Include a breakdown of time spent in each pipeline stage for a few different - models. It is suggested that you use pie charts or 100% stacked bar charts. -* For optimization steps (like backface culling), include a performance - comparison to show the effectiveness. - - -## Submit - -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. - -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". - * **ADDITIONALLY:** - In the body of the pull request, include a link to your repository. -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project N: PENNKEY`. - * Direct link to your pull request on GitHub. - * Estimate the amount of time you spent on the project. - * If there were any outstanding problems, or if you did any extra - work, *briefly* explain. - * Feedback on the project itself, if any. +* Bradley Crusco +* Tested on: Windows 10, i7-3770K @ 3.50GHz 16GB, 2 x GTX 980 4096MB (Personal Computer) + +## Description + +An interactive GPU accelerated rasterizer. The program uses the scanline rasterization technique and can render obj models using triangle, line, or point primitives. Additional features include interactive mouse controls, backface culling, scissor test culling, and color and normal interpolation to support smooth models. + +### Stanford Dragon +![](renders/dragon.png "Stanford Dragon") + +### Video Demo +Real Time CUDA Rasterizer + +## Pipeline + +### Vertex Shading + +For vertex shading we take input vertices from a vertex in buffer and transform them by applying the model-view projection matrix so they are orientated correctly within our scene. After being transformed they are sent to a vertex out buffer. + +### Primitive Assembly + +While this project supports the rendering of different primitive types (triangles, points, and lines, see below for details), that is achieved through manipulating the only true primitive supported, triangles. The vertices that were transformed in the previous stage and sent to the vertex out buffer are turned into triangle primitives in groups of three, each making up one vertex of the triangle. The total size of the primitives array we create will be a third the size of the vertex out buffer. + +### Backface Culling (Optional) + +* **Overview**: Backface culling is a relatively simple procedure added to the pipeline as an option after the primitive assembly step. The step determines which primitives are facing the camera, and marks those that are not as not visible. After this has been done for all primitives, stream compaction is run on the resulting array of primitives and those that are not visible are removed from the pipeline. To determine if a primitive is facing the camera, we use the dot product between the vector from the camera to the model's position and the normal of the primitive. +* **Performance Impact**: Refer to the two below charts for visuals of the performance impact. Unfortunately by looking at the frame rate data we see that the feature is actually reducing general performance quite substantially. Without culling we see a frame rate of 109 fps. With scissor test enabled it drops to 87 fps. The next chart illustrates what is going on. We are actually getting the performance increase we'd expect with scissor test enabled. When we look at execution time for the rasterization step, which follows culling, it drops from 7,436 µs to 4,631 µs. So what's going on? The overhead of my stream compaction that takes place after culling to remove culled fragments is slowing down the program and erasing any performance gains we receive from the culling optimization. ![](data_images/Culling FPS.png "Culling Performance - FPS") ![](data_images/Culling Execution.png "Culling Performance - Execution Time") + +### Scissor Test (Optional) +![](renders/dragon_scissor.png "Scissor Test Enabled on Stanford Dragon") + +* **Overview**: The scissor test is another relatively simple stage added as an option to the pipeline. In the scene in the program, a rectangular portion of the screen can be defined as the bounds of this scissor clipping. Anything outside the bounds will be clipped from the scene. Whether a primitive is to be displayed is determined by checking the maximum and minimum points on the primitive's bounding box and comparing those positions to the dimensions of the rectangle defining our culling area. If the max or min of the bounding box lays outside this area, the primitive is marked as not visible. Once we've run the test on all the primitives in the scene, stream compaction is used to remove the invalid primitives from the array. +* **Performance Impact**: Refer to the two below charts for visuals of the performance impact. Unfortunately by looking at the frame rate data we see that the feature is actually reducing general performance quite substantially. Without scissor test we see a frame rate of 109 fps. With scissor test enabled it drops to 79 fps. The next chart illustrates what is going on. We are actually getting the performance increase we'd expect with scissor test enabled. When we look at execution time for the rasterization step, which follows the scissor test, it drops from 7,436 µs to 5,315 µs. So what's going on? The overhead of my stream compaction that takes place after the scissor test to remove culled fragments is slowing down the program and erasing any performance gains we receive from the scissor test optimization. ![](data_images/Scissor FPS.png "Scissor Test Performance - FPS") ![](data_images/Scissor Execute.png "Scissor Test Performance - Execution Time") + +### Rasterization + +#### Triangles +![](renders/dragon_tri.png "Stanford Dragon Rendered Using Triangle Primitives (No Normal Interpolation)") + +To render the basic rasterization primitive, the triangle, each GPU thread is responsible for one triangle. The bounding box for that triangle is retrieved, and, through the scanline implementation, the thread loops over each pixel in the bounding box. + +#### Normal and Color Interpolation +![](renders/cow_interp_comp.png "Cow With and Without Normal Interpolation") + +* **Overview**: Implementing normal and color interpolation gives significantly more visually pleasing results, as can be seen above. Without the interpolation, models look obviously constructed of triangles. With interpolation, the models are smooth and provide the realistic effect we'd expect. To achieve these smooth models, the obj file must provide vertex normals. If not, the result will look like interpolation is disabled. The same for color, if not provided in the object file per vertex, the model will be one solid color. To calculate the interpolated results, we first calculate the barycentric coordinate. Once we do that and have determined that the coordinate is in bounds and the z position of our primitive passes the depth test, we interpolate by adding the sum of the product of the x, y, and z components of the barycentric coordinate and each of the three vertices, respectively. +* **Performance Impact**: Minimal. The majority of the performance overhead for this is in the calculation of the barycentric coordinate and the depth test, components that are required for the rasterization of the triangle primitive with or without interpolated normals and colors. The few multiplications and additions needed to calculate the interpolated values do not include noticible performance impact. + +#### Points +![](renders/dragon_points.png "Stanford Dragon Rendered Using Point Primitives") + +* **Overview**: For this effect, the standard rasterization step of the pipeline is replaced with one to output points. Because we are only rendering a point and not an entire triangle, barycentric coordinates do not need to be calculated, nor do we have to interpolate the normals or colors across each vertex. Instead we just output the values for a single vertex to the depth buffer (I use the middle vertex at index one) and lead the others as zero. This vertex will be the point that is rendered to the screen. +* **Performance Impact**: There is actually a small increase in performance compared to the other two primitive types, as the point requires no additional calculation to be rendered, unlike the triangles and lines. + +#### Lines +![](renders/dragon_lines.png "Stanford Dragon Rendered Using Line Primitives") + +* **Overview**: The implementation for lines is more complicated than the other two primitives. For ease of implementation, I render the line between the first and second vertices of a the triangle primitives. There are two situations that need to be handled when rendering these. The first is the simplest case, when the line to render is a straight vertical line. Here we loop between the min and end values in the y direction and render as we travel along. For all other lines we must calculate a Bresenham line, is a method of approximating the rasterization of a line. Otherwise it works just like rendering the vertical line, with the main difference being a loop along the x direction while using the slope of the line to calculate a y value in the correct position at each iteration of the loop, and z is determined using the depth test. +* **Performance Impact**: Minimal. Having to loop through and generate the line and do the Bresenham calculating requires extra computation, but is similar in execution time to the triangle primitive implementation. + +### Fragment Shading + +The fragment shader updates the color value of a fragment in the depth buffer using a basic Lambert shader. It essentially adds lighting to the scene. As of this writing the implementation uses a single light source at located at the camera position to keep the object lit while the user moves the camera using the mouse controls. + +### Fragments to Framebuffer +These fragments are finally output to the framebuffer, and from there displayed to the screen. + +## Additional Features + +### Mouse Interaction + +* **Overview**: The rasterizer renders in real time and can be interacted with by the user using mouse controls. +* **Rotation**: Holding the left mouse button and moving the cursor rotates the camera around the object. +* **Panning**: Holding the middle mouse button and moving the cursor pans the camera along the x and y directions. +* **Zooming**: Moving the scroll wheel in and out moves the camera along the z axis. + +## Performance Analysis + +### Rasterization Pipeline Stage Durations for Select Models + +Below you can see performance breakdowns for three different models, a smooth cow model, a smooth Stanford dragon model, and a complex "flower" model. The breakdown shows the percentage of team each main kernel stage of the rasterization pipeline takes relative to the others. In all three examples, you can see that by far the largest portion of the pipeline is the rasterization stage. Which stage comes in second varies dramatically. Between the cow and the dragon model, we see both the vertex shader and the primitive assembly stages increase, the primitive assembly especially so. This is because of the additional complexity of the dragon model, it contains many more polygons than the cow model. When we look at the chart for the "flower" model, the duration of the rasterization step is so significant that it essentially erases every other step in the pipeline. This model presents a very complicated depth problem for the depth test component of the rasterization stage, and the results here show how significant that portion of the pipeline is to performance. + +![](renders/cow.png "Smooth Cow") +![](data_images/Cow Pie Chart.png "Cow Rasterization Pipeline Breakdown") + +![](renders/Dragon_norm.png "Smooth Dragon.png") +![](data_images/Dragon Pie Chart.png "Dragon Rasterization Pipeline Breakdown") + +![](renders/flower.png "Flower.png") +![](data_images/Flower Pie Chart.png "Flower Rasterization Pipeline Breakdown") + +### Optimizations +I was able to do a significant amount of optimization work on this project. Below is the original performance data from NSight. + +![](data_images/Original Performance Summary.png "Original Performance Summary") + +You can see that there is occupancy issues with the vertexShading, assemblePrimitives, and rasterization kernels. Below is the updated NSight data after all of my optimizations. + +![](data_images/Final Performance Summary.png "Final Performance Summary") + +Note: Ignore the kernel durations, as this data is for two different models. I accidentally took the wrong original data on too simple of a model. The model for the second data set is the much more complicated Smooth Stanford Dragon model. + +* **vertexShading** + * Increased block size from 128 to 256 and based grid size off of number of vertices, not screen size. Dramatically reduced execution time of the kernel. + * Reduced register count to 30 per thread from 38, achieving 100% occupancy. +* **assemblePrimitives** + * Reduced register count to 18 from 48, achieving 100% occupancy. +* **rasterization** + * Reduced register count to 48 from 104, going from 25% occupancy to 62.50%. diff --git a/data_images/Cow Pie Chart.png b/data_images/Cow Pie Chart.png new file mode 100644 index 0000000..41a32c5 Binary files /dev/null and b/data_images/Cow Pie Chart.png differ diff --git a/data_images/Culling Execution.png b/data_images/Culling Execution.png new file mode 100644 index 0000000..23fb441 Binary files /dev/null and b/data_images/Culling Execution.png differ diff --git a/data_images/Culling FPS.png b/data_images/Culling FPS.png new file mode 100644 index 0000000..d82bd2d Binary files /dev/null and b/data_images/Culling FPS.png differ diff --git a/data_images/Dragon Pie Chart.png b/data_images/Dragon Pie Chart.png new file mode 100644 index 0000000..1dfad7d Binary files /dev/null and b/data_images/Dragon Pie Chart.png differ diff --git a/data_images/Fake Youtube.png b/data_images/Fake Youtube.png new file mode 100644 index 0000000..6b08e7d Binary files /dev/null and b/data_images/Fake Youtube.png differ diff --git a/data_images/Final Performance Summary.png b/data_images/Final Performance Summary.png new file mode 100644 index 0000000..ed5f7b8 Binary files /dev/null and b/data_images/Final Performance Summary.png differ diff --git a/data_images/Flower Pie Chart.png b/data_images/Flower Pie Chart.png new file mode 100644 index 0000000..9dce1e1 Binary files /dev/null and b/data_images/Flower Pie Chart.png differ diff --git a/data_images/Original Performance Summary.png b/data_images/Original Performance Summary.png new file mode 100644 index 0000000..182c5da Binary files /dev/null and b/data_images/Original Performance Summary.png differ diff --git a/data_images/Scissor Execute.png b/data_images/Scissor Execute.png new file mode 100644 index 0000000..6351004 Binary files /dev/null and b/data_images/Scissor Execute.png differ diff --git a/data_images/Scissor FPS.png b/data_images/Scissor FPS.png new file mode 100644 index 0000000..240d913 Binary files /dev/null and b/data_images/Scissor FPS.png differ diff --git a/objs/dragon_smooth.psd b/objs/dragon_smooth.psd new file mode 100644 index 0000000..4cb3ccf Binary files /dev/null and b/objs/dragon_smooth.psd differ diff --git a/renders/Dragon_norm.png b/renders/Dragon_norm.png new file mode 100644 index 0000000..3006b20 Binary files /dev/null and b/renders/Dragon_norm.png differ diff --git a/renders/cow.png b/renders/cow.png new file mode 100644 index 0000000..3aa8531 Binary files /dev/null and b/renders/cow.png differ diff --git a/renders/cow_interp_comp.png b/renders/cow_interp_comp.png new file mode 100644 index 0000000..b154655 Binary files /dev/null and b/renders/cow_interp_comp.png differ diff --git a/renders/cow_zdepth.png b/renders/cow_zdepth.png deleted file mode 100644 index 9a25043..0000000 Binary files a/renders/cow_zdepth.png and /dev/null differ diff --git a/renders/dragon.png b/renders/dragon.png new file mode 100644 index 0000000..da89996 Binary files /dev/null and b/renders/dragon.png differ diff --git a/renders/dragon_interp_comp.png b/renders/dragon_interp_comp.png new file mode 100644 index 0000000..67ff570 Binary files /dev/null and b/renders/dragon_interp_comp.png differ diff --git a/renders/dragon_lines.png b/renders/dragon_lines.png new file mode 100644 index 0000000..e70be67 Binary files /dev/null and b/renders/dragon_lines.png differ diff --git a/renders/dragon_points.png b/renders/dragon_points.png new file mode 100644 index 0000000..21298be Binary files /dev/null and b/renders/dragon_points.png differ diff --git a/renders/dragon_scissor.png b/renders/dragon_scissor.png new file mode 100644 index 0000000..32fbda9 Binary files /dev/null and b/renders/dragon_scissor.png differ diff --git a/renders/dragon_tri.png b/renders/dragon_tri.png new file mode 100644 index 0000000..613ca37 Binary files /dev/null and b/renders/dragon_tri.png differ diff --git a/renders/flower.png b/renders/flower.png new file mode 100644 index 0000000..7af75d3 Binary files /dev/null and b/renders/flower.png differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a57f69f..e7e90de 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,6 +2,8 @@ set(SOURCE_FILES "rasterize.cu" "rasterize.h" "rasterizeTools.h" + "scene.cpp" + "scene.hpp" ) cuda_add_library(src diff --git a/src/main.cpp b/src/main.cpp index a125d7c..eb0cc2f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,6 +8,8 @@ #include "main.hpp" +#include + //------------------------------- //-------------MAIN-------------- //------------------------------- @@ -83,7 +85,6 @@ void runCuda() { frame++; fpstracker++; - } //------------------------------- @@ -106,6 +107,9 @@ bool init(obj *mesh) { } glfwMakeContextCurrent(window); glfwSetKeyCallback(window, keyCallback); + glfwSetCursorPosCallback(window, mouseMoveCallback); + glfwSetMouseButtonCallback(window, mousePressCallback); + glfwSetScrollCallback(window, mouseScrollCallback); // Set up GL context glewExperimental = GL_TRUE; @@ -159,7 +163,10 @@ void initCuda() { // Use device with highest Gflops/s cudaGLSetGLDevice(0); - rasterizeInit(width, height); + scene = new Scene(); + scene->scissorMax = glm::vec2(width * 0.75f, height * 0.75f); + scene->scissorMin = glm::vec2(width * 0.25f, height * 0.25f); + rasterizeInit(width, height, scene); // Clean up on program exit atexit(cleanupCuda); @@ -274,3 +281,75 @@ void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods glfwSetWindowShouldClose(window, GL_TRUE); } } + +void mouseMoveCallback(GLFWwindow *window, double x, double y) { + if (!scene->mouseState.initialPositionsSet) { + // Have to set a baseline position for our offsets to base off of + scene->mouseState.x = x; + scene->mouseState.y = y; + scene->mouseState.initialPositionsSet = true; + } + else { + if (scene->mouseState.leftPressed) { + // Holding the left mouse rotates the camera. + glm::vec3 inverseLookAt = scene->camera.position - scene->camera.lookAt; + float xOffset = 0.005f * (scene->mouseState.x - (float)x); + float yOffset = 0.005f * (scene->mouseState.y - (float)y); + glm::vec3 tempInverseLookAt = glm::rotateY(inverseLookAt, glm::atan((float)(xOffset / + (TWO_PI * glm::length(inverseLookAt))), glm::length(inverseLookAt))); + + scene->camera.right = glm::normalize(glm::rotateY(glm::vec3(inverseLookAt.x, 0.0f, inverseLookAt.z), (float)(PI / 2.0))); + scene->camera.position = scene->camera.lookAt + glm::rotate(tempInverseLookAt, glm::atan(yOffset / + (float)(TWO_PI * glm::length(inverseLookAt)), glm::length(tempInverseLookAt)), scene->camera.right); + scene->updateModelView(); + } + else if (scene->mouseState.middlePressed) { + // Holding middle will translate the camera + float xOffset = 0.0001f * (scene->mouseState.x - (float)x); + float yOffset = 0.0001f * ((float)y - scene->mouseState.y); + glm::vec3 tempPosition = (scene->camera.position + scene->camera.right * xOffset) + (scene->camera.up * yOffset); + glm::vec3 tempLookAt = scene->camera.lookAt - tempPosition; + + if (glm::length(tempLookAt) > 1.0f) { + // Restrict amount user can pan to stop going through model + scene->camera.lookAt = scene->camera.lookAt + tempPosition - scene->camera.position; + scene->camera.position = tempPosition; + scene->updateModelView(); + } + } + } +} + +void mousePressCallback(GLFWwindow *window, int button, int action, int mods) { + if (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS) { + // Mark left mouse button as held + scene->mouseState.leftPressed = true; + scene->mouseState.initialPositionsSet = false; + } + else if (button == GLFW_MOUSE_BUTTON_MIDDLE && action == GLFW_PRESS) { + // Mark right mouse button as held + scene->mouseState.middlePressed = true; + scene->mouseState.initialPositionsSet = false; + } + else if (action == GLFW_RELEASE) { + // Reset the flags for which button is held + scene->mouseState.leftPressed = false; + scene->mouseState.middlePressed = false; + scene->mouseState.initialPositionsSet = true; + } +} + +void mouseScrollCallback(GLFWwindow *window, double x, double y) { + if (y > 0.0) { + glm::vec3 tempLookAt = scene->camera.lookAt - scene->camera.position; + if (glm::length(tempLookAt) > 1.0) { + // Stop user from zooming in too close and through/past the model + scene->camera.position += glm::normalize(tempLookAt) * 0.1f; + scene->updateModelView(); + } + } + else if (y < 0.0) { + scene->camera.position = scene->camera.position + (glm::normalize(scene->camera.position - scene->camera.lookAt) * 0.1f); + scene->updateModelView(); + } +} diff --git a/src/main.hpp b/src/main.hpp index 49d3948..9e030ff 100644 --- a/src/main.hpp +++ b/src/main.hpp @@ -24,6 +24,7 @@ #include #include #include "rasterize.h" +#include "scene.hpp" using namespace std; @@ -43,6 +44,8 @@ uchar4 *dptr; GLFWwindow *window; +Scene *scene; + //------------------------------- //----------CUDA STUFF----------- //------------------------------- @@ -93,3 +96,6 @@ void deleteTexture(GLuint *tex); void mainLoop(); void errorCallback(int error, const char *description); void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods); +void mouseMoveCallback(GLFWwindow *window, double x, double y); +void mousePressCallback(GLFWwindow *window, int button, int action, int mods); +void mouseScrollCallback(GLFWwindow *window, double x, double y); \ No newline at end of file diff --git a/src/rasterize.cu b/src/rasterize.cu index 53103b5..bcccc9f 100644 --- a/src/rasterize.cu +++ b/src/rasterize.cu @@ -11,35 +11,29 @@ #include #include #include +#include #include #include #include "rasterizeTools.h" +#include "../stream_compaction/efficient.h" -struct VertexIn { - glm::vec3 pos; - glm::vec3 nor; - glm::vec3 col; - // TODO (optional) add other vertex attributes (e.g. texture coordinates) -}; -struct VertexOut { - // TODO -}; -struct Triangle { - VertexOut v[3]; -}; -struct Fragment { - glm::vec3 color; -}; +#define VERTBLOCKSIZE 256 +#define FRAGBLOCKSIZE 256 static int width = 0; static int height = 0; +static Scene *scene = NULL; static int *dev_bufIdx = NULL; static VertexIn *dev_bufVertex = NULL; +static VertexOut *dev_bufVertexOut = NULL; static Triangle *dev_primitives = NULL; +static int *dev_depth = NULL; static Fragment *dev_depthbuffer = NULL; static glm::vec3 *dev_framebuffer = NULL; static int bufIdxSize = 0; static int vertCount = 0; +static int primitiveCount = 0; +static Triangle *dev_compactionOutput = NULL; /** * Kernel that writes the image to the OpenGL PBO directly. @@ -75,12 +69,235 @@ void render(int w, int h, Fragment *depthbuffer, glm::vec3 *framebuffer) { } } +/** + * Clears the depth buffers and primitive buffer. + */ +void clearDepthBuffer() { + cudaMemset(dev_depth, scene->farPlane * 10000, width * height * sizeof(int)); + cudaMemset(dev_depthbuffer, 0.0f, width * height * sizeof(Fragment)); +} + +/** + * Apply vertex transformations and transfer to vertex out buffer + */ +__global__ +void vertexShading(int w, int h, int nearPlane, int farPlane, int vertexCount, const VertexIn *vertexBufferIn, + VertexOut *vertexBufferOut, const glm::mat4 modelView) { + int index = ((blockIdx.x * blockDim.x) + threadIdx.x) + (((blockIdx.y * blockDim.y) + threadIdx.y) * w); + + if (index < vertexCount) { + glm::vec4 clipCoordinates = modelView * glm::vec4(vertexBufferIn[index].pos, 1.0f); + glm::vec3 normDeviceCoordinates = glm::vec3(clipCoordinates.x, clipCoordinates.y, clipCoordinates.z) / clipCoordinates.w; + + vertexBufferOut[index].pos = glm::vec3(w * 0.5f * (normDeviceCoordinates.x + 1.0f), + h * 0.5f * (normDeviceCoordinates.y + 1.0f), 0.5f * ((farPlane - nearPlane) + * normDeviceCoordinates.z + (farPlane + nearPlane))); + vertexBufferOut[index].col = vertexBufferIn[index].col; + vertexBufferOut[index].nor = vertexBufferIn[index].nor; + vertexBufferOut[index].model_pos = vertexBufferIn[index].pos; + } +} + +/** + * Assemble primitives from vertex out buffer data. + */ +__global__ +void assemblePrimitives(int primitiveCount, const VertexOut *vertexBufferOut, Triangle *primitives, const int *bufIdx) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index < primitiveCount) { + for (int i = 0; i < 3; i++) { + primitives[index].v[i] = vertexBufferOut[bufIdx[3 * index + i]]; + } + + primitives[index].boundingBox = getAABBForTriangle(primitives[index]); + primitives[index].visible = true; + } +} + + +/** +* Perform scanline rasterization on a triangle +*/ +__global__ +void rasterization(int w, int h, int primitiveCount, Triangle *primitives, Fragment *depthbuffer, int *depth) { + int index = ((blockIdx.x * blockDim.x) + threadIdx.x) + (((blockIdx.y * blockDim.y) + threadIdx.y) * w); + + if (index < primitiveCount) { + // Only doing scanline triangle atm + int minX = fmaxf(round(primitives[index].boundingBox.min.x), 0.0f), minY = fmaxf(round(primitives[index].boundingBox.min.y), 0.0f); + int maxX = fminf(round(primitives[index].boundingBox.max.x), (float)w); + + // Loop through each scanline, then each pixel on the line + for (int y = fminf(round(primitives[index].boundingBox.max.y), (float)h); y >= minY; y--) { + for (int x = minX; x <= maxX; x++) { + glm::vec3 baryCentricCoordinate = calculateBarycentricCoordinate(primitives[index], glm::vec2(x, y)); + if (isBarycentricCoordInBounds(baryCentricCoordinate)) { + int z = getZAtCoordinate(baryCentricCoordinate, primitives[index]) * 10000.0f; + int depthIndex = w - x + (h - y) * w; + + atomicMin(&depth[depthIndex], z); + + if (depth[depthIndex] == z) { + depthbuffer[depthIndex].color = baryCentricCoordinate.x * primitives[index].v[0].col + baryCentricCoordinate.y + * primitives[index].v[1].col + baryCentricCoordinate.z * primitives[index].v[2].col; + depthbuffer[depthIndex].position = baryCentricCoordinate.x * primitives[index].v[0].pos + baryCentricCoordinate.y + * primitives[index].v[1].pos + baryCentricCoordinate.z * primitives[index].v[2].pos; + depthbuffer[depthIndex].normal = baryCentricCoordinate.x * primitives[index].v[0].nor + baryCentricCoordinate.y + * primitives[index].v[1].nor + baryCentricCoordinate.z * primitives[index].v[2].nor; + } + } + } + } + } +} + +/** +* Rasterize point primitives. +*/ +__global__ +void pointRasterization(int w, int h, int primitiveCount, Triangle *primitives, Fragment *depthbuffer, int *depth) { + int index = ((blockIdx.x * blockDim.x) + threadIdx.x) + (((blockIdx.y * blockDim.y) + threadIdx.y) * w); + + if (index < primitiveCount) { + Triangle primitive = primitives[index]; + int x = round(primitive.v[1].pos.x), y = round(primitive.v[1].pos.y); + int z = primitive.v[1].pos.z * 10000.0f; + int depthIndex = w - x + (h - y) * w; + + atomicMin(&depth[depthIndex], z); + + if (depth[depthIndex] == z) { + Fragment fragment; + fragment.color = primitive.v[1].col; + fragment.position = primitive.v[1].pos; + fragment.normal = primitive.v[1].nor; + depthbuffer[depthIndex] = fragment; + } + } +} + + +/** +* Rasterize line primitives. +*/ +__global__ +void lineRasterization(int w, int h, int primitiveCount, Triangle *primitives, Fragment *depthbuffer, int *depth) { + int index = ((blockIdx.x * blockDim.x) + threadIdx.x) + (((blockIdx.y * blockDim.y) + threadIdx.y) * w); + + if (index < primitiveCount) { + Triangle primitive = primitives[index]; + glm::vec3 minPosition = primitive.v[0].pos, maxPosition = primitive.v[1].pos; + + if (round(minPosition.x) == round(maxPosition.x)) { + // Get straight vertical line + int x = round(minPosition.x); + if (minPosition.y > maxPosition.y) { + // Flip + minPosition = primitive.v[1].pos; + maxPosition = primitive.v[0].pos; + } + + for (int y = round(maxPosition.y); y >= round(minPosition.y); y--) { + float minMaxRatio = __fdividef(y - minPosition.y, maxPosition.y - minPosition.y); + int depthIndex = w - x + (h - y) * w; + int z = -(minMaxRatio * round(minPosition.z) + (1.0f - minMaxRatio) * round(maxPosition.z)); + + atomicMin(&depth[depthIndex], z); + + if (depth[depthIndex] == z) { + Fragment fragment; + fragment.color = primitive.v[1].col; + fragment.position = glm::vec3(x, y, -z); + fragment.normal = glm::normalize(primitive.v[0].nor + primitive.v[1].nor); + depthbuffer[depthIndex] = fragment; + } + } + } + else { + //Have to calculate a Bresenham line + if (round(minPosition.x) > round(maxPosition.x)) { + // Swap + minPosition = primitive.v[1].pos; + maxPosition = primitive.v[0].pos; + } + + float slope = (maxPosition.y - minPosition.y) / (maxPosition.x - minPosition.x); + + for (int x = round(minPosition.x); x <= round(maxPosition.x); x++) { + int y = slope * (x - round(minPosition.x)) + minPosition.y; + float minMaxRatio = __fdividef(y - minPosition.y, maxPosition.y - minPosition.y); + int depthIndex = w - x + (h - y) * w; + int z = -(minMaxRatio * minPosition.z + (1.0f - minMaxRatio) * maxPosition.z); + + atomicMin(&depth[depthIndex], z); + + if (depth[depthIndex] == z) { + Fragment fragment; + fragment.color = primitive.v[1].col; + fragment.position = glm::vec3(x, y, -z); + fragment.normal = glm::normalize(primitive.v[0].nor + primitive.v[1].nor); + depthbuffer[depthIndex] = fragment; + } + } + } + } +} + +/** +* Fragment shader. Use light argument to color the fragment in the depth buffer. +*/ +__global__ +void fragmentShading(int w, int h, Fragment *depthBuffer, const Light light) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index < (w * h)) { + Fragment fragment = depthBuffer[index]; + depthBuffer[index].color = (glm::dot(glm::normalize(light.position - fragment.position), fragment.normal) + * fragment.color * light.color); + } +} + +/** +* Perform backface culling optimization, removing unscene fragments. +*/ +__global__ +void backFaceCulling(int w, int primitiveCount, Triangle *primitives, glm::vec3 cameraPosition) { + int index = ((blockIdx.x * blockDim.x) + threadIdx.x) + (((blockIdx.y * blockDim.y) + threadIdx.y) * w); + + if (index < primitiveCount) { + if (glm::dot(primitives[index].v[0].model_pos - cameraPosition, primitives[index].v[0].nor) >= 0.0f) { + primitives[index].visible = false; + } + } +} + +/** +* Perform scissor test culling, removing fragments outside of the scissor area. +*/ +__global__ +void scissorTest(int w, int primitiveCount, Triangle *primitives, const glm::vec2 scissorMax, const glm::vec2 scissorMin) { + int index = ((blockIdx.x * blockDim.x) + threadIdx.x) + (((blockIdx.y * blockDim.y) + threadIdx.y) * w); + + if (index < primitiveCount) { + if (primitives[index].boundingBox.min.y > scissorMax.y || primitives[index].boundingBox.max.y < scissorMin.y || + primitives[index].boundingBox.max.x > scissorMax.x || primitives[index].boundingBox.max.x < scissorMin.x) { + primitives[index].visible = false; + } + } +} + /** * Called once at the beginning of the program to allocate memory. */ -void rasterizeInit(int w, int h) { +void rasterizeInit(int w, int h, Scene *s) { width = w; height = h; + scene = s; + + cudaFree(dev_depth); + cudaMalloc(&dev_depth, width * height * sizeof(int)); + cudaMemset(dev_depth, scene->farPlane * 10000.0f, width * height * sizeof(int)); cudaFree(dev_depthbuffer); cudaMalloc(&dev_depthbuffer, width * height * sizeof(Fragment)); cudaMemset(dev_depthbuffer, 0, width * height * sizeof(Fragment)); @@ -98,6 +315,7 @@ void rasterizeSetBuffers( int _vertCount, float *bufPos, float *bufNor, float *bufCol) { bufIdxSize = _bufIdxSize; vertCount = _vertCount; + primitiveCount = vertCount / 3; cudaFree(dev_bufIdx); cudaMalloc(&dev_bufIdx, bufIdxSize * sizeof(int)); @@ -114,10 +332,18 @@ void rasterizeSetBuffers( cudaMalloc(&dev_bufVertex, vertCount * sizeof(VertexIn)); cudaMemcpy(dev_bufVertex, bufVertex, vertCount * sizeof(VertexIn), cudaMemcpyHostToDevice); + cudaFree(dev_bufVertexOut); + cudaMalloc(&dev_bufVertexOut, vertCount * sizeof(VertexOut)); + cudaMemset(dev_bufVertexOut, 0, vertCount * sizeof(VertexIn)); + cudaFree(dev_primitives); cudaMalloc(&dev_primitives, vertCount / 3 * sizeof(Triangle)); cudaMemset(dev_primitives, 0, vertCount / 3 * sizeof(Triangle)); + cudaFree(dev_compactionOutput); + cudaMalloc(&dev_compactionOutput, vertCount / 3 * sizeof(Triangle)); + cudaMemset(dev_compactionOutput, 0, vertCount / 3 * sizeof(Triangle)); + checkCUDAError("rasterizeSetBuffers"); } @@ -127,14 +353,56 @@ void rasterizeSetBuffers( void rasterize(uchar4 *pbo) { int sideLength2d = 8; dim3 blockSize2d(sideLength2d, sideLength2d); - dim3 blockCount2d((width - 1) / blockSize2d.x + 1, - (height - 1) / blockSize2d.y + 1); + dim3 blockCount2d((width + blockSize2d.x - 1) / blockSize2d.x, + (height + blockSize2d.y - 1) / blockSize2d.y); + int vertexBlockSize = VERTBLOCKSIZE, fragmentBlockSize = FRAGBLOCKSIZE; + int vertexGridSize = (vertCount + VERTBLOCKSIZE - 1) / VERTBLOCKSIZE; + int fragmentGridSize = (width * height + FRAGBLOCKSIZE - 1) / FRAGBLOCKSIZE; - // TODO: Execute your rasterization pipeline here - // (See README for rasterization pipeline outline.) + primitiveCount = vertCount / 3; + + // Clear depth buffer + clearDepthBuffer(); + + // Vertex shading + vertexShading<<>>(width, height, scene->nearPlane, scene->farPlane, vertCount, dev_bufVertex, dev_bufVertexOut, scene->modelView); + + // Primitive Assembly + assemblePrimitives<<>>(primitiveCount, dev_bufVertexOut, dev_primitives, dev_bufIdx); + + // Culling after Primitive assembly + if (scene->culling) { + backFaceCulling<<>>(width, primitiveCount, dev_primitives, scene->camera.position); + primitiveCount = StreamCompaction::Efficient::Compact(primitiveCount, dev_compactionOutput, dev_primitives); + cudaMemcpy(dev_primitives, dev_compactionOutput, primitiveCount * sizeof(Triangle), cudaMemcpyDeviceToDevice); + } + + // Scissor test + if (scene->scissor) { + scissorTest<<>>(width, primitiveCount, dev_primitives, scene->scissorMax, scene->scissorMin); + primitiveCount = StreamCompaction::Efficient::Compact(primitiveCount, dev_compactionOutput, dev_primitives); + cudaMemcpy(dev_primitives, dev_compactionOutput, primitiveCount * sizeof(Triangle), cudaMemcpyDeviceToDevice); + } + + // rasterization + // Choose between primitive types based on scene file + if (scene->pointRasterization) { + pointRasterization<<>>(width, height, primitiveCount, dev_primitives, dev_depthbuffer, dev_depth); + } + else if (scene->lineRasterization) { + lineRasterization<<>>(width, height, primitiveCount, dev_primitives, dev_depthbuffer, dev_depth); + } + else { + // Standard triangle rasterization + rasterization<<>>(width, height, primitiveCount, dev_primitives, dev_depthbuffer, dev_depth); + } + + // Fragment shading + fragmentShading<<>>(width, height, dev_depthbuffer, scene->light); // Copy depthbuffer colors into framebuffer render<<>>(width, height, dev_depthbuffer, dev_framebuffer); + // Copy framebuffer into OpenGL buffer for OpenGL previewing sendImageToPBO<<>>(pbo, width, height, dev_framebuffer); checkCUDAError("rasterize"); @@ -159,5 +427,11 @@ void rasterizeFree() { cudaFree(dev_framebuffer); dev_framebuffer = NULL; + cudaFree(dev_depth); + dev_depth = NULL; + + cudaFree(dev_compactionOutput); + dev_compactionOutput = NULL; + checkCUDAError("rasterizeFree"); } diff --git a/src/rasterize.h b/src/rasterize.h index a06b339..4c1f30c 100644 --- a/src/rasterize.h +++ b/src/rasterize.h @@ -9,8 +9,9 @@ #pragma once #include +#include "scene.hpp" -void rasterizeInit(int width, int height); +void rasterizeInit(int width, int height, Scene *scene); void rasterizeSetBuffers( int bufIdxSize, int *bufIdx, int vertCount, float *bufPos, float *bufNor, float *bufCol); diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h index 46c701e..a572a19 100644 --- a/src/rasterizeTools.h +++ b/src/rasterizeTools.h @@ -17,6 +17,31 @@ struct AABB { glm::vec3 max; }; +struct VertexIn { + glm::vec3 pos; + glm::vec3 nor; + glm::vec3 col; +}; + +struct VertexOut { + glm::vec3 pos; + glm::vec3 model_pos; // Used for culling + glm::vec3 nor; + glm::vec3 col; +}; + +struct Triangle { + VertexOut v[3]; + AABB boundingBox; + bool visible; +}; + +struct Fragment { + glm::vec3 position; + glm::vec3 normal; + glm::vec3 color; +}; + /** * Multiplies a glm::mat4 matrix and a vec4. */ @@ -30,17 +55,17 @@ glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) { * Finds the axis aligned bounding box for a given triangle. */ __host__ __device__ static -AABB getAABBForTriangle(const glm::vec3 tri[3]) { - AABB aabb; - aabb.min = glm::vec3( - min(min(tri[0].x, tri[1].x), tri[2].x), - min(min(tri[0].y, tri[1].y), tri[2].y), - min(min(tri[0].z, tri[1].z), tri[2].z)); - aabb.max = glm::vec3( - max(max(tri[0].x, tri[1].x), tri[2].x), - max(max(tri[0].y, tri[1].y), tri[2].y), - max(max(tri[0].z, tri[1].z), tri[2].z)); - return aabb; +AABB getAABBForTriangle(const Triangle tri) { + AABB aabb; + aabb.min = glm::vec3( + glm::min(glm::min(tri.v[0].pos.x, tri.v[1].pos.x), tri.v[2].pos.x), + glm::min(glm::min(tri.v[0].pos.y, tri.v[1].pos.y), tri.v[2].pos.y), + glm::min(glm::min(tri.v[0].pos.z, tri.v[1].pos.z), tri.v[2].pos.z)); + aabb.max = glm::vec3( + glm::max(glm::max(tri.v[0].pos.x, tri.v[1].pos.x), tri.v[2].pos.x), + glm::max(glm::max(tri.v[0].pos.y, tri.v[1].pos.y), tri.v[2].pos.y), + glm::max(glm::max(tri.v[0].pos.z, tri.v[1].pos.z), tri.v[2].pos.z)); + return aabb; } // CHECKITOUT @@ -48,8 +73,8 @@ AABB getAABBForTriangle(const glm::vec3 tri[3]) { * Calculate the signed area of a given triangle. */ __host__ __device__ static -float calculateSignedArea(const glm::vec3 tri[3]) { - return 0.5 * ((tri[2].x - tri[0].x) * (tri[1].y - tri[0].y) - (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y)); +float calculateSignedArea(const Triangle tri) { + return 0.5 * ((tri.v[2].pos.x - tri.v[0].pos.x) * (tri.v[1].pos.y - tri.v[0].pos.y) - (tri.v[1].pos.x - tri.v[0].pos.x) * (tri.v[2].pos.y - tri.v[0].pos.y)); } // CHECKITOUT @@ -57,24 +82,25 @@ float calculateSignedArea(const glm::vec3 tri[3]) { * Helper function for calculating barycentric coordinates. */ __host__ __device__ static -float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, const glm::vec3 tri[3]) { - glm::vec3 baryTri[3]; - baryTri[0] = glm::vec3(a, 0); - baryTri[1] = glm::vec3(b, 0); - baryTri[2] = glm::vec3(c, 0); - return calculateSignedArea(baryTri) / calculateSignedArea(tri); +float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, const Triangle tri) { + Triangle baryTri; + baryTri.v[0].pos = glm::vec3(a, 0); + baryTri.v[1].pos = glm::vec3(b, 0); + baryTri.v[2].pos = glm::vec3(c, 0); + return calculateSignedArea(baryTri) / calculateSignedArea(tri); } // CHECKITOUT /** * Calculate barycentric coordinates. + * TODO: Update to handle triangles coming in and not the array */ __host__ __device__ static -glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point) { - float beta = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), point, glm::vec2(tri[2].x, tri[2].y), tri); - float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), glm::vec2(tri[1].x, tri[1].y), point, tri); - float alpha = 1.0 - beta - gamma; - return glm::vec3(alpha, beta, gamma); +glm::vec3 calculateBarycentricCoordinate(const Triangle tri, glm::vec2 point) { + float beta = calculateBarycentricCoordinateValue(glm::vec2(tri.v[0].pos.x, tri.v[0].pos.y), point, glm::vec2(tri.v[2].pos.x, tri.v[2].pos.y), tri); + float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri.v[0].pos.x, tri.v[0].pos.y), glm::vec2(tri.v[1].pos.x, tri.v[1].pos.y), point, tri); + float alpha = 1.0 - beta - gamma; + return glm::vec3(alpha, beta, gamma); } // CHECKITOUT @@ -94,8 +120,8 @@ bool isBarycentricCoordInBounds(const glm::vec3 barycentricCoord) { * (i.e. depth) on the triangle. */ __host__ __device__ static -float getZAtCoordinate(const glm::vec3 barycentricCoord, const glm::vec3 tri[3]) { - return -(barycentricCoord.x * tri[0].z - + barycentricCoord.y * tri[1].z - + barycentricCoord.z * tri[2].z); +float getZAtCoordinate(const glm::vec3 barycentricCoord, const Triangle tri) { + return -(barycentricCoord.x * tri.v[0].pos.z + + barycentricCoord.y * tri.v[1].pos.z + + barycentricCoord.z * tri.v[2].pos.z); } diff --git a/src/scene.cpp b/src/scene.cpp new file mode 100644 index 0000000..6be6c5e --- /dev/null +++ b/src/scene.cpp @@ -0,0 +1,42 @@ +#include "scene.hpp" +#include + +Scene::Scene() { + // Create with some default values + this->nearPlane = 0.1f; + this->farPlane = 100.0f; + camera.fieldOfView = 45.0f; + camera.position = glm::vec3(0.0f, 0.0f, 3.0f); + camera.lookAt = glm::vec3(0.0f); + camera.right = glm::vec3(1.0f, 0.0f, 0.0f); + + light.position = 1000.0f * camera.position; + light.color = glm::vec3(1.0f); + + culling = false; + scissor = false; + pointRasterization = false; + lineRasterization = false; + + mouseState.initialPositionsSet = false; + + // Then have to calculate the model view matrix + updateModelView(); +} + +Scene::~Scene() { + +} + +// Used for initial calculation and any updates we might want to make if we add mouse/keyboard control +void Scene::updateModelView() { + glm::vec3 cameraDirection; + + cameraDirection = glm::normalize(camera.lookAt - camera.position); + camera.up = glm::cross(camera.right, cameraDirection); + view = glm::lookAt(camera.position, camera.lookAt, camera.up); + projection = glm::perspective(camera.fieldOfView, 1.0f, -nearPlane, -farPlane); + modelView = projection * view * model; + + light.position = 1000.0f * camera.position; +} \ No newline at end of file diff --git a/src/scene.hpp b/src/scene.hpp new file mode 100644 index 0000000..cdd6677 --- /dev/null +++ b/src/scene.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include "glm/glm.hpp" + +using namespace std; + +struct MouseState { + bool initialPositionsSet; + int x, y; + bool leftPressed; + bool middlePressed; +}; + +struct Camera { + float fieldOfView; + glm::vec3 position; + glm::vec3 lookAt; + glm::vec3 right; + glm::vec3 up; +}; + +struct Light { + glm::vec3 position; + glm::vec3 color; +}; + +class Scene { +private: + glm::mat4 view; + glm::mat4 projection; + const glm::mat4 model = glm::mat4(1.0f); + +public: + Camera camera; + + MouseState mouseState; + + float nearPlane; + float farPlane; + glm::mat4 modelView; + + Light light; + + bool culling; + bool scissor; + bool pointRasterization; + bool lineRasterization; + + glm::vec2 scissorMin; + glm::vec2 scissorMax; + + Scene(); + ~Scene(); + + void updateModelView(); +}; \ No newline at end of file diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt new file mode 100644 index 0000000..f7ea27d --- /dev/null +++ b/stream_compaction/CMakeLists.txt @@ -0,0 +1,9 @@ +set(SOURCE_FILES + "efficient.h" + "efficient.cu" + ) + +cuda_add_library(stream_compaction + ${SOURCE_FILES} + OPTIONS -arch=sm_20 + ) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu new file mode 100644 index 0000000..9cee2bc --- /dev/null +++ b/stream_compaction/efficient.cu @@ -0,0 +1,175 @@ +#include +#include +#include "efficient.h" +#include "./util/checkCUDAError.h" + +#define blockSize 128 + +namespace StreamCompaction { +namespace Efficient { + +/** +* Maps an array to an array of 0s and 1s for stream compaction. Elements +* which map to 0 will be removed, and elements which map to 1 will be kept. +*/ +__global__ void KernMapToBoolean(int n, int *bools, const Triangle *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + bools[index] = !!idata[index].visible; +} + +/** +* Performs scatter on an array. That is, for each element in idata, +* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. +*/ +__global__ void KernScatter(int n, Triangle *odata, const Triangle *idata, const int *bools, const int *indices) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } +} + +/** +* Accumulates the new count of threads for a block with the original. +*/ +__global__ void KernGetBlockCount(int n, int *odata, const int *idata1, const int *idata2) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + odata[index] = idata1[(index + 1) * blockSize - 1] + idata2[(index + 1) * blockSize - 1]; +} + +/** +* Increments the block count. +*/ +__global__ void KernIncrementBlock(int n, int *data, const int *increments) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + data[index] = data[index] + increments[blockIdx.x]; +} + +/* +* Performs work efficient scan on data in a single GPU block using shared memory. +* Based on the GPU Gems 3 Code found here: +* http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html +*/ +__global__ void KernScan(int n, int *odata, const int *idata) { + int index = threadIdx.x; + int offset = 1; + extern __shared__ int temp[]; + + // Copy input data to shared memory + temp[2 * index] = idata[2 * index + (blockIdx.x * blockDim.x * 2)]; + temp[2 * index + 1] = idata[2 * index + 1 + (blockIdx.x * blockDim.x * 2)]; + + // Up sweep + for (int d = n >> 1; d > 0; d >>= 1) { + __syncthreads(); + + if (index < d) { + int ai = offset * (2 * index + 1) - 1; + int bi = offset * (2 * index + 2) - 1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + + // Clear the root + if (index == 0) { + temp[n - 1] = 0; + } + + // Down sweep + for (int d = 1; d < n; d *= 2) { + offset >>= 1; + __syncthreads(); + + if (index < d) { + int ai = offset * (2 * index + 1) - 1; + int bi = offset * (2 * index + 2) - 1; + int t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + + // Write to output array + odata[2 * index + (blockIdx.x * blockDim.x * 2)] = temp[2 * index]; + odata[2 * index + 1 + (blockIdx.x * blockDim.x * 2)] = temp[2 * index + 1]; +} + +void Scan(int n, int *odata, int *idata) { + int blocksPerGrid = (n - 1) / blockSize + 1; + int *dev_idata, *dev_odata; // Padded device memory to handle non power of 2 cases + + cudaMalloc((void**)&dev_idata, blocksPerGrid * blockSize * sizeof(int)); + cudaMemset(dev_idata, 0, blocksPerGrid * blockSize * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyDeviceToDevice); + + cudaMalloc((void**)&dev_odata, blocksPerGrid * blockSize * sizeof(int)); + + if (blocksPerGrid == 1) { + KernScan<<<1, blockSize / 2, blockSize * sizeof(int)>>>(blockSize, dev_odata, dev_idata); + checkCUDAError("KernScan"); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice); + } + else { + int *dev_increments, *dev_scannedIncrements; + + cudaMalloc((void**)&dev_increments, blocksPerGrid * sizeof(int)); + cudaMalloc((void**)&dev_scannedIncrements, blocksPerGrid * sizeof(int)); + + KernScan<<>>(blockSize, dev_odata, dev_idata); + checkCUDAError("KernScan"); + + int tempBlocksPerGrid = (blocksPerGrid - 1) / blockSize + 1; + KernGetBlockCount<<>>(blocksPerGrid, dev_increments, dev_odata, dev_idata); + checkCUDAError("KernGetBlockCount"); + + // Recursive scan call until we can fit on a single block + Scan(blocksPerGrid, dev_scannedIncrements, dev_increments); + + KernIncrementBlock<<>>(blocksPerGrid * blockSize, dev_odata, dev_scannedIncrements); + checkCUDAError("KernIncrementBlock"); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice); + + cudaFree(dev_increments); + cudaFree(dev_scannedIncrements); + } + + cudaFree(dev_idata); + cudaFree(dev_odata); +} + +int Compact(int n, Triangle *odata, Triangle *idata) { + int blocksPerGrid = (n - 1) / blockSize + 1; + int rayCount = 0; + int *dev_bools, *dev_scanData; + + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_scanData, blocksPerGrid * blockSize * sizeof(int)); + + // Map input to boolean values + KernMapToBoolean<<>>(n, dev_bools, idata); + checkCUDAError("KernMapToBoolean"); + + // Scan + Scan(n, dev_scanData, dev_bools); + + // Scatter + KernScatter<<>>(n, odata, idata, dev_bools, dev_scanData); + checkCUDAError("KernScatter"); + + // Get number of rays remaining + int lastScanDataElem, lastBoolElem; + cudaMemcpy(&lastScanDataElem, dev_scanData + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastBoolElem, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + rayCount = lastScanDataElem + lastBoolElem; + + cudaFree(dev_bools); + cudaFree(dev_scanData); + + return rayCount; +} +} +} diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h new file mode 100644 index 0000000..f8864a8 --- /dev/null +++ b/stream_compaction/efficient.h @@ -0,0 +1,8 @@ +#pragma once +#include "../src/rasterizeTools.h" + +namespace StreamCompaction { +namespace Efficient { + int Compact(int n, Triangle *odata, Triangle *idata); +} +} diff --git a/util/obj.cpp b/util/obj.cpp index 6ae70f1..2df37bd 100644 --- a/util/obj.cpp +++ b/util/obj.cpp @@ -123,7 +123,7 @@ void obj::buildBufPoss() { for (int i = 0; i < (int) BufIdxvec.size(); i++) { ibo[i] = BufIdxvec[i]; } - setColor(glm::vec3(1, 1, 1)); + setColor(glm::vec3(1.0f)); printf("Mesh built: buffers contain %d faces & %d vertices\n", getBufIdxsize() / 3,