diff --git a/.gitignore b/.gitignore
index 89942d9..96c2a42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -558,3 +558,7 @@ xcuserdata
 *.xccheckout
 *.moved-aside
 *.xcuserstate
+
+
+#my
+test_img
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 414ab4d..db40597 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 endif()
 
 include_directories(.)
-#add_subdirectory(stream_compaction)  # TODO: uncomment if using your stream compaction
+add_subdirectory(stream_compaction)  # TODO: uncomment if using your stream compaction
 add_subdirectory(src)
 
 cuda_add_executable(${CMAKE_PROJECT_NAME}
@@ -78,7 +78,7 @@ cuda_add_executable(${CMAKE_PROJECT_NAME}
 
 target_link_libraries(${CMAKE_PROJECT_NAME}
     src
-    #stream_compaction  # TODO: uncomment if using your stream compaction
+    stream_compaction  # TODO: uncomment if using your stream compaction
     ${CORELIBS}
     )
 
diff --git a/README.md b/README.md
index c2504ab..0215e98 100644
--- a/README.md
+++ b/README.md
@@ -3,311 +3,96 @@ CUDA Path Tracer
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
-
-Instructions (delete me)
-========================
-
-This is **NOW** due ~~Thursday, September 24~~ **Tuesday, September 29** evening at midnight.
-
-**Summary:**
-In this project, you'll implement a CUDA-based path tracer capable of rendering
-globally-illuminated images very quickly.
-Since in this class we are concerned with working in GPU programming,
-performance, and the generation of actual beautiful images (and not with
-mundane programming tasks like I/O), this project includes base code for
-loading a scene description file, described below, and various other things
-that generally make up a framework for previewing and saving images.
-
-The core renderer is left for you to implement. Finally, note that, while this
-base code is meant to serve as a strong starting point for a CUDA path tracer,
-you are not required to use it if you don't want to. You may also change any
-part of the base code as you please. **This is YOUR project.**
-
-**Recommendation:** Every image you save should automatically get a different
-filename. Don't delete all of them! For the benefit of your README, keep a
-bunch of them around so you can pick a few to document your progress at the
-end.
-
-### Contents
-
-* `src/` C++/CUDA source files.
-* `scenes/` Example scene description files.
-* `img/` Renders of example scene description files.
-  (These probably won't match precisely with yours.)
-* `external/` Includes and static libraries for 3rd party libraries.
-
-
-### Running the code
-
-The main function requires a scene description file. Call the program with
-one as an argument: `cis565_path_tracer scenes/sphere.txt`.
-(In Visual Studio, `../scenes/sphere.txt`.)
-
-If you are using Visual Studio, you can set this in the Debugging > Command
-Arguments section in the Project properties. Make sure you get the path right -
-read the console for errors.
-
-#### Controls
-
-* Esc to save an image and exit.
-* Space to save an image. Watch the console for the output filename.
-* W/A/S/D and R/F move the camera. Arrow keys rotate.
-
-## Requirements
-
-**Ask on the mailing list for clarifications.**
-
-In this project, you are given code for:
-
-* Loading and reading the scene description format
-* Sphere and box intersection functions
-* Support for saving images
-* Working CUDA-GL interop for previewing your render while it's running
-* A function which generates random screen noise (instead of an actual render).
-
-You will need to implement the following features:
-
-* Raycasting from the camera into the scene through an imaginary grid of pixels
-  (the screen).
-  * Implement simple antialiasing (by jittering rays within each pixel).
-* Diffuse surfaces (using provided cosine-weighted scatter function) [PBRT 8.3].
-* Perfectly specular-reflective (mirrored) surfaces.
-  * See notes on diffuse/specular in `scatterRay` and on imperfect specular below.
-* Stream compaction optimization, using:
-* **NEWLY ADDED:** Work-efficient stream compaction using shared memory across
-  multiple blocks. (See
-  [*GPU Gems 3*, Chapter 39](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html).)
-
-You are also required to implement at least 2 of the following features.
-If you find other good references for these features, share them!
-**Extra credit**: implement more features on top of the 2 required ones,
-with point value up to +20/100 at the grader's discretion
-(based on difficulty and coolness).
-
-* **NOW REQUIRED - NOT AN EXTRA:** ~~Work-efficient stream compaction (see above).~~
-* These 2 smaller features:
-  * Refraction (e.g. glass/water) [PBRT 8.2] with Frensel effects using
-    [Schlick's approximation](https://en.wikipedia.org/wiki/Schlick's_approximation)
-    or more accurate methods [PBRT 8.5].
-  * Physically-based depth-of-field (by jittering rays within an aperture)
-    [PBRT 6.2.3].
-  * Recommended but not required: non-perfect specular surfaces. (See below.)
-* Texture mapping [PBRT 10.4].
-* Bump mapping [PBRT 9.3].
-* Direct lighting (by taking a final ray directly to a random point on an
-  emissive object acting as a light source). Or more advanced [PBRT 15.1.1].
-* Some method of defining object motion, and motion blur by averaging samples
-  at different times in the animation.
-* Subsurface scattering [PBRT 5.6.2, 11.6].
-* Arbitrary mesh loading and rendering (e.g. `obj` files). You can find these
-  online or export them from your favorite 3D modeling application.
-  With approval, you may use a third-party OBJ loading code to bring the data
-  into C++.
-  * You can use the triangle intersection function `glm::intersectRayTriangle`.
-
-This 'extra features' list is not comprehensive. If you have a particular idea
-you would like to implement (e.g. acceleration structures, etc.), please
-contact us first.
-
-For each extra feature, you must provide the following analysis:
-
-* Overview write-up of the feature
-* Performance impact of the feature
-* If you did something to accelerate the feature, what did you do and why?
-* Compare your GPU version of the feature to a HYPOTHETICAL CPU version
-  (you don't have to implement it!) Does it benefit or suffer from being
-  implemented on the GPU?
-* How might this feature be optimized beyond your current implementation?
-
-## Base Code Tour
-
-You'll be working in the following files. Look for important parts of the code:
-search for `CHECKITOUT`. You'll have to implement parts labeled with `TODO`.
-(But don't let these constrain you - you have free rein!)
-
-* `src/pathtrace.cu`: path tracing kernels, device functions, and calling code
-  * `pathtraceInit` initializes the path tracer state - it should copy
-    scene data (e.g. geometry, materials) from `Scene`.
-  * `pathtraceFree` frees memory allocated by `pathtraceInit`
-  * `pathtrace` performs one iteration of the rendering - it handles kernel
-    launches, memory copies, transferring some data, etc.
-    * See comments for a low-level path tracing recap.
-* `src/intersections.h`: ray intersection functions
-  * `boxIntersectionTest` and `sphereIntersectionTest`, which take in a ray and
-    a geometry object and return various properties of the intersection.
-* `src/interactions.h`: ray scattering functions
-  * `calculateRandomDirectionInHemisphere`: a cosine-weighted random direction
-    in a hemisphere. Needed for implementing diffuse surfaces.
-  * `scatterRay`: this function should perform all ray scattering, and will
-    call `calculateRandomDirectionInHemisphere`. See comments for details.
-* `src/main.cpp`: you don't need to do anything here, but you can change the
-  program to save `.hdr` image files, if you want (for postprocessing).
-
-### Generating random numbers
-
-```
-thrust::default_random_engine rng(hash(index));
-thrust::uniform_real_distribution<float> u01(0, 1);
-float result = u01(rng);
-```
-
-There is a convenience function for generating a random engine using a
-combination of index, iteration, and depth as the seed:
-
-```
-thrust::default_random_engine rng = random_engine(iter, index, depth);
-```
-
-### Imperfect specular lighting
-
-In path tracing, like diffuse materials, specular materials are
-simulated using a probability distribution instead computing the
-strength of a ray bounce based on angles.
-
-Equations 7, 8, and 9 of
-[*GPU Gems 3*, Chapter 20](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch20.html)
-give the formulas for generating a random specular ray. (Note that
-there is a typographical error: χ in the text = ξ in the formulas.)
-
-Also see the notes in `scatterRay` for probability splits between
-diffuse/specular/other material types.
-
-See also: PBRT 8.2.2.
-
-### Handling Long-Running CUDA Threads
-
-By default, your GPU driver will probably kill a CUDA kernel if it runs for more than 5 seconds. There's a way to disable this timeout. Just beware of infinite loops - they may lock up your computer.
-
-> The easiest way to disable TDR for Cuda programming, assuming you have the NVIDIA Nsight tools installed, is to open the Nsight Monitor, click on "Nsight Monitor options", and under "General" set "WDDM TDR enabled" to false. This will change the registry setting for you. Close and reboot. Any change to the TDR registry setting won't take effect until you reboot. [Stack Overflow](http://stackoverflow.com/questions/497685/cuda-apps-time-out-fail-after-several-seconds-how-to-work-around-this)
-
-### Notes on GLM
-
-This project uses GLM for linear algebra.
-
-On NVIDIA cards pre-Fermi (pre-DX12), you may have issues with mat4-vec4
-multiplication. If you have one of these cards, be careful! If you have issues,
-you might need to grab `cudamat4` and `multiplyMV` from the
-[Fall 2014 project](https://github.com/CIS565-Fall-2014/Project3-Pathtracer).
-Let us know if you need to do this.
-
-### Scene File Format
-
-This project uses a custom scene description format. Scene files are flat text
-files that describe all geometry, materials, lights, cameras, and render
-settings inside of the scene. Items in the format are delimited by new lines,
-and comments can be added using C-style `// comments`.
-
-Materials are defined in the following fashion:
-
-* MATERIAL (material ID) //material header
-* RGB (float r) (float g) (float b) //diffuse color
-* SPECX (float specx) //specular exponent
-* SPECRGB (float r) (float g) (float b) //specular color
-* REFL (bool refl) //reflectivity flag, 0 for no, 1 for yes
-* REFR (bool refr) //refractivity flag, 0 for no, 1 for yes
-* REFRIOR (float ior) //index of refraction for Fresnel effects
-* EMITTANCE (float emittance) //the emittance strength of the material. Material is a light source iff emittance > 0.
-
-Cameras are defined in the following fashion:
-
-* CAMERA //camera header
-* RES (float x) (float y) //resolution
-* FOVY (float fovy) //vertical field of view half-angle. the horizonal angle is calculated from this and the reslution
-* ITERATIONS (float interations) //how many iterations to refine the image,
-  only relevant for supersampled antialiasing, depth of field, area lights, and
-  other distributed raytracing applications
-* DEPTH (int depth) //maximum depth (number of times the path will bounce)
-* FILE (string filename) //file to output render to upon completion
-* EYE (float x) (float y) (float z) //camera's position in worldspace
-* VIEW (float x) (float y) (float z) //camera's view direction
-* UP (float x) (float y) (float z) //camera's up vector
-
-Objects are defined in the following fashion:
-
-* OBJECT (object ID) //object header
-* (cube OR sphere OR mesh) //type of object, can be either "cube", "sphere", or
-  "mesh". Note that cubes and spheres are unit sized and centered at the
-  origin.
-* material (material ID) //material to assign this object
-* TRANS (float transx) (float transy) (float transz) //translation
-* ROTAT (float rotationx) (float rotationy) (float rotationz) //rotation
-* SCALE (float scalex) (float scaley) (float scalez) //scale
-
-Two examples are provided in the `scenes/` directory: a single emissive sphere,
-and a simple cornell box made using cubes for walls and lights and a sphere in
-the middle.
-
-## Third-Party Code Policy
-
-* Use of any third-party code must be approved by asking on our Google Group.
-* If it is approved, all students are welcome to use it. Generally, we approve
-  use of third-party code that is not a core part of the project. For example,
-  for the path tracer, we would approve using a third-party library for loading
-  models, but would not approve copying and pasting a CUDA function for doing
-  refraction.
-* Third-party code **MUST** be credited in README.md.
-* Using third-party code without its approval, including using another
-  student's code, is an academic integrity violation, and will, at minimum,
-  result in you receiving an F for the semester.
-
-## README
-
-Please see: [**TIPS FOR WRITING AN AWESOME README**](https://github.com/pjcozzi/Articles/blob/master/CIS565/GitHubRepo/README.md)
-
-* Sell your project.
-* Assume the reader has a little knowledge of path tracing - don't go into
-  detail explaining what it is. Focus on your project.
-* Don't talk about it like it's an assignment - don't say what is and isn't
-  "extra" or "extra credit." Talk about what you accomplished.
-* Use this to document what you've done.
-* *DO NOT* leave the README to the last minute! It is a crucial part of the
-  project, and we will not be able to grade you without a good README.
-
-In addition:
-
-* This is a renderer, so include images that you've made!
-* Be sure to back your claims for optimization with numbers and comparisons.
-* If you reference any other material, please provide a link to it.
-* You wil not be graded on how fast your path tracer runs, but getting close to
-  real-time is always nice!
-* If you have a fast GPU renderer, it is very good to show case this with a
-  video to show interactivity. If you do so, please include a link!
-
-### Analysis
-
-* Stream compaction helps most after a few bounces. Print and plot the
-  effects of stream compaction within a single iteration (i.e. the number of
-  unterminated rays after each bounce) and evaluate the benefits you get from
-  stream compaction.
-* Compare scenes which are open (like the given cornell box) and closed
-  (i.e. no light can escape the scene). Again, compare the performance effects
-  of stream compaction! Remember, stream compaction only affects rays which
-  terminate, so what might you expect?
-
-
-## Submit
-
-If you have modified any of the `CMakeLists.txt` files at all (aside from the
-list of `SOURCE_FILES`), you must test that your project can build in Moore
-100B/C. Beware of any build issues discussed on the Google Group.
-
-1. Open a GitHub pull request so that we can see that you have finished.
-   The title should be "Submission: YOUR NAME".
-2. Send an email to the TA (gmail: kainino1+cis565@) with:
-   * **Subject**: in the form of `[CIS565] Project N: PENNKEY`.
-   * Direct link to your pull request on GitHub.
-   * Estimate the amount of time you spent on the project.
-   * If there were any outstanding problems, or if you did any extra
-     work, *briefly* explain.
-   * Feedback on the project itself, if any.
-
-## References
-
-* [PBRT] Physically Based Rendering, Second Edition: From Theory To Implementation. Pharr, Matt and Humphreys, Greg. 2010.
+* Shuai Shao (Shrek)
+* Tested on: Windows 10, i7-4710HQ @ 2.50GHz 16GB, GeForce GTX 970M (Personal Computer)
+
+-----------------------------------------
+
+
+## Image Example
+* Simple scene shows diffuse, reflection, refraction, caustic, global illumination, antialiasing, and soft shadow
+![refraction](img/refraction_note.png)
+
+* Lens effect
+![lens](img/lens.png)
+
+* Obj File
+![obj](img/teapot.png)
+
+-----------------------------------
+
+## Features
+
+### General Intro
+
+GPU accelerated Monte Carlo path tracer launches parallel threads to calculate ray intersections and shading. Instead of using recursive ray tracing functions, the GPU approach maintains a ray array, and do stream compaction to exclude terminated rays for each bounce in an iterative way. Paths ray bounces through the scene and terminates by either hitting the light source or hitting nothing. The final image averages the pixel value results of tons of iterations to get a well rendered image. Since the algorithm is embarrassingly  parallel, the GPU approach can be must faster than the CPU counterpart. 
+
+### Anti aliasing
+
+It is easy to implement Anti-aliasing in GPU Path Tracer. Simply jittering the ray within a pixel can lead to an averaged pixel value. 
+
+### Lens effect
+
+Reference: PBRT 6.2.3
+
+Traditional path/ray tracer basically is modeled as a pinhole camera. For real camera, because of the focal length of the lens, only the objects in a range of distance to the image plane can be projected on the reasonable same pixel and form a clear image. Objects out of that the range will project on an area instead of a point on the image plane which means a blurred image.  Larger aperture leads to a shorter range of well focused length, i.e. a bluer image.
+
+To model this feature, we need two more varaibles for the camera. Lens radius and focalLength. Objects on the focalPlane will be projected on the same point on the image plane. Given a ray whose origin is at the middle of the lens(the origin of the camera), by finding the focus point on the focalPlane, we can compute the ray that goes through the focus point and a random point on the lens. So that we get a new ray whose origin is randomly distributed on the lens but goes through the focus point on the focal plane as its original pin hole ray does. The direction of the new ray is slightly different from the original one, thus objects that are far from the focal plane will be blurred. 
+
+### Work-efficient stream compaction with shared memory and multi blocks
+
+Reference: GPU Gems 3 ch 39
+
+Shared memory has a much faster access speed compared to global memory. Since work-efficient stream compaction has a tree structure, it needs lots of memory accessing, using shared memory can greatly increase the performance. Because shared memory is shared within a block and the size is limited, we need to cooperate through multi blocks. What we do is a multi level scan. We do exclusive scan on each block. then we collect the sum of each block and do a exclusive scan, add them back to their block. 
+
+The stream compaction can considerably decrease the number of active path rays, reduce the number of blocks. The influence is more obvious in a open scene
+
+
+![numPathRays](img/numPathRays.png)
+
+
+
+Sloving Bank conflict: use a macro to solve the same bank accessing (currently not using)
+
+
+### Refraction with Frensel effect
+
+Reference: PBRT 
+
+Update the Scatter Ray Function to scatter refraction ray at specific time. 
+
+### Obj file loading and rendering
+
+Read vertex, vertex normal, face, etc info from obj file.
+Code is partly adapted from my previous Ray Tracer Project in UC Berkley CS 184.
+
+### Stackless kd-tree for GPU (still buggy)
+
+reference: KD-Tree Acceleration Structures for a GPU Raytracer, Tim Foley and Jeremy Sugerman, Stanford University, Graphics Hardware (2005)
+
+The biggest difference of kd-tree on GPU from that of CPU is that we can't use recursive function or use full stack on GPU.
+
+I implement kd-backtrack algorithm. Which use backtrack to play the role of popping node from stack. Unfortuntely, this part is still buggy. Now the code can work correct in terms of low level backtrack. I use a depth hit test program to debug. It can be seen in this simple scene, the first depth hit is correct due to low level kd-tree. Yet there are rays losing intersection after the first hit test. So the result is darker. When it comes to complex tree like a obj file, current code will lose a lot of intersections. Although I spend the most time implementing and debugging this part, I fail to debug it completely before due time. Really a pity........ TODO in future.
+
+(left: naive parse, right: current kd tree)
+
+* hit depth = 0
+![master_depth0](img/master_depth0.png)   ![kdtree_depth0](img/kdtree_depth0.png)
+
+* hit depth = 1
+![master_depth1](img/master_depth1.png)   ![kdtree_depth1](img/kdtree_depth1.png)
+
+* kd tree wrong
+ ![kdtree_wrong](img/kdtree_wrong.png)
+
+----------------------
+
+## TIPS 
+
+* When doing CUDA debugging, if some threads trap to infinite loops, the debug can fail. To get rid of this, it is better to set a limitation for number of loops for every loop in the kernel. Together with printing the error info. This way can make the debugging much easier and less confusing about the CUDA error like grid launch error. 
+
+* I met racing conditions with large blockSize work-efficient stream compaction. I haven't got time to handle this. Currently just use blockSize of 32 to get rid of this. Larger blockSize can result in tiny patterns in the image.
+
+* \__constant__ memory can be used for geoms, kd nodes, and material to increase accessing speed since they are constant once constructed by CPU code. This can be done in future. 
+
+
diff --git a/img/kdtree_depth0.png b/img/kdtree_depth0.png
new file mode 100644
index 0000000..ca1d1f4
Binary files /dev/null and b/img/kdtree_depth0.png differ
diff --git a/img/kdtree_depth1.png b/img/kdtree_depth1.png
new file mode 100644
index 0000000..044f794
Binary files /dev/null and b/img/kdtree_depth1.png differ
diff --git a/img/kdtree_depth2.png b/img/kdtree_depth2.png
new file mode 100644
index 0000000..079a867
Binary files /dev/null and b/img/kdtree_depth2.png differ
diff --git a/img/kdtree_wrong.png b/img/kdtree_wrong.png
new file mode 100644
index 0000000..1e83cb1
Binary files /dev/null and b/img/kdtree_wrong.png differ
diff --git a/img/lens.png b/img/lens.png
new file mode 100644
index 0000000..3637d68
Binary files /dev/null and b/img/lens.png differ
diff --git a/img/master_depth0.png b/img/master_depth0.png
new file mode 100644
index 0000000..7131eb3
Binary files /dev/null and b/img/master_depth0.png differ
diff --git a/img/master_depth1.png b/img/master_depth1.png
new file mode 100644
index 0000000..3f08e9b
Binary files /dev/null and b/img/master_depth1.png differ
diff --git a/img/master_depth2.png b/img/master_depth2.png
new file mode 100644
index 0000000..ac1841d
Binary files /dev/null and b/img/master_depth2.png differ
diff --git a/img/numPathRays.png b/img/numPathRays.png
new file mode 100644
index 0000000..4e2df9a
Binary files /dev/null and b/img/numPathRays.png differ
diff --git a/img/refraction.png b/img/refraction.png
new file mode 100644
index 0000000..7702185
Binary files /dev/null and b/img/refraction.png differ
diff --git a/img/refraction_note.png b/img/refraction_note.png
new file mode 100644
index 0000000..c41b260
Binary files /dev/null and b/img/refraction_note.png differ
diff --git a/img/teapot.png b/img/teapot.png
new file mode 100644
index 0000000..7d9314b
Binary files /dev/null and b/img/teapot.png differ
diff --git a/scenes/cornell_open.txt b/scenes/cornell_open.txt
new file mode 100644
index 0000000..b79d6c0
--- /dev/null
+++ b/scenes/cornell_open.txt
@@ -0,0 +1,181 @@
+// Material----------------------------------------------------------------
+
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// refraction glass
+MATERIAL 5
+RGB		    0 0 0
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0
+REFR        1
+REFRIOR     1.2
+EMITTANCE   0
+
+// Emissive material (light)
+MATERIAL 6
+RGB         1 1 0
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse blue
+MATERIAL 7
+RGB         .1 .1 .98
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse
+MATERIAL 8
+RGB         .5 .1 .5
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse
+MATERIAL 9
+RGB         .2 .6 .2
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+
+// Scene -----------------------------------------------------------
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+VIEW        0 0 -1
+UP          0 1 0
+// LENS_RADIUS 1.1
+// FOCAL_LENGTH 8.5
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Sphere
+OBJECT 1
+sphere
+material 4
+TRANS       -3 4 -1
+ROTAT       0 0 0
+SCALE       1 1 1
+
+// Sphere glass
+OBJECT 2
+sphere
+material 4
+TRANS       .2 4 2
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// back light
+OBJECT 3
+cube
+material 6
+TRANS       0 5 -5
+ROTAT       0 0 0
+SCALE       1 3 .3
+
+//small blue cube
+OBJECT 4
+cube
+material 7
+TRANS       3 3 -2
+ROTAT       0 30 0
+SCALE       1 1 1
+
+//cube
+OBJECT 5
+cube
+material 1
+TRANS       -3.5 6 4
+ROTAT       10 60 0
+SCALE       2 2 2
+
+// Sphere
+OBJECT 6
+sphere
+material 2
+TRANS       3 8 2
+ROTAT       0 0 0
+SCALE       1 1 1
+
+// Sphere
+OBJECT 7
+sphere
+material 3
+TRANS       2 1.5 4
+ROTAT       0 0 0
+SCALE       1 1 1
diff --git a/scenes/cornell_test.txt b/scenes/cornell_test.txt
new file mode 100644
index 0000000..4a7960f
--- /dev/null
+++ b/scenes/cornell_test.txt
@@ -0,0 +1,221 @@
+// Material----------------------------------------------------------------
+
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// refraction glass
+MATERIAL 5
+RGB		    0 0 0
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0
+REFR        1
+REFRIOR     1.2
+EMITTANCE   0
+
+// Emissive material (light)
+MATERIAL 6
+RGB         1 1 0
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse blue
+MATERIAL 7
+RGB         .1 .1 .98
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse
+MATERIAL 8
+RGB         .5 .1 .5
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse
+MATERIAL 9
+RGB         .2 .6 .2
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+
+// Scene -----------------------------------------------------------
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+VIEW        0 0 -1
+UP          0 1 0
+// LENS_RADIUS 1.1
+// FOCAL_LENGTH 8.5
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 8
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS       -3 4 -1
+ROTAT       0 0 0
+SCALE       1 1 1
+
+// Sphere glass
+OBJECT 7
+sphere
+material 4
+TRANS       .2 4 2
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// back light
+OBJECT 8
+cube
+material 6
+TRANS       0 5 -5
+ROTAT       0 0 0
+SCALE       1 3 .3
+
+//small blue cube
+OBJECT 9
+cube
+material 7
+TRANS       3 3 -2
+ROTAT       0 30 0
+SCALE       1 1 1
+
+//cube
+OBJECT 10
+cube
+material 1
+TRANS       -3.5 6 4
+ROTAT       10 60 0
+SCALE       2 2 2
+
+// Sphere
+OBJECT 6
+sphere
+material 2
+TRANS       3 8 2
+ROTAT       0 0 0
+SCALE       1 1 1
+
+// Sphere
+OBJECT 6
+sphere
+material 3
+TRANS       2 1.5 4
+ROTAT       0 0 0
+SCALE       1 1 1
diff --git a/scenes/sphere_test.txt b/scenes/sphere_test.txt
new file mode 100644
index 0000000..61770bb
--- /dev/null
+++ b/scenes/sphere_test.txt
@@ -0,0 +1,28 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  1
+DEPTH       8
+FILE        sphere
+EYE         0.0 5 10.5
+VIEW        0 0 -1
+UP          0 1 0
+
+// Sphere
+OBJECT 0
+sphere
+material 0
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       3 3 3
diff --git a/scenes/zobj_test.txt b/scenes/zobj_test.txt
new file mode 100644
index 0000000..84b29a3
--- /dev/null
+++ b/scenes/zobj_test.txt
@@ -0,0 +1,165 @@
+// Material----------------------------------------------------------------
+
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// refraction glass
+MATERIAL 5
+RGB		    0 0 0
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0
+REFR        1
+REFRIOR     1.2
+EMITTANCE   0
+
+// Emissive material (light)
+MATERIAL 6
+RGB         1 1 0
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse blue
+MATERIAL 7
+RGB         .1 .1 .98
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+
+
+// Scene -----------------------------------------------------------
+
+// Camera
+CAMERA
+RES         400 400
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+VIEW        0 0 -1
+UP          0 1 0
+// LENS_RADIUS 2.1
+// FOCAL_LENGTH 8.5
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+
+
+// obj
+OBJECT 2
+obj
+material 7
+TRANS       0 5 -1
+ROTAT       20 30 45
+SCALE       .04 .04 .04
+OBJFILE     zteapot.obj
+
+// triangle
+//OBJECT 11
+//triangle
+//material 2
+//A       -3 2 4
+//B       2 2 -1
+//C       3 8 -4
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a1cb3fb..eaf0d60 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -15,6 +15,8 @@ set(SOURCE_FILES
     "preview.cpp"
     "utilities.cpp"
     "utilities.h"
+    "kdtree.cpp"
+    "kdtree.h"
     )
 
 cuda_add_library(src
diff --git a/src/interactions.h b/src/interactions.h
index d8107fb..14163ac 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -2,6 +2,7 @@
 
 #include "intersections.h"
 
+#define RAY_EPSILON 0.001f
 // CHECKITOUT
 /**
  * Computes a cosine-weighted random direction in a hemisphere.
@@ -73,8 +74,120 @@ void scatterRay(
         glm::vec3 intersect,
         glm::vec3 normal,
         const Material &m,
-        thrust::default_random_engine &rng) {
+        thrust::default_random_engine &rng) 
+{
     // TODO: implement this.
     // A basic implementation of pure-diffuse shading will just call the
     // calculateRandomDirectionInHemisphere defined above.
+	
+	
+
+
+	float schlickR = 1;
+
+	
+	int num_ray_type = 0;
+	int scatter_type_array[3];
+
+	//diffuse
+	if (!m.hasReflective && !m.hasRefractive)
+	{
+		scatter_type_array[num_ray_type] = 0;
+		num_ray_type++;
+	}
+
+	if(m.hasReflective)
+	{
+		scatter_type_array[num_ray_type] = 1;
+		num_ray_type++;
+	}
+
+	if(m.hasRefractive)
+	{
+		scatter_type_array[num_ray_type] = 2;
+		num_ray_type++;
+	}
+
+	//TODO: other ray type (sub scatter...)
+
+
+	int scatter_ray_type = scatter_type_array[0];
+	//random choose;
+	if(num_ray_type > 1)
+	{
+		thrust::uniform_int_distribution<int> uirand(0, num_ray_type-1);
+		scatter_ray_type = scatter_type_array[uirand(rng)];
+	}
+	
+	
+
+
+	
+	if(scatter_ray_type == 2)
+	{
+		//refractive
+		
+
+		float cosi = glm::dot(-ray.direction, normal);
+		float sini = sqrtf(1 - cosi*cosi);
+		float ni, nt;
+
+		float normal_sign;
+		
+		if (cosi > 0)
+		{
+			//from outside
+			ni = 1;
+			nt = m.indexOfRefraction;
+			normal_sign = 1;
+		}
+		else
+		{
+			ni = m.indexOfRefraction;
+			nt = 1;
+			normal_sign = -1;
+		}
+
+		float sint = ni * sini / nt;
+		float R0 = (ni - nt) / (ni + nt);
+		R0 *= R0;
+		schlickR = R0 + (1 - R0) * powf(1 - cosi, 5);
+		if (sint > 1)
+		{
+			//total reflect
+			num_ray_type--;
+			scatter_ray_type = 1;
+
+			
+		}
+		else
+		{
+			float cost = sqrtf(1 - sint*sint);
+			color *= m.color*(float)num_ray_type * (1 - schlickR);
+			glm::vec3 horizontal = (ray.direction + normal * normal_sign * cosi) / sini;
+			ray.direction = horizontal * sint - normal * normal_sign * cost;
+			//ray.direction = ray.direction * ni / nt + normal * normal_sign * (ni / nt*cosi - cost);
+			ray.origin = intersect - normal * normal_sign * RAY_EPSILON;
+		}
+	}
+
+
+	if (scatter_ray_type == 0)
+	{
+		//diffuse
+		ray.direction = calculateRandomDirectionInHemisphere(normal, rng);
+		//color *= m.color * glm::dot( ray.direction, normal) * (float)num_ray_type;
+		color *= m.color  * (float)num_ray_type;
+		ray.origin = intersect + normal * RAY_EPSILON;
+
+	}
+	else if (scatter_ray_type == 1)
+	{
+		//reflective
+		color *= m.specular.color * (float)num_ray_type * schlickR;
+		ray.direction = ray.direction + 2 * glm::dot(-ray.direction, normal) * normal;
+		ray.origin = intersect + normal * RAY_EPSILON;
+	}
+	
+	
 }
diff --git a/src/intersections.h b/src/intersections.h
index f34b89d..99b0938 100644
--- a/src/intersections.h
+++ b/src/intersections.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <glm/glm.hpp>
+#include <glm/gtx/intersect.hpp>
 
 #include "sceneStructs.h"
 #include "utilities.h"
@@ -141,3 +142,92 @@ __host__ __device__ float sphereIntersectionTest(Geom sphere, Ray r,
 
     return glm::length(r.origin - intersectionPoint);
 }
+
+
+__host__ __device__ float triangleIntersectionTest(Geom triangle, Ray r,
+	glm::vec3 &intersectionPoint, glm::vec3 &normal, bool &outside)
+{
+	glm::vec3 baryPosition;
+	bool hit = glm::intersectRayTriangle<glm::vec3>(r.origin, r.direction
+		, triangle.translation, triangle.rotation, triangle.scale, baryPosition);
+
+	float t = baryPosition.z;
+
+	//printf("%d  %f\n",(int)hit,t);
+
+	if (t < 0)
+	{
+		return -1;
+	}
+
+	baryPosition.z = 1 - baryPosition.y - baryPosition.x;
+	intersectionPoint = triangle.translation * baryPosition.x
+		+ triangle.rotation * baryPosition.y
+		+ triangle.scale * baryPosition.z;
+
+	//normal vector?
+	//use transform matrix?
+	//interpolate the normal
+	//[col][row]
+	glm::vec3 n0(triangle.transform[0][0], triangle.transform[0][1], triangle.transform[0][2]);
+	glm::vec3 n1(triangle.transform[1][0], triangle.transform[1][1], triangle.transform[1][2]);
+	glm::vec3 n2(triangle.transform[2][0], triangle.transform[2][1], triangle.transform[2][2]);
+
+	normal = glm::normalize(n0 * baryPosition.x + n1 * baryPosition.y + n2 * baryPosition.z);
+
+
+	outside = glm::dot(-r.direction, normal) > 0;
+
+	return t;
+}
+
+
+
+__host__ __device__ bool AABBIntersect(const AABB aabb,const Ray ray, float & tmin, float & tmax)
+{
+	//be sure the aabb is not empty
+	glm::vec3 inv_dir(1.0 / ray.direction.x, 1.0 / ray.direction.y, 1.0 / ray.direction.z);
+	
+	float tx1 = (aabb.min_pos.x - ray.origin.x)*inv_dir.x;
+	float tx2 = (aabb.max_pos.x - ray.origin.x)*inv_dir.x;
+
+	//float tmin = min(tx1, tx2);
+	//float tmax = max(tx1, tx2);
+	tmin = min(tx1, tx2);
+	tmax = max(tx1, tx2);
+
+	float ty1 = (aabb.min_pos.y - ray.origin.y) * inv_dir.y;
+	float ty2 = (aabb.max_pos.y - ray.origin.y) * inv_dir.y;
+
+	tmin = max(tmin, min(ty1, ty2));
+	tmax = min(tmax, max(ty1, ty2));
+
+
+	if (tmax >= tmin)
+	{
+		float tz1 = (aabb.min_pos.z - ray.origin.z) * inv_dir.z;
+		float tz2 = (aabb.max_pos.z - ray.origin.z) * inv_dir.z;
+
+		tmin = max(tmin, min(tz1, tz2));
+		tmax = min(tmax, max(tz1, tz2));
+
+
+		//To confirm?? tmax>0
+		if(tmax >= tmin && tmax > 0)
+		{
+			//the line of ray will intersect
+			//but maybe the t < 0
+
+			
+
+			return true;
+		}
+	}
+
+
+	return false;
+}
+
+
+
+
diff --git a/src/kdtree.cpp b/src/kdtree.cpp
new file mode 100644
index 0000000..dd31b8b
--- /dev/null
+++ b/src/kdtree.cpp
@@ -0,0 +1,373 @@
+#include <iostream>
+#include <algorithm>
+#include "scene.h"
+#include "kdtree.h"
+
+//this code only runs on cpu
+AABB getAABB(const Geom & geom)
+{
+	AABB aabb;
+	switch (geom.type)
+	{
+	case CUBE:
+	{
+		//vs 2012 way of init
+		glm::vec4 tmp_arys[] = {
+			glm::vec4(0.5f, 0.5f, 0.5f, 1.0f)
+			, glm::vec4(0.5f, 0.5f, -0.5f, 1.0f)
+			, glm::vec4(0.5f, -0.5f, 0.5f, 1.0f)
+			, glm::vec4(-0.5f, 0.5f, 0.5f, 1.0f)
+			, glm::vec4(0.5f, -0.5f, -0.5f, 1.0f)
+			, glm::vec4(-0.5f, 0.5f, -0.5f, 1.0f)
+			, glm::vec4(-0.5f, -0.5f, 0.5f, 1.0f)
+			, glm::vec4(-0.5f, -0.5f, -0.5f, 1.0f)
+		};
+		std::vector<glm::vec4> points(&tmp_arys[0], &tmp_arys[0] + 8);
+
+		glm::vec4 & t = points.at(0);
+		aabb.max_pos = glm::vec3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+		aabb.min_pos = glm::vec3(FLT_MAX, FLT_MAX, FLT_MAX);
+		for (auto p : points)
+		{
+			p = geom.transform * p;
+			p /= p.w;
+
+			aabb.min_pos.x = min(aabb.min_pos.x, p.x);
+			aabb.min_pos.y = min(aabb.min_pos.y, p.y);
+			aabb.min_pos.z = min(aabb.min_pos.z, p.z);
+
+			aabb.max_pos.x = max(aabb.max_pos.x, p.x);
+			aabb.max_pos.y = max(aabb.max_pos.y, p.y);
+			aabb.max_pos.z = max(aabb.max_pos.z, p.z);
+		}
+	}
+	break;
+	case SPHERE:
+	{
+		//simple square like cube
+		//use max radius
+		glm::vec4 tmp = geom.transform * glm::vec4(0.0f, 0.0f, 0.0f, 1.0f);
+		glm::vec3 o(tmp.x / tmp.w, tmp.y / tmp.w, tmp.z / tmp.w);
+		float r = max(geom.scale.x, geom.scale.y);
+		r = 0.5f * max(r, geom.scale.z);
+		glm::vec3 offset(r, r, r);
+		aabb.min_pos = o - offset;
+		aabb.max_pos = o + offset;
+	}
+	break;
+	case TRIANGLE:
+	{
+		//ugly implementation
+		const glm::vec3 & a = geom.translation;
+		const glm::vec3 & b = geom.rotation;
+		const glm::vec3 & c = geom.scale;
+
+		float minx = a.x;
+		float miny = a.y;
+		float minz = a.z;
+
+		float maxx = a.x;
+		float maxy = a.y;
+		float maxz = a.z;
+
+
+		minx = min(minx, b.x);
+		miny = min(miny, b.y);
+		minz = min(minz, b.z);
+		minx = min(minx, c.x);
+		miny = min(miny, c.y);
+		minz = min(minz, c.z);
+
+		maxx = max(maxx, b.x);
+		maxy = max(maxy, b.y);
+		maxz = max(maxz, b.z);
+		maxx = max(maxx, c.x);
+		maxy = max(maxy, c.y);
+		maxz = max(maxz, c.z);
+
+
+		aabb.min_pos = glm::vec3(minx, miny, minz);
+		aabb.max_pos = glm::vec3(maxx, maxy, maxz);
+	}
+	break;
+	default:
+		std::cerr << "GEOM TYPE ERROR\n";
+		break;
+	}
+	return aabb;
+}
+
+
+
+std::pair<AABB, AABB> cutAABB(const AABB & parent, const AAPlane& pl)
+{
+	AABB l = parent;
+	AABB r = parent;
+
+	//suppose pl is always inside the parent aabb
+
+	l.max_pos[pl.axis] = pl.pos;
+	r.min_pos[pl.axis] = pl.pos;
+	return std::make_pair(l, r);
+}
+
+
+
+typedef bool(*KdConstructCompareFun)(const KDNodeConstructWrapper &, const KDNodeConstructWrapper &);
+bool my_kd_construct_compare_x(const KDNodeConstructWrapper & a, const KDNodeConstructWrapper & b)
+{
+	return a.mid.x < b.mid.x;
+}
+bool my_kd_construct_compare_y(const KDNodeConstructWrapper & a, const KDNodeConstructWrapper & b)
+{
+	return a.mid.y < b.mid.y;
+}
+bool my_kd_construct_compare_z(const KDNodeConstructWrapper & a, const KDNodeConstructWrapper & b)
+{
+	return a.mid.z < b.mid.z;
+}
+
+void KDTree::init(Scene & s)
+{
+	//vector<Geom> & geoms_using = s.tmp_geoms;
+	vector<Geom> & geoms_using = s.geoms;
+	//vector<Geom> & geoms_final = s.geoms;
+
+	last_idx = 0;
+	AABB spaceAABB;
+	spaceAABB = getAABB(geoms_using.at(0));
+
+	vector<KDNodeConstructWrapper> vec_geoms(geoms_using.size());
+
+	int i = 0;
+	for (const Geom & g : geoms_using)
+	{
+
+		vec_geoms.at(i).aabb = getAABB(g);
+		vec_geoms.at(i).geom_idx = i;
+
+		AABB & aabb = vec_geoms.at(i).aabb;
+
+		vec_geoms.at(i).mid = (aabb.max_pos + aabb.min_pos) * 0.5f;
+
+		//update spaceAABB
+		spaceAABB.min_pos.x = min(spaceAABB.min_pos.x, aabb.min_pos.x);
+		spaceAABB.min_pos.y = min(spaceAABB.min_pos.y, aabb.min_pos.y);
+		spaceAABB.min_pos.z = min(spaceAABB.min_pos.z, aabb.min_pos.z);
+
+		spaceAABB.max_pos.x = max(spaceAABB.max_pos.x, aabb.max_pos.x);
+		spaceAABB.max_pos.y = max(spaceAABB.max_pos.y, aabb.max_pos.y);
+		spaceAABB.max_pos.z = max(spaceAABB.max_pos.z, aabb.max_pos.z);
+		////////////////////
+
+
+		i++;
+	}
+
+	//hst_node.resize(vec_geoms.size()*2.5);
+
+	//vector<int> vec_sequence;	//geom_idx, used to rebuild a scene->geoms vector whose sequence = tree travse
+
+	root_idx = build(vec_geoms, hst_geom_idx, spaceAABB, -1, 0);
+
+
+	//rebuild scene->geoms according to vec_sequence;
+	
+	////old method copy every geom 
+	//for (auto geom_idx : hst_geom_idx)
+	//{
+	//	geoms_final.push_back(geoms_using.at(geom_idx));
+	//}
+	//geoms_using.clear();
+
+
+	////test output for release mode
+	//std::cout << "i" << '\t' << "p_id" <<'\t' << "r_id" << '\t' << "gid" << '\t' << "num" << '\n';
+	//i=-1;
+	//for (auto a : hst_node)
+	//{
+	//	i++;
+	//	std::cout << i << '\t' << a.parent_idx << '\t' << a.right_idx << '\t' << a.geom_index << '\t' << a.num_geoms << '\n';
+	//	
+	//}
+	//std::cout << "\n" << hst_node.size();
+	//getchar();
+
+	//std::cout << "\n\n\n\n";
+
+	//for (int j = 0; j < hst_geom_idx.size(); j++)
+	//{
+	//	std::cout << j << '\t' << hst_geom_idx.at(j) << '\n';
+	//}
+	//std::cout << "\n" << hst_geom_idx.size();
+	//std::cout << "\n" << s.geoms.size();
+	//getchar();
+}
+
+
+
+void KDTree::buildLeaf(int cur_idx
+	,const vector<KDNodeConstructWrapper>& construct_objs
+	, vector<int> & sequence, const AABB& box, int parent_idx, int depth)
+{
+	
+	//auto t = construct_objs.begin();
+	hst_node.at(cur_idx).aabb = box;//t->aabb;
+
+	//cur.geom_index = t->geom_idx;
+	hst_node.at(cur_idx).geom_index = sequence.size();
+
+	hst_node.at(cur_idx).parent_idx = parent_idx;
+	//cur.left_idx = -1;
+	hst_node.at(cur_idx).num_geoms = construct_objs.size();
+
+	hst_node.at(cur_idx).right_idx = -1;
+
+	for (const KDNodeConstructWrapper & c : construct_objs)
+	{
+		sequence.push_back(c.geom_idx);
+	}
+
+	
+	//return cur_idx;
+}
+
+
+
+
+
+
+
+//return this node idx
+int KDTree::build(vector<KDNodeConstructWrapper> & construct_objs
+	, vector<int> & sequence, const AABB& box, int parent_idx, int depth)
+{
+	if (construct_objs.empty())
+	{
+		std::cerr << "ERROR: empty kdtree node!\n";
+		return -1;
+	}
+
+
+	//if (last_idx >= hst_node.size())
+	//{
+	//	hst_node.push_back(Node());
+	//}
+	hst_node.push_back(Node());
+	//Node & cur = hst_node.at(last_idx); // !!! this is not safe when hst_node assigns new value
+	int cur_idx = hst_node.size() - 1;
+	last_idx++;
+
+
+	if (construct_objs.size() <= MAX_LEAF_GEOM_NUM)
+	{
+		//leaf node
+		//no more split
+		buildLeaf(cur_idx, construct_objs, sequence, box, parent_idx, depth);
+
+		return cur_idx;
+	}
+
+
+
+	//internal node
+
+	KdConstructCompareFun f;
+	switch (depth % 3)
+	{
+	case 0:
+		f = my_kd_construct_compare_x;
+		hst_node.at(cur_idx).split.axis = AXIS_X;
+		break;
+	case 1:
+		f = my_kd_construct_compare_y;
+		hst_node.at(cur_idx).split.axis = AXIS_Y;
+		break;
+	case 2:
+		f = my_kd_construct_compare_z;
+		hst_node.at(cur_idx).split.axis = AXIS_Z;
+		break;
+	}
+
+
+	//std::nth_element(construct_objs.begin(),construct_objs.begin()+(construct_objs.size()/2),construct_objs.end(),*f);
+
+	sort(construct_objs.begin(), construct_objs.end(), *f);
+
+	vector<KDNodeConstructWrapper>::iterator t = construct_objs.begin() + (construct_objs.size() / 2);
+
+	hst_node.at(cur_idx).split.pos = t->mid[hst_node.at(cur_idx).split.axis];
+	hst_node.at(cur_idx).aabb = box;
+	hst_node.at(cur_idx).geom_index = -1;
+	hst_node.at(cur_idx).num_geoms = 0;
+	hst_node.at(cur_idx).right_idx = -1;
+	hst_node.at(cur_idx).parent_idx = parent_idx;
+	pair<AABB, AABB> aabb_pair = cutAABB(box, hst_node.at(cur_idx).split);
+
+	vector<KDNodeConstructWrapper> left_objs;
+	vector<KDNodeConstructWrapper> right_objs;
+
+	left_objs.assign(construct_objs.begin(), t);
+	right_objs.assign(t, construct_objs.end());
+	//for (auto it = construct_objs.begin(); it != construct_objs.end(); ++it)
+	//{
+	//	if (it < t)
+	//	{
+	//		left_objs.push_back(*it);
+	//	}
+	//	else
+	//	{
+	//		right_objs.push_back(*it);
+	//	}
+	//}
+
+
+	int tmp_left_size = left_objs.size();
+
+
+
+	//overlap object should be added to both branch
+
+	int num_overlap = 0;
+
+	//add right to left
+	for (auto o : right_objs)
+	{
+		if (o.aabb.min_pos[hst_node.at(cur_idx).split.axis] < hst_node.at(cur_idx).split.pos)
+		{
+			left_objs.push_back(o);
+			num_overlap++;
+		}
+	}
+
+	//add left to right
+	for (int i = 0; i < tmp_left_size; i++)	//naive parse method....
+	{
+		KDNodeConstructWrapper & o = left_objs.at(i);
+		if (o.aabb.max_pos[hst_node.at(cur_idx).split.axis] > hst_node.at(cur_idx).split.pos)
+		{
+			right_objs.push_back(o);
+			num_overlap++;
+		}
+	}
+
+
+	if ( ((double)num_overlap) / ((double)construct_objs.size()) >= MAX_OVERLAP_RATIO)
+	{
+		//don't split
+		//build leaf
+		buildLeaf(cur_idx, construct_objs, sequence, box, parent_idx, depth);
+
+		return cur_idx;
+	}
+
+
+	//cur.left_idx = build(left_objs, aabb_pair.first, cur_idx, depth + 1);
+	build(left_objs, sequence, aabb_pair.first, cur_idx, depth + 1);
+	
+	int tmp_right_idx = build(right_objs, sequence, aabb_pair.second, cur_idx, depth + 1);
+	
+	hst_node.at(cur_idx).right_idx = tmp_right_idx;
+
+	return cur_idx;
+}
\ No newline at end of file
diff --git a/src/kdtree.h b/src/kdtree.h
new file mode 100644
index 0000000..657f10e
--- /dev/null
+++ b/src/kdtree.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include "sceneStructs.h"
+
+
+AABB getAABB(const Geom & geom);
+std::pair<AABB, AABB> cutAABB(const AABB & parent, const AAPlane& pl);
+
+struct Node
+{
+	//first geom index
+	int geom_index;	// < 0 means not leaf node
+	
+	AABB aabb;
+
+	AAPlane split;
+
+	
+	//left_idx = cur_idx + 1;
+
+	int num_geoms;
+
+
+	int right_idx;
+
+
+	int parent_idx;
+};
+
+class Scene;
+
+struct KDNodeConstructWrapper
+{
+	AABB aabb;
+	int geom_idx;
+
+	glm::vec3 mid;
+};
+
+class KDTree
+{
+public:
+	const int MAX_LEAF_GEOM_NUM = 8;
+	const float MAX_OVERLAP_RATIO = 0.5f;
+
+
+	//attention: this will copy to gpu shared memory
+	int root_idx;
+
+	//construction runs on cpu
+	void init(Scene&);
+
+	vector<Node> hst_node;
+
+	vector<int> hst_geom_idx;
+	//
+	int last_idx;
+private:
+	int build(vector<KDNodeConstructWrapper> & geoms
+		,vector<int> & sequence, const AABB& box, int parent_idx, int depth);
+
+	void buildLeaf(int cur_idx,const vector<KDNodeConstructWrapper>& geoms
+		, vector<int> & sequence, const AABB& box, int parent_idx, int depth);
+};
diff --git a/src/main.cpp b/src/main.cpp
index 77671f4..88f24ca 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,12 +1,17 @@
 #include "main.h"
 #include "preview.h"
+#include <src/scene.h>
 #include <cstring>
 
+#include <stream_compaction/stream_compaction.h>
+
+//#define STREAM_COMPACTION_TEST
+
 static std::string startTimeString;
 static bool camchanged = false;
 static float theta = 0, phi = 0;
 static glm::vec3 cammove;
-
+//Scene scene;
 Scene *scene;
 RenderState *renderState;
 int iteration;
@@ -14,11 +19,116 @@ int iteration;
 int width;
 int height;
 
+
+//MY
+void sceneInitKDTree()
+{
+	std::cout << "init kd tree..." << std::endl;
+	scene->kdtree.init(*scene);
+
+	std::cout << "done!" << std::endl;
+	
+	//std::cout << sizeof(Node) << std::endl;
+	//std::cout << (scene->kdtree.last_idx) << std::endl;
+	//std::cout << scene->kdtree.hst_node.size() << std::endl;
+}
+
+
 //-------------------------------
 //-------------MAIN--------------
 //-------------------------------
 
 int main(int argc, char** argv) {
+
+
+	//std::cout<<sizeof(char)<<std::endl;
+	//std::cout<<sizeof(int)<<std::endl;
+	//std::cout<<sizeof(glm::vec3)<<std::endl;
+	//std::cout<<sizeof(Ray)<<std::endl;
+	//std::cout<<sizeof(Path)<<std::endl;
+	//std::cout<<sizeof(Geom)<<std::endl;
+	//std::cout<<sizeof(Material)<<std::endl;
+	
+	Scene myScene;
+	scene = &myScene;
+
+
+#ifdef STREAM_COMPACTION_TEST
+
+	//test stream compaction
+	int num = 15000;
+	//int num = 30;
+	std::vector<Path> hos_paths(num);
+	std::vector<Path> hos_paths_cmp(num);
+
+
+	//init
+	int ii = 0;
+	for (Path& p : hos_paths)
+	{
+		p.terminated = (ii%11!=0);
+		p.image_index = ii;
+		ii++;
+	}
+	hos_paths.at(num - 1).terminated = false;;
+	//hos_paths.at(0).terminated = false;
+	//hos_paths.at(1).terminated = false;
+	//hos_paths.at(5).terminated = false;
+	//hos_paths.at(10).terminated = false;
+	//hos_paths.at(13).terminated = false;
+	//hos_paths.at(16).terminated = false;
+	//hos_paths.at(21).terminated = false;
+
+	//hos_paths.at(38).terminated = false;
+	//hos_paths.at(99).terminated = false;
+	//hos_paths.at(177).terminated = false;
+	//hos_paths.at(199).terminated = false;
+
+	////////
+
+	int cpu_compact_num = StreamCompaction::Efficient::compactWithoutScan(num, hos_paths_cmp.data(), hos_paths.data());
+	std::cout << "input:" << std::endl;
+	StreamCompaction::Efficient::printArray(num, hos_paths.data());
+	std::cout << "cpu:" << std::endl;
+	StreamCompaction::Efficient::printArray(cpu_compact_num, hos_paths_cmp.data());
+
+
+	Path * dev_paths;
+	cudaMalloc(&dev_paths, num*sizeof(Path));
+	cudaMemcpy(dev_paths, hos_paths.data(), num*sizeof(Path), cudaMemcpyHostToDevice);
+
+	num = StreamCompaction::Efficient::compact(num, dev_paths);
+
+
+	cudaMemcpy(hos_paths.data(), dev_paths, num*sizeof(Path), cudaMemcpyDeviceToHost);
+
+
+	cudaFree(dev_paths);
+
+	//StreamCompaction::Efficient::printArray(cpu_compact_num, hos_paths_cmp.data());
+	
+	std::cout << "gpu:" << std::endl;
+	StreamCompaction::Efficient::printArray(num, hos_paths.data());
+
+
+	
+	if (cpu_compact_num != num)
+	{
+		printf("fail,num not equal... cpu:%d       gpu:%d\n",cpu_compact_num,num);
+	}
+	else if (StreamCompaction::Efficient::cmpArrays(num, hos_paths_cmp.data(), hos_paths.data()))
+	{
+		printf("fail\n");
+	}
+
+	return 0;
+	////////////////////////////////////
+
+#endif
+
+
+
+
     startTimeString = currentTimeString();
 
     if (argc < 2) {
@@ -29,11 +139,16 @@ int main(int argc, char** argv) {
     const char *sceneFile = argv[1];
 
     // Load scene file
-    scene = new Scene(sceneFile);
+    //scene = new Scene(sceneFile);
+	scene->loadScene(sceneFile);
+#ifdef USE_KDTREE_FLAG
+	sceneInitKDTree();
+#endif
 
     // Set up camera stuff from loaded path tracer settings
     iteration = 0;
     renderState = &scene->state;
+	//renderState = &scene.state;
     width = renderState->camera.resolution.x;
     height = renderState->camera.resolution.y;
 
@@ -43,6 +158,8 @@ int main(int argc, char** argv) {
     // GLFW main loop
     mainLoop();
 
+	//delete scene;
+
     return 0;
 }
 
@@ -69,6 +186,9 @@ void saveImage() {
     //img.saveHDR(filename);  // Save a Radiance HDR file
 }
 
+
+
+
 void runCuda() {
     if (camchanged) {
         iteration = 0;
@@ -89,19 +209,22 @@ void runCuda() {
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
 
     if (iteration == 0) {
-        pathtraceFree();
+		pathtraceFree();
+		
+		
         pathtraceInit(scene);
     }
+	
 
     if (iteration < renderState->iterations) {
         uchar4 *pbo_dptr = NULL;
         iteration++;
         cudaGLMapBufferObject((void**)&pbo_dptr, pbo);
-
+		
         // execute the kernel
         int frame = 0;
         pathtrace(pbo_dptr, frame, iteration);
-
+		
         // unmap buffer object
         cudaGLUnmapBufferObject(pbo);
     } else {
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index e7ef1c6..2623e5f 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -5,6 +5,9 @@
 #include <thrust/random.h>
 #include <thrust/remove.h>
 
+#include <thrust/device_vector.h>
+
+
 #include "sceneStructs.h"
 #include "scene.h"
 #include "glm/glm.hpp"
@@ -13,6 +16,8 @@
 #include "pathtrace.h"
 #include "intersections.h"
 #include "interactions.h"
+#include <stream_compaction/stream_compaction.h>
+
 
 #define ERRORCHECK 1
 
@@ -38,10 +43,9 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) {
 #endif
 }
 
-__host__ __device__
-thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
+__host__ __device__ thrust::default_random_engine makeSeededRandomEngine(int iter, int index = 0, int depth = 0) {
     int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
-    return thrust::default_random_engine(h);
+	return thrust::default_random_engine(h);
 }
 
 //Kernel that writes the image to the OpenGL PBO directly.
@@ -72,15 +76,66 @@ static glm::vec3 *dev_image = NULL;
 // TODO: static variables for device memory, scene/camera info, etc
 // ...
 
+static Path * dev_path;
+
+//static Ray * dev_ray0;
+//static Ray * dev_ray1;
+
+//static thrust::device_vector<Ray> * dev_ray0;
+
+//static Ray * dev_ray_cur;
+//static Ray * dev_ray_next;
+
+//static thrust::device_vector<Geom> dev_geom;			//global memory
+//static thrust::device_vector<Material> dev_material;	//global
+static Geom * dev_geom;
+static Material * dev_material;
+
+
+
+//kd tree structure
+static Node * dev_node;		//kd tree node
+
+static int * dev_geom_idx;
+
+
+
 void pathtraceInit(Scene *scene) {
+	
     hst_scene = scene;
+	
     const Camera &cam = hst_scene->state.camera;
     const int pixelcount = cam.resolution.x * cam.resolution.y;
-
+	
     cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
     cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
     // TODO: initialize the above static variables added above
+	
+	
+
+	cudaMalloc(&dev_path, pixelcount * sizeof(Path));
 
+	
+
+	cudaMalloc(&dev_geom, scene->geoms.size() * sizeof (Geom));
+	cudaMemcpy(dev_geom, scene->geoms.data() , scene->geoms.size() * sizeof (Geom), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_material,scene->materials.size() * sizeof(Material));
+	cudaMemcpy(dev_material,scene->materials.data() , scene->materials.size() * sizeof (Material), cudaMemcpyHostToDevice);
+
+
+	
+	//KD-tree
+#ifdef USE_KDTREE_FLAG	
+
+	cudaMalloc(&dev_node, (scene->kdtree.hst_node.size()) * sizeof(Node) );
+	cudaMemcpy(dev_node, scene->kdtree.hst_node.data(), (scene->kdtree.hst_node.size()) * sizeof(Node), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_geom_idx, (scene->kdtree.hst_geom_idx.size()) * sizeof(int));
+	cudaMemcpy(dev_geom_idx, scene->kdtree.hst_geom_idx.data(), (scene->kdtree.hst_geom_idx.size()) * sizeof(int), cudaMemcpyHostToDevice);
+#endif
+
+	
     checkCUDAError("pathtraceInit");
 }
 
@@ -88,34 +143,500 @@ void pathtraceFree() {
     cudaFree(dev_image);  // no-op if dev_image is null
     // TODO: clean up the above static variables
 
+#ifdef USE_KDTREE_FLAG
+	cudaFree(dev_node);
+	cudaFree(dev_geom_idx);
+#endif
+	
+	cudaFree(dev_path);
+
+	cudaFree(dev_geom);
+	cudaFree(dev_material);
+
     checkCUDAError("pathtraceFree");
 }
 
+
+__host__ __device__ void getCameraRayAtPixel(Path & path,const Camera &c, int x, int y,int iter,int index)
+{
+	thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
+	thrust::uniform_real_distribution<float> u01(0, 1);
+
+
+	path.ray.origin = c.position;
+	path.ray.direction = glm::normalize(c.view
+		- c.right * c.pixelLength.x * ((float)x - (float)c.resolution.x * 0.5f + u01(rng))  		//u01(rng) is for jiitering for antialiasing
+		- c.up * c.pixelLength.y * ((float)y - (float)c.resolution.y * 0.5f + u01(rng)) 			//u01(rng) is for jiitering for antialiasing
+		);
+
+	if (c.lensRadiaus > 0)
+	{
+		//lens effect
+		float r = c.lensRadiaus * u01(rng);
+		float theta = u01(rng) * 2 * PI;
+
+		
+		float t = c.focalDistance * c.view.z / path.ray.direction.z;
+
+		glm::vec3 pfocus = path.ray.origin + t * path.ray.direction;
+
+		path.ray.origin = c.position + c.right * r * cos(theta) - c.up * r * sin(theta);
+		path.ray.direction = glm::normalize(pfocus - path.ray.origin);
+	}
+	
+	path.image_index = index;
+	path.color = glm::vec3(1.0f);
+	path.terminated = false;
+	
+}
+
+
 /**
- * Example function to generate static and test the CUDA-GL interop.
- * Delete this once you're done looking at it!
+ * Generate Rays from camera through screen to the field
+ * which is the first generation of rays
+ *
+ * Antialiasing - num of rays per pixel
+ * motion blur - jitter scene position
+ * lens effect - jitter camera position
  */
-__global__ void generateNoiseDeleteMe(Camera cam, int iter, glm::vec3 *image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+__global__ void generateRayFromCamera(Camera cam, int iter, Path* paths)
+{
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
     if (x < cam.resolution.x && y < cam.resolution.y) {
         int index = x + (y * cam.resolution.x);
+		Path & path = paths[index];
+		getCameraRayAtPixel(path,cam,x,y,iter,index);
+
 
-        thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
-        thrust::uniform_real_distribution<float> u01(0, 1);
-
-        // CHECKITOUT: Note that on every iteration, noise gets added onto
-        // the image (not replaced). As a result, the image smooths out over
-        // time, since the output image is the contents of this array divided
-        // by the number of iterations.
-        //
-        // Your renderer will do the same thing, and, over time, it will become
-        // smoother.
-        image[index] += glm::vec3(u01(rng));
     }
 }
 
+
+
+//__device__ void hitTestGeomsNaive(int geoms_size,Path & path,Geom * geoms,glm::vec3 & intersect_point, glm::vec3 & normal,int & hit_geom_index)
+//{
+//	float t;
+//	float t_min = FLT_MAX;
+//	for(int i = 0; i < geoms_size; i++)
+//	{
+//		//Geom & geom = static_cast<Geom>(*it);
+//		glm::vec3 tmp_intersect;
+//		glm::vec3 tmp_normal;
+//		Geom geom = geoms[i];
+//		if( geom.type == CUBE)
+//		{
+//			t = boxIntersectionTest(geom,path.ray,tmp_intersect,tmp_normal);
+//		}
+//		else if( geom.type == SPHERE)
+//		{
+//			t = sphereIntersectionTest(geom,path.ray,tmp_intersect,tmp_normal);
+//		}
+//		else
+//		{
+//			//TODO: triangle
+//			printf("ERROR: geom type error at %d\n",i);
+//		}
+//
+//		if(t > 0 && t_min > t)
+//		{
+//			t_min = t;
+//			hit_geom_index = i;
+//			intersect_point = tmp_intersect;
+//			normal = tmp_normal;
+//		}
+//	}
+//}
+
+
+__device__ int kd_search_leaf(int & cur_idx, Node * nodes, Geom* geoms, int * geomsid
+	                          ,const Ray & ray,float& tmin, float& tmax, float global_tmax
+							  ,glm::vec3 & intersect, glm::vec3 & normal, bool & outside)
+{
+	//search for a hit in this leaf
+	Node & n = nodes[cur_idx];
+	float t;
+	glm::vec3 leaf_intersect_point;
+	glm::vec3 leaf_normal;
+	float t_min = FLT_MAX;
+	int hit_geom_index = -1;
+	bool leaf_outside = true;
+
+	for (int i = 0; i < n.num_geoms; i++)
+	{
+		glm::vec3 tmp_intersect;
+		glm::vec3 tmp_normal;
+		bool tmp_outside = true;
+		int gid = geomsid[n.geom_index + i];
+		Geom & geom = geoms[gid];
+		if (geom.type == CUBE)
+		{
+			t = boxIntersectionTest(geom, ray, tmp_intersect, tmp_normal, tmp_outside);
+		}
+		else if (geom.type == SPHERE)
+		{
+			t = sphereIntersectionTest(geom, ray, tmp_intersect, tmp_normal, tmp_outside);
+		}
+		else
+		{
+			// triangle
+			t = triangleIntersectionTest(geom, ray, tmp_intersect, tmp_normal, tmp_outside);
+		}
+
+		if (t > 0 && t_min > t)
+		{
+			t_min = t;
+			hit_geom_index = gid;
+			leaf_intersect_point = tmp_intersect;
+			leaf_normal = tmp_normal;
+			leaf_outside = tmp_outside;
+		}
+	}
+
+
+
+	//////////////////////////////////////////////////////////////////////
+
+	if(hit_geom_index != -1)
+	{
+		// found hithit
+		intersect = leaf_intersect_point;
+		normal = leaf_normal;
+		outside = leaf_outside;
+		return hit_geom_index;
+	}
+	else
+	{
+		//continue search
+		if (fabs(tmax - global_tmax) < EPSILON)
+		{
+			//fail, no collision
+			//end search
+			//printf("end\n");
+			return -1;
+		}
+		else
+		{
+			//return -1;
+
+			////kd-restart
+			//tmin = tmax;
+			//tmax = global_tmax;
+			//cur_idx = 0;
+			//return -2;
+
+
+			//-------------------------------------
+
+			//kd-backtrack
+
+			float tmp_tmin = tmax,tmp_tmax = global_tmax;
+			float t0, t1;
+
+			//backtrack
+
+			bool tmp_hit;
+			int backtrack_idx = cur_idx;
+			
+			do
+			{
+				backtrack_idx = nodes[backtrack_idx].parent_idx;
+				if (backtrack_idx < 0)
+				{
+					//shouldn't happen
+					//end
+					return -1;
+				}
+
+				tmp_hit = AABBIntersect(nodes[backtrack_idx].aabb, ray, t0, t1);
+
+				//intersection limited to (tmp_tmin,tmp_tmax)
+				if (t1 < tmp_tmin + EPSILON)
+				{
+					tmp_hit = false;
+				}
+				else if (t0 < tmp_tmin)
+				{
+					t0 = tmp_tmin;
+				}
+			} while (!tmp_hit);
+
+
+
+			//bool tmp_hit = AABBIntersect(n.aabb,ray,t0,t1);
+			//int backtrack_idx = cur_idx;
+			//if (!(t0 >= tmp_tmin && t1 <= tmp_tmax))
+			//{
+			//	tmp_hit = false;
+			//}
+
+			//while(!tmp_hit)
+			//{
+			//	//tmp_tmin = tmax;
+			//	//tmp_tmax = global_tmax;
+			//	
+			//	//call backtrack again
+			//	backtrack_idx = nodes[backtrack_idx].parent_idx;
+
+			//	if (backtrack_idx < 0)
+			//	{
+			//		//error...
+			//		//shouldn't happen
+			//		//printf("ERROR: kd tree backtrack to root!\n");
+			//		return -1;
+			//	}
+
+			//	tmp_hit = AABBIntersect(nodes[backtrack_idx].aabb,ray,t0,t1);
+			//	if (! (t0 >= tmp_tmin && t1 <= tmp_tmax ) )
+			//	{
+			//		tmp_hit = false;
+			//	}
+			//}
+			
+
+
+			//has intersection
+			cur_idx = backtrack_idx;
+			//tmin = tmp_tmin+RAY_EPSILON;
+			tmin = t0;
+			tmax = t1;
+			//tmax = global_tmax;
+			
+			return -2;
+		}
+	}
+}
+
+
+__device__ int kd_search_split(int & cur_idx,Node & n,const Ray & ray,float& tmin, float& tmax)
+{
+	float thit = (n.split.pos - ray.origin[n.split.axis]) / ray.direction[n.split.axis];
+	int first,second;
+	//order
+	if(ray.direction[n.split.axis] > 0.0f)
+	{
+		//first = n.left_idx;
+		first = cur_idx + 1;
+		second = n.right_idx;
+	}
+	else
+	{
+		first = n.right_idx;
+		//second = n.left_idx;
+		second = cur_idx + 1;
+	}
+
+
+	if(thit >= tmax || thit < 0)
+	{
+		cur_idx = first;
+		//cur_idx = second;
+	}
+	else if( thit <= tmin + EPSILON)
+	{
+		
+		cur_idx = second;
+	}
+	else
+	{
+		cur_idx = first;
+		tmax = thit;
+	}
+
+
+	return -2;
+}
+
+
+//return:
+//-1 end, no collision
+//-2 continue
+//>=0 hit_geom_id
+__device__ int kd_search_node(int & cur_idx,Node * nodes,Geom* geoms, int * geomsid
+							  ,const Ray & ray,float& tmin,float& tmax, float global_tmax
+							  ,glm::vec3 & intersect, glm::vec3 & normal, bool & outside)
+{
+	if(nodes[cur_idx].geom_index == -1)
+	{
+		//internal node
+		return kd_search_split(cur_idx,nodes[cur_idx],ray, tmin, tmax);
+		
+	}
+	else
+	{
+		//leaf node
+		//return 8;
+		return kd_search_leaf(cur_idx, nodes, geoms, geomsid
+			, ray, tmin, tmax, global_tmax
+			, intersect, normal, outside);
+	}
+}
+
+//__device__ void kd_search_init(int root_idx,Node * nodes,const Ray & ray)
+//{
+//	float tmin,tmax;
+//	AABBIntersect(nodes[root_idx].aabb,ray,tmin,tmax);
+//	kd_serach_node(root_idx,nodes,ray,tmin,tmax);
+//}
+
+
+
+
+__global__ void pathTraceOneBounce(int iter, int depth,int num_paths,glm::vec3 * image
+										,Path * paths
+										,Geom * geoms, int geoms_size
+										,Material * materials, int materials_size
+										,Node * nodes
+										,int num_nodes
+										, int * geomsid
+										//,const thrust::device_vector<Geom> & geoms , const thrust::device_vector<Material> & materials
+										)
+{
+	//int blockId = blockIdx.x + blockIdx.y * gridDim.x;
+	//int path_index = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
+	int path_index =  blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+	
+	if(path_index < num_paths)
+	{
+		Path & path = paths[path_index];	//TODO: reconsider the speed for the memory access here
+		//Path & path = paths[path_index];
+		//calculate intersection
+		float t;
+		glm::vec3 intersect_point;
+		glm::vec3 normal;
+		float t_min = FLT_MAX;
+		int hit_geom_index = -1;
+		bool outside = true;
+
+
+#ifndef USE_KDTREE_FLAG
+		//naive parse through global geoms
+
+		for (int i = 0; i < geoms_size; i++)
+		{
+			//Geom & geom = static_cast<Geom>(*it);
+			glm::vec3 tmp_intersect;
+			glm::vec3 tmp_normal;
+			Geom & geom = geoms[i];
+			if (geom.type == CUBE)
+			{
+				t = boxIntersectionTest(geom, path.ray, tmp_intersect, tmp_normal, outside);
+			}
+			else if (geom.type == SPHERE)
+			{
+				t = sphereIntersectionTest(geom, path.ray, tmp_intersect, tmp_normal, outside);
+			}
+			else
+			{
+				// triangle
+				//printf("ERROR: geom type error at %d\n",i);
+				t = triangleIntersectionTest(geom, path.ray, tmp_intersect, tmp_normal, outside);
+			}
+
+			if (t > 0 && t_min > t)
+			{
+				t_min = t;
+				hit_geom_index = i;
+				intersect_point = tmp_intersect;
+				normal = tmp_normal;
+			}
+		}
+		
+
+		///////////////////////////////
+#else
+
+		//TODO:k-d tree traverse approach
+		//int traverse = 0;
+		int max_traverse = 2 * num_nodes;	//to prevent deadlock
+
+		int state = -2;
+		int cur_idx = 0;		//tmp, root node always 0....
+		float global_tmin, global_tmax;
+		AABBIntersect(nodes[cur_idx].aabb, path.ray, global_tmin, global_tmax);
+		float tmin = global_tmin, tmax = global_tmax;
+		while (state == -2 /*&& traverse < max_traverse*/)
+		{
+			state = kd_search_node(cur_idx, nodes, geoms, geomsid
+				,path.ray, tmin, tmax, global_tmax
+				, intersect_point, normal, outside);
+
+			//traverse++;
+
+		}
+		hit_geom_index = state;
+		////////////////////////////
+#endif
+
+
+		if (hit_geom_index == -1)
+		{
+			path.terminated = true;
+			image[path.image_index] += BACKGROUND_COLOR;
+		}
+		else
+		{
+			//hit something
+			Geom & geom = geoms[hit_geom_index];
+			Material & material = materials[geom.materialid];
+
+			////test, first hit
+			//if (1)
+			//{
+			//	path.terminated = true;
+			//	image[path.image_index] += material.color;
+			//	return;
+			//}
+
+			
+
+			//===================================
+
+
+
+			if (material.emittance > EPSILON)
+			{
+				//light source
+				path.terminated = true;
+				image[path.image_index] += path.color * material.color * material.emittance;
+			}
+			else
+			{
+				path.terminated = false;
+				thrust::default_random_engine rng = makeSeededRandomEngine(iter, path.image_index, depth);
+				scatterRay(path.ray, path.color, intersect_point, normal, material, rng);
+			}
+
+
+			////depth hit test
+			//if (depth == iter-1)
+			//{
+			//	//path.terminated = true;
+			//	image[path.image_index] = material.color * (float)(iter);
+			//	//return;
+			//}
+
+
+
+
+		}
+
+
+		
+	}
+}
+
+
+struct is_path_terminated
+{
+  __host__ __device__
+  bool operator()(const Path path)
+  {
+	  return path.terminated;
+  }
+};
+
 /**
  * Wrapper for the __global__ call that sets up the kernel calls and does a ton
  * of memory management
@@ -125,10 +646,14 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
     const Camera &cam = hst_scene->state.camera;
     const int pixelcount = cam.resolution.x * cam.resolution.y;
 
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+    const int blockSideLength = 8;
+    const dim3 blockSize(blockSideLength, blockSideLength);
+	const int blockSizeTotal = blockSideLength * blockSideLength;
+    
+	const dim3 blockSize2d(8, 8);
+	const dim3 blocksPerGrid2d(
+		(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+		(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
     ///////////////////////////////////////////////////////////////////////////
 
@@ -157,9 +682,49 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
     // * Finally, handle all of the paths that still haven't terminated.
     //   (Easy way is to make them black or background-colored.)
 
-    // TODO: perform one iteration of path tracing
+   
+	////test
+	//getchar();
+	//printf("%d\n", iter-1);
+
+
+    //generateNoiseDeleteMe<<<blocksPerGrid, blockSize>>>(cam, iter, dev_image);
+
+
+
+	int depth = 0;
 
-    generateNoiseDeleteMe<<<blocksPerGrid2d, blockSize2d>>>(cam, iter, dev_image);
+	generateRayFromCamera<<<blocksPerGrid2d,blockSize>>>(cam,iter,dev_path);
+	checkCUDAError("generate camera ray");
+
+	
+	Path* dev_path_end = dev_path + pixelcount;
+	int num_path = dev_path_end - dev_path;
+	//loop
+	while (/*dev_path_end != dev_path*/num_path > 0 && depth < traceDepth)
+	{
+		
+
+		
+		dim3 blocksNeeded = (num_path + blockSizeTotal - 1) / blockSizeTotal ;
+		pathTraceOneBounce<<<blocksNeeded,blockSize>>>(iter,depth, num_path  ,dev_image, dev_path
+			, dev_geom, hst_scene->geoms.size()
+			, dev_material, hst_scene->materials.size()
+			, dev_node, hst_scene->kdtree.hst_node.size(), dev_geom_idx);
+		checkCUDAError("trace one bounce");
+		cudaDeviceSynchronize();
+		depth ++;
+
+		////stream compaction
+		//dev_path_end = thrust::remove_if(thrust::device, dev_path, dev_path_end, is_path_terminated() );
+		//num_path = dev_path_end - dev_path;
+
+		//TODO:self work efficient
+		num_path = StreamCompaction::Efficient::compact(num_path, dev_path);
+		
+		checkCUDAError("stream compaction");
+
+	}
 
     ///////////////////////////////////////////////////////////////////////////
 
@@ -171,4 +736,9 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
             pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 
     checkCUDAError("pathtrace");
+
+
+	
+	
+	
 }
diff --git a/src/scene.cpp b/src/scene.cpp
index 5804ce3..78fdf4c 100644
--- a/src/scene.cpp
+++ b/src/scene.cpp
@@ -1,10 +1,15 @@
 #include <iostream>
+#include <algorithm>
+#include <list>
 #include "scene.h"
 #include <cstring>
 #include <glm/gtc/matrix_inverse.hpp>
 #include <glm/gtx/string_cast.hpp>
 
-Scene::Scene(string filename) {
+
+
+
+void Scene::loadScene(string filename) {
     cout << "Reading scene from " << filename << " ..." << endl;
     cout << " " << endl;
     char* fname = (char*)filename.c_str();
@@ -33,66 +38,127 @@ Scene::Scene(string filename) {
 }
 
 int Scene::loadGeom(string objectid) {
-    int id = atoi(objectid.c_str());
-    if (id != geoms.size()) {
-        cout << "ERROR: OBJECT ID does not match expected number of geoms" << endl;
-        return -1;
-    } else {
-        cout << "Loading Geom " << id << "..." << endl;
-        Geom newGeom;
-        string line;
+	//vector<Geom> & geoms_using = USE_KDTREE_FLAG ? tmp_geoms: geoms;
+	vector<Geom> & geoms_using = geoms;
 
-        //load object type
-        utilityCore::safeGetline(fp_in, line);
-        if (!line.empty() && fp_in.good()) {
-            if (strcmp(line.c_str(), "sphere") == 0) {
-                cout << "Creating new sphere..." << endl;
-                newGeom.type = SPHERE;
-            } else if (strcmp(line.c_str(), "cube") == 0) {
-                cout << "Creating new cube..." << endl;
-                newGeom.type = CUBE;
-            }
-        }
 
-        //link material
-        utilityCore::safeGetline(fp_in, line);
-        if (!line.empty() && fp_in.good()) {
-            vector<string> tokens = utilityCore::tokenizeString(line);
-            newGeom.materialid = atoi(tokens[1].c_str());
-            cout << "Connecting Geom " << objectid << " to Material " << newGeom.materialid << "..." << endl;
-        }
+	int id = atoi(objectid.c_str());
+	//if (id != geoms.size()) {
+	//    cout << "ERROR: OBJECT ID does not match expected number of geoms" << endl;
+	//    return -1;
+	//} else {
+	cout << "Loading Geom " << id << "..." << endl;
+	Geom newGeom;
+	string line;
 
-        //load transformations
-        utilityCore::safeGetline(fp_in, line);
-        while (!line.empty() && fp_in.good()) {
-            vector<string> tokens = utilityCore::tokenizeString(line);
+	bool isObj = false;
+	string objfilename;
 
-            //load tranformations
-            if (strcmp(tokens[0].c_str(), "TRANS") == 0) {
-                newGeom.translation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
-            } else if (strcmp(tokens[0].c_str(), "ROTAT") == 0) {
-                newGeom.rotation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
-            } else if (strcmp(tokens[0].c_str(), "SCALE") == 0) {
-                newGeom.scale = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
-            }
+	//load object type
+	utilityCore::safeGetline(fp_in, line);
+	if (!line.empty() && fp_in.good()) {
+		if (strcmp(line.c_str(), "sphere") == 0) {
+			cout << "Creating new sphere..." << endl;
+			newGeom.type = SPHERE;
+		}
+		else if (strcmp(line.c_str(), "cube") == 0) {
+			cout << "Creating new cube..." << endl;
+			newGeom.type = CUBE;
+		}
+		else if (strcmp(line.c_str(), "triangle") == 0)
+		{
+			cout << "Creating new triangle..." << endl;
+			newGeom.type = TRIANGLE;
+		}
+		else if (strcmp(line.c_str(), "obj") == 0)
+		{
+			cout << "Creating from obj file ..." << endl;
+			isObj = true;
+		}
+	}
 
-            utilityCore::safeGetline(fp_in, line);
-        }
+	//link material
+	utilityCore::safeGetline(fp_in, line);
+	if (!line.empty() && fp_in.good()) {
+		vector<string> tokens = utilityCore::tokenizeString(line);
+		newGeom.materialid = atoi(tokens[1].c_str());
+		cout << "Connecting Geom " << objectid << " to Material " << newGeom.materialid << "..." << endl;
+	}
 
-        newGeom.transform = utilityCore::buildTransformationMatrix(
-                newGeom.translation, newGeom.rotation, newGeom.scale);
-        newGeom.inverseTransform = glm::inverse(newGeom.transform);
-        newGeom.invTranspose = glm::inverseTranspose(newGeom.transform);
+	//load transformations
+	utilityCore::safeGetline(fp_in, line);
+	while (!line.empty() && fp_in.good()) {
+		vector<string> tokens = utilityCore::tokenizeString(line);
 
-        geoms.push_back(newGeom);
-        return 1;
-    }
+		//load tranformations
+		if (strcmp(tokens[0].c_str(), "TRANS") == 0) {
+			newGeom.translation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+		}
+		else if (strcmp(tokens[0].c_str(), "ROTAT") == 0) {
+			newGeom.rotation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+		}
+		else if (strcmp(tokens[0].c_str(), "SCALE") == 0) {
+			newGeom.scale = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+		}
+		//OBJ
+		else if (strcmp(tokens[0].c_str(), "OBJFILE") == 0)
+		{
+			objfilename = tokens[1];
+		}
+		//TRIANGLE
+		else if (strcmp(tokens[0].c_str(), "A") == 0) {
+			newGeom.translation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+		}
+		else if (strcmp(tokens[0].c_str(), "B") == 0) {
+			newGeom.rotation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+		}
+		else if (strcmp(tokens[0].c_str(), "C") == 0) {
+			newGeom.scale = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+		}
+
+		utilityCore::safeGetline(fp_in, line);
+	}
+
+
+	newGeom.transform = utilityCore::buildTransformationMatrix(
+		newGeom.translation, newGeom.rotation, newGeom.scale);
+	newGeom.inverseTransform = glm::inverse(newGeom.transform);
+	newGeom.invTranspose = glm::inverseTranspose(newGeom.transform);
+
+	if (!isObj)
+	{
+		if (newGeom.type == TRIANGLE)
+		{
+			glm::vec3 & a = newGeom.translation;
+			glm::vec3 & b = newGeom.rotation;
+			glm::vec3 & c = newGeom.scale;
+
+			glm::vec3 n = glm::normalize(glm::cross(b - a, c - a));
+			for (int j = 0; j < 3; j++)
+			{
+				newGeom.transform[0][j] = n[j];	//at a
+				newGeom.transform[1][j] = n[j];	//at b
+				newGeom.transform[2][j] = n[j];	//at c
+			}
+		}
+		geoms_using.push_back(newGeom);
+	}
+	else
+	{
+		loadObjSimple(objfilename, newGeom.transform, newGeom.invTranspose, newGeom.materialid);
+	}
+
+
+
+	return 1;
+	//}
 }
 
 int Scene::loadCamera() {
     cout << "Loading Camera ..." << endl;
     RenderState &state = this->state;
     Camera &camera = state.camera;
+	camera.lensRadiaus = -1.0f;
     float fovy;
 
     //load static properties
@@ -111,7 +177,7 @@ int Scene::loadCamera() {
             state.traceDepth = atoi(tokens[1].c_str());
         } else if (strcmp(tokens[0].c_str(), "FILE") == 0) {
             state.imageName = tokens[1];
-        }
+		}
     }
 
     string line;
@@ -124,7 +190,13 @@ int Scene::loadCamera() {
             camera.view = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
         } else if (strcmp(tokens[0].c_str(), "UP") == 0) {
             camera.up = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
-        }
+		}
+		else if (strcmp(tokens[0].c_str(), "LENS_RADIUS") == 0) {
+			state.camera.lensRadiaus = atof(tokens[1].c_str());
+		}
+		else if (strcmp(tokens[0].c_str(), "FOCAL_LENGTH") == 0) {
+			state.camera.focalDistance = atof(tokens[1].c_str());
+		}
 
         utilityCore::safeGetline(fp_in, line);
     }
@@ -135,6 +207,19 @@ int Scene::loadCamera() {
     float fovx = (atan(xscaled) * 180) / PI;
     camera.fov = glm::vec2(fovx, fovy);
 
+	//MY: store tan(fovy) and tan(fovx)
+	camera.pixelLength = glm::vec2(2*xscaled/(float)camera.resolution.x,2*yscaled/(float)camera.resolution.y);
+
+	//MY: calculate right vector
+	camera.view = glm::normalize(camera.view);
+
+	camera.right = glm::normalize(glm::cross(camera.view, camera.up));
+	
+	//make sure up is perpendicular to view
+	camera.up = glm::cross(camera.right,camera.view);
+
+
+
     //set up render camera stuff
     int arraylen = camera.resolution.x * camera.resolution.y;
     state.image.resize(arraylen);
@@ -180,3 +265,204 @@ int Scene::loadMaterial(string materialid) {
         return 1;
     }
 }
+
+
+//simple implemention
+//adapted from my uc berkeley cs184 ray tracer project
+void Scene::loadObjSimple(const string & objname, glm::mat4 & t, glm::mat4 & t_normal, int material_id)
+{
+	//vector<Geom> & geoms_using = USE_KDTREE_FLAG ? tmp_geoms : geoms;
+	vector<Geom> & geoms_using = geoms;
+	
+	
+	//fast implement
+	
+	//now doesn't use
+
+	//Affine3f t_normal(t.inverse().matrix().transpose());
+	//glm::mat4 t_inv = glm::inverse(t);
+	//glm::mat4 t_normal = glm::inverseTranspose(t);
+
+	vector<glm::vec3> vec_Vert;
+	vector<glm::vec3> vec_Nor;
+
+
+	ifstream file;
+	file.open(objname);
+
+	if (!file.is_open())
+	{
+		std::cout << "error: Unable to open Obj file" << std::endl;
+	}
+	else
+	{
+		string line;
+		while (file.good()) {
+			std::vector<std::string> split;
+			std::string buf;
+			std::getline(file, line);
+			std::stringstream ss(line);
+			while (ss >> buf) {
+				split.push_back(buf);
+			}
+			if (split.size() == 0) {
+				continue;
+			}
+			if (split[0][0] == '#'){
+				continue;
+			}
+			else if (split[0] == "g")
+			{
+				//group...
+				continue;
+			}
+			else if (split[0] == "s")
+			{
+				//ignore...
+				continue;
+			}
+			else if (split[0] == "mtllib")
+			{
+				//ignore
+				continue;
+			}
+			else if (split[0] == "usemtl")
+			{
+				//ignore
+				continue;
+			}
+			else if (split[0] == "v"){
+				//vertex
+
+				float x = atof(split[1].c_str());
+				float y = atof(split[2].c_str());
+				float z = atof(split[3].c_str());
+
+				vec_Vert.push_back(glm::vec3(x, y, z));
+
+			}
+			else if (split[0] == "vn"){
+				//normal
+
+				float x = atof(split[1].c_str());
+				float y = atof(split[2].c_str());
+				float z = atof(split[3].c_str());
+
+				vec_Nor.push_back(glm::vec3(x, y, z));
+			}
+			else if (split[0] == "vt")
+			{
+				//texture
+				//ignore now
+			}
+			else if (split[0] == "f")
+			{
+				//face
+
+				string s;
+				int split_size = split.size();
+				int num_v = split_size - 1;
+
+				int v[50];
+				int vt[50];
+				int vn[50];
+				vn[1] = -1;
+
+				int i;
+				for (i = 1; i<split_size; i++)
+				{
+					std::istringstream ss(split.at(i));
+					getline(ss, s, '/');
+					v[i] = atoi(s.c_str());
+					s = "-1";
+
+					getline(ss, s, '/');
+					vt[i] = atoi(s.c_str());	//texture
+					s = "-1";
+
+					getline(ss, s, '/');
+					vn[i] = atoi(s.c_str());
+					s = "-1";
+				}
+
+
+
+				for (i = 3; i <= num_v; i++)
+				{
+					//TODO : implement transform later
+
+					//anti-clockwise
+					glm::vec3 pa = vec_Vert.at(v[1] - 1);
+					glm::vec3 pb = vec_Vert.at(v[i - 1] - 1);
+					glm::vec3 pc = vec_Vert.at(v[i] - 1);
+
+					pa = glm::vec3(t * glm::vec4(pa, 1));
+					pb = glm::vec3(t * glm::vec4(pb, 1));
+					pc = glm::vec3(t * glm::vec4(pc, 1));
+
+					glm::vec3 an, bn, cn;	//normal
+
+					if (vec_Nor.size() == 0)
+					{
+						an = glm::normalize (glm::cross(pb - pa, pc - pa));
+						bn = an;
+						cn = an;
+					}
+					else if (vn[1] == -1)
+					{
+						//normal not explicitly assigned.
+						//use vertex id as normal id
+						
+						an = glm::vec3 (t_normal * glm::vec4(vec_Nor.at(v[1] - 1), 0));
+						bn = glm::vec3 (t_normal * glm::vec4(vec_Nor.at(v[i-1] - 1), 0));
+						cn = glm::vec3 (t_normal * glm::vec4(vec_Nor.at(v[i] - 1), 0));
+
+					}
+					else
+					{
+
+						an = glm::vec3(t_normal * glm::vec4(vec_Nor.at(vn[1] - 1), 0));
+						bn = glm::vec3(t_normal * glm::vec4(vec_Nor.at(vn[i - 1] - 1), 0));
+						cn = glm::vec3(t_normal * glm::vec4(vec_Nor.at(vn[i] - 1), 0));
+
+					}
+
+
+					//ugly implementation
+					Geom triangle;
+					triangle.translation = pa;
+					triangle.rotation = pb;
+					triangle.scale = pc;
+					for (int j = 0; j < 3; j++)
+					{
+						triangle.transform[0][j] = an[j];
+						triangle.transform[1][j] = bn[j];
+						triangle.transform[2][j] = cn[j];
+					}
+					
+					triangle.materialid = material_id;
+					triangle.type = TRIANGLE;
+
+
+					geoms_using.push_back(triangle);
+				}
+
+
+
+
+			}
+			else
+			{
+				std::cout << "Warning! undefined label!\n";
+				abort();
+			}
+
+
+
+
+		}
+	}
+
+
+	file.close();
+}
diff --git a/src/scene.h b/src/scene.h
index f29a917..fdec7e2 100644
--- a/src/scene.h
+++ b/src/scene.h
@@ -4,23 +4,42 @@
 #include <sstream>
 #include <fstream>
 #include <iostream>
+
+#include <thrust/host_vector.h>
+
 #include "glm/glm.hpp"
 #include "utilities.h"
 #include "sceneStructs.h"
 
+#include "kdtree.h"
+
 using namespace std;
 
+//#define USE_KDTREE_FLAG
+
+
 class Scene {
 private:
     ifstream fp_in;
     int loadMaterial(string materialid);
     int loadGeom(string objectid);
     int loadCamera();
+
+	void loadObjSimple(const string & objname, glm::mat4 & t, glm::mat4 & t_normal, int material_id);
 public:
-    Scene(string filename);
-    ~Scene();
+    //Scene(string filename);
+	void loadScene(string filename);
+    //~Scene();
 
     std::vector<Geom> geoms;
     std::vector<Material> materials;
+	//thrust::host_vector<Geom> geoms;
+	//thrust::host_vector<Material> materials;
     RenderState state;
+
+
+	//MY
+	KDTree kdtree;
+
+	std::vector<Geom> tmp_geoms;
 };
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index baa2e30..5bfbf70 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -5,9 +5,14 @@
 #include <cuda_runtime.h>
 #include "glm/glm.hpp"
 
+using namespace std;
+#define BACKGROUND_COLOR (glm::vec3(0.0f))
+//#define MAX_RAY_TRACE_DEPTH (5)
+
 enum GeomType {
     SPHERE,
     CUBE,
+	TRIANGLE
 };
 
 struct Ray {
@@ -44,6 +49,17 @@ struct Camera {
     glm::vec3 view;
     glm::vec3 up;
     glm::vec2 fov;
+
+	glm::vec3 right;	// same in x direction of the screen
+	
+	glm::vec2 pixelLength;
+
+
+	///////
+	// lens camera
+	////////
+	float lensRadiaus;
+	float focalDistance;
 };
 
 struct RenderState {
@@ -53,3 +69,37 @@ struct RenderState {
     std::vector<glm::vec3> image;
     std::string imageName;
 };
+
+
+
+//MY
+
+struct Path
+{
+	Ray ray;
+	glm::vec3 color;
+
+	int image_index;
+	bool terminated;
+};
+
+
+
+enum AXIS { AXIS_X = 0, AXIS_Y, AXIS_Z};
+
+struct AAPlane
+{	
+	AXIS axis;
+	float pos;
+};
+
+struct AABB
+{
+	glm::vec3 min_pos;
+	glm::vec3 max_pos;
+};
+
+
+
+
+
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index ac358c9..05a8550 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -1,4 +1,8 @@
 set(SOURCE_FILES
+    "common.h"
+    "common.cu"
+    "stream_compaction.h"
+    "stream_compaction.cu"
     )
 
 cuda_add_library(stream_compaction
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
new file mode 100644
index 0000000..d17484f
--- /dev/null
+++ b/stream_compaction/common.cu
@@ -0,0 +1,83 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+
+
+
+//void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+//    cudaError_t err = cudaGetLastError();
+//    if (cudaSuccess == err) {
+//        return;
+//    }
+//
+//    fprintf(stderr, "CUDA error");
+//    if (file) {
+//        fprintf(stderr, " (%s:%d)", file, line);
+//    }
+//    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+//    exit(EXIT_FAILURE);
+//}
+
+
+namespace StreamCompaction {
+namespace Common {
+
+	__global__ void kernZeroArray(int n, int * data)
+	{
+		int k = threadIdx.x + blockDim.x * blockIdx.x;
+		if(k < n)
+		{
+			data[k] = 0;
+		}
+	}
+
+
+
+	__global__ void kernInclusive2Exclusive(int n, int * exclusive, const int * inclusive)
+	{
+		int k = threadIdx.x + blockDim.x * blockIdx.x;
+		if( k < n)
+		{
+			if(k == 0)
+			{
+				exclusive[k] = IDENTITY;
+			}
+			else
+			{
+				exclusive[k] = inclusive[k-1];
+			}
+		}
+	}
+
+
+
+	/**
+	* Maps an array to an array of 0s and 1s for stream compaction. Elements
+	* which map to 0 will be removed, and elements which map to 1 will be kept.
+	*/
+	__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+		int k = threadIdx.x + blockDim.x * blockIdx.x;
+		if( k < n )
+		{
+			bools[k] = idata[k] != 0 ? 1 : 0;
+		}
+	}
+
+	/**
+	* Performs scatter on an array. That is, for each element in idata,
+	* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+	*/
+	__global__ void kernScatter(int n, int *odata,
+		const int *idata, const int *bools, const int *indices) {
+			int k = threadIdx.x + blockDim.x * blockIdx.x;
+			if( k < n )
+			{
+				if(bools[k] == 1)
+				{
+					odata[ indices[k] ] = idata[k];
+				}
+			}
+	}
+
+}
+}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
new file mode 100644
index 0000000..bce939b
--- /dev/null
+++ b/stream_compaction/common.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+//#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+
+#define IDENTITY (0)
+
+
+
+
+//const int blockSize = 192;
+//const int blockSize = 128;
+const int blockSize = 32;
+//const int blockSize = 4;
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+//void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return ilog2(x - 1) + 1;
+}
+
+
+namespace StreamCompaction {
+namespace Common {
+	__global__ void kernZeroArray(int n, int * data);
+
+	__global__ void kernInclusive2Exclusive(int n, int * exclusive, const int * inclusive);
+
+    __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+    __global__ void kernScatter(int n, int *odata,
+            const int *idata, const int *bools, const int *indices);
+}
+}
diff --git a/stream_compaction/stream_compaction.cu b/stream_compaction/stream_compaction.cu
new file mode 100644
index 0000000..06ff6b4
--- /dev/null
+++ b/stream_compaction/stream_compaction.cu
@@ -0,0 +1,488 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "stream_compaction.h"
+
+
+namespace StreamCompaction {
+	namespace Efficient {
+		//const int blockSize = 128;
+
+
+
+		__global__ void kernUpSweep(int size, int step, int * data)
+		{
+			//step = 2^(d+1)
+			int k = threadIdx.x + blockDim.x * blockIdx.x;
+
+			if (k < size)
+			{
+				if (k % step == 0)
+				{
+					data[k + step - 1] += data[k + (step >> 1) - 1];
+				}
+			}
+
+		}
+
+		__global__ void kernDownSweep(int size, int step, int * data)
+		{
+			//step = 2^(d+1)
+			int k = threadIdx.x + blockDim.x * blockIdx.x;
+
+			if (k < size)
+			{
+				if (k % step == 0)
+				{
+					int left_child = data[k + (step >> 1) - 1];
+					data[k + (step >> 1) - 1] = data[k + step - 1];
+					data[k + step - 1] += left_child;
+				}
+			}
+		}
+
+
+		__global__ void kernSetRootZero(int rootId, int * data)
+		{
+			int k = threadIdx.x + blockDim.x * blockIdx.x;
+			if (k == rootId)
+			{
+				data[k] = 0;
+			}
+		}
+
+
+
+
+
+
+		//combine upsweep, downsweep ... -> one kernel function
+		//one block
+		//g_odata can = g_idata
+		__global__ void kernScan(int N , int * g_odata, const int *g_idata)
+		{
+			extern __shared__ int s_idata[];
+
+			int n = blockDim.x * 2;		//data size
+			int blockOffset = n * blockIdx.x;
+			int thid = threadIdx.x;	// +blockDim.x * blockIdx.x;
+
+			//if (thid + blockOffset < N)
+			//{
+				
+			//int n_data = n * 2;
+			int offset = 1;
+
+			int ai = thid;
+			int bi = thid + (n / 2);
+			int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
+			int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
+
+			//copy data from global to shared
+			s_idata[ai + bankOffsetA] = g_idata[ai + blockOffset];
+			s_idata[bi + bankOffsetB] = g_idata[bi + blockOffset];
+
+			//if (ai + blockOffset >= N)
+			//{
+			//	s_idata[ai + bankOffsetA] = 0;
+			//}
+			//else
+			//{
+			//	s_idata[ai + bankOffsetA] = g_idata[ai + blockOffset];
+			//}
+
+			//if (bi + blockOffset >= N)
+			//{
+			//	s_idata[bi + bankOffsetB] = 0;
+			//}
+			//else
+			//{
+			//	s_idata[bi + bankOffsetB] = g_idata[bi + blockOffset];
+			//}
+
+
+
+
+			//UpSweep
+			for (int d = n >> 1; d > 0; d >>= 1)
+			{
+				__syncthreads();
+				if (thid < d)
+				{
+					int ai = offset * (2 * thid + 1) - 1;
+					int bi = offset * (2 * thid + 2) - 1;
+					ai += CONFLICT_FREE_OFFSET(ai);
+					bi += CONFLICT_FREE_OFFSET(bi);
+
+					s_idata[bi] += s_idata[ai];
+				}
+				offset <<= 1; // * 2
+			}
+
+			
+			//assign 0 to the root
+			if (thid == 0)
+			{
+				s_idata[n - 1 + CONFLICT_FREE_OFFSET(n - 1)] = 0;
+			}
+
+
+
+			//DownSweep
+			for (int d = 1; d < n; d <<= 1)
+			{
+				offset >>= 1;
+				__syncthreads();
+				if (thid < d)
+				{
+					int ai = offset * (2 * thid + 1) - 1;
+					int bi = offset * (2 * thid + 2) - 1;
+					ai += CONFLICT_FREE_OFFSET(ai);
+					bi += CONFLICT_FREE_OFFSET(bi);
+
+					int t = s_idata[ai];
+					s_idata[ai] = s_idata[bi];
+					s_idata[bi] += t;
+				}
+			}
+
+			g_odata[ai + blockOffset] = s_idata[ai + bankOffsetA];
+			g_odata[bi + blockOffset] = s_idata[bi + bankOffsetB];
+			//}
+
+			//test
+			//printf("%d : %d\n", ai + blockOffset, s_idata[ai + bankOffsetA]);
+			//printf("%d : %d\n", bi + blockOffset, s_idata[bi + bankOffsetB]);
+		}
+
+
+
+
+		__global__ void kernCopyToBlockSumArray(int N,int blockArraySize, int * g_sum ,const int * g_bools,const int * g_scans)
+		{
+			//turn from exclusive result to inclusive
+			int k = threadIdx.x + blockDim.x * blockIdx.x;
+
+			if (k < N)
+			{
+				int this_block_last_id = (k)*blockArraySize + blockArraySize - 1;
+				g_sum[k] = g_scans[this_block_last_id] + g_bools[this_block_last_id];
+			}
+			else
+			{
+				g_sum[k] = 0;
+			}
+		}
+
+
+		__global__ void kernAddSumArrayBack(int N,int blockArraySize, int * g_odata, const int * g_sum)
+		{
+			int k = threadIdx.x + blockDim.x * blockIdx.x;
+			if (k < N)
+			{
+
+				//all element after the first block array
+				
+				g_odata[k] += g_sum[(k / blockArraySize)];
+				
+			}
+			
+		}
+
+
+
+		__global__ void kernAssignBool(int N, int * istrues, Path* paths)
+		{
+			int k = threadIdx.x + blockDim.x * blockIdx.x;
+			if (k < N)
+			{
+				istrues[k] = (int)(!paths[k].terminated);
+			}
+			else
+			{
+				istrues[k] = 0;
+			}
+		}
+
+
+		__global__ void kernScatter(int n, Path *odata,
+			const Path *idata, const int *bools, const int *indices)
+		{
+			int k = threadIdx.x + blockDim.x * blockIdx.x;
+			if (k < n)
+			{
+				if (bools[k] == 1)
+				{
+					odata[indices[k]] = idata[k];
+				}
+			}
+		}
+
+
+
+
+		/**
+		* Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		* input g_idata is on global memory on device
+		*/
+
+		
+
+
+
+		/**
+		* size - size of the bools, has already been extended to 2^c
+		* scans, bools - device memory, memory has been allocated
+		*/
+		void scan(int size, int * scans, const int * bools)
+		{
+			dim3 fullBlocksPerGrid_2((size / 2 + blockSize - 1) / blockSize);
+			dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+
+
+			if (fullBlocksPerGrid_2.x == 1)
+			{
+				// one block scan
+				kernScan << <fullBlocksPerGrid_2, blockSize, 2 * blockSize*sizeof(int) >> >(size, scans, bools);
+				cudaDeviceSynchronize();
+				return;
+			}
+
+
+			//multiblock scan
+			int * block_sum_array;
+			int * block_sum_array_bools;
+
+			//per block scan
+			kernScan << <fullBlocksPerGrid_2, blockSize, 2 * blockSize*sizeof(int) >> >(size, scans, bools);
+			
+			//cudaDeviceSynchronize();
+
+
+			//int ceil_log2n_sum = ilog2ceil(fullBlocksPerGrid.x);
+			//int size_sum = 1 << ceil_log2n_sum;
+			int size_sum = fullBlocksPerGrid_2.x;
+
+			dim3 numBlocks_2((size_sum / 2 + blockSize - 1) / blockSize);
+			dim3 numBlocks((size_sum + blockSize - 1) / blockSize);
+
+			cudaMalloc(&block_sum_array, size_sum * sizeof(int));
+
+			cudaDeviceSynchronize();
+			kernCopyToBlockSumArray << <numBlocks, blockSize >> >(size_sum, 2 * blockSize, block_sum_array, bools, scans);
+			cudaDeviceSynchronize();
+
+
+			cudaMalloc(&block_sum_array_bools, size_sum * sizeof(int));
+			cudaMemcpy(block_sum_array_bools, block_sum_array, size_sum * sizeof(int), cudaMemcpyDeviceToDevice);
+			cudaDeviceSynchronize();
+			//if (numBlocks_2.x == 1)
+			//{
+			scan(size_sum, block_sum_array, block_sum_array_bools);
+			//}
+			cudaDeviceSynchronize();
+			
+
+			kernAddSumArrayBack << <fullBlocksPerGrid, blockSize >> >(size, 2 * blockSize, scans, block_sum_array);
+			cudaDeviceSynchronize();
+
+
+			cudaFree(block_sum_array);
+			cudaFree(block_sum_array_bools);
+
+			cudaDeviceSynchronize();
+		}
+
+
+
+
+		//multiple blocks, shared memory, resolve bank conflict
+		int compact(int n, Path* path)
+		{
+			//int * block_sum_array;	//global
+			int * exist;
+			int * scans;
+			Path * tmp_path;
+
+			cudaMalloc(&tmp_path, n * sizeof(Path));
+			cudaMemcpy(tmp_path, path, n*sizeof(Path), cudaMemcpyDeviceToDevice);
+
+			int ceil_log2n = ilog2ceil(n);
+			int size = 1 << ceil_log2n;
+
+			dim3 fullBlocksPerGrid_2((size / 2 + blockSize - 1) / blockSize);
+			dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+
+			cudaMalloc(&exist, size*sizeof(int));
+			kernAssignBool << <fullBlocksPerGrid, blockSize >> >(n, exist, path);
+			cudaMalloc(&scans, size*sizeof(int));
+
+			
+
+			
+
+			scan(size, scans, exist);
+
+
+			//compact
+
+			
+
+			kernScatter << <fullBlocksPerGrid, blockSize >> >(size, path,
+				tmp_path, exist, scans);
+			cudaDeviceSynchronize();
+
+			int hos_sum;
+			cudaMemcpy(&hos_sum, scans + size - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+			int hos_last;
+			cudaMemcpy(&hos_last, exist + size - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+
+			cudaFree(tmp_path);
+			//cudaFree(block_sum_array);
+			cudaFree(exist);
+			cudaFree(scans);
+
+			return hos_sum + hos_last;
+		}
+
+
+
+
+
+
+		/**
+		* Performs stream compaction on idata, storing the result into odata.
+		* All zeroes are discarded.
+		*
+		* @param n      The number of elements in idata.
+		* @param odata  The array into which to store elements.
+		* @param idata  The array of elements to compact.
+		* @returns      The number of elements remaining after compaction.
+		*/
+
+		//int compact(int n, int *odata, const int *idata) {
+		//	int hos_scans;
+		//	int hos_bools;
+		//	int * dev_bools;
+		//	int * dev_scans;
+		//	int * dev_idata;
+		//	int * dev_odata;
+		//	dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+		//	cudaMalloc((void**)&dev_bools, n * sizeof(int));
+		//	//checkCUDAError("cudaMalloc dev_bools failed");
+		//	cudaMalloc((void**)&dev_scans, n * sizeof(int));
+		//	//checkCUDAError("cudaMalloc dev_scans failed");
+		//	cudaMalloc((void**)&dev_idata, n * sizeof(int));
+		//	//checkCUDAError("cudaMalloc dev_idata failed");
+		//	cudaMalloc((void**)&dev_odata, n * sizeof(int));
+		//	//checkCUDAError("cudaMalloc dev_odata failed");
+
+		//	cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+		//	//checkCUDAError("cudaMemcpy from data to dev_data failed");
+		//	cudaDeviceSynchronize();
+
+		//	Common::kernMapToBoolean << < fullBlocksPerGrid, blockSize >> > (n, dev_bools, dev_idata);
+		//	cudaDeviceSynchronize();
+
+		//	//cudaMemcpy(hos_bools,dev_bools, n * sizeof(int),cudaMemcpyDeviceToHost);
+		//	//checkCUDAError("cudaMemcpy from data to dev_data failed");
+		//	//cudaDeviceSynchronize();
+
+		//	scan(n, dev_scans, dev_bools, true);
+
+		//	//cudaMemcpy(dev_scans,hos_scans, n * sizeof(int),cudaMemcpyHostToDevice);
+		//	//checkCUDAError("cudaMemcpy from hos_scans to dev_scans failed");
+		//	//cudaDeviceSynchronize();
+
+		//	Common::kernScatter << < fullBlocksPerGrid, blockSize >> >(n, dev_odata,
+		//		dev_idata, dev_bools, dev_scans);
+		//	cudaDeviceSynchronize();
+
+		//	cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+		//	//checkCUDAError("cudaMemcpy from dev_odata to odata failed");
+		//	//cudaDeviceSynchronize();
+
+		//	cudaMemcpy(&hos_scans, dev_scans + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+		//	//checkCUDAError("cudaMemcpy scans[n-1] failed");
+
+		//	cudaMemcpy(&hos_bools, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+		//	//checkCUDAError("cudaMemcpy bools[n-1] failed");
+
+		//	cudaDeviceSynchronize();
+
+
+
+		//	cudaFree(dev_idata);
+		//	cudaFree(dev_odata);
+		//	cudaFree(dev_bools);
+		//	cudaFree(dev_scans);
+
+		//	//int num = hos_scans[n-1] + hos_bools[n-1];
+		//	int num = hos_scans + hos_bools;
+		//	//delete[] hos_scans;
+		//	//delete[] hos_bools;
+
+		//	return num;
+		//}
+
+
+
+		
+
+		/**
+		* CPU stream compaction without using the scan function.
+		*
+		* @returns the number of elements remaining after compaction.
+		* odata can = idata, no race condition
+		*/
+		int compactWithoutScan(int n, Path *odata, const Path *idata) {
+			int r = 0;
+			for (int i = 0; i < n; i++)
+			{
+				if (!idata[i].terminated)
+				{
+					odata[r] = idata[i];
+					r++;
+				}
+			}
+			return r;
+		}
+
+
+		int cmpArrays(int n, Path *a, Path *b) {
+			int r = 0;
+			for (int i = 0; i < n; i++) {
+				if (a[i].image_index != b[i].image_index) {
+					printf("    a[%d] = %d, b[%d] = %d\n", i, (int)a[i].image_index, i, (int)b[i].image_index);
+					//return 1;
+					r = 1;
+				}
+			}
+			return r;
+		}
+		
+
+
+		void printArray(int n, Path *a, bool abridged) {
+			printf("    [ ");
+			for (int i = 0; i < n; i++) {
+				if (abridged && i + 2 == 15 && n > 16) {
+					i = n - 2;
+					printf("... ");
+				}
+				//printf("%d ", (int)a[i].terminated);
+				printf("%d ", a[i].image_index);
+			}
+			printf("]\n");
+		}
+
+
+
+
+
+
+	}
+}
diff --git a/stream_compaction/stream_compaction.h b/stream_compaction/stream_compaction.h
new file mode 100644
index 0000000..7977a5d
--- /dev/null
+++ b/stream_compaction/stream_compaction.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <src/sceneStructs.h>
+
+#define NUM_BANKS 16
+#define LOG_NUM_BANKS 4 
+//#define CONFLICT_FREE_OFFSET(n) \
+//	((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))
+
+#define CONFLICT_FREE_OFFSET(n) \
+	(0)
+
+namespace StreamCompaction {
+namespace Efficient {
+	void scan(int size, int * scans, const int * bools);
+
+	int compact(int n, Path* path);
+
+
+	//cpu, used for test
+	int compactWithoutScan(int n, Path *odata, const Path *idata);
+	int cmpArrays(int n, Path *a, Path *b);
+
+	//int printArray(int n, Path *a);
+
+	void printArray(int n, Path *a, bool abridged = true);
+
+    //int compact(int n, int *odata, const int *idata);
+
+	//void radixSortLauncher(int n, int *odata, const int *idata, int msb,int lsb);
+}
+}