diff --git a/CMakeLists.txt b/CMakeLists.txt
index c473e2c..77e7c4f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,9 +71,11 @@ set(headers
src/pathtrace.h
src/scene.h
src/sceneStructs.h
+ src/tiny_obj_loader.h
src/preview.h
src/utilities.h
src/ImGui/imconfig.h
+ src/lbvh.h
src/ImGui/imgui.h
src/ImGui/imconfig.h
@@ -95,6 +97,7 @@ set(sources
src/scene.cpp
src/preview.cpp
src/utilities.cpp
+ src/lbvh.cu
src/ImGui/imgui.cpp
src/ImGui/imgui_demo.cpp
@@ -111,6 +114,7 @@ list(SORT sources)
source_group(Headers FILES ${headers})
source_group(Sources FILES ${sources})
+#add_subdirectory(src/libmorton)
#add_subdirectory(src/ImGui)
#add_subdirectory(stream_compaction) # TODO: uncomment if using your stream compaction
diff --git a/README.md b/README.md
index 110697c..10fdb2b 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,396 @@ CUDA Path Tracer
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Megan Reddy
+ * [LinkedIn](https://www.linkedin.com/in/meganr25a949125/), [personal website](https://meganr28.github.io/)
+* Tested on: Windows 10, AMD Ryzen 9 5900HS with Radeon Graphics @ 3301 MHz 16GB, NVIDIA GeForce RTX 3060 Laptop GPU 6GB (Personal Computer)
+* Compute Capability: 8.6
-### (TODO: Your README)
+### Overview
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+**Path tracing** is a an algorithm for synthesizing images by simulating the physical properties of light. Some effects
+that you can obtain for free in a path tracer include global illumination, caustics, soft shadows, motion blur, and depth-of-field.
+Generating a converged, noise-free image involves tracing millions of rays for each scene. Tracing ray paths is an "embarrassingly parallel"
+problem, meaning that it can be easily converted into a parallel task on the GPU.
+
+This path tracer is implemented for the GPU using NVIDIA's CUDA API. It parallelizes several steps that would run serially on the CPU,
+greatly reducing rendering time from hours to minutes. This includes generating camera rays, computing ray intersections for each pixel,
+shading, and writing to the output image.
+
+
+
+
+A custom-made model of a violin mesh with reflective and refractive spheres (183,024 triangles)
+
+#### Features Implemented
+
+- Core Features
+ * Shading kernel for ideal diffuse, perfectly specular, and imperfectly specular surfaces
+ * Path continuation/termination with stream compaction
+ * Material sorting
+ * Caching first bounce intersections
+- Extra Features
+ * Performance
+ * Linear Bounding Volume Hierarchy (LBVH)
+ * Bounding Volume Hierarchy (with Midpoint and SAH split methods)
+ * Russian roulette ray termination
+ * Visual
+ * Refraction (with Fresnel)
+ * Depth-of-field
+ * Stochastic sampled anti-aliasing
+ * Arbitrary mesh loading (with bounding box culling)
+ * Reinhard operator and gamma correction (conversion to sRGB)
+
+### Usage
+
+#### Loading and Running a Scene
+
+In order to run a `.txt` file from the `scenes` folder, you must provide it as a command line argument. You can do this two ways:
+* Call the program with the argument: `cis565_path_tracer scenes/sphere.txt`
+* In Visual Studio, navigate to your project `Properties` and select `Configuration Properties -> Debugging -> Command Arguments` and provide the path to the scene file:
+ `../scenes/sphere.txt`. Note that you may need to provide the full path instead of the relative path.
+
+#### Macros
+There are several macros that enable the user to test out different performance features and visual improvements.
+They are located in the following files:
+
+##### pathtrace.cu
+
+* `ANTIALIASING` - jitter the camera ray direction to remove jagged edges from shapes.
+* `MATERIAL_SORT_` - sort materials so that the same materials are contiguous in memory.
+* `STREAM_COMPACTION_` - remove rays that have terminated early.
+* `CACHE_FIRST_BOUNCE` - cache first bounce intersections to use in subsequent iterations.
+
+##### utilities.h
+
+I have added these two macros to easily enable/disable features in this file.
+
+* `ENABLE` 1
+* `DISABLE` 0
+
+An example usage would be `RUSSIAN_ROULETTE ENABLE`.
+
+* `RUSSIAN_ROULETTE` - turn on to use russian roulette ray termination.
+* `CONVERT_TO_SRGB` - convert final image from HDR to sRGB.
+* `BB_CULLING` - first check if a ray intersects the bounding volume encompassing a mesh. If so, check all the triangles within the mesh.
+* `USE_LBVH` - use a Linear Bounding Volume Hierarchy to accelerate intersection testing.
+* `USE_BVH` - use a standard Bounding Volume Hierarchy to accelerate intersection testing. One of the split methods below must also be enabled.
+* `USE_BVH_MIDPOINT` - split BVH based on midpoint of primitives' centroids.
+* `USE_BVH_SAH` - split BVH based on Surface Area Heuristic.
+* `DISPLAY_HEATMAP` - enable heatmap visualization of BVH to diagnose areas with most intersections.
+
+### Visual Features
+
+#### Materials
+
+In path tracing, we sample a Bidirectional Scattering Distribution Function (BSDF) to choose a direction for the ray to bounce.
+This function describes the probability that light incoming along a ray `wi` will leave along a direction `wo`. In this implementation,
+there are four material types available: diffuse, perfectly specular reflective, imperfectly specular reflective, and specular transmissive.
+
+##### Diffuse
+
+Perfectly diffuse surfaces scatter illumination equally in all directions. In order to choose a new ray direction, we choose a random
+direction within the hemisphere centered about the normal (see `calculateRandomDirectionInHemisphere`).
+
+
+
+##### Perfect Specular Reflective
+
+Perfectly specular reflective surfaces only have one outgoing direction that contributes light energy. This single direction
+is computed using `glm::reflect` and is the reflection of `wo` about the surface normal.
+
+
+
+##### Imperfect Specular Reflective
+
+Imperfectly specular surfaces commonly include microfacet materials, however I opted to implement a more plastic-like material. To do
+this, I generated a random variable `xi` between 0 to 1. This represents the probability of choosing the Diffuse BRDF or Specular BRDF.
+We use this variable to choose one and then scatter the ray in that direction, multiplying the ray's throughput by the Fresnel term and diffuse or specular
+color, depending on which branch was chosen.
+
+
+
+##### Refraction (with Fresnel)
+
+For specular transmission, we use Snell's Law to compute the direction of the transmitted ray. If the angle of incidence is greater than a critical angle,
+we must reflect the ray instead. In the code, we check the return value of `glm::refract` to determine whether we should reflect or not. Additionally,
+we compute the Fresnel term and factor this into the color. The Fresnel effect describes the relationship between reflectance and viewing angle, where surfaces
+tend to be more reflective at grazing angles.
+
+
+
+#### Anti-Aliasing
+
+Anti-aliasing is a technique to remove jagged edges from images. The idea is to jitter the samples within a pixel, obtain the color
+from casting rays in those directions, and then average the samples to get the final color. Since we are already taking many samples
+per pixel (each sample is an "iteration"), we can get this for free by simply jittering the x and y pixel position by some random value between 0 and 1
+every time we generate a ray from the camera. This produces the "smoothed" appearance seen in the images below.
+
+| No Anti-Aliasing | Anti-Aliasing |
+:-------------------------:|:-------------------------:
+ | 
+
+#### Depth-of-field
+
+Depth-of-field is when objects that are within some focal distance appear in-focus whereas other objects appear out-of-focus.
+To achieve this effect, we sample a point on a concentric disk with radius `r`. This disk represents the camera aperture. Once we have
+this sample point, we add it to the ray origin and compute the focal distance as the distance between the camera `lookAt` point and
+`eye`. To use depth-of-field, be sure to set `LENS_RADIUS` to a value greater than zero in the scene file.
+
+| No Depth-of-Field | Depth-of-Field (Lens Radius: 0.5) |
+:-------------------------:|:-------------------------:
+ | 
+
+#### Reinhard Tone Mapping and Gamma Correction
+
+Before writing to the output image, the Reinhard operator is applied to map the output color from HDR to sRGB. Additionally,
+gamma correction is applied to control the brightness of the image. These can be toggled using the `CONVERT_TO_SRGB` macro in `utilities.h`.
+
+| No Tone Mapping or Gamma | Tone Mapping and Gamma |
+:-------------------------:|:-------------------------:
+ | 
+
+#### Mesh Loading
+
+This project relies on the `tinyobj` header file for loading meshes. In a scene file, the user must specify a mesh in the
+following format:
+
+```
+OBJECT 0
+mesh dragon.obj
+material 1
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 1 1 1
+```
+
+The `mesh` keyword must be followed by an `.obj` file to load. In this implementation, the transformations in the file are ignored
+since the transformations are already baked into the mesh being imported. Mesh loading involves iterating through all the faces in the mesh,
+constructing a `Triangle` struct, and pushing this back to a vector of triangles that lives in the `Scene` class. Each mesh is `Geom` that stores
+an offset into this array of triangles.
+
+
+
+
+Two Dragons
+
+### Performance Improvements
+
+The following features are intended to improve total render time by reducing the total number of ray intersections
+and re-ordering materials so that they are contiguous in memory before shading.
+
+#### Testing Methodology
+
+To measure performance, I used a `PerformanceTimer` class with methods `startGpuTimer()`, `endGpuTimer()`,
+and `getGpuElapsedTimeForPreviousOperation()`. These methods are wrappers around calls to the CUDA runtime library.
+For measuring individual kernels, I placed a call to `startGpuTimer` and `endGpuTimer` around the kernel and called `getGpuElapsedTime` after
+the call to `endGpuTimer`. To measure the performance of one iteration, I summed the times for each kernel/thrust call within a single call to the
+`pathtrace` function. I could have also placed a timer around the `pathtrace` call, but I wanted to obtain timing without memory I/O and setup.
+
+During performance testing, I noticed that the timing for the first iteration is usually an outlier within the data
+(the value is either much higher or much lower than the rest of the iterations),
+which might be caused due to the application startup/warmup being factored into the elapsed time. This number is factored into the averages below, meaning
+that the averages might be slightly higher/lower than normal, but still capture the performance of each feature well.
+
+#### Acceleration Structures
+
+For this project, I wanted to learn more about path tracer performance and acceleration structures, so I chose to implement a Linear Bounding Volume Hierarchy (LBVH), which is described further
+in [this paper](https://research.nvidia.com/sites/default/files/pubs/2012-06_Maximizing-Parallelism-in/karras2012hpg_paper.pdf).
+After implementing this feature, I was interested to see the comparison between an LBVH and a BVH that uses the Midpoint split method
+and the SAH split method, so I added those too. The BVH with both Midpoint and SAH splits is currently functional, but also suffers from a performance bug that may be caused by
+a bug in the splitting code. Since the LBVH was my main focus, I didn't quite polish this aspect of the BVH (therefore it runs slowly for larger triangle counts),
+but have still included it here in the discussion.
+
+In the future, I would also like to explore spatial-partitioning schemes such as an octree or kd-tree.
+
+##### Linear Bounding Volume Hierarchy
+
+The Linear Bounding Volume Hierarchy (LBVH) was first described in [this paper](https://luebke.us/publications/eg09.pdf) by Lauterbach, et al. (2009). The basic idea
+is to order the primitives (in this case triangles) in a way that will place primitives that are close to each other in 3D space
+close to each other in the tree structure. To do this, we sort the triangles by the centroid of their bounding boxes expressed relative to the "scene" bounding box. In other words,
+we compute a normalized centroid:
+
+```glm::vec3 norm_centroid = (centroid - sceneAABB.min) / (sceneAABB.max - sceneAABB.min);```
+
+Once we have the Morton codes, we sort them using `std::sort`. The paper mentions using parallel radix sort, which is a good option if we're doing construction on the GPU. Since the construction
+is being done on the CPU, it suffices to use the C++ STL function. After sorting, we start building the tree. The method presented in Karras (2012) uses a numbering scheme to assign a range of Morton codes
+to each tree node. This numbering scheme was developed so that construction could be parallelized, but I followed the same method incase I wish to do GPU construction later on. The most important part is the split
+criteria - the nodes are split according to their highest differing bit in their Morton Codes. This diagram demonstrates this idea:
+
+
+
+
+Linear Bounding Volume Hierarchy Split Based on Highest Differing Bit (credit: NVIDIA)
+
+After splitting the nodes, we construct the bounding boxes in a bottom-to-top fashion by first assigning boxes to the leaf nodes and then constructing
+the parents' bounding boxes based off of those. This is to ensure less overlap between bounding boxes, but it is not perfect.
+
+To traverse the LBVH, we use an iterative traversal with a per-thread stack. For each node, we check whether the current ray intersects the children, and depending
+on the results, we choose whether to keep traversing the left subtree, right subtree, or both. If we choose to traverse both, we move to the left child, and push the right
+child onto the stack. This essentially allows us to search the entire left subtree first before moving to the right-hand side. One downside of the LBVH is that it is optimized for
+fast construction, which isn't really necessary for this path tracer since the construction is only done once at startup. If this were a dynamic scene with moving objects, then it would have been more important
+to have optimized construction. As a result, this can lead to less-optimal or unbalanced tree structures.
+
+##### Bounding Volume Hierarchy (Midpoint and SAH)
+
+Initially, I was only planning to implement the LBVH, but after getting extremely slow runtimes for it,
+I decided to try out a BVH with Midpoint split. Turns out, the LBVH was working fine, I just had a minor bug in my AABB intersection
+test! (see the bloopers below). After I fixed that, the LBVH outperformed the BVH.
+
+The first method I implemented was splitting by the midpoint of the triangles' centroids. This gave some performance improvement, but led to very
+unbalanced trees. For the teapot, it placed close to ~2000 triangles in one leaf node. I believe there is another bug that's causing this imbalance, but in
+general it is possible for Midpoint split to result in very unbalanced trees. Since the Midpoint split was inefficient, I gave the Surface Area Heuristic (SAH)
+a try. In this method, we compute the cost of splitting along a particular axis at a triangle's centroid. This means we perform `3 * num_triangles` cost checks
+to determine a split. The cost function is dependent on the number of triangles that would be placed in each child of the current node as well as
+the surface area of those boxes. We give a higher probability of being hit by a ray to larger boxes. After implementing this heuristic, the trees were more balanced,
+but the construction time had increased immensely because of the cost checks. The construction time was too long for the Stanford Bunny and Stanford Dragon, so I have omitted these from the
+graphs.
+
+##### Bounding Box Culling
+
+A basic optimization I implemented was bounding box culling for loaded meshes. This can be toggled with `BB_CULLING` in `utilities.h`.
+I calculated a bounding box for the entire mesh once loaded, and if the ray hits this box during intersection testing, it will check all its triangles.
+If it misses, it will skip the entire mesh. Bounding box culling works best if the mesh takes up a smaller portion of the screen. If the mesh is very large,
+the probability of hitting the bounding box and check all the triangles is higher.
+
+The follow meshes were used to analyze the performance of each acceleration structure:
+
+| Teapot | Bunny | Dragon
+:-------------------------:|:-------------------------:|:-------------------------:
+ |  | 
+
+Performance comparisons for each acceleration structure can be found in the table and chart below:
+
+
+
+
+Table 1. Performance Comparison of Different Acceleration Structures
+
+
+
+
+Figure 1. Performance Comparison of Different Acceleration Structures
+
+The LBVH performed the best out of all optimizations. Even for meshes with high triangle counts, it still ran much faster and did not
+suffer from a large performance drop like the rest of the implementations. One cause of this is its balanced tree structure and ordering of
+of the nodes (see heatmap below). Sibling nodes are always next to each other in memory. The performance bug in the BVH implementations is apparent from the table and chart, since the
+runtimes are unusually long.
+
+
+
+
+Stanford Dragon Heatmap
+
+The following performance tests were done using the scenes below. A closed version of the scene is provided for use during stream compaction analysis.
+
+| Pastel Room Open | Pastel Room Closed |
+:-------------------------:|:-------------------------:
+ | 
+
+#### Path Termination with Stream Compaction
+
+Stream compaction helps remove terminated rays after each bounce. This ensures that we spend less time performing computations
+for rays that will contribute no light energy to the final image. This graph shows the effect of performing stream compaction
+within a single iteration for an open scene and a closed scene. Without stream compaction, the total number of ray paths would be
+800,000 for this scene (resolution 1000x800).
+
+
+
+
+Figure 2. Stream Compaction Performance, Unterminated Rays
+
+
+
+
+Figure 3. Stream Compaction Performance, Time in Intersections and Shading
+
+From the charts, it is evident that stream compaction lowers the amount of time spent in the intersection and shading kernels
+and reduces time wasted on terminated rays. The contrast between open and closed scenes is also apparent, where stream compaction removed
+significantly more rays from the open scene than the closed scene. This is expected because the rays have an easier way to "escape" out of the scene,
+whereas a closed room does not allow them to terminate as quickly since they can still bounce around for multiple depths.
+
+#### Russian Roulette Ray Termination
+
+Russian roulette is an optimization technique intended to terminate ray paths that contribute very little to the final result. This
+involves a simple check of whether some random value `xi` is less than a value based on the `maxColorChannel` and terminating the path if so.
+If it is not, we boost this path's contribution by dividing by `maxColorChannel` and continue iterating. This is to account for the contribution
+of paths that would have contributed to the pixel color, but have terminated due to Russian roulette.
+
+
+
+
+Figure 4. Russian Roulette Performance Impact
+
+Looking at this graph, it seems as if turning Russian Roulette off is faster. However, the numbers are very close and do not have
+an extremely noticeable performance impact. One possible reason it could take slightly longer when enabled is that Russian Roulette adds
+a branch condition in the shading kernel, which could lead to thread divergence.
+
+#### Caching First Bounce Intersections
+
+Since the result of the first intersection is deterministic, we can cache the first bounce intersections and save
+them for future iterations. This will save us from performing one extra `computeIntersections` call per run. Note
+that this feature cannot be used with anti-aliasing or depth-of-field since we jitter the camera rays, which may result
+in different outcomes for the first bounce intersection.
+
+
+
+
+Figure 5. Caching vs. No Caching Across Iterations (Total Elapsed Time)
+
+
+
+
+Figure 6. Caching vs. No Caching For 10 Iterations With Varying Trace Depth
+
+Caching first bounce intersections had a very subtle effect, but still improved performance. Overall, it
+decreased the amount of time taken during each iteration since we are reducing the amount of `computeIntersections`
+calls by 1.
+
+#### Material Sorting
+
+Shading computations can take different amounts of time depending on the type of material. To further improve performance,
+we can sort materials by type after intersections are computed so that the same materials are contiguous in memory. This ensures
+that materials with similar computational complexity will be executed together.
+
+
+
+
+Figure 7. Shading Kernel Execution Time, Sorting vs. No Sorting
+
+Based on this graph, material sorting did improve performance for this scene. I think this is slightly suprising because
+when I tested simpler scenes, it led to a major performance decrease. I suspected that it was because there are only a
+handful of materials in the scene, making the benefits of sorting unnoticeable. My assumption was that sorting will only improve performance
+when there is a large number of different materials in the scene, but it was nice to see that re-ordering the materials had a positive impact
+for this particular scene.
+
+#### Bloopers
+
+**Broken AABB** - Buggy AABB Intersection Test
+
+
+
+**Chrome Bunny** - Flat Normals
+
+
+**Crazy Reflections**
+
+
+#### References
+
+* [Tero Karras' Blog - Linear Bounding Volume Hierarchy](https://developer.nvidia.com/blog/thinking-parallel-part-ii-tree-traversal-gpu/)
+* [Paper - Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d Trees](https://research.nvidia.com/sites/default/files/pubs/2012-06_Maximizing-Parallelism-in/karras2012hpg_paper.pdf)
+* [Surface Area Heuristic - Jacco Bikker](https://jacco.ompf2.com/2022/04/18/how-to-build-a-bvh-part-2-faster-rays/)
+* [Raycasting AABBs](https://gdbooks.gitbooks.io/3dcollisions/content/Chapter3/raycast_aabb.html)
+* [Fast, Branchless Ray/Bounding Box Intersections](https://tavianator.com/2011/ray_box.html)
+* Physically Based Rendering: From Theory to Implementation - Pharr, Jakob, Humphreys
+
+##### Third Party Libraries
+
+* tinyobj
+* libmorton (included but not used)
+
+##### Models
+
+* Stanford Dragon - [The Stanford 3D Scanning Repository](http://graphics.stanford.edu/data/3Dscanrep/)
+* Stanford Bunny and Teapot - [Alec Jacobson's Common 3D Test Models](https://github.com/alecjacobson/common-3d-test-models)
diff --git a/img/bloopers/cornell.2022-09-22_diffuse_1_bounce.png b/img/bloopers/cornell.2022-09-22_diffuse_1_bounce.png
new file mode 100644
index 0000000..42fff27
Binary files /dev/null and b/img/bloopers/cornell.2022-09-22_diffuse_1_bounce.png differ
diff --git a/img/bloopers/cornell.2022-09-22_diffuse_2_bounces.png b/img/bloopers/cornell.2022-09-22_diffuse_2_bounces.png
new file mode 100644
index 0000000..429e325
Binary files /dev/null and b/img/bloopers/cornell.2022-09-22_diffuse_2_bounces.png differ
diff --git a/img/bloopers/cornell.2022-09-22_diffuse_3_bounces.png b/img/bloopers/cornell.2022-09-22_diffuse_3_bounces.png
new file mode 100644
index 0000000..1889df0
Binary files /dev/null and b/img/bloopers/cornell.2022-09-22_diffuse_3_bounces.png differ
diff --git a/img/bloopers/cornell.2022-09-22_init_program.png b/img/bloopers/cornell.2022-09-22_init_program.png
new file mode 100644
index 0000000..6b5453d
Binary files /dev/null and b/img/bloopers/cornell.2022-09-22_init_program.png differ
diff --git a/img/bloopers/cornell.2022-09-23_diffuse_rngfix_2_bounces.png b/img/bloopers/cornell.2022-09-23_diffuse_rngfix_2_bounces.png
new file mode 100644
index 0000000..629f93a
Binary files /dev/null and b/img/bloopers/cornell.2022-09-23_diffuse_rngfix_2_bounces.png differ
diff --git a/img/bloopers/cornell.2022-09-24_diffuse-5000samp-depth8.png b/img/bloopers/cornell.2022-09-24_diffuse-5000samp-depth8.png
new file mode 100644
index 0000000..2a11cef
Binary files /dev/null and b/img/bloopers/cornell.2022-09-24_diffuse-5000samp-depth8.png differ
diff --git a/img/bloopers/cornell.2022-09-24_diffuse_rngfix_3_bounces.png b/img/bloopers/cornell.2022-09-24_diffuse_rngfix_3_bounces.png
new file mode 100644
index 0000000..16aa68c
Binary files /dev/null and b/img/bloopers/cornell.2022-09-24_diffuse_rngfix_3_bounces.png differ
diff --git a/img/bloopers/cornell.2022-09-24_switch_diffuse_spec_probability.png b/img/bloopers/cornell.2022-09-24_switch_diffuse_spec_probability.png
new file mode 100644
index 0000000..4c0a781
Binary files /dev/null and b/img/bloopers/cornell.2022-09-24_switch_diffuse_spec_probability.png differ
diff --git a/img/bloopers/cornell.2022-10-06_04-26-16z.5000samp.png b/img/bloopers/cornell.2022-10-06_04-26-16z.5000samp.png
new file mode 100644
index 0000000..4a961e2
Binary files /dev/null and b/img/bloopers/cornell.2022-10-06_04-26-16z.5000samp.png differ
diff --git a/img/bloopers/cornell.2022-10-06_05-12-34z.5000samp.png b/img/bloopers/cornell.2022-10-06_05-12-34z.5000samp.png
new file mode 100644
index 0000000..913fedf
Binary files /dev/null and b/img/bloopers/cornell.2022-10-06_05-12-34z.5000samp.png differ
diff --git a/img/cornell.2022-09-24_depth8_5000samp_nocompact.png b/img/cornell.2022-09-24_depth8_5000samp_nocompact.png
new file mode 100644
index 0000000..3ee2e6b
Binary files /dev/null and b/img/cornell.2022-09-24_depth8_5000samp_nocompact.png differ
diff --git a/img/cornell.2022-09-24_depth8_5000samp_spec.png b/img/cornell.2022-09-24_depth8_5000samp_spec.png
new file mode 100644
index 0000000..bc02c5a
Binary files /dev/null and b/img/cornell.2022-09-24_depth8_5000samp_spec.png differ
diff --git a/img/cornell.2022-09-24_depth8_5000samp_withcompact.png b/img/cornell.2022-09-24_depth8_5000samp_withcompact.png
new file mode 100644
index 0000000..7f69499
Binary files /dev/null and b/img/cornell.2022-09-24_depth8_5000samp_withcompact.png differ
diff --git a/img/cornell.2022-09-25_depth8_5000samp_mat_nocache.png b/img/cornell.2022-09-25_depth8_5000samp_mat_nocache.png
new file mode 100644
index 0000000..ffd52b7
Binary files /dev/null and b/img/cornell.2022-09-25_depth8_5000samp_mat_nocache.png differ
diff --git a/img/cornell.2022-09-25_depth8_5000samples_nomat_nocache.png b/img/cornell.2022-09-25_depth8_5000samples_nomat_nocache.png
new file mode 100644
index 0000000..bc02c5a
Binary files /dev/null and b/img/cornell.2022-09-25_depth8_5000samples_nomat_nocache.png differ
diff --git a/img/cornell.2022-09-27_depth8_5000samp_antialiasing.png b/img/cornell.2022-09-27_depth8_5000samp_antialiasing.png
new file mode 100644
index 0000000..c62c93f
Binary files /dev/null and b/img/cornell.2022-09-27_depth8_5000samp_antialiasing.png differ
diff --git a/img/cornell.2022-09-27_depth8_5000samp_dof.png b/img/cornell.2022-09-27_depth8_5000samp_dof.png
new file mode 100644
index 0000000..c20c505
Binary files /dev/null and b/img/cornell.2022-09-27_depth8_5000samp_dof.png differ
diff --git a/img/cornell.2022-09-27_depth8_5000samp_noantialiasing.png b/img/cornell.2022-09-27_depth8_5000samp_noantialiasing.png
new file mode 100644
index 0000000..01050b7
Binary files /dev/null and b/img/cornell.2022-09-27_depth8_5000samp_noantialiasing.png differ
diff --git a/img/cornell.2022-09-28_21-01-03z.5000samp.png b/img/cornell.2022-09-28_21-01-03z.5000samp.png
new file mode 100644
index 0000000..43b8c67
Binary files /dev/null and b/img/cornell.2022-09-28_21-01-03z.5000samp.png differ
diff --git a/img/cornell.2022-10-09_13-47-52z.5000samp.png b/img/cornell.2022-10-09_13-47-52z.5000samp.png
new file mode 100644
index 0000000..f05d851
Binary files /dev/null and b/img/cornell.2022-10-09_13-47-52z.5000samp.png differ
diff --git a/img/cornell.2022-10-10_03-25-47z.5000samp.png b/img/cornell.2022-10-10_03-25-47z.5000samp.png
new file mode 100644
index 0000000..5fbbfe3
Binary files /dev/null and b/img/cornell.2022-10-10_03-25-47z.5000samp.png differ
diff --git a/img/dragon_5000_lbvh.png b/img/dragon_5000_lbvh.png
new file mode 100644
index 0000000..3e4eacd
Binary files /dev/null and b/img/dragon_5000_lbvh.png differ
diff --git a/img/final/acceleration_bunny.PNG b/img/final/acceleration_bunny.PNG
new file mode 100644
index 0000000..73a2549
Binary files /dev/null and b/img/final/acceleration_bunny.PNG differ
diff --git a/img/final/acceleration_dragon.PNG b/img/final/acceleration_dragon.PNG
new file mode 100644
index 0000000..032cd98
Binary files /dev/null and b/img/final/acceleration_dragon.PNG differ
diff --git a/img/final/acceleration_teapot.PNG b/img/final/acceleration_teapot.PNG
new file mode 100644
index 0000000..63c5870
Binary files /dev/null and b/img/final/acceleration_teapot.PNG differ
diff --git a/img/final/anti-aliasing-zoomed.PNG b/img/final/anti-aliasing-zoomed.PNG
new file mode 100644
index 0000000..15106fd
Binary files /dev/null and b/img/final/anti-aliasing-zoomed.PNG differ
diff --git a/img/final/anti-aliasing.PNG b/img/final/anti-aliasing.PNG
new file mode 100644
index 0000000..a40aa38
Binary files /dev/null and b/img/final/anti-aliasing.PNG differ
diff --git a/img/final/antialiased-zoomed.PNG b/img/final/antialiased-zoomed.PNG
new file mode 100644
index 0000000..b7c9218
Binary files /dev/null and b/img/final/antialiased-zoomed.PNG differ
diff --git a/img/final/diffuse.PNG b/img/final/diffuse.PNG
new file mode 100644
index 0000000..a8a78a0
Binary files /dev/null and b/img/final/diffuse.PNG differ
diff --git a/img/final/dof.PNG b/img/final/dof.PNG
new file mode 100644
index 0000000..e0515c9
Binary files /dev/null and b/img/final/dof.PNG differ
diff --git a/img/final/gamma.PNG b/img/final/gamma.PNG
new file mode 100644
index 0000000..4be40cf
Binary files /dev/null and b/img/final/gamma.PNG differ
diff --git a/img/final/heatmap.png b/img/final/heatmap.png
new file mode 100644
index 0000000..1b7373c
Binary files /dev/null and b/img/final/heatmap.png differ
diff --git a/img/final/imperfect_specular.PNG b/img/final/imperfect_specular.PNG
new file mode 100644
index 0000000..1e85e27
Binary files /dev/null and b/img/final/imperfect_specular.PNG differ
diff --git a/img/final/mesh_loading.PNG b/img/final/mesh_loading.PNG
new file mode 100644
index 0000000..5894799
Binary files /dev/null and b/img/final/mesh_loading.PNG differ
diff --git a/img/final/no-anti-aliasing-zoomed.png b/img/final/no-anti-aliasing-zoomed.png
new file mode 100644
index 0000000..ffa8f3c
Binary files /dev/null and b/img/final/no-anti-aliasing-zoomed.png differ
diff --git a/img/final/no-anti-aliasing.PNG b/img/final/no-anti-aliasing.PNG
new file mode 100644
index 0000000..4de1ed6
Binary files /dev/null and b/img/final/no-anti-aliasing.PNG differ
diff --git a/img/final/noantialiased-zoomed.PNG b/img/final/noantialiased-zoomed.PNG
new file mode 100644
index 0000000..beff402
Binary files /dev/null and b/img/final/noantialiased-zoomed.PNG differ
diff --git a/img/final/nogamma.PNG b/img/final/nogamma.PNG
new file mode 100644
index 0000000..3b21e2c
Binary files /dev/null and b/img/final/nogamma.PNG differ
diff --git a/img/final/perfect_specular.PNG b/img/final/perfect_specular.PNG
new file mode 100644
index 0000000..e4b487a
Binary files /dev/null and b/img/final/perfect_specular.PNG differ
diff --git a/img/final/refractive.PNG b/img/final/refractive.PNG
new file mode 100644
index 0000000..c0e81e1
Binary files /dev/null and b/img/final/refractive.PNG differ
diff --git a/img/final/stream_compact_closed.PNG b/img/final/stream_compact_closed.PNG
new file mode 100644
index 0000000..a0413be
Binary files /dev/null and b/img/final/stream_compact_closed.PNG differ
diff --git a/img/final/stream_compact_open - Copy.PNG b/img/final/stream_compact_open - Copy.PNG
new file mode 100644
index 0000000..9324756
Binary files /dev/null and b/img/final/stream_compact_open - Copy.PNG differ
diff --git a/img/final/stream_compact_open.PNG b/img/final/stream_compact_open.PNG
new file mode 100644
index 0000000..9324756
Binary files /dev/null and b/img/final/stream_compact_open.PNG differ
diff --git a/img/final/stream_compact_open_labels.PNG b/img/final/stream_compact_open_labels.PNG
new file mode 100644
index 0000000..c2866a8
Binary files /dev/null and b/img/final/stream_compact_open_labels.PNG differ
diff --git a/img/final/violin.PNG b/img/final/violin.PNG
new file mode 100644
index 0000000..1ab1f6c
Binary files /dev/null and b/img/final/violin.PNG differ
diff --git a/img/glass_ball.2022-10-09_17-19-17z.5000samp.png b/img/glass_ball.2022-10-09_17-19-17z.5000samp.png
new file mode 100644
index 0000000..4d8fb62
Binary files /dev/null and b/img/glass_ball.2022-10-09_17-19-17z.5000samp.png differ
diff --git a/img/graphs/acceleration_structures.png b/img/graphs/acceleration_structures.png
new file mode 100644
index 0000000..8bd1a7b
Binary files /dev/null and b/img/graphs/acceleration_structures.png differ
diff --git a/img/graphs/caching_bounces.png b/img/graphs/caching_bounces.png
new file mode 100644
index 0000000..e3c8b57
Binary files /dev/null and b/img/graphs/caching_bounces.png differ
diff --git a/img/graphs/caching_elapsed_time.png b/img/graphs/caching_elapsed_time.png
new file mode 100644
index 0000000..2871e1e
Binary files /dev/null and b/img/graphs/caching_elapsed_time.png differ
diff --git a/img/graphs/lbvh.png b/img/graphs/lbvh.png
new file mode 100644
index 0000000..58501aa
Binary files /dev/null and b/img/graphs/lbvh.png differ
diff --git a/img/graphs/material_sorting.png b/img/graphs/material_sorting.png
new file mode 100644
index 0000000..617adbb
Binary files /dev/null and b/img/graphs/material_sorting.png differ
diff --git a/img/graphs/russian_roulette.png b/img/graphs/russian_roulette.png
new file mode 100644
index 0000000..69327c7
Binary files /dev/null and b/img/graphs/russian_roulette.png differ
diff --git a/img/graphs/stream_compact_kernels.png b/img/graphs/stream_compact_kernels.png
new file mode 100644
index 0000000..89497ad
Binary files /dev/null and b/img/graphs/stream_compact_kernels.png differ
diff --git a/img/graphs/stream_compact_rays.png b/img/graphs/stream_compact_rays.png
new file mode 100644
index 0000000..df3cf38
Binary files /dev/null and b/img/graphs/stream_compact_rays.png differ
diff --git a/img/graphs/table.png b/img/graphs/table.png
new file mode 100644
index 0000000..3d1871f
Binary files /dev/null and b/img/graphs/table.png differ
diff --git a/img/mesh.2022-10-10_00-39-44z.5000samp.png b/img/mesh.2022-10-10_00-39-44z.5000samp.png
new file mode 100644
index 0000000..6781e4f
Binary files /dev/null and b/img/mesh.2022-10-10_00-39-44z.5000samp.png differ
diff --git a/img/performance_testing.2022-10-11_06-11-40z.10samp.png b/img/performance_testing.2022-10-11_06-11-40z.10samp.png
new file mode 100644
index 0000000..cd60668
Binary files /dev/null and b/img/performance_testing.2022-10-11_06-11-40z.10samp.png differ
diff --git a/scenes/bunny-tree.txt b/scenes/bunny-tree.txt
new file mode 100644
index 0000000..fc80004
--- /dev/null
+++ b/scenes/bunny-tree.txt
@@ -0,0 +1,159 @@
+// Emissive material (light)
+MATERIAL 0
+RGB 1 1 1
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 40
+
+// Diffuse white
+MATERIAL 1
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse red
+MATERIAL 2
+RGB .85 .35 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse aqua
+MATERIAL 3
+RGB 0.4 0.9 0.6
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse grey
+MATERIAL 4
+RGB 0.6 0.6 0.6
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse blue
+MATERIAL 5
+RGB 0.5 0.5 0.9
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Specular pink
+MATERIAL 6
+RGB 0 0 0
+SPECEX 0
+SPECRGB 0.9 0.6 0.9
+REFL 1
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Transmissive orange
+MATERIAL 7
+RGB 0 0 0
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 1
+REFR 1
+REFRIOR 1.55
+EMITTANCE 0
+
+// Imperfect specular green
+MATERIAL 8
+RGB 0.6 1.0 0.5
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 1
+REFR 0
+REFRIOR 1.55
+EMITTANCE 0
+
+// Camera
+CAMERA
+RES 800 800
+FOVY 35
+ITERATIONS 3
+DEPTH 8
+FILE performance_testing
+LENS_RADIUS 0.0
+FOCAL_DIST 29.5
+EYE 0.0 5 10.5
+LOOKAT 0 5 0
+UP 0 1 0
+
+
+// Ceiling light middle
+OBJECT 0
+cube
+material 0
+TRANS 0 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 15 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 2
+TRANS 0 10 0
+ROTAT 0 0 90
+SCALE .01 20 10
+
+// Back wall
+OBJECT 3
+cube
+material 4
+TRANS 0 5 -5
+ROTAT 0 90 0
+SCALE .01 10 15
+
+// Left wall
+OBJECT 4
+cube
+material 3
+TRANS -7 5 0
+ROTAT 0 0 0
+SCALE .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS 7 5 0
+ROTAT 0 0 0
+SCALE .01 10 10
+
+// Mesh
+OBJECT 6
+mesh dragon.obj
+material 8
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 1 1 1
diff --git a/scenes/cornell.txt b/scenes/cornell.txt
index 83ff820..8f0954f 100644
--- a/scenes/cornell.txt
+++ b/scenes/cornell.txt
@@ -6,7 +6,7 @@ SPECRGB 0 0 0
REFL 0
REFR 0
REFRIOR 0
-EMITTANCE 5
+EMITTANCE 10
// Diffuse white
MATERIAL 1
@@ -40,9 +40,9 @@ EMITTANCE 0
// Specular white
MATERIAL 4
-RGB .98 .98 .98
+RGB 0 0 0
SPECEX 0
-SPECRGB .98 .98 .98
+SPECRGB 0 .98 .98
REFL 1
REFR 0
REFRIOR 0
@@ -51,11 +51,13 @@ EMITTANCE 0
// Camera
CAMERA
RES 800 800
-FOVY 45
+FOVY 19.5
ITERATIONS 5000
DEPTH 8
FILE cornell
-EYE 0.0 5 10.5
+LENS_RADIUS 0.5
+FOCAL_DIST 29.5
+EYE 0.0 5 15.5
LOOKAT 0 5 0
UP 0 1 0
diff --git a/scenes/dragon.txt b/scenes/dragon.txt
new file mode 100644
index 0000000..27d7911
--- /dev/null
+++ b/scenes/dragon.txt
@@ -0,0 +1,119 @@
+// Emissive material (light)
+MATERIAL 0
+RGB 1 1 1
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 40
+
+// Diffuse white
+MATERIAL 1
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse red
+MATERIAL 2
+RGB .85 .35 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse green
+MATERIAL 3
+RGB .35 .85 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Specular white
+MATERIAL 4
+RGB 0.13 0.13 0.13
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 1
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Camera
+CAMERA
+RES 800 800
+FOVY 35
+ITERATIONS 5000
+DEPTH 5
+FILE mesh_cornell
+LENS_RADIUS 0.0
+FOCAL_DIST 29.5
+EYE 0.0 5 10.5
+LOOKAT 0 5 0
+UP 0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS 0 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 20 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 2
+TRANS 0 10 0
+ROTAT 0 0 90
+SCALE .01 20 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS 0 5 -5
+ROTAT 0 90 0
+SCALE .01 10 20
+
+// Left wall
+OBJECT 4
+cube
+material 3
+TRANS -10 5 0
+ROTAT 0 0 0
+SCALE .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS 10 5 0
+ROTAT 0 0 0
+SCALE .01 10 10
+
+// Mesh
+OBJECT 6
+mesh dragon.obj
+material 4
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 1 1 1
diff --git a/scenes/glass_ball.txt b/scenes/glass_ball.txt
new file mode 100644
index 0000000..4ae9e96
--- /dev/null
+++ b/scenes/glass_ball.txt
@@ -0,0 +1,119 @@
+// Emissive material (light)
+MATERIAL 0
+RGB 1 1 1
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 40
+
+// Diffuse white
+MATERIAL 1
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse red
+MATERIAL 2
+RGB .85 .35 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse green
+MATERIAL 3
+RGB .35 .85 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Specular white
+MATERIAL 4
+RGB 0 0 1
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 1
+REFR 1
+REFRIOR 1.55
+EMITTANCE 0
+
+// Camera
+CAMERA
+RES 800 800
+FOVY 19.5
+ITERATIONS 5000
+DEPTH 5
+FILE glass_ball
+LENS_RADIUS 0.0
+FOCAL_DIST 29.5
+EYE 0.0 5 15.5
+LOOKAT 0 5 0
+UP 0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS 0 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS 0 10 0
+ROTAT 0 0 90
+SCALE .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS 0 5 -5
+ROTAT 0 90 0
+SCALE .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS -5 5 0
+ROTAT 0 0 0
+SCALE .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS 5 5 0
+ROTAT 0 0 0
+SCALE .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS 0 4 0
+ROTAT 0 0 0
+SCALE 6 6 6
diff --git a/scenes/materials.txt b/scenes/materials.txt
new file mode 100644
index 0000000..fd58acc
--- /dev/null
+++ b/scenes/materials.txt
@@ -0,0 +1,167 @@
+// Emissive material (light)
+MATERIAL 0
+RGB 1 1 1
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 20
+
+// Diffuse white
+MATERIAL 1
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse red
+MATERIAL 2
+RGB .85 .35 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse aqua
+MATERIAL 3
+RGB 0.4 0.9 0.6
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse grey
+MATERIAL 4
+RGB 0.4 0.4 0.4
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse purple
+MATERIAL 5
+RGB 0 0 0
+SPECEX 0
+SPECRGB 0.7 0.3 0.9
+REFL 1
+REFR 1
+REFRIOR 1.55
+EMITTANCE 0
+
+// Specular pink
+MATERIAL 6
+RGB 0 0 0
+SPECEX 0
+SPECRGB 0.9 0.6 0.9
+REFL 1
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Transmissive orange
+MATERIAL 7
+RGB 0 0 0
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 1
+REFR 1
+REFRIOR 1.55
+EMITTANCE 0
+
+// Imperfect specular green
+MATERIAL 8
+RGB 0.6 1.0 0.5
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 1
+REFR 0
+REFRIOR 1.55
+EMITTANCE 0
+
+// Camera
+CAMERA
+RES 800 800
+FOVY 35
+ITERATIONS 5000
+DEPTH 5
+FILE performance_testing
+LENS_RADIUS 0.0
+FOCAL_DIST 29.5
+EYE 0.0 5 10.5
+LOOKAT 0 5 0
+UP 0 1 0
+
+
+// Ceiling light middle
+OBJECT 0
+cube
+material 0
+TRANS 0 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 40 .01 40
+
+// Ceiling
+OBJECT 2
+cube
+material 4
+TRANS 0 10 0
+ROTAT 0 0 90
+SCALE .01 40 40
+
+// Back wall
+OBJECT 3
+cube
+material 4
+TRANS 0 5 -5
+ROTAT 0 90 0
+SCALE .01 20 40
+
+// Left wall
+OBJECT 4
+cube
+material 3
+TRANS -20 5 0
+ROTAT 0 0 0
+SCALE .01 10 40
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS 20 5 0
+ROTAT 0 0 0
+SCALE .01 10 40
+
+// Mesh
+OBJECT 6
+mesh violin.obj
+material 5
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 1 1 1
+
+// Sphere
+OBJECT 7
+sphere
+material 5
+TRANS 1 1.25 1.5
+ROTAT 0 0 0
+SCALE 2.5 2.5 2.5
diff --git a/scenes/mesh.txt b/scenes/mesh.txt
new file mode 100644
index 0000000..9bdea1c
--- /dev/null
+++ b/scenes/mesh.txt
@@ -0,0 +1,30 @@
+// Emissive material (light)
+MATERIAL 0
+RGB 1 1 1
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 5
+
+// Camera
+CAMERA
+RES 800 800
+FOVY 45
+ITERATIONS 5000
+DEPTH 8
+FILE mesh
+LENS_RADIUS 0.0
+FOCAL_DIST 29.5
+EYE 0.0 5 10.5
+LOOKAT 0 5 0
+UP 0 1 0
+
+// Sphere
+OBJECT 0
+mesh dragons.obj
+material 0
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 1 1 1
diff --git a/scenes/mesh_cornell.txt b/scenes/mesh_cornell.txt
new file mode 100644
index 0000000..c06a0c7
--- /dev/null
+++ b/scenes/mesh_cornell.txt
@@ -0,0 +1,119 @@
+// Emissive material (light)
+MATERIAL 0
+RGB 1 1 1
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 20
+
+// Diffuse white
+MATERIAL 1
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse red
+MATERIAL 2
+RGB .85 .35 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse green
+MATERIAL 3
+RGB .35 .85 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Specular white
+MATERIAL 4
+RGB 0 0.13 0.84
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Camera
+CAMERA
+RES 800 800
+FOVY 45
+ITERATIONS 5000
+DEPTH 5
+FILE mesh_cornell
+LENS_RADIUS 0.0
+FOCAL_DIST 29.5
+EYE 0.0 5 10.5
+LOOKAT 0 5 0
+UP 0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS 0 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS 0 10 0
+ROTAT 0 0 90
+SCALE .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS 0 5 -5
+ROTAT 0 90 0
+SCALE .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS -5 5 0
+ROTAT 0 0 0
+SCALE .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS 5 5 0
+ROTAT 0 0 0
+SCALE .01 10 10
+
+// Mesh
+OBJECT 6
+mesh dragons.obj
+material 4
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 1 1 1
diff --git a/scenes/performance-testing.txt b/scenes/performance-testing.txt
new file mode 100644
index 0000000..37adc1d
--- /dev/null
+++ b/scenes/performance-testing.txt
@@ -0,0 +1,199 @@
+// Emissive material (light)
+MATERIAL 0
+RGB 1 1 1
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 40
+
+// Diffuse white
+MATERIAL 1
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse red
+MATERIAL 2
+RGB .85 .35 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse aqua
+MATERIAL 3
+RGB 0.4 0.9 0.6
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse grey
+MATERIAL 4
+RGB 0.6 0.6 0.6
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse blue
+MATERIAL 5
+RGB 0.5 0.5 0.9
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Specular pink
+MATERIAL 6
+RGB 0 0 0
+SPECEX 0
+SPECRGB 0.9 0.6 0.9
+REFL 1
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Transmissive orange
+MATERIAL 7
+RGB 0 0 0
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 1
+REFR 1
+REFRIOR 1.55
+EMITTANCE 0
+
+// Imperfect specular green
+MATERIAL 8
+RGB 0.6 1.0 0.5
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 1
+REFR 0
+REFRIOR 1.55
+EMITTANCE 0
+
+// Camera
+CAMERA
+RES 1000 800
+FOVY 35
+ITERATIONS 10
+DEPTH 8
+FILE performance_testing
+LENS_RADIUS 0.0
+FOCAL_DIST 29.5
+EYE 0.0 5 10.5
+LOOKAT 0 5 0
+UP 0 1 0
+
+
+// Ceiling light middle
+OBJECT 0
+cube
+material 0
+TRANS 0 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Ceiling light left
+OBJECT 1
+cube
+material 0
+TRANS -5 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Ceiling light right
+OBJECT 2
+cube
+material 0
+TRANS 5 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Floor
+OBJECT 3
+cube
+material 1
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 20 .01 40
+
+// Ceiling
+OBJECT 4
+cube
+material 2
+TRANS 0 10 0
+ROTAT 0 0 90
+SCALE .01 20 40
+
+// Back wall
+OBJECT 5
+cube
+material 4
+TRANS 0 5 -5
+ROTAT 0 90 0
+SCALE .01 10 20
+
+// Left wall
+OBJECT 6
+cube
+material 3
+TRANS -10 5 0
+ROTAT 0 0 0
+SCALE .01 10 40
+
+// Right wall
+OBJECT 7
+cube
+material 3
+TRANS 10 5 0
+ROTAT 0 0 0
+SCALE .01 10 40
+
+// Mesh
+OBJECT 8
+mesh bunny-performance.obj
+material 8
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 1 1 1
+
+// Transmissive sphere
+OBJECT 9
+sphere
+material 7
+TRANS 1 1.25 1
+ROTAT 0 0 0
+SCALE 2.5 2.5 2.5
+
+// Reflective sphere
+OBJECT 10
+sphere
+material 6
+TRANS 4 3 -1
+ROTAT 0 0 0
+SCALE 4 4 4
+
+// Pedestal
+OBJECT 11
+cube
+material 4
+TRANS 4 0 -1
+ROTAT 0 0 0
+SCALE 4 2 4
diff --git a/scenes/sphere.txt b/scenes/sphere.txt
index a74b545..89a74b6 100644
--- a/scenes/sphere.txt
+++ b/scenes/sphere.txt
@@ -15,6 +15,8 @@ FOVY 45
ITERATIONS 5000
DEPTH 8
FILE sphere
+LENS_RADIUS 0.0
+FOCAL_DIST 29.5
EYE 0.0 5 10.5
LOOKAT 0 5 0
UP 0 1 0
diff --git a/scenes/violin.txt b/scenes/violin.txt
new file mode 100644
index 0000000..6dba462
--- /dev/null
+++ b/scenes/violin.txt
@@ -0,0 +1,173 @@
+// Emissive material (light)
+MATERIAL 0
+RGB 0.8 0.6 0.3
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 100
+
+// Diffuse white
+MATERIAL 1
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse red
+MATERIAL 2
+RGB .85 .35 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse rust
+MATERIAL 3
+RGB 0.01 0 0
+SPECEX 0
+SPECRGB .98 .98 .98
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse grey
+MATERIAL 4
+RGB 0.8 0.8 0.8
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Transmissive white
+MATERIAL 5
+RGB 0 0 0
+SPECEX 0
+SPECRGB .98 .98 .98
+REFL 1
+REFR 1
+REFRIOR 1.55
+EMITTANCE 0
+
+// Reflective white
+MATERIAL 5
+RGB 0 0 0
+SPECEX 0
+SPECRGB .98 .98 .98
+REFL 1
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse purple
+MATERIAL 6
+RGB 0 0 0.05
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Camera
+CAMERA
+RES 1000 800
+FOVY 35
+ITERATIONS 5000
+DEPTH 5
+FILE performance_testing
+LENS_RADIUS 0.0
+FOCAL_DIST 29.5
+EYE 0.0 5 10.5
+LOOKAT 0 5 0
+UP 0 1 0
+
+
+// Ceiling light middle
+OBJECT 0
+cube
+material 0
+TRANS 0 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 2
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 40 .01 40
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS 0 10 0
+ROTAT 0 0 90
+SCALE .01 40 40
+
+// Mesh
+OBJECT 3
+mesh violin.obj
+material 3
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 1 1 1
+
+// Paper 1
+OBJECT 4
+cube
+material 4
+TRANS 0 0 4
+ROTAT 0 65 0
+SCALE 3 .07 5
+
+// Paper 2
+OBJECT 5
+cube
+material 1
+TRANS 2.3 0.01 2.1
+ROTAT 0 110 0
+SCALE 3 .07 5
+
+// Pedestal Left
+OBJECT 6
+cube
+material 6
+TRANS -3 0.4 4.44
+ROTAT 0 45 0
+SCALE 3 .7 2.8
+
+// Transmissive sphere
+OBJECT 7
+sphere
+material 5
+TRANS -3 2.1 4.5
+ROTAT 0 0 0
+SCALE 2 2 2
+
+// Pedestal Right
+OBJECT 8
+cube
+material 6
+TRANS -3 0.4 -6
+ROTAT 0 118 0
+SCALE 4.6 1.2 4.7
+
+// Reflective sphere
+OBJECT 9
+sphere
+material 5
+TRANS -2.7 3 -5.3
+ROTAT 0 0 0
+SCALE 4 4 4
\ No newline at end of file
diff --git a/scenes/violin2.txt b/scenes/violin2.txt
new file mode 100644
index 0000000..fd58acc
--- /dev/null
+++ b/scenes/violin2.txt
@@ -0,0 +1,167 @@
+// Emissive material (light)
+MATERIAL 0
+RGB 1 1 1
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 20
+
+// Diffuse white
+MATERIAL 1
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse red
+MATERIAL 2
+RGB .85 .35 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse aqua
+MATERIAL 3
+RGB 0.4 0.9 0.6
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse grey
+MATERIAL 4
+RGB 0.4 0.4 0.4
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse purple
+MATERIAL 5
+RGB 0 0 0
+SPECEX 0
+SPECRGB 0.7 0.3 0.9
+REFL 1
+REFR 1
+REFRIOR 1.55
+EMITTANCE 0
+
+// Specular pink
+MATERIAL 6
+RGB 0 0 0
+SPECEX 0
+SPECRGB 0.9 0.6 0.9
+REFL 1
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Transmissive orange
+MATERIAL 7
+RGB 0 0 0
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 1
+REFR 1
+REFRIOR 1.55
+EMITTANCE 0
+
+// Imperfect specular green
+MATERIAL 8
+RGB 0.6 1.0 0.5
+SPECEX 0
+SPECRGB 0.98 0.98 0.98
+REFL 1
+REFR 0
+REFRIOR 1.55
+EMITTANCE 0
+
+// Camera
+CAMERA
+RES 800 800
+FOVY 35
+ITERATIONS 5000
+DEPTH 5
+FILE performance_testing
+LENS_RADIUS 0.0
+FOCAL_DIST 29.5
+EYE 0.0 5 10.5
+LOOKAT 0 5 0
+UP 0 1 0
+
+
+// Ceiling light middle
+OBJECT 0
+cube
+material 0
+TRANS 0 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 40 .01 40
+
+// Ceiling
+OBJECT 2
+cube
+material 4
+TRANS 0 10 0
+ROTAT 0 0 90
+SCALE .01 40 40
+
+// Back wall
+OBJECT 3
+cube
+material 4
+TRANS 0 5 -5
+ROTAT 0 90 0
+SCALE .01 20 40
+
+// Left wall
+OBJECT 4
+cube
+material 3
+TRANS -20 5 0
+ROTAT 0 0 0
+SCALE .01 10 40
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS 20 5 0
+ROTAT 0 0 0
+SCALE .01 10 40
+
+// Mesh
+OBJECT 6
+mesh violin.obj
+material 5
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 1 1 1
+
+// Sphere
+OBJECT 7
+sphere
+material 5
+TRANS 1 1.25 1.5
+ROTAT 0 0 0
+SCALE 2.5 2.5 2.5
diff --git a/src/interactions.h b/src/interactions.h
index f969e45..556a77f 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -41,6 +41,115 @@ glm::vec3 calculateRandomDirectionInHemisphere(
+ sin(around) * over * perpendicularDirection2;
}
+__host__ __device__
+glm::vec3 calculateFresnel(const Material &m, float cosTheta) {
+ float etaI = 1.0;
+ float etaT = m.indexOfRefraction;
+ float cosThetaI = glm::clamp(cosThetaI, -1.f, 1.f);
+
+ // Check if entering or leaving medium, and swap indices of refraction if necessary
+ bool leaving = (cosThetaI < 0.f);
+ if (leaving) {
+ float tmp = etaI;
+ etaI = etaT;
+ etaT = tmp;
+ cosThetaI = glm::abs(cosThetaI);
+ }
+ float eta = etaI / etaT;
+
+ // Snell's Law
+ float sinThetaI = glm::sqrt(max(0.0, 1.0 - cosThetaI * cosThetaI));
+ float sinThetaT = eta * sinThetaI;
+
+ // Total internal reflection
+ if (sinThetaT >= 1.0) return glm::vec3(1.0, 1.0, 1.0);
+
+ // Compute Fresnel reflectance (see equation in PBRT 8.2.1)
+ float cosThetaT = glm::sqrt(glm::max(0.0, 1.0 - sinThetaT * sinThetaT));
+ float rParallel = ((etaT * cosThetaI) - (etaI * cosThetaT)) /
+ ((etaT * cosThetaI) + (etaI * cosThetaT));
+ float rPerp = ((etaI * cosThetaI) - (etaT * cosThetaT)) /
+ ((etaI * cosThetaI) + (etaT * cosThetaT));
+
+ return glm::vec3((rParallel * rParallel + rPerp * rPerp) * 0.5f);
+}
+
+__host__ __device__
+glm::vec3 sample_diffuse(
+ glm::vec3 &normal, const Material& m, thrust::default_random_engine& rng, glm::vec3 wo, glm::vec3& wi)
+{
+ wi = calculateRandomDirectionInHemisphere(normal, rng);
+ return m.color;
+}
+
+__host__ __device__
+glm::vec3 sample_specular_refl(
+ glm::vec3 &normal, const Material& m, thrust::default_random_engine& rng, glm::vec3 wo, glm::vec3& wi)
+{
+ wi = glm::reflect(wo, normal);
+ return m.specular.color;
+}
+
+__host__ __device__
+glm::vec3 sample_specular_trans(
+ glm::vec3 &normal, const Material& m, thrust::default_random_engine& rng, glm::vec3 wo, glm::vec3& wi)
+{
+ float entering = (glm::dot(wo, normal) < 0);
+ float eta = (entering) ? 1.f / m.indexOfRefraction : m.indexOfRefraction;
+
+ // Flip normal to be in same hemisphere as wo
+ bool flip = (glm::dot(wo, normal) > 0.f);
+ normal = (flip) ? -normal : normal;
+ wi = glm::refract(wo, normal, eta);
+
+ // Total internal reflection
+ if (glm::length(wi) < 0) {
+ wi = glm::reflect(wo, normal);
+ return glm::vec3(0.0, 0.0, 0.0);
+ }
+ return m.specular.color;
+}
+
+__host__ __device__
+glm::vec3 sample_glass(
+ glm::vec3& normal, const Material& m, thrust::default_random_engine& rng, glm::vec3 wo, glm::vec3& wi)
+{
+ thrust::uniform_real_distribution u01(0, 1);
+ bool reflect = u01(rng) < 0.5;
+
+ float cosTheta = glm::dot(wo, normal);
+ glm::vec3 Fr = calculateFresnel(m, cosTheta);
+ glm::vec3 f = glm::vec3(0.0, 0.0, 0.0);
+ if (reflect) {
+ f = sample_specular_refl(normal, m, rng, wo, wi);
+ return 2.f * Fr * f;
+ }
+ else {
+ f = sample_specular_trans(normal, m, rng, wo, wi);
+ return 2.f * (glm::vec3(1.f, 1.f, 1.f) - Fr) * f;
+ }
+}
+
+__host__ __device__
+glm::vec3 sample_plastic(
+ glm::vec3& normal, const Material& m, thrust::default_random_engine& rng, glm::vec3 wo, glm::vec3& wi)
+{
+ thrust::uniform_real_distribution u01(0, 1);
+ bool reflect = u01(rng) < 0.5;
+
+ float cosTheta = glm::dot(wo, normal);
+ glm::vec3 Fr = calculateFresnel(m, cosTheta);
+ glm::vec3 f = glm::vec3(0.0, 0.0, 0.0);
+ if (reflect) {
+ f = sample_specular_refl(normal, m, rng, wo, wi);
+ return 2.f * Fr * f;
+ }
+ else {
+ f = sample_diffuse(normal, m, rng, wo, wi);
+ return 2.f * (glm::vec3(1.f, 1.f, 1.f) - Fr) * f;
+ }
+}
+
/**
* Scatter a ray with some probabilities according to the material properties.
* For example, a diffuse surface scatters in a cosine-weighted hemisphere.
@@ -76,4 +185,32 @@ void scatterRay(
// TODO: implement this.
// A basic implementation of pure-diffuse shading will just call the
// calculateRandomDirectionInHemisphere defined above.
+ if (pathSegment.remainingBounces <= 0) {
+ return;
+ }
+ thrust::uniform_real_distribution u01(0, 1);
+ float xi = u01(rng);
+
+ glm::vec3 wi = glm::vec3(0.0, 0.0, 0.0);
+ glm::vec3 f = glm::vec3(0.0, 0.0, 0.0);
+ if (m.hasReflective && m.hasRefractive) {
+ f = sample_glass(normal, m, rng, pathSegment.ray.direction, wi);
+ }
+ else if (m.hasReflective && glm::length(m.color) > 0) {
+ f = sample_plastic(normal, m, rng, pathSegment.ray.direction, wi);
+ }
+ else if (m.hasReflective) {
+ f = sample_specular_refl(normal, m, rng, pathSegment.ray.direction, wi);
+ }
+ else if (m.hasRefractive) {
+ f = sample_specular_trans(normal, m, rng, pathSegment.ray.direction, wi);
+ }
+ else {
+ f = sample_diffuse(normal, m, rng, pathSegment.ray.direction, wi);
+ }
+ pathSegment.throughput *= f;
+ pathSegment.ray.direction = wi;
+ pathSegment.ray.invDirection = glm::vec3(1.0, 1.0, 1.0) / pathSegment.ray.direction;
+ pathSegment.ray.origin = intersect + 0.01f * pathSegment.ray.direction;
+ pathSegment.remainingBounces--;
}
diff --git a/src/intersections.h b/src/intersections.h
index b150407..20d10c0 100644
--- a/src/intersections.h
+++ b/src/intersections.h
@@ -89,6 +89,60 @@ __host__ __device__ float boxIntersectionTest(Geom box, Ray r,
return -1;
}
+//__host__ __device__ bool aabbIntersectionTest(AABB aabb, Ray r) {
+// glm::vec3 invR = glm::vec3(1.0, 1.0, 1.0) / r.direction;
+//
+// float x1 = (aabb.min.x - r.origin.x) * invR.x;
+// float x2 = (aabb.max.x - r.origin.x) * invR.x;
+//
+// float tmin = glm::min(x1, x2);
+// float tmax = glm::max(x1, x2);
+//
+// float y1 = (aabb.min.y - r.origin.y) * invR.y;
+// float y2 = (aabb.max.y - r.origin.y) * invR.y;
+//
+// tmin = glm::min(tmin, glm::min(y1, y2));
+// tmax = glm::max(tmax, glm::max(y1, y2));
+//
+// float z1 = (aabb.min.z - r.origin.z) * invR.z;
+// float z2 = (aabb.max.z - r.origin.z) * invR.z;
+//
+// tmin = glm::min(tmin, glm::min(z1, z2));
+// tmax = glm::max(tmax, glm::max(z1, z2));
+//
+// return tmin <= tmax && tmax >= 0.0;
+//}
+
+// Based off of "Fast, Branchless Ray/Bounding Box Intersections" by Tavian Barnes
+__host__ __device__ bool aabbIntersectionTest(AABB aabb, Ray &r, float& t) {
+ glm::vec3 invR = r.invDirection;
+
+ float x1 = (aabb.min.x - r.origin.x) * invR.x;
+ float x2 = (aabb.max.x - r.origin.x) * invR.x;
+
+ float tmin = glm::min(x1, x2);
+ float tmax = glm::max(x1, x2);
+
+ float y1 = (aabb.min.y - r.origin.y) * invR.y;
+ float y2 = (aabb.max.y - r.origin.y) * invR.y;
+
+ tmin = glm::max(tmin, glm::min(y1, y2));
+ tmax = glm::min(tmax, glm::max(y1, y2));
+
+ float z1 = (aabb.min.z - r.origin.z) * invR.z;
+ float z2 = (aabb.max.z - r.origin.z) * invR.z;
+
+ tmin = glm::max(tmin, glm::min(z1, z2));
+ tmax = glm::min(tmax, glm::max(z1, z2));
+
+ bool intersect = tmin <= tmax && tmax >= 0;
+ t = (intersect) ? tmin : -1.0;
+ if (t < 0.f) t = tmax;
+
+ r.intersectionCount++;
+ return intersect;
+}
+
// CHECKITOUT
/**
* Test intersection between a ray and a transformed sphere. Untransformed,
@@ -137,8 +191,242 @@ __host__ __device__ float sphereIntersectionTest(Geom sphere, Ray r,
intersectionPoint = multiplyMV(sphere.transform, glm::vec4(objspaceIntersection, 1.f));
normal = glm::normalize(multiplyMV(sphere.invTranspose, glm::vec4(objspaceIntersection, 0.f)));
if (!outside) {
- normal = -normal;
+ //normal = -normal; --> commented out because this impacts refraction
}
-
return glm::length(r.origin - intersectionPoint);
}
+
+/**
+ * Test intersection between a ray and a transformed triangle.
+ *
+ * @param intersectionPoint Output parameter for point of intersection.
+ * @param normal Output parameter for surface normal.
+ * @param outside Output param for whether the ray came from outside.
+ * @return Ray parameter `t` value. -1 if no intersection.
+ */
+__host__ __device__ float triangleIntersectionTest(Triangle tri, Ray &r,
+ glm::vec3& barycenter) {
+
+ bool intersect = glm::intersectRayTriangle(r.origin, r.direction,
+ tri.verts[0], tri.verts[1], tri.verts[2],
+ barycenter);
+ r.intersectionCount++;
+ if (!intersect) return -1.f;
+
+ return barycenter.z;
+}
+
+/**
+ * Test intersection between a ray and a triangle mesh.
+ *
+ * @param intersectionPoint Output parameter for point of intersection.
+ * @param normal Output parameter for surface normal.
+ * @param outside Output param for whether the ray came from outside.
+ * @return Ray parameter `t` value. -1 if no intersection.
+ */
+__host__ __device__ float meshIntersectionTest(Geom mesh, Ray &r,
+ const Triangle* tris, glm::vec3& intersectionPoint, glm::vec3& normal, bool& outside) {
+
+#if BB_CULLING
+ // Test ray against mesh AABB
+ float t = -1.0;
+ bool intersectAABB = aabbIntersectionTest(mesh.aabb, r, t);
+ if (!intersectAABB) return -1.f;
+#endif
+
+ // If bounding box is intersected, then check for intersection with all triangles
+ Triangle min_tri;
+ glm::vec3 barycenter, min_barycenter;
+ float min_t = INFINITY;
+ for (int i = mesh.startIdx; i < mesh.startIdx + mesh.triangleCount; i++)
+ {
+ float t = triangleIntersectionTest(tris[i], r, barycenter);
+ if (t < min_t && t > 0.f)
+ {
+ min_t = t;
+ min_barycenter = barycenter;
+ min_tri = tris[i];
+ }
+ }
+
+ // Find intersection point and normal
+ float u = min_barycenter.x;
+ float v = min_barycenter.y;
+ float w = 1.f - u - v;
+ intersectionPoint = u * min_tri.verts[0] + v * min_tri.verts[1] + w * min_tri.verts[2];
+ normal = u * min_tri.norms[0] + v * min_tri.norms[1] + w * min_tri.norms[2];
+
+ return min_t;
+}
+
+__host__ __device__ bool devIsLeaf(const LBVHNode* node) {
+ return node->left == 0xFFFFFFFF && node->right == 0xFFFFFFFF;
+}
+
+__host__ __device__ void lbvhIntersectTriangle(const Triangle* tris, Ray &r, int objectId,
+ Triangle& min_tri, glm::vec3& min_barycenter, float& min_t) {
+
+ glm::vec3 barycenter;
+ float t = triangleIntersectionTest(tris[objectId], r, barycenter);
+ if (t < min_t && t > 0.f)
+ {
+ min_t = t;
+ min_barycenter = barycenter;
+ min_tri = tris[objectId];
+ }
+}
+
+/**
+ * Test intersection between a ray and an LBVH.
+ *
+ * @param intersectionPoint Output parameter for point of intersection.
+ * @param normal Output parameter for surface normal.
+ * @param outside Output param for whether the ray came from outside.
+ * @return Ray parameter `t` value. -1 if no intersection.
+ */
+__host__ __device__ float lbvhIntersectionTest(const LBVHNode* nodes, const Triangle* tris, Ray &r, int triangleCount,
+ glm::vec3& intersectionPoint, glm::vec3& normal, bool& outside) {
+
+ float stack[16];
+ int stackPtr = -1;
+
+ Triangle min_tri;
+ glm::vec3 min_barycenter;
+ float min_t = INFINITY;
+
+ // Push root node
+ stack[++stackPtr] = triangleCount;
+ int currNodeIdx = stack[stackPtr];
+ while (stackPtr >= 0)
+ {
+ // Check intersection with left and right children
+ int leftChild = nodes[currNodeIdx].left;
+ int rightChild = nodes[currNodeIdx].right;
+ const LBVHNode* left = &nodes[leftChild];
+ const LBVHNode* right = &nodes[rightChild];
+
+ float t;
+ bool intersectLeft = aabbIntersectionTest(left->aabb, r, t);
+ bool intersectRight = aabbIntersectionTest(right->aabb, r, t);
+
+ // If intersection found, and they are leaf nodes, check for triangle intersections
+ if (intersectLeft && devIsLeaf(left)) {
+ lbvhIntersectTriangle(tris, r, leftChild, min_tri, min_barycenter, min_t);
+ }
+ if (intersectRight && devIsLeaf(right)) {
+ lbvhIntersectTriangle(tris, r, rightChild, min_tri, min_barycenter, min_t);
+ }
+
+ // If internal nodes, keep traversing
+ bool traverseLeftSubtree = (intersectLeft && !devIsLeaf(left));
+ bool traverseRightSubtree = (intersectRight && !devIsLeaf(right));
+
+ if (!traverseLeftSubtree && !traverseRightSubtree) {
+ // Pop node from stack
+ currNodeIdx = stack[stackPtr--];
+ }
+ else {
+ currNodeIdx = (traverseLeftSubtree) ? leftChild : rightChild;
+ if (traverseLeftSubtree && traverseRightSubtree) {
+ // Push right child onto stack
+ stack[++stackPtr] = rightChild;
+ }
+ }
+ }
+
+ // Find intersection point and normal
+ float u = min_barycenter.x;
+ float v = min_barycenter.y;
+ float w = 1.f - u - v;
+ intersectionPoint = u * min_tri.verts[0] + v * min_tri.verts[1] + w * min_tri.verts[2];
+ normal = u * min_tri.norms[0] + v * min_tri.norms[1] + w * min_tri.norms[2];
+
+ return min_t;
+}
+
+__host__ __device__ bool devBvhIsLeaf(const BVHNode* node) {
+ return (node->numTris > 0);
+}
+
+__host__ __device__ void bvhIntersectTriangles(const Triangle* tris, Ray &r, int start, int numTris,
+ Triangle& min_tri, glm::vec3& min_barycenter, float& min_t) {
+
+ for (int i = start; i < start + numTris; ++i) {
+ glm::vec3 barycenter;
+ float t = triangleIntersectionTest(tris[i], r, barycenter);
+ if (t < min_t && t > 0.f)
+ {
+ min_t = t;
+ min_barycenter = barycenter;
+ min_tri = tris[i];
+ }
+ }
+}
+
+/**
+ * Test intersection between a ray and a BVH.
+ *
+ * @param intersectionPoint Output parameter for point of intersection.
+ * @param normal Output parameter for surface normal.
+ * @param outside Output param for whether the ray came from outside.
+ * @return Ray parameter `t` value. -1 if no intersection.
+ */
+__host__ __device__ float bvhIntersectionTest(const BVHNode* nodes, const Triangle* tris, Ray &r, int triangleCount,
+ glm::vec3& intersectionPoint, glm::vec3& normal, bool& outside) {
+
+ float stack[20];
+ int stackPtr = -1;
+
+ Triangle min_tri;
+ glm::vec3 min_barycenter;
+ float min_t = INFINITY;
+
+ // Push root node
+ stack[++stackPtr] = 0;
+ int currNodeIdx = stack[stackPtr];
+ while (stackPtr >= 0)
+ {
+ // Check intersection with left and right children
+ int leftChild = nodes[currNodeIdx].left;
+ int rightChild = nodes[currNodeIdx].right;
+ const BVHNode* left = &nodes[leftChild];
+ const BVHNode* right = &nodes[rightChild];
+
+ float t;
+ bool intersectLeft = aabbIntersectionTest(left->aabb, r, t);
+ bool intersectRight = aabbIntersectionTest(right->aabb, r, t);
+
+ // If intersection found, and they are leaf nodes, check for triangle intersections
+ if (intersectLeft && devBvhIsLeaf(left)) {
+ bvhIntersectTriangles(tris, r, left->firstTri, left->numTris, min_tri, min_barycenter, min_t);
+ }
+ if (intersectRight && devBvhIsLeaf(right)) {
+ bvhIntersectTriangles(tris, r, right->firstTri, right->numTris, min_tri, min_barycenter, min_t);
+ }
+
+ // If internal nodes, keep traversing
+ bool traverseLeftSubtree = (intersectLeft && !devBvhIsLeaf(left));
+ bool traverseRightSubtree = (intersectRight && !devBvhIsLeaf(right));
+
+ if (!traverseLeftSubtree && !traverseRightSubtree) {
+ // Pop node from stack
+ currNodeIdx = stack[stackPtr--];
+ }
+ else {
+ currNodeIdx = (traverseLeftSubtree) ? leftChild : rightChild;
+ if (traverseLeftSubtree && traverseRightSubtree) {
+ // Push right child onto stack
+ stack[++stackPtr] = rightChild;
+ }
+ }
+ }
+
+ // Find intersection point and normal
+ float u = min_barycenter.x;
+ float v = min_barycenter.y;
+ float w = 1.f - u - v;
+ intersectionPoint = u * min_tri.verts[0] + v * min_tri.verts[1] + w * min_tri.verts[2];
+ normal = u * min_tri.norms[0] + v * min_tri.norms[1] + w * min_tri.norms[2];
+
+ return min_t;
+}
diff --git a/src/lbvh.cu b/src/lbvh.cu
new file mode 100644
index 0000000..59f2029
--- /dev/null
+++ b/src/lbvh.cu
@@ -0,0 +1,374 @@
+#include "lbvh.h"
+
+/// LBVH FUNCTIONS ///
+
+// This optimized LBVH is based on the paper "Maximizing Parallelism in the Construction of BVHs,
+// Octrees, and k-d Trees" by Tero Karras of NVIDIA Research
+
+bool morton_sort(const MortonCode& a, const MortonCode& b) {
+ return a.code < b.code;
+}
+
+bool isLeaf(const LBVHNode* node) {
+ return node->left == 0xFFFFFFFF && node->right == 0xFFFFFFFF;
+}
+
+AABB Union(AABB left, AABB right) {
+ glm::vec3 umin = glm::min(left.min, right.min);
+ glm::vec3 umax = glm::max(left.max, right.max);
+ return AABB{ umin, umax };
+}
+
+// Expand 10-bit integer into 30-bit integer
+unsigned int expand(unsigned int n)
+{
+ n = (n | (n << 16)) & 0b00000011000000000000000011111111;
+ n = (n | (n << 8)) & 0b00000011000000001111000000001111;
+ n = (n | (n << 4)) & 0b00000011000011000011000011000011;
+ n = (n | (n << 2)) & 0b00001001001001001001001001001001;
+ return n;
+}
+
+// Based on PBRT 4.3.3. and Tero Karras version at https://developer.nvidia.com/blog/thinking-parallel-part-iii-tree-construction-gpu/
+unsigned int mortonCode3D(const glm::vec3& centroid) {
+ // Convert centroid coordinates to value between 0 and 1024
+ float x = min(max(centroid.x * 1024.0f, 0.0f), 1023.0f);
+ float y = min(max(centroid.y * 1024.0f, 0.0f), 1023.0f);
+ float z = min(max(centroid.z * 1024.0f, 0.0f), 1023.0f);
+
+ // Expand each 10 bit value so that ith value is at 3 * ith position
+ unsigned int xx = expand((unsigned int)x);
+ unsigned int yy = expand((unsigned int)y);
+ unsigned int zz = expand((unsigned int)z);
+
+ // Interleave the bits
+ return (xx << 2) | (yy << 1) | zz;
+}
+
+void computeMortonCodes(Scene* scene, const AABB& sceneAABB) {
+ for (int i = 0; i < scene->triangles.size(); i++) {
+ // Find centroid of triangle's bounding box
+ glm::vec3 centroid = 0.5f * scene->triangles[i].aabb.min + 0.5f * scene->triangles[i].aabb.max;
+
+ // Normalize centroid w.r.t. scene bounding box
+ glm::vec3 norm_centroid = (centroid - sceneAABB.min) / (sceneAABB.max - sceneAABB.min);
+
+ // Calculate Morton code and add to list
+ MortonCode mcode;
+ mcode.objectId = i;
+ mcode.code = mortonCode3D(norm_centroid);
+ scene->mcodes.push_back(mcode);
+ }
+}
+
+void sortMortonCodes(Scene* scene) {
+ std::vector mcodes_copy = scene->mcodes;
+ std::sort(mcodes_copy.begin(), mcodes_copy.end(), morton_sort);
+ scene->mcodes = mcodes_copy;
+}
+
+// Determines the number of common bits between two numbers
+int delta(MortonCode* sortedMCodes, int N, int i, int j) {
+ // Range check
+ if (j < 0 || j >= N) {
+ return -1;
+ }
+
+ if (sortedMCodes[i].code == sortedMCodes[j].code)
+ {
+ return 32 + __lzcnt(i ^ j);
+ }
+
+ return __lzcnt(sortedMCodes[i].code ^ sortedMCodes[j].code);
+}
+
+// Determines in which direction the node's range will grow
+int sign(MortonCode* sortedMCodes, int N, int i) {
+ int diff = delta(sortedMCodes, N, i, i + 1) - delta(sortedMCodes, N, i, i - 1);
+ return (diff >= 0) ? 1 : -1;
+}
+
+NodeRange determineRange(MortonCode* sortedMCodes, int triangleCount, int i) {
+ // Determine direction of range (+1 or -1)
+ int d = sign(sortedMCodes, triangleCount, i);
+
+ // Compute upper bound of range
+ int deltaMin = delta(sortedMCodes, triangleCount, i, i - d);
+ int lMax = 2;
+ while (delta(sortedMCodes, triangleCount, i, i + lMax * d) > deltaMin) {
+ lMax = lMax * 2;
+ }
+
+ // Find the other end with binary search
+ int l = 0;
+ for (int t = lMax / 2; t >= 1; t /= 2) {
+ if (delta(sortedMCodes, triangleCount, i, i + (l + t) * d) > deltaMin) {
+ l = l + t;
+ }
+ }
+ int j = i + l * d;
+
+ return NodeRange{ i, j, l, d };
+}
+
+int findSplit(MortonCode* sortedMCodes, int triangleCount, NodeRange range) {
+ int i = range.i;
+ int j = range.j;
+ int l = range.l;
+ int d = range.d;
+
+ // Find split position with binary search
+ int deltaNode = delta(sortedMCodes, triangleCount, range.i, range.j);
+ int s = 0;
+ int t = l;
+ do {
+ t = ceil(t / 2.f);
+ if (delta(sortedMCodes, triangleCount, i, i + (s + t) * d) > deltaNode) {
+ s = s + t;
+ }
+ } while (t > 1);
+
+ int gamma = i + s * d + min(d, 0);
+
+ return gamma;
+}
+
+// Recursively assigns bounding boxes to each node, start from the leaf nodes and recursing upwards
+AABB assignBoundingBoxes(Scene* scene, LBVHNode* node) {
+
+ if (!isLeaf(node)) {
+ AABB leftAABB = assignBoundingBoxes(scene, &scene->lbvh[node->left]);
+ AABB rightAABB = assignBoundingBoxes(scene, &scene->lbvh[node->right]);
+ node->aabb = Union(leftAABB, rightAABB);
+ }
+
+ return node->aabb;
+}
+
+// Tree-building functions
+void buildLBVH(Scene* scene, int leafStart, int triangleCount, int meshNum) {
+ // Resize LBVH
+ int numLeaf = triangleCount;
+ int numInternal = triangleCount - 1;
+ int internalStart = leafStart + numLeaf;
+ scene->lbvh.resize(numLeaf + numInternal);
+ scene->sorted_triangles.resize(numLeaf);
+
+ // Initialize leaf nodes
+ for (int i = leafStart; i < numLeaf; ++i) {
+ LBVHNode leafNode;
+ leafNode.objectId = scene->mcodes[i - leafStart].objectId;
+ leafNode.aabb = scene->triangles[leafNode.objectId].aabb;
+ leafNode.left = 0xFFFFFFFF;
+ leafNode.right = 0xFFFFFFFF;
+ scene->lbvh[i] = leafNode;
+
+ scene->sorted_triangles[i] = scene->triangles[leafNode.objectId];
+ }
+ scene->triangles = scene->sorted_triangles;
+
+ // Initialize internal nodes
+ for (int j = internalStart; j < internalStart + numInternal; ++j) {
+ LBVHNode internalNode;
+
+ // Determine range
+ NodeRange range = determineRange(scene->mcodes.data(), triangleCount, j - triangleCount);
+
+ // Find split position
+ int split = findSplit(scene->mcodes.data(), triangleCount, range);
+
+ int leftChild = -1;
+ int rightChild = -1;
+ if (min(range.i, range.j) == split) {
+ leftChild = split;
+ }
+ else {
+ leftChild = triangleCount + split;
+ }
+
+ if (max(range.i, range.j) == split + 1) {
+ rightChild = split + 1;
+ }
+ else {
+ rightChild = triangleCount + split + 1;
+ }
+
+ internalNode.objectId = -1;
+ internalNode.left = leftChild;
+ internalNode.right = rightChild;
+ scene->lbvh[j] = internalNode;
+ }
+ // Assign bounding boxes here
+ assignBoundingBoxes(scene, &scene->lbvh[triangleCount]);
+}
+
+void generateLBVH(Scene* scene)
+{
+ for (int i = 0; i < scene->meshCount; i++) {
+ // Morton code computation
+ computeMortonCodes(scene, scene->mesh_aabbs[i]);
+
+ // Sort Morton codes
+ sortMortonCodes(scene);
+
+ // Build tree from sorted Morton codes
+ buildLBVH(scene, scene->lbvh.size(), scene->mcodes.size(), i);
+
+ scene->mcodes.clear();
+ }
+}
+
+/// BASIC BVH FUNCTIONS ///
+
+// Counter to keep track of the current available node in the tree
+int idx = 1;
+
+// Finds the new bounds of the aabb
+void updateBounds(Scene* scene, const int idx)
+{
+ BVHNode& node = scene->bvh[idx];
+ for (int i = node.firstTri; i < node.firstTri + node.numTris; ++i)
+ {
+ node.aabb = Union(node.aabb, scene->triangles[i].aabb);
+ }
+}
+
+int maxExtent(glm::vec3 extent) {
+ if (extent.x > extent.y && extent.x > extent.z) {
+ return 0;
+ }
+ else if (extent.y > extent.z) {
+ return 1;
+ }
+ else {
+ return 2;
+ }
+}
+
+AABB Union(AABB aabb, glm::vec3 p) {
+ glm::vec3 umin = glm::min(aabb.min, p);
+ glm::vec3 umax = glm::max(aabb.max, p);
+ return AABB{ umin, umax };
+}
+
+// SAH cost = num_triangles_left * left_box_area + num_triangles_right * right_box_area
+// Determines bounding boxes that result from splitting at this position and how many
+// triangles to place in each box. Once these are determined, we can calculate SAH cost
+float evalSAH(Scene* scene, BVHNode* node, float queryPos, int axis)
+{
+ AABB leftChild = { glm::vec3{INFINITY, INFINITY, INFINITY}, glm::vec3{-INFINITY, -INFINITY, -INFINITY} };
+ AABB rightChild = { glm::vec3{INFINITY, INFINITY, INFINITY}, glm::vec3{-INFINITY, -INFINITY, -INFINITY} };
+ int leftCount = 0;
+ int rightCount = 0;
+
+ for (int i = node->firstTri; i < node->firstTri + node->numTris; ++i) {
+ glm::vec3 centroid = scene->triangles[i].centroid;
+ if (centroid[axis] < queryPos) {
+ leftCount++;
+ leftChild = Union(leftChild, scene->triangles[i].aabb);
+ }
+ else {
+ rightCount++;
+ rightChild = Union(rightChild, scene->triangles[i].aabb);
+ }
+ }
+ // Calculate cost
+ float cost = leftCount * leftChild.surfaceArea() + rightCount * rightChild.surfaceArea();
+
+ return cost;
+}
+
+void calculateSAHSplit(Scene* scene, BVHNode* node, float& split, int& axis)
+{
+ // To find the optimal cost, we must calculate the cost of splitting along each
+ // axis for each triangle contained within this node
+ float optimalCost = INFINITY;
+ for (int i = 0; i < 3; ++i) {
+ for (int j = node->firstTri; j < node->firstTri + node->numTris; ++j) {
+ float centroidPos = scene->triangles[j].centroid[i];
+ float cost = evalSAH(scene, node, centroidPos, i);
+ if (cost < optimalCost) {
+ optimalCost = cost;
+ split = centroidPos;
+ axis = i;
+ }
+ }
+ }
+}
+
+void chooseSplit(Scene* scene, BVHNode* node, float& split, int& axis)
+{
+
+#if USE_BVH_MIDPOINT
+ // Find bounding box of centroids
+ AABB centroidAABB = { glm::vec3{INFINITY, INFINITY, INFINITY}, glm::vec3{-INFINITY, -INFINITY, -INFINITY} };
+ for (int i = node->firstTri; i < node->firstTri + node->numTris; ++i)
+ centroidAABB = Union(centroidAABB, scene->triangles[i].centroid);
+ axis = maxExtent(centroidAABB.max - centroidAABB.min);
+ split = (centroidAABB.min[axis] + centroidAABB.max[axis]) * 0.5f;
+
+#elif USE_BVH_SAH
+ calculateSAHSplit(scene, node, split, axis);
+#endif
+
+}
+
+void addChildren(Scene* scene, BVHNode* node)
+{
+ if (node->numTris <= 2)
+ {
+ return;
+ }
+
+ // Choose split axis and position
+ float split = 0.f;
+ int axis = 0;
+ chooseSplit(scene, node, split, axis);
+
+ // Partition primitives (in-place sorting)
+ int start = node->firstTri;
+ int end = node->firstTri + node->numTris - 1;
+ while(start <= end) {
+ if (scene->triangles[start].centroid[axis] < split) {
+ start++;
+ }
+ else {
+ std::swap(scene->triangles[start], scene->triangles[end]);
+ end--;
+ }
+ }
+
+ // Make sure there is no empty side on partition
+ int count = start - node->firstTri;
+ if (count == 0 || count == node->numTris) return;
+
+ // Set children nodes
+ node->left = idx++;
+ node->right = idx++;
+ scene->bvh[node->left].firstTri = node->firstTri;
+ scene->bvh[node->left].numTris = start - node->firstTri;
+ scene->bvh[node->right].firstTri = start;
+ scene->bvh[node->right].numTris = node->numTris - scene->bvh[node->left].numTris;
+ node->numTris = 0;
+
+ updateBounds(scene, node->left);
+ updateBounds(scene, node->right);
+
+ addChildren(scene, &scene->bvh[node->left]);
+ addChildren(scene, &scene->bvh[node->right]);
+}
+
+void generateBVH(Scene* scene)
+{
+ // Resize BVH
+ scene->bvh.resize(2 * scene->triangles.size() - 1);
+
+ // Initialize root node
+ BVHNode* root = &scene->bvh[0];
+ root->aabb = scene->mesh_aabbs[0];
+ root->firstTri = 0;
+ root->numTris = scene->triangles.size();
+
+ // Construct hierarchy
+ addChildren(scene, root);
+}
\ No newline at end of file
diff --git a/src/lbvh.h b/src/lbvh.h
new file mode 100644
index 0000000..8642e16
--- /dev/null
+++ b/src/lbvh.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "glm/glm.hpp"
+#include "utilities.h"
+#include "scene.h"
+#include "sceneStructs.h"
+
+class Scene;
+
+/// LBVH FUNCTIONS ///
+
+// Morton code generation and sorting
+unsigned int expand(unsigned int n);
+unsigned int mortonCode3D(const glm::vec3& centroid);
+void computeMortonCodes(Scene* scene, const AABB& sceneAABB);
+void sortMortonCodes(Scene* scene);
+
+// Tree building
+bool isLeaf(const LBVHNode* node);
+int delta(unsigned int* sortedMCodes, int N, int i, int j);
+int sign(unsigned int* sortedMCodes, int N, int i);
+
+NodeRange determineRange(unsigned int* sortedMCodes, int triangleCount, int idx);
+int findSplit(unsigned int* sortedMCodes, int triangleCount, NodeRange range);
+void assignBoundingBoxes(Scene* scene);
+void buildLBVH(Scene* scene, int leafStart, int triangleCount);
+
+// Construct LBVH
+void generateLBVH(Scene* scene);
+
+/// BVH FUNCTIONS ///
+
+void updateBounds(Scene* scene, const int idx);
+int maxExtent(glm::vec3 extent);
+float evalSAH(Scene* scene, BVHNode* node, float queryPos, int axis);
+void calculateCost(Scene* scene, BVHNode* node, float& split, int& axis);
+void chooseSplit(Scene* scene, BVHNode* node, float& split, int& axis);
+void addChildren(Scene* scene, BVHNode* node);
+
+void generateBVH(Scene* scene);
diff --git a/src/libmorton/morton.h b/src/libmorton/morton.h
new file mode 100644
index 0000000..6084b52
--- /dev/null
+++ b/src/libmorton/morton.h
@@ -0,0 +1,106 @@
+#pragma once
+
+// This file will always contain inline functions which point to the fastest Morton encoding/decoding implementation
+// IF you just want to use the fastest method to encode/decode morton codes, include this header.
+
+// If you want to experiment with alternative methods (which might be slower, all depending on hardware / your data set)
+// check the individual headers below.
+
+#include "morton2D.h"
+#include "morton3D.h"
+
+#if defined(__BMI2__) || defined(__AVX2__)
+#include "morton_BMI.h"
+#elif defined(__AVX512BITALG__)
+#include "morton_AVX512BITALG.h"
+#endif
+
+namespace libmorton {
+ // Functions under this are stubs which will always point to fastest implementation at the moment
+ //-----------------------------------------------------------------------------------------------
+
+ // ENCODING
+#if defined(__BMI2__) || defined(__AVX2__)
+ inline uint_fast32_t morton2D_32_encode(const uint_fast16_t x, const uint_fast16_t y) {
+ return m2D_e_BMI(x, y);
+ }
+ inline uint_fast64_t morton2D_64_encode(const uint_fast32_t x, const uint_fast32_t y) {
+ return m2D_e_BMI(x, y);
+ }
+ inline uint_fast32_t morton3D_32_encode(const uint_fast16_t x, const uint_fast16_t y, const uint_fast16_t z) {
+ return m3D_e_BMI(x, y, z);
+ }
+ inline uint_fast64_t morton3D_64_encode(const uint_fast32_t x, const uint_fast32_t y, const uint_fast32_t z) {
+ return m3D_e_BMI(x, y, z);
+ }
+#elif defined(__AVX512BITALG__)
+ inline uint_fast32_t morton2D_32_encode(const uint_fast16_t x, const uint_fast16_t y) {
+ return m2D_e_BITALG(x, y);
+ }
+ inline uint_fast64_t morton2D_64_encode(const uint_fast32_t x, const uint_fast32_t y) {
+ return m2D_e_BITALG(x, y);
+ }
+ inline uint_fast32_t morton3D_32_encode(const uint_fast16_t x, const uint_fast16_t y, const uint_fast16_t z) {
+ return m3D_e_BITALG(x, y, z);
+ }
+ inline uint_fast64_t morton3D_64_encode(const uint_fast32_t x, const uint_fast32_t y, const uint_fast32_t z) {
+ return m3D_e_BITALG(x, y, z);
+ }
+#else
+ inline uint_fast32_t morton2D_32_encode(const uint_fast16_t x, const uint_fast16_t y) {
+ return m2D_e_magicbits_combined(x, y);
+ }
+ inline uint_fast64_t morton2D_64_encode(const uint_fast32_t x, const uint_fast32_t y) {
+ return m2D_e_sLUT(x, y);
+ }
+ inline uint_fast32_t morton3D_32_encode(const uint_fast16_t x, const uint_fast16_t y, const uint_fast16_t z) {
+ return m3D_e_sLUT(x, y, z);
+ }
+ inline uint_fast64_t morton3D_64_encode(const uint_fast32_t x, const uint_fast32_t y, const uint_fast32_t z) {
+ return m3D_e_sLUT(x, y, z);
+ }
+#endif
+
+ // DECODING
+
+#if defined(__BMI2__) || defined(__AVX2__)
+ inline void morton2D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y) {
+ m2D_d_BMI(morton, x, y);
+ }
+ inline void morton2D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y) {
+ m2D_d_BMI(morton, x, y);
+ }
+ inline void morton3D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y, uint_fast16_t& z) {
+ m3D_d_BMI(morton, x, y, z);
+ }
+ inline void morton3D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y, uint_fast32_t& z) {
+ m3D_d_BMI(morton, x, y, z);
+ }
+#elif defined(__AVX512BITALG__)
+ inline void morton2D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y) {
+ m2D_d_BITALG(morton, x, y);
+ }
+ inline void morton2D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y) {
+ m2D_d_BITALG(morton, x, y);
+ }
+ inline void morton3D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y, uint_fast16_t& z) {
+ m3D_d_BITALG(morton, x, y, z);
+ }
+ inline void morton3D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y, uint_fast32_t& z) {
+ m3D_d_BITALG(morton, x, y, z);
+}
+#else
+ inline void morton2D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y) {
+ m2D_d_magicbits_combined(morton, x, y);
+ }
+ inline void morton2D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y) {
+ m2D_d_sLUT(morton, x, y);
+ }
+ inline void morton3D_32_decode(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y, uint_fast16_t& z) {
+ m3D_d_sLUT(morton, x, y, z);
+ }
+ inline void morton3D_64_decode(const uint_fast64_t morton, uint_fast32_t& x, uint_fast32_t& y, uint_fast32_t& z) {
+ m3D_d_sLUT(morton, x, y, z);
+ }
+#endif
+}
\ No newline at end of file
diff --git a/src/libmorton/morton2D.h b/src/libmorton/morton2D.h
new file mode 100644
index 0000000..6631c1c
--- /dev/null
+++ b/src/libmorton/morton2D.h
@@ -0,0 +1,283 @@
+#pragma once
+
+// Libmorton - Methods to encode/decode 64-bit morton codes from/to 32-bit (x,y) coordinates
+// Warning: morton.h will always point to the functions that use the fastest available method.
+
+#include
+#include
+#include
+#include "morton2D_LUTs.h"
+#include "morton_common.h"
+
+#define EIGHTBITMASK (morton) 0x000000FF
+
+namespace libmorton {
+
+ // Encode methods
+ template inline morton m2D_e_sLUT(const coord x, const coord y);
+ template inline morton m2D_e_sLUT_ET(const coord x, const coord y);
+ template inline morton m2D_e_LUT(const coord x, const coord y);
+ template inline morton m2D_e_LUT_ET(const coord x, const coord y);
+ template inline morton m2D_e_magicbits(const coord x, const coord y);
+ template inline morton m2D_e_for(const coord x, const coord y);
+ template inline morton m2D_e_for_ET(const coord x, const coord y);
+
+ // Decode methods
+ template inline void m2D_d_sLUT(const morton m, coord& x, coord& y);
+ template inline void m2D_d_sLUT_ET(const morton m, coord& x, coord& y);
+ template inline void m2D_d_LUT(const morton m, coord& x, coord& y);
+ template inline void m2D_d_LUT_ET(const morton m, coord& x, coord& y);
+ template inline void m2D_d_magicbits(const morton m, coord& x, coord& y);
+ template inline void m2D_d_for(const morton m, coord& x, coord& y);
+
+ // ENCODE 2D Morton code : Pre-shifted LookUpTable (sLUT)
+ template
+ inline morton m2D_e_sLUT(const coord x, const coord y) {
+ morton answer = 0;
+ for (unsigned int i = sizeof(coord); i > 0; --i) {
+ unsigned int shift = (i - 1) * 8;
+ answer =
+ answer << 16 |
+ Morton2D_encode_y_256[(y >> shift) & EIGHTBITMASK] |
+ Morton2D_encode_x_256[(x >> shift) & EIGHTBITMASK];
+ }
+ return answer;
+ }
+
+ // ENCODE 2D Morton code : LookUpTable (LUT)
+ template
+ inline morton m2D_e_LUT(const coord x, const coord y) {
+ morton answer = 0;
+ for (unsigned int i = sizeof(coord); i > 0; --i) {
+ unsigned int shift = (i - 1) * 8;
+ answer =
+ answer << 16 |
+ (Morton2D_encode_x_256[(y >> shift) & EIGHTBITMASK] << morton(1)) |
+ (Morton2D_encode_x_256[(x >> shift) & EIGHTBITMASK]);
+ }
+ return answer;
+ }
+
+ // HELPER METHOD for Early Termination LUT Encode
+ template
+ inline morton compute2D_ET_LUT_encode(const coord c, const coord *LUT) {
+ unsigned long maxbit = 0;
+ if (findFirstSetBit(c, &maxbit) == 0) { return 0; }
+ morton answer = 0;
+ unsigned int i = 0;
+ while (maxbit >= i) {
+ answer |= ((morton)LUT[(c >> i) & EIGHTBITMASK]) << i * 2;
+ i += 8;
+ }
+ return answer;
+ }
+
+ // ENCODE 2D Morton code : Pre-shifted LUT (Early termination version)
+ // This version tries to terminate early when there are no more bits to process
+ // Figuring this out is probably too costly in most cases.
+ template
+ inline morton m2D_e_sLUT_ET(const coord x, const coord y) {
+ morton answer_x = compute2D_ET_LUT_encode(x, Morton2D_encode_x_256);
+ morton answer_y = compute2D_ET_LUT_encode(y, Morton2D_encode_y_256);
+ return answer_y | answer_x;
+ }
+
+ // ENCODE 2D Morton code : LUT (Early termination version)
+ template
+ inline morton m2D_e_LUT_ET(const coord x, const coord y) {
+ morton answer_x = compute2D_ET_LUT_encode(x, Morton2D_encode_x_256);
+ morton answer_y = compute2D_ET_LUT_encode(y, Morton2D_encode_x_256);
+ return (answer_y << 1) | answer_x;
+ }
+
+ // Magicbits masks (2D encode)
+ static uint_fast32_t magicbit2D_masks32[6] = { 0xFFFFFFFF, 0x0000FFFF, 0x00FF00FF, 0x0F0F0F0F, 0x33333333, 0x55555555 };
+ static uint_fast64_t magicbit2D_masks64[6] = { 0x00000000FFFFFFFF, 0x0000FFFF0000FFFF, 0x00FF00FF00FF00FF, 0x0F0F0F0F0F0F0F0F, 0x3333333333333333, 0x5555555555555555 };
+
+ // HELPER METHOD for Magic bits encoding - split by 2
+ template
+ inline morton morton2D_SplitBy2Bits(const coord a) {
+ const morton* masks = (sizeof(morton) <= 4) ? reinterpret_cast(magicbit2D_masks32) : reinterpret_cast(magicbit2D_masks64);
+ morton x = a;
+ if (sizeof(morton) > 4) { x = (x | (uint_fast64_t)x << 32) & masks[0]; }
+ x = (x | x << 16) & masks[1];
+ x = (x | x << 8) & masks[2];
+ x = (x | x << 4) & masks[3];
+ x = (x | x << 2) & masks[4];
+ x = (x | x << 1) & masks[5];
+ return x;
+ }
+
+ // ENCODE 2D Morton code : Magic bits
+ template
+ inline morton m2D_e_magicbits(const coord x, const coord y) {
+ return morton2D_SplitBy2Bits(x) | (morton2D_SplitBy2Bits(y) << 1);
+ }
+
+ // ENCODE 2D 32-bit morton code - alternative version by JarkkoPFC - https://gist.github.com/JarkkoPFC/0e4e599320b0cc7ea92df45fb416d79a
+ // This uses the same technique as the magicbits method, but uses the upper part of a 64-bit type to split the y coordinate,
+ // the lower part to split the x coordinate, then merges them back together.
+ inline uint_fast32_t m2D_e_magicbits_combined(uint_fast16_t x, uint_fast16_t y) {
+ uint_fast64_t m = x | (uint_fast64_t(y) << 32); // put Y in upper 32 bits, X in lower 32 bits
+ m = (m | (m << 8)) & magicbit2D_masks64[2];
+ m = (m | (m << 4)) & magicbit2D_masks64[3];
+ m = (m | (m << 2)) & magicbit2D_masks64[4];
+ m = (m | (m << 1)) & magicbit2D_masks64[5];
+ m = m | (m >> 31); // merge X and Y back together
+ // hard cut off to 32 bits, because on some systems uint_fast32_t will be a 64-bit type, and we don't want to retain split Y-version in the upper 32 bits.
+ m = m & 0x00000000FFFFFFFF;
+ return uint_fast32_t(m);
+ }
+
+ // ENCODE 2D Morton code : For Loop
+ template
+ inline morton m2D_e_for(const coord x, const coord y) {
+ morton answer = 0;
+ unsigned int checkbits = (unsigned int)floor(sizeof(morton) * 4.0f);
+ for (unsigned int i = 0; i < checkbits; ++i) {
+ morton mshifted = static_cast(0x1) << i; // Here we need to cast 0x1 to 64bits, otherwise there is a bug when morton code is larger than 32 bits
+ unsigned int shift = i; // because you have to shift back i and forth 2*i
+ answer |=
+ ((x & mshifted) << shift)
+ | ((y & mshifted) << (shift + 1));
+ }
+ return answer;
+ }
+
+ // ENCODE 2D Morton code : For Loop (Early termination version)
+ template
+ inline morton m2D_e_for_ET(const coord x, const coord y) {
+ morton answer = 0;
+ unsigned long x_max = 0, y_max = 0;
+ unsigned int checkbits = sizeof(morton) * 4;
+ findFirstSetBit(x, &x_max);
+ findFirstSetBit(y, &y_max);
+ checkbits = std::min(static_cast(checkbits), std::max(x_max, y_max) + 1ul);
+ for (unsigned int i = 0; i < checkbits; ++i) {
+ morton m_shifted = static_cast(0x1) << i; // Here we need to cast 0x1 to 64bits, otherwise there is a bug when morton code is larger than 32 bits
+ unsigned int shift = i;
+ answer |= ((x & m_shifted) << shift)
+ | ((y & m_shifted) << (shift + 1));
+ }
+ return answer;
+ }
+
+ // HELPER METHODE for LUT decoding
+ template
+ inline coord morton2D_DecodeCoord_LUT256(const morton m, const uint_fast8_t *LUT, const unsigned int startshift) {
+ morton a = 0;
+ unsigned int loops = sizeof(morton);
+ for (unsigned int i = 0; i < loops; ++i) {
+ a |= ((morton)LUT[(m >> ((i * 8) + startshift)) & EIGHTBITMASK] << (4 * i));
+ }
+ return static_cast(a);
+ }
+
+ // DECODE 2D Morton code : Shifted LUT
+ template
+ inline void m2D_d_sLUT(const morton m, coord& x, coord& y) {
+ x = morton2D_DecodeCoord_LUT256(m, Morton2D_decode_x_256, 0);
+ y = morton2D_DecodeCoord_LUT256(m, Morton2D_decode_y_256, 0);
+ }
+
+ // DECODE 2D 64-bit morton code : LUT
+ template
+ inline void m2D_d_LUT(const morton m, coord& x, coord& y) {
+ x = morton2D_DecodeCoord_LUT256(m, Morton2D_decode_x_256, 0);
+ y = morton2D_DecodeCoord_LUT256(m, Morton2D_decode_x_256, 1);
+ }
+
+ // DECODE 2D Morton code : Shifted LUT (early termination)
+ template
+ inline void m2D_d_sLUT_ET(const morton m, coord& x, coord& y) {
+ x = 0; y = 0;
+ unsigned long firstbit_location = 0;
+ if (!findFirstSetBit(m, &firstbit_location)) { return; }
+ unsigned int i = 0;
+ unsigned int shiftback = 0;
+ while (firstbit_location > i) {
+ morton m_shifted = (m >> i) & EIGHTBITMASK;
+ x |= (coord)Morton2D_decode_x_256[m_shifted] << shiftback;
+ y |= (coord)Morton2D_decode_y_256[m_shifted] << shiftback;
+ shiftback += 4;
+ i += 8;
+ }
+ }
+
+ // DECODE 2D Morton code : LUT (early termination)
+ template
+ inline void m2D_d_LUT_ET(const morton m, coord& x, coord& y) {
+ x = 0; y = 0;
+ unsigned long firstbit_location = 0;
+ if (!findFirstSetBit(m, &firstbit_location)) { return; }
+ unsigned int i = 0;
+ unsigned int shiftback = 0;
+ while (firstbit_location > i) {
+ x |= (coord)Morton2D_decode_x_256[(m >> i) & EIGHTBITMASK] << shiftback;
+ y |= (coord)Morton2D_decode_x_256[(m >> (i + 1)) & EIGHTBITMASK] << shiftback;
+ shiftback += 4;
+ i += 8;
+ }
+ }
+
+ // HELPER method for Magicbits decoding
+ template
+ static inline coord morton2D_GetSecondBits(const morton m) {
+ morton* masks = (sizeof(morton) <= 4) ? reinterpret_cast(magicbit2D_masks32) : reinterpret_cast(magicbit2D_masks64);
+ morton x = m & masks[5];
+ x = (x ^ (x >> 1)) & masks[4];
+ x = (x ^ (x >> 2)) & masks[3];
+ x = (x ^ (x >> 4)) & masks[2];
+ x = (x ^ (x >> 8)) & masks[1];
+ if (sizeof(morton) > 4) x = (x ^ (x >> 16)) & masks[0];
+ return static_cast(x);
+ }
+
+ // DECODE 2D Morton code : Magic bits
+ // This method splits the morton codes bits by using certain patterns (magic bits)
+ template
+ inline void m2D_d_magicbits(const morton m, coord& x, coord& y) {
+ x = morton2D_GetSecondBits(m);
+ y = morton2D_GetSecondBits(m >> 1);
+ }
+
+ // DECODE 2D 32-bit morton code - alternative version by JarkkoPFC - https://gist.github.com/JarkkoPFC/0e4e599320b0cc7ea92df45fb416d79a
+ inline void m2D_d_magicbits_combined(const uint_fast32_t morton, uint_fast16_t& x, uint_fast16_t& y) {
+ uint_fast64_t res = (morton | (uint_fast64_t(morton) << 31)) & magicbit2D_masks64[5];
+ res = (res | (res >> 1)) & magicbit2D_masks64[4];
+ res = (res | (res >> 2)) & magicbit2D_masks64[3];
+ res = (res | (res >> 4)) & magicbit2D_masks64[2];
+ res = res | (res >> 8);
+ x = uint_fast16_t(res) & 0xFFFF;
+ y = (uint_fast16_t(res >> 32)) & 0xFFFF;
+ }
+
+ // DECODE 2D morton code : For loop
+ template
+ inline void m2D_d_for(const morton m, coord& x, coord& y) {
+ x = 0; y = 0;
+ unsigned int checkbits = sizeof(morton) * 4;
+ for (unsigned int i = 0; i <= checkbits; ++i) {
+ morton selector = 1;
+ unsigned int shift_selector = 2 * i;
+ x |= (m & (selector << shift_selector)) >> i;
+ y |= (m & (selector << (shift_selector + 1))) >> (i + 1);
+ }
+ }
+
+ // DECODE 3D Morton code : For loop (Early termination version)
+ template
+ inline void m2D_d_for_ET(const morton m, coord& x, coord& y) {
+ x = 0; y = 0;
+ unsigned long firstbit_location = 0;
+ if (!findFirstSetBit(m, &firstbit_location)) return;
+ float defaultbits = sizeof(morton) * 4;
+ unsigned int checkbits = static_cast(std::min(defaultbits, firstbit_location / 2.0f));
+ for (unsigned int i = 0; i <= checkbits; ++i) {
+ morton selector = 1;
+ unsigned int shift_selector = 2 * i;
+ x |= (m & (selector << shift_selector)) >> i;
+ y |= (m & (selector << (shift_selector + 1))) >> (i + 1);
+ }
+ }
+}
diff --git a/src/libmorton/morton2D_LUTs.h b/src/libmorton/morton2D_LUTs.h
new file mode 100644
index 0000000..730eb02
--- /dev/null
+++ b/src/libmorton/morton2D_LUTs.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include
+
+namespace libmorton {
+
+ // LUT for Morton2D encode X
+ static const uint_fast16_t Morton2D_encode_x_256[256] =
+ {
+ 0, 1, 4, 5, 16, 17, 20, 21,
+ 64, 65, 68, 69, 80, 81, 84, 85,
+ 256, 257, 260, 261, 272, 273, 276, 277,
+ 320, 321, 324, 325, 336, 337, 340, 341,
+ 1024, 1025, 1028, 1029, 1040, 1041, 1044, 1045,
+ 1088, 1089, 1092, 1093, 1104, 1105, 1108, 1109,
+ 1280, 1281, 1284, 1285, 1296, 1297, 1300, 1301,
+ 1344, 1345, 1348, 1349, 1360, 1361, 1364, 1365,
+ 4096, 4097, 4100, 4101, 4112, 4113, 4116, 4117,
+ 4160, 4161, 4164, 4165, 4176, 4177, 4180, 4181,
+ 4352, 4353, 4356, 4357, 4368, 4369, 4372, 4373,
+ 4416, 4417, 4420, 4421, 4432, 4433, 4436, 4437,
+ 5120, 5121, 5124, 5125, 5136, 5137, 5140, 5141,
+ 5184, 5185, 5188, 5189, 5200, 5201, 5204, 5205,
+ 5376, 5377, 5380, 5381, 5392, 5393, 5396, 5397,
+ 5440, 5441, 5444, 5445, 5456, 5457, 5460, 5461,
+ 16384, 16385, 16388, 16389, 16400, 16401, 16404, 16405,
+ 16448, 16449, 16452, 16453, 16464, 16465, 16468, 16469,
+ 16640, 16641, 16644, 16645, 16656, 16657, 16660, 16661,
+ 16704, 16705, 16708, 16709, 16720, 16721, 16724, 16725,
+ 17408, 17409, 17412, 17413, 17424, 17425, 17428, 17429,
+ 17472, 17473, 17476, 17477, 17488, 17489, 17492, 17493,
+ 17664, 17665, 17668, 17669, 17680, 17681, 17684, 17685,
+ 17728, 17729, 17732, 17733, 17744, 17745, 17748, 17749,
+ 20480, 20481, 20484, 20485, 20496, 20497, 20500, 20501,
+ 20544, 20545, 20548, 20549, 20560, 20561, 20564, 20565,
+ 20736, 20737, 20740, 20741, 20752, 20753, 20756, 20757,
+ 20800, 20801, 20804, 20805, 20816, 20817, 20820, 20821,
+ 21504, 21505, 21508, 21509, 21520, 21521, 21524, 21525,
+ 21568, 21569, 21572, 21573, 21584, 21585, 21588, 21589,
+ 21760, 21761, 21764, 21765, 21776, 21777, 21780, 21781,
+ 21824, 21825, 21828, 21829, 21840, 21841, 21844, 21845
+ };
+
+ // LUT for Morton2D encode Y
+ static const uint_fast16_t Morton2D_encode_y_256[256] =
+ {
+ 0, 2, 8, 10, 32, 34, 40, 42,
+ 128, 130, 136, 138, 160, 162, 168, 170,
+ 512, 514, 520, 522, 544, 546, 552, 554,
+ 640, 642, 648, 650, 672, 674, 680, 682,
+ 2048, 2050, 2056, 2058, 2080, 2082, 2088, 2090,
+ 2176, 2178, 2184, 2186, 2208, 2210, 2216, 2218,
+ 2560, 2562, 2568, 2570, 2592, 2594, 2600, 2602,
+ 2688, 2690, 2696, 2698, 2720, 2722, 2728, 2730,
+ 8192, 8194, 8200, 8202, 8224, 8226, 8232, 8234,
+ 8320, 8322, 8328, 8330, 8352, 8354, 8360, 8362,
+ 8704, 8706, 8712, 8714, 8736, 8738, 8744, 8746,
+ 8832, 8834, 8840, 8842, 8864, 8866, 8872, 8874,
+ 10240, 10242, 10248, 10250, 10272, 10274, 10280, 10282,
+ 10368, 10370, 10376, 10378, 10400, 10402, 10408, 10410,
+ 10752, 10754, 10760, 10762, 10784, 10786, 10792, 10794,
+ 10880, 10882, 10888, 10890, 10912, 10914, 10920, 10922,
+ 32768, 32770, 32776, 32778, 32800, 32802, 32808, 32810,
+ 32896, 32898, 32904, 32906, 32928, 32930, 32936, 32938,
+ 33280, 33282, 33288, 33290, 33312, 33314, 33320, 33322,
+ 33408, 33410, 33416, 33418, 33440, 33442, 33448, 33450,
+ 34816, 34818, 34824, 34826, 34848, 34850, 34856, 34858,
+ 34944, 34946, 34952, 34954, 34976, 34978, 34984, 34986,
+ 35328, 35330, 35336, 35338, 35360, 35362, 35368, 35370,
+ 35456, 35458, 35464, 35466, 35488, 35490, 35496, 35498,
+ 40960, 40962, 40968, 40970, 40992, 40994, 41000, 41002,
+ 41088, 41090, 41096, 41098, 41120, 41122, 41128, 41130,
+ 41472, 41474, 41480, 41482, 41504, 41506, 41512, 41514,
+ 41600, 41602, 41608, 41610, 41632, 41634, 41640, 41642,
+ 43008, 43010, 43016, 43018, 43040, 43042, 43048, 43050,
+ 43136, 43138, 43144, 43146, 43168, 43170, 43176, 43178,
+ 43520, 43522, 43528, 43530, 43552, 43554, 43560, 43562,
+ 43648, 43650, 43656, 43658, 43680, 43682, 43688, 43690
+ };
+
+ // LUT for Morton2D decode X
+ static const uint_fast8_t Morton2D_decode_x_256[256] = {
+ 0,1,0,1,2,3,2,3,0,1,0,1,2,3,2,3,
+ 4,5,4,5,6,7,6,7,4,5,4,5,6,7,6,7,
+ 0,1,0,1,2,3,2,3,0,1,0,1,2,3,2,3,
+ 4,5,4,5,6,7,6,7,4,5,4,5,6,7,6,7,
+ 8,9,8,9,10,11,10,11,8,9,8,9,10,11,10,11,
+ 12,13,12,13,14,15,14,15,12,13,12,13,14,15,14,15,
+ 8,9,8,9,10,11,10,11,8,9,8,9,10,11,10,11,
+ 12,13,12,13,14,15,14,15,12,13,12,13,14,15,14,15,
+ 0,1,0,1,2,3,2,3,0,1,0,1,2,3,2,3,
+ 4,5,4,5,6,7,6,7,4,5,4,5,6,7,6,7,
+ 0,1,0,1,2,3,2,3,0,1,0,1,2,3,2,3,
+ 4,5,4,5,6,7,6,7,4,5,4,5,6,7,6,7,
+ 8,9,8,9,10,11,10,11,8,9,8,9,10,11,10,11,
+ 12,13,12,13,14,15,14,15,12,13,12,13,14,15,14,15,
+ 8,9,8,9,10,11,10,11,8,9,8,9,10,11,10,11,
+ 12,13,12,13,14,15,14,15,12,13,12,13,14,15,14,15
+ };
+
+ // LUT for Morton2D decode Y
+ static const uint_fast8_t Morton2D_decode_y_256[256] = {
+ 0,0,1,1,0,0,1,1,2,2,3,3,2,2,3,3,
+ 0,0,1,1,0,0,1,1,2,2,3,3,2,2,3,3,
+ 4,4,5,5,4,4,5,5,6,6,7,7,6,6,7,7,
+ 4,4,5,5,4,4,5,5,6,6,7,7,6,6,7,7,
+ 0,0,1,1,0,0,1,1,2,2,3,3,2,2,3,3,
+ 0,0,1,1,0,0,1,1,2,2,3,3,2,2,3,3,
+ 4,4,5,5,4,4,5,5,6,6,7,7,6,6,7,7,
+ 4,4,5,5,4,4,5,5,6,6,7,7,6,6,7,7,
+ 8,8,9,9,8,8,9,9,10,10,11,11,10,10,11,11,
+ 8,8,9,9,8,8,9,9,10,10,11,11,10,10,11,11,
+ 12,12,13,13,12,12,13,13,14,14,15,15,14,14,15,15,
+ 12,12,13,13,12,12,13,13,14,14,15,15,14,14,15,15,
+ 8,8,9,9,8,8,9,9,10,10,11,11,10,10,11,11,
+ 8,8,9,9,8,8,9,9,10,10,11,11,10,10,11,11,
+ 12,12,13,13,12,12,13,13,14,14,15,15,14,14,15,15,
+ 12,12,13,13,12,12,13,13,14,14,15,15,14,14,15,15
+ };
+}
diff --git a/src/libmorton/morton3D.h b/src/libmorton/morton3D.h
new file mode 100644
index 0000000..81e88f8
--- /dev/null
+++ b/src/libmorton/morton3D.h
@@ -0,0 +1,283 @@
+#pragma once
+
+// Libmorton - Methods to encode/decode 64-bit morton codes from/to 32-bit (x,y,z) coordinates
+// Warning: morton.h will always point to the functions that use the fastest available method.
+
+#include
+#include
+#include
+#include "morton3D_LUTs.h"
+#include "morton_common.h"
+
+#define EIGHTBITMASK (morton) 0x000000FF
+#define NINEBITMASK (morton) 0x000001FF
+
+namespace libmorton {
+ // AVAILABLE METHODS FOR ENCODING
+ template inline morton m3D_e_sLUT(const coord x, const coord y, const coord z);
+ template inline morton m3D_e_sLUT_ET(const coord x, const coord y, const coord z);
+ template inline morton m3D_e_LUT(const coord x, const coord y, const coord z);
+ template inline morton m3D_e_LUT_ET(const coord x, const coord y, const coord z);
+ template inline morton m3D_e_magicbits(const coord x, const coord y, const coord z);
+ template inline morton m3D_e_for(const coord x, const coord y, const coord z);
+ template inline morton m3D_e_for_ET(const coord x, const coord y, const coord z);
+
+ // AVAILABLE METHODS FOR DECODING
+ template inline void m3D_d_sLUT(const morton m, coord& x, coord& y, coord& z);
+ template inline void m3D_d_sLUT_ET(const morton m, coord& x, coord& y, coord& z);
+ template inline void m3D_d_LUT(const morton m, coord& x, coord& y, coord& z);
+ template inline void m3D_d_LUT_ET(const morton m, coord& x, coord& y, coord& z);
+ template inline void m3D_d_magicbits(const morton m, coord& x, coord& y, coord& z);
+ template inline void m3D_d_for(const morton m, coord& x, coord& y, coord& z);
+ template inline void m3D_d_for_ET(const morton m, coord& x, coord& y, coord& z);
+
+ // ENCODE 3D Morton code : Pre-Shifted LookUpTable (sLUT)
+ template
+ inline morton m3D_e_sLUT(const coord x, const coord y, const coord z) {
+ morton answer = 0;
+ for (unsigned int i = sizeof(coord); i > 0; --i) {
+ unsigned int shift = (i - 1) * 8;
+ answer =
+ answer << 24 |
+ (Morton3D_encode_z_256[(z >> shift) & EIGHTBITMASK] |
+ Morton3D_encode_y_256[(y >> shift) & EIGHTBITMASK] |
+ Morton3D_encode_x_256[(x >> shift) & EIGHTBITMASK]);
+ }
+ return answer;
+ }
+
+ // ENCODE 3D Morton code : LookUpTable (LUT)
+ template
+ inline morton m3D_e_LUT(const coord x, const coord y, const coord z) {
+ morton answer = 0;
+ for (unsigned int i = sizeof(coord); i > 0; --i) {
+ unsigned int shift = (i - 1) * 8;
+ answer =
+ answer << 24 |
+ (Morton3D_encode_x_256[(z >> shift) & EIGHTBITMASK] << morton(2)) |
+ (Morton3D_encode_x_256[(y >> shift) & EIGHTBITMASK] << morton(1)) |
+ Morton3D_encode_x_256[(x >> shift) & EIGHTBITMASK];
+ }
+ return answer;
+ }
+
+ // HELPER METHOD for ET LUT encode
+ template
+ inline morton compute3D_ET_LUT_encode(const coord c, const coord *LUT) {
+ unsigned long maxbit = 0;
+ if (findFirstSetBit(c, &maxbit) == 0) { return 0; }
+ morton answer = 0;
+ for (int i = (int)ceil((maxbit + 1) / 8.0f); i >= 0; --i) {
+ unsigned int shift = i * 8;
+ answer = answer << 24 | (LUT[(c >> shift) & EIGHTBITMASK]);
+ }
+ return answer;
+ }
+
+ // ENCODE 3D Morton code : Pre-shifted LookUpTable (LUT) (Early Termination version)
+ // This version tries to terminate early when there are no more bits to process
+ // Figuring this out is probably too costly in most cases.
+ template
+ inline morton m3D_e_sLUT_ET(const coord x, const coord y, const coord z) {
+ morton answer_x = compute3D_ET_LUT_encode(x, Morton3D_encode_x_256);
+ morton answer_y = compute3D_ET_LUT_encode(y, Morton3D_encode_y_256);
+ morton answer_z = compute3D_ET_LUT_encode(z, Morton3D_encode_z_256);
+ return answer_z | answer_y | answer_x;
+ }
+
+ // ENCODE 3D Morton code : LookUpTable (LUT) (Early termination version)
+ // This version tries to terminate early when there are no more bits to process
+ // Figuring this out is probably too costly in most cases.
+ template
+ inline morton m3D_e_LUT_ET(const coord x, const coord y, const coord z) {
+ morton answer_x = compute3D_ET_LUT_encode(x, Morton3D_encode_x_256);
+ morton answer_y = compute3D_ET_LUT_encode(y, Morton3D_encode_x_256);
+ morton answer_z = compute3D_ET_LUT_encode(z, Morton3D_encode_x_256);
+ return (answer_z << 2) | (answer_y << 1) | answer_x;
+ }
+
+ // Magicbits masks (3D encode)
+ static uint_fast32_t magicbit3D_masks32_encode[6] = { 0x000003ff, 0, 0x30000ff, 0x0300f00f, 0x30c30c3, 0x9249249 }; // we add a 0 on position 1 in this array to use same code for 32-bit and 64-bit cases
+ static uint_fast64_t magicbit3D_masks64_encode[6] = { 0x1fffff, 0x1f00000000ffff, 0x1f0000ff0000ff, 0x100f00f00f00f00f, 0x10c30c30c30c30c3, 0x1249249249249249 };
+
+ // HELPER METHOD: Magic bits encoding (helper method)
+ template
+ static inline morton morton3D_SplitBy3bits(const coord a) {
+ const morton* masks = (sizeof(morton) <= 4) ? reinterpret_cast(magicbit3D_masks32_encode) : reinterpret_cast(magicbit3D_masks64_encode);
+ morton x = ((morton)a) & masks[0];
+ if (sizeof(morton) == 8) { x = (x | (uint_fast64_t)x << 32) & masks[1]; } // for 64-bit case
+ x = (x | x << 16) & masks[2];
+ x = (x | x << 8) & masks[3];
+ x = (x | x << 4) & masks[4];
+ x = (x | x << 2) & masks[5];
+ return x;
+ }
+
+ // ENCODE 3D Morton code : Magic bits method
+ // This method uses certain bit patterns (magic bits) to split bits in the coordinates
+ template
+ inline morton m3D_e_magicbits(const coord x, const coord y, const coord z) {
+ return morton3D_SplitBy3bits(x) | (morton3D_SplitBy3bits(y) << 1) | (morton3D_SplitBy3bits(z) << 2);
+ }
+
+ // ENCODE 3D Morton code : For loop
+ // This is the most naive way of encoding coordinates into a morton code
+ template
+ inline morton m3D_e_for(const coord x, const coord y, const coord z) {
+ morton answer = 0;
+ unsigned int checkbits = (sizeof(morton) * 8) / 3;
+ for (unsigned int i = 0; i < checkbits; ++i) {
+ morton mshifted = static_cast(1) << i; // Here we need to cast 0x1 to 64bits, otherwise there is a bug when morton code is larger than 32 bits
+ unsigned int shift = 2 * i; // because you have to shift back i and forth 3*i
+ answer |= ((x & mshifted) << shift)
+ | ((y & mshifted) << (shift + 1))
+ | ((z & mshifted) << (shift + 2));
+ }
+ return answer;
+ }
+
+ // ENCODE 3D Morton code : For loop (Early termination version)
+ // In case of the for loop, figuring out when to stop early has huge benefits.
+ template
+ inline morton m3D_e_for_ET(const coord x, const coord y, const coord z) {
+ morton answer = 0;
+ unsigned long x_max = 0, y_max = 0, z_max = 0;
+ unsigned int checkbits = (sizeof(morton) * 8) / 3;
+ findFirstSetBit(x, &x_max);
+ findFirstSetBit(y, &y_max);
+ findFirstSetBit(z, &z_max);
+ checkbits = std::min((unsigned long)checkbits, std::max(z_max, std::max(x_max, y_max)) + (unsigned long)1);
+ for (unsigned int i = 0; i < checkbits; ++i) {
+ morton m_shifted = static_cast(1) << i; // Here we need to cast 0x1 to 64bits, otherwise there is a bug when morton code is larger than 32 bits
+ unsigned int shift = 2 * i;
+ answer |= ((x & m_shifted) << shift)
+ | ((y & m_shifted) << (shift + 1))
+ | ((z & m_shifted) << (shift + 2));
+ }
+ return answer;
+ }
+
+ // HELPER METHOD for LUT decoding
+ // todo: wouldn't this be better with 8-bit aligned decode LUT?
+ template
+ inline coord morton3D_DecodeCoord_LUT256(const morton m, const uint_fast8_t *LUT, const unsigned int startshift) {
+ morton a = 0;
+ unsigned int loops = (sizeof(morton) <= 4) ? 4 : 7;
+ for (unsigned int i = 0; i < loops; ++i) {
+ a |= (morton)(LUT[(m >> ((i * 9) + startshift)) & NINEBITMASK] << morton(3 * i));
+ }
+ return static_cast(a);
+ }
+
+ // DECODE 3D Morton code : Shifted LUT
+ template
+ inline void m3D_d_sLUT(const morton m, coord& x, coord& y, coord& z) {
+ x = morton3D_DecodeCoord_LUT256(m, Morton3D_decode_x_512, 0);
+ y = morton3D_DecodeCoord_LUT256(m, Morton3D_decode_y_512, 0);
+ z = morton3D_DecodeCoord_LUT256(m, Morton3D_decode_z_512, 0);
+ }
+
+ // DECODE 3D Morton code : LUT
+ template
+ inline void m3D_d_LUT(const morton m, coord& x, coord& y, coord& z) {
+ x = morton3D_DecodeCoord_LUT256(m, Morton3D_decode_x_512, 0);
+ y = morton3D_DecodeCoord_LUT256(m, Morton3D_decode_x_512, 1);
+ z = morton3D_DecodeCoord_LUT256(m, Morton3D_decode_x_512, 2);
+ }
+
+ // DECODE 3D Morton code : Shifted LUT (Early termination version)
+ template
+ inline void m3D_d_sLUT_ET(const morton m, coord& x, coord& y, coord& z) {
+ x = 0; y = 0; z = 0;
+ unsigned long firstbit_location = 0;
+ if (!findFirstSetBit(m, &firstbit_location)) { return; }
+ unsigned int i = 0;
+ unsigned int shiftback = 0;
+ while (firstbit_location > i) {
+ morton m_shifted = (m >> i) & NINEBITMASK;
+ x |= (coord)Morton3D_decode_x_512[m_shifted] << shiftback;
+ y |= (coord)Morton3D_decode_y_512[m_shifted] << shiftback;
+ z |= (coord)Morton3D_decode_z_512[m_shifted] << shiftback;
+ shiftback += 3;
+ i += 9;
+ }
+ return;
+ }
+
+ // DECODE 3D Morton code : LUT (Early termination version)
+ template
+ inline void m3D_d_LUT_ET(const morton m, coord& x, coord& y, coord& z) {
+ x = 0; y = 0; z = 0;
+ unsigned long firstbit_location = 0;
+ if (!findFirstSetBit(m, &firstbit_location)) { return; }
+ unsigned int i = 0;
+ unsigned int shiftback = 0;
+ while (i < firstbit_location) {
+ x = x | (coord)Morton3D_decode_x_512[(m >> i) & NINEBITMASK] << shiftback;
+ y = y | (coord)Morton3D_decode_x_512[(m >> (i + 1)) & NINEBITMASK] << shiftback;
+ z = z | (coord)Morton3D_decode_x_512[(m >> (i + 2)) & NINEBITMASK] << shiftback;
+ i += 9;
+ shiftback += 3;
+ }
+ return;
+ }
+
+ // Magicbits masks (3D decode)
+ static uint_fast32_t magicbit3D_masks32_decode[6] = { 0, 0x000003ff, 0x30000ff, 0x0300f00f, 0x30c30c3, 0x9249249 }; // we add a 0 on position 0 in this array to use same code for 32-bit and 64-bit cases
+ static uint_fast64_t magicbit3D_masks64_decode[6] = { 0x1fffff, 0x1f00000000ffff, 0x1f0000ff0000ff, 0x100f00f00f00f00f, 0x10c30c30c30c30c3, 0x1249249249249249 };
+
+ // HELPER METHOD for Magic bits decoding
+ template
+ static inline coord morton3D_GetThirdBits(const morton m) {
+ morton* masks = (sizeof(morton) <= 4) ? reinterpret_cast(magicbit3D_masks32_decode) : reinterpret_cast(magicbit3D_masks64_decode);
+ morton x = m & masks[5];
+ x = (x ^ (x >> 2)) & masks[4];
+ x = (x ^ (x >> 4)) & masks[3];
+ x = (x ^ (x >> 8)) & masks[2];
+ x = (x ^ (x >> 16)) & masks[1];
+ if (sizeof(morton) > 4) { x = (x ^ ((uint_fast64_t)x >> 32)) & masks[0]; }
+ return static_cast(x);
+ }
+
+ // DECODE 3D Morton code : Magic bits
+ // This method splits the morton codes bits by using certain patterns (magic bits)
+ template
+ inline void m3D_d_magicbits(const morton m, coord& x, coord& y, coord& z) {
+ x = morton3D_GetThirdBits(m);
+ y = morton3D_GetThirdBits(m >> 1);
+ z = morton3D_GetThirdBits(m >> 2);
+ }
+
+ // DECODE 3D Morton code : For loop
+ template
+ inline void m3D_d_for(const morton m, coord& x, coord& y, coord& z) {
+ x = 0; y = 0; z = 0;
+ unsigned int checkbits = (sizeof(morton) * 8) / 3;
+ for (unsigned int i = 0; i <= checkbits; ++i) {
+ morton selector = 1;
+ unsigned int shift_selector = 3 * i;
+ unsigned int shiftback = 2 * i;
+ x |= (m & (selector << shift_selector)) >> (shiftback);
+ y |= (m & (selector << (shift_selector + 1))) >> (shiftback + 1);
+ z |= (m & (selector << (shift_selector + 2))) >> (shiftback + 2);
+ }
+ }
+
+ // DECODE 3D Morton code : For loop (Early termination version)
+ template
+ inline void m3D_d_for_ET(const morton m, coord& x, coord& y, coord& z) {
+ x = 0; y = 0; z = 0;
+ unsigned long firstbit_location = 0;
+ if (!findFirstSetBit(m, &firstbit_location)) return;
+ unsigned int defaultbits = (sizeof(morton) * 8) / 3;
+ unsigned int checkbits = static_cast(std::min((float) defaultbits, firstbit_location / 3.0f));
+ for (unsigned int i = 0; i <= checkbits; ++i) {
+ morton selector = 1;
+ unsigned int shift_selector = 3 * i;
+ unsigned int shiftback = 2 * i;
+ x |= (m & (selector << shift_selector)) >> (shiftback);
+ y |= (m & (selector << (shift_selector + 1))) >> (shiftback + 1);
+ z |= (m & (selector << (shift_selector + 2))) >> (shiftback + 2);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/libmorton/morton3D_LUTs.h b/src/libmorton/morton3D_LUTs.h
new file mode 100644
index 0000000..fedf063
--- /dev/null
+++ b/src/libmorton/morton3D_LUTs.h
@@ -0,0 +1,225 @@
+#pragma once
+
+#include
+
+namespace libmorton {
+ // LUT for Morton3D encode X
+ static const uint_fast32_t Morton3D_encode_x_256[256] =
+ {
+ 0x00000000,
+ 0x00000001, 0x00000008, 0x00000009, 0x00000040, 0x00000041, 0x00000048, 0x00000049, 0x00000200,
+ 0x00000201, 0x00000208, 0x00000209, 0x00000240, 0x00000241, 0x00000248, 0x00000249, 0x00001000,
+ 0x00001001, 0x00001008, 0x00001009, 0x00001040, 0x00001041, 0x00001048, 0x00001049, 0x00001200,
+ 0x00001201, 0x00001208, 0x00001209, 0x00001240, 0x00001241, 0x00001248, 0x00001249, 0x00008000,
+ 0x00008001, 0x00008008, 0x00008009, 0x00008040, 0x00008041, 0x00008048, 0x00008049, 0x00008200,
+ 0x00008201, 0x00008208, 0x00008209, 0x00008240, 0x00008241, 0x00008248, 0x00008249, 0x00009000,
+ 0x00009001, 0x00009008, 0x00009009, 0x00009040, 0x00009041, 0x00009048, 0x00009049, 0x00009200,
+ 0x00009201, 0x00009208, 0x00009209, 0x00009240, 0x00009241, 0x00009248, 0x00009249, 0x00040000,
+ 0x00040001, 0x00040008, 0x00040009, 0x00040040, 0x00040041, 0x00040048, 0x00040049, 0x00040200,
+ 0x00040201, 0x00040208, 0x00040209, 0x00040240, 0x00040241, 0x00040248, 0x00040249, 0x00041000,
+ 0x00041001, 0x00041008, 0x00041009, 0x00041040, 0x00041041, 0x00041048, 0x00041049, 0x00041200,
+ 0x00041201, 0x00041208, 0x00041209, 0x00041240, 0x00041241, 0x00041248, 0x00041249, 0x00048000,
+ 0x00048001, 0x00048008, 0x00048009, 0x00048040, 0x00048041, 0x00048048, 0x00048049, 0x00048200,
+ 0x00048201, 0x00048208, 0x00048209, 0x00048240, 0x00048241, 0x00048248, 0x00048249, 0x00049000,
+ 0x00049001, 0x00049008, 0x00049009, 0x00049040, 0x00049041, 0x00049048, 0x00049049, 0x00049200,
+ 0x00049201, 0x00049208, 0x00049209, 0x00049240, 0x00049241, 0x00049248, 0x00049249, 0x00200000,
+ 0x00200001, 0x00200008, 0x00200009, 0x00200040, 0x00200041, 0x00200048, 0x00200049, 0x00200200,
+ 0x00200201, 0x00200208, 0x00200209, 0x00200240, 0x00200241, 0x00200248, 0x00200249, 0x00201000,
+ 0x00201001, 0x00201008, 0x00201009, 0x00201040, 0x00201041, 0x00201048, 0x00201049, 0x00201200,
+ 0x00201201, 0x00201208, 0x00201209, 0x00201240, 0x00201241, 0x00201248, 0x00201249, 0x00208000,
+ 0x00208001, 0x00208008, 0x00208009, 0x00208040, 0x00208041, 0x00208048, 0x00208049, 0x00208200,
+ 0x00208201, 0x00208208, 0x00208209, 0x00208240, 0x00208241, 0x00208248, 0x00208249, 0x00209000,
+ 0x00209001, 0x00209008, 0x00209009, 0x00209040, 0x00209041, 0x00209048, 0x00209049, 0x00209200,
+ 0x00209201, 0x00209208, 0x00209209, 0x00209240, 0x00209241, 0x00209248, 0x00209249, 0x00240000,
+ 0x00240001, 0x00240008, 0x00240009, 0x00240040, 0x00240041, 0x00240048, 0x00240049, 0x00240200,
+ 0x00240201, 0x00240208, 0x00240209, 0x00240240, 0x00240241, 0x00240248, 0x00240249, 0x00241000,
+ 0x00241001, 0x00241008, 0x00241009, 0x00241040, 0x00241041, 0x00241048, 0x00241049, 0x00241200,
+ 0x00241201, 0x00241208, 0x00241209, 0x00241240, 0x00241241, 0x00241248, 0x00241249, 0x00248000,
+ 0x00248001, 0x00248008, 0x00248009, 0x00248040, 0x00248041, 0x00248048, 0x00248049, 0x00248200,
+ 0x00248201, 0x00248208, 0x00248209, 0x00248240, 0x00248241, 0x00248248, 0x00248249, 0x00249000,
+ 0x00249001, 0x00249008, 0x00249009, 0x00249040, 0x00249041, 0x00249048, 0x00249049, 0x00249200,
+ 0x00249201, 0x00249208, 0x00249209, 0x00249240, 0x00249241, 0x00249248, 0x00249249
+ };
+
+ // LUT for Morton3D encode Y
+ static const uint_fast32_t Morton3D_encode_y_256[256] = {
+ 0x00000000,
+ 0x00000002, 0x00000010, 0x00000012, 0x00000080, 0x00000082, 0x00000090, 0x00000092, 0x00000400,
+ 0x00000402, 0x00000410, 0x00000412, 0x00000480, 0x00000482, 0x00000490, 0x00000492, 0x00002000,
+ 0x00002002, 0x00002010, 0x00002012, 0x00002080, 0x00002082, 0x00002090, 0x00002092, 0x00002400,
+ 0x00002402, 0x00002410, 0x00002412, 0x00002480, 0x00002482, 0x00002490, 0x00002492, 0x00010000,
+ 0x00010002, 0x00010010, 0x00010012, 0x00010080, 0x00010082, 0x00010090, 0x00010092, 0x00010400,
+ 0x00010402, 0x00010410, 0x00010412, 0x00010480, 0x00010482, 0x00010490, 0x00010492, 0x00012000,
+ 0x00012002, 0x00012010, 0x00012012, 0x00012080, 0x00012082, 0x00012090, 0x00012092, 0x00012400,
+ 0x00012402, 0x00012410, 0x00012412, 0x00012480, 0x00012482, 0x00012490, 0x00012492, 0x00080000,
+ 0x00080002, 0x00080010, 0x00080012, 0x00080080, 0x00080082, 0x00080090, 0x00080092, 0x00080400,
+ 0x00080402, 0x00080410, 0x00080412, 0x00080480, 0x00080482, 0x00080490, 0x00080492, 0x00082000,
+ 0x00082002, 0x00082010, 0x00082012, 0x00082080, 0x00082082, 0x00082090, 0x00082092, 0x00082400,
+ 0x00082402, 0x00082410, 0x00082412, 0x00082480, 0x00082482, 0x00082490, 0x00082492, 0x00090000,
+ 0x00090002, 0x00090010, 0x00090012, 0x00090080, 0x00090082, 0x00090090, 0x00090092, 0x00090400,
+ 0x00090402, 0x00090410, 0x00090412, 0x00090480, 0x00090482, 0x00090490, 0x00090492, 0x00092000,
+ 0x00092002, 0x00092010, 0x00092012, 0x00092080, 0x00092082, 0x00092090, 0x00092092, 0x00092400,
+ 0x00092402, 0x00092410, 0x00092412, 0x00092480, 0x00092482, 0x00092490, 0x00092492, 0x00400000,
+ 0x00400002, 0x00400010, 0x00400012, 0x00400080, 0x00400082, 0x00400090, 0x00400092, 0x00400400,
+ 0x00400402, 0x00400410, 0x00400412, 0x00400480, 0x00400482, 0x00400490, 0x00400492, 0x00402000,
+ 0x00402002, 0x00402010, 0x00402012, 0x00402080, 0x00402082, 0x00402090, 0x00402092, 0x00402400,
+ 0x00402402, 0x00402410, 0x00402412, 0x00402480, 0x00402482, 0x00402490, 0x00402492, 0x00410000,
+ 0x00410002, 0x00410010, 0x00410012, 0x00410080, 0x00410082, 0x00410090, 0x00410092, 0x00410400,
+ 0x00410402, 0x00410410, 0x00410412, 0x00410480, 0x00410482, 0x00410490, 0x00410492, 0x00412000,
+ 0x00412002, 0x00412010, 0x00412012, 0x00412080, 0x00412082, 0x00412090, 0x00412092, 0x00412400,
+ 0x00412402, 0x00412410, 0x00412412, 0x00412480, 0x00412482, 0x00412490, 0x00412492, 0x00480000,
+ 0x00480002, 0x00480010, 0x00480012, 0x00480080, 0x00480082, 0x00480090, 0x00480092, 0x00480400,
+ 0x00480402, 0x00480410, 0x00480412, 0x00480480, 0x00480482, 0x00480490, 0x00480492, 0x00482000,
+ 0x00482002, 0x00482010, 0x00482012, 0x00482080, 0x00482082, 0x00482090, 0x00482092, 0x00482400,
+ 0x00482402, 0x00482410, 0x00482412, 0x00482480, 0x00482482, 0x00482490, 0x00482492, 0x00490000,
+ 0x00490002, 0x00490010, 0x00490012, 0x00490080, 0x00490082, 0x00490090, 0x00490092, 0x00490400,
+ 0x00490402, 0x00490410, 0x00490412, 0x00490480, 0x00490482, 0x00490490, 0x00490492, 0x00492000,
+ 0x00492002, 0x00492010, 0x00492012, 0x00492080, 0x00492082, 0x00492090, 0x00492092, 0x00492400,
+ 0x00492402, 0x00492410, 0x00492412, 0x00492480, 0x00492482, 0x00492490, 0x00492492
+ };
+
+ // LUT for Morton3D encode Z
+ static const uint_fast32_t Morton3D_encode_z_256[256] = {
+ 0x00000000,
+ 0x00000004, 0x00000020, 0x00000024, 0x00000100, 0x00000104, 0x00000120, 0x00000124, 0x00000800,
+ 0x00000804, 0x00000820, 0x00000824, 0x00000900, 0x00000904, 0x00000920, 0x00000924, 0x00004000,
+ 0x00004004, 0x00004020, 0x00004024, 0x00004100, 0x00004104, 0x00004120, 0x00004124, 0x00004800,
+ 0x00004804, 0x00004820, 0x00004824, 0x00004900, 0x00004904, 0x00004920, 0x00004924, 0x00020000,
+ 0x00020004, 0x00020020, 0x00020024, 0x00020100, 0x00020104, 0x00020120, 0x00020124, 0x00020800,
+ 0x00020804, 0x00020820, 0x00020824, 0x00020900, 0x00020904, 0x00020920, 0x00020924, 0x00024000,
+ 0x00024004, 0x00024020, 0x00024024, 0x00024100, 0x00024104, 0x00024120, 0x00024124, 0x00024800,
+ 0x00024804, 0x00024820, 0x00024824, 0x00024900, 0x00024904, 0x00024920, 0x00024924, 0x00100000,
+ 0x00100004, 0x00100020, 0x00100024, 0x00100100, 0x00100104, 0x00100120, 0x00100124, 0x00100800,
+ 0x00100804, 0x00100820, 0x00100824, 0x00100900, 0x00100904, 0x00100920, 0x00100924, 0x00104000,
+ 0x00104004, 0x00104020, 0x00104024, 0x00104100, 0x00104104, 0x00104120, 0x00104124, 0x00104800,
+ 0x00104804, 0x00104820, 0x00104824, 0x00104900, 0x00104904, 0x00104920, 0x00104924, 0x00120000,
+ 0x00120004, 0x00120020, 0x00120024, 0x00120100, 0x00120104, 0x00120120, 0x00120124, 0x00120800,
+ 0x00120804, 0x00120820, 0x00120824, 0x00120900, 0x00120904, 0x00120920, 0x00120924, 0x00124000,
+ 0x00124004, 0x00124020, 0x00124024, 0x00124100, 0x00124104, 0x00124120, 0x00124124, 0x00124800,
+ 0x00124804, 0x00124820, 0x00124824, 0x00124900, 0x00124904, 0x00124920, 0x00124924, 0x00800000,
+ 0x00800004, 0x00800020, 0x00800024, 0x00800100, 0x00800104, 0x00800120, 0x00800124, 0x00800800,
+ 0x00800804, 0x00800820, 0x00800824, 0x00800900, 0x00800904, 0x00800920, 0x00800924, 0x00804000,
+ 0x00804004, 0x00804020, 0x00804024, 0x00804100, 0x00804104, 0x00804120, 0x00804124, 0x00804800,
+ 0x00804804, 0x00804820, 0x00804824, 0x00804900, 0x00804904, 0x00804920, 0x00804924, 0x00820000,
+ 0x00820004, 0x00820020, 0x00820024, 0x00820100, 0x00820104, 0x00820120, 0x00820124, 0x00820800,
+ 0x00820804, 0x00820820, 0x00820824, 0x00820900, 0x00820904, 0x00820920, 0x00820924, 0x00824000,
+ 0x00824004, 0x00824020, 0x00824024, 0x00824100, 0x00824104, 0x00824120, 0x00824124, 0x00824800,
+ 0x00824804, 0x00824820, 0x00824824, 0x00824900, 0x00824904, 0x00824920, 0x00824924, 0x00900000,
+ 0x00900004, 0x00900020, 0x00900024, 0x00900100, 0x00900104, 0x00900120, 0x00900124, 0x00900800,
+ 0x00900804, 0x00900820, 0x00900824, 0x00900900, 0x00900904, 0x00900920, 0x00900924, 0x00904000,
+ 0x00904004, 0x00904020, 0x00904024, 0x00904100, 0x00904104, 0x00904120, 0x00904124, 0x00904800,
+ 0x00904804, 0x00904820, 0x00904824, 0x00904900, 0x00904904, 0x00904920, 0x00904924, 0x00920000,
+ 0x00920004, 0x00920020, 0x00920024, 0x00920100, 0x00920104, 0x00920120, 0x00920124, 0x00920800,
+ 0x00920804, 0x00920820, 0x00920824, 0x00920900, 0x00920904, 0x00920920, 0x00920924, 0x00924000,
+ 0x00924004, 0x00924020, 0x00924024, 0x00924100, 0x00924104, 0x00924120, 0x00924124, 0x00924800,
+ 0x00924804, 0x00924820, 0x00924824, 0x00924900, 0x00924904, 0x00924920, 0x00924924
+ };
+
+ // LUT for Morton3D decode X
+ static const uint_fast8_t Morton3D_decode_x_512[512] = {
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7,
+ 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7
+ };
+
+ // LUT for Morton3D decode Y
+ static const uint_fast8_t Morton3D_decode_y_512[512] = {
+ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+ 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3,
+ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+ 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3,
+ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+ 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3,
+ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+ 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3,
+ 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5,
+ 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7,
+ 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5,
+ 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7,
+ 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5,
+ 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7,
+ 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5,
+ 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7,
+ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+ 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3,
+ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+ 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3,
+ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+ 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3,
+ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+ 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3,
+ 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5,
+ 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7,
+ 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5,
+ 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7,
+ 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5,
+ 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7,
+ 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5, 4, 4, 5, 5,
+ 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7
+ };
+
+ // LUT for Morton3D decode Z
+ static const uint_fast8_t Morton3D_decode_z_512[512] = {
+ 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+ 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+ 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
+ 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
+ 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+ 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+ 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
+ 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
+ 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+ 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+ 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
+ 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
+ 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+ 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+ 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
+ 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
+ 4, 4, 4, 4, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5,
+ 4, 4, 4, 4, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5,
+ 6, 6, 6, 6, 7, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 7,
+ 6, 6, 6, 6, 7, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 7,
+ 4, 4, 4, 4, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5,
+ 4, 4, 4, 4, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5,
+ 6, 6, 6, 6, 7, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 7,
+ 6, 6, 6, 6, 7, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 7,
+ 4, 4, 4, 4, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5,
+ 4, 4, 4, 4, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5,
+ 6, 6, 6, 6, 7, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 7,
+ 6, 6, 6, 6, 7, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 7,
+ 4, 4, 4, 4, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5,
+ 4, 4, 4, 4, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5,
+ 6, 6, 6, 6, 7, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 7,
+ 6, 6, 6, 6, 7, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 7
+ };
+}
\ No newline at end of file
diff --git a/src/libmorton/morton_AVX512BITALG.h b/src/libmorton/morton_AVX512BITALG.h
new file mode 100644
index 0000000..d33b7df
--- /dev/null
+++ b/src/libmorton/morton_AVX512BITALG.h
@@ -0,0 +1,223 @@
+#pragma once
+#if defined(__AVX512BITALG__)
+#include
+#include
+
+namespace libmorton {
+
+ namespace bitalg_detail {
+ // "Zip" and interleave an m-vector of n-bit integers into a
+ // new n*m-bit integer
+ // 2D MORTONS
+ inline void bitunzip2D(const uint32_t morton, uint32_t& x, uint32_t& y) noexcept {
+ // Unpack bits into upper and lower half of 32-bit integer in parallel
+ // into 16-bit components
+ const uint32_t Unzipped = _cvtmask32_u32(
+ _mm256_bitshuffle_epi64_mask(
+ _mm256_set1_epi32(morton),
+ _mm256_set_epi8(
+ // Every odd bit
+ 31, 29, 27, 25, 23, 21, 19, 17,
+ 15, 13, 11, 9, 7, 5, 3, 1,
+ // Every even bit
+ 30, 28, 26, 24, 22, 20, 18, 16,
+ 14, 12, 10, 8, 6, 4, 2, 0
+ )
+ )
+ );
+ x = static_cast(Unzipped >> 0);
+ y = static_cast(Unzipped >> 16);
+ }
+ inline void bitunzip2D(const uint64_t morton, uint64_t& x, uint64_t& y) noexcept {
+ // Unpack bits into upper and lower half of 64-bit integer in parallel
+ // into 32-bit components
+ const uint64_t Unzipped = _cvtmask64_u64(
+ _mm512_bitshuffle_epi64_mask(
+ _mm512_set1_epi64(morton),
+ _mm512_set_epi8(
+ // Every odd bit
+ 63, 61, 59, 57, 55, 53, 51, 49,
+ 47, 45, 43, 41, 39, 37, 35, 33,
+ 31, 29, 27, 25, 23, 21, 19, 17,
+ 15, 13, 11, 9, 7, 5, 3, 1,
+ // Every even bit
+ 62, 60, 58, 56, 54, 52, 50, 48,
+ 46, 44, 42, 40, 38, 36, 34, 32,
+ 30, 28, 26, 24, 22, 20, 18, 16,
+ 14, 12, 10, 8, 6, 4, 2, 0
+ )
+ )
+ );
+ x = static_cast(Unzipped >> 0);
+ y = static_cast(Unzipped >> 32);
+ }
+ inline uint32_t bitzip2D(uint32_t x, uint32_t y) noexcept {
+ // Put both 32-bit integer into each 64-bit lane
+ const __m256i CoordVec = _mm256_set1_epi64x(
+ (static_cast(y) << 32u) | x
+ );
+ // Interleave bits from 32-bit X and Y coordinate
+ const __mmask32 Interleave = _mm256_bitshuffle_epi64_mask(
+ CoordVec,
+ _mm256_set_epi16(
+ 0x1000 + 0x0101 * 15, 0x1000 + 0x0101 * 14,
+ 0x1000 + 0x0101 * 13, 0x1000 + 0x0101 * 12,
+ 0x1000 + 0x0101 * 11, 0x1000 + 0x0101 * 10,
+ 0x1000 + 0x0101 * 9, 0x1000 + 0x0101 * 8,
+ 0x1000 + 0x0101 * 7, 0x1000 + 0x0101 * 6,
+ 0x1000 + 0x0101 * 5, 0x1000 + 0x0101 * 4,
+ 0x1000 + 0x0101 * 3, 0x1000 + 0x0101 * 2,
+ 0x1000 + 0x0101 * 1, 0x1000 + 0x0101 * 0
+ )
+ );
+ return _cvtmask32_u32(Interleave);
+ }
+
+ inline uint64_t bitzip2D(uint64_t x, uint64_t y) noexcept {
+ const __m512i CoordVec = _mm512_set1_epi64(
+ (static_cast(y) << 32u) | x
+ );
+ // Interleave bits from 32-bit X and Y coordinate
+ const __mmask64 Interleave = _mm512_bitshuffle_epi64_mask(
+ CoordVec,
+ _mm512_set_epi16(
+ 0x2000 + 0x0101 * 31, 0x2000 + 0x0101 * 30,
+ 0x2000 + 0x0101 * 29, 0x2000 + 0x0101 * 28,
+ 0x2000 + 0x0101 * 27, 0x2000 + 0x0101 * 26,
+ 0x2000 + 0x0101 * 25, 0x2000 + 0x0101 * 24,
+ 0x2000 + 0x0101 * 23, 0x2000 + 0x0101 * 22,
+ 0x2000 + 0x0101 * 21, 0x2000 + 0x0101 * 20,
+ 0x2000 + 0x0101 * 19, 0x2000 + 0x0101 * 18,
+ 0x2000 + 0x0101 * 17, 0x2000 + 0x0101 * 16,
+ 0x2000 + 0x0101 * 15, 0x2000 + 0x0101 * 14,
+ 0x2000 + 0x0101 * 13, 0x2000 + 0x0101 * 12,
+ 0x2000 + 0x0101 * 11, 0x2000 + 0x0101 * 10,
+ 0x2000 + 0x0101 * 9, 0x2000 + 0x0101 * 8,
+ 0x2000 + 0x0101 * 7, 0x2000 + 0x0101 * 6,
+ 0x2000 + 0x0101 * 5, 0x2000 + 0x0101 * 4,
+ 0x2000 + 0x0101 * 3, 0x2000 + 0x0101 * 2,
+ 0x2000 + 0x0101 * 1, 0x2000 + 0x0101 * 0
+ )
+ );
+ return _cvtmask64_u64(Interleave);
+ }
+ // 3D MORTONS
+ inline void bitunzip3D(const uint32_t morton, uint32_t& x, uint32_t& y, uint32_t& z) noexcept {
+ // Unpack 32-bit integer in parallel into 10-bit components, within 16-bit lanes
+ const uint64_t Unzipped = _cvtmask64_u64(
+ _mm512_bitshuffle_epi64_mask(
+ _mm512_set1_epi64(morton),
+ _mm512_set_epi8(
+ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,
+ ~0, ~0, ~0, ~0, ~0, ~0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2,
+ ~0, ~0, ~0, ~0, ~0, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1,
+ ~0, ~0, ~0, ~0, ~0, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0
+ )
+ )
+ );
+ x = static_cast(Unzipped >> 0);
+ y = static_cast(Unzipped >> 16);
+ z = static_cast(Unzipped >> 32);
+ }
+ inline void bitunzip3D(const uint64_t morton, uint64_t& x, uint64_t& y, uint64_t& z) noexcept {
+ // Unpack 64-bit integer in parallel into 21-bit components
+ const uint64_t Unzipped = _cvtmask64_u64(
+ _mm512_bitshuffle_epi64_mask(
+ _mm512_set1_epi64(morton),
+ _mm512_set_epi8(
+ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1,
+ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0
+ )
+ )
+ );
+ x = static_cast(Unzipped >> 0);
+ y = static_cast(Unzipped >> 32);
+ z = bmi2_detail::pext(morton, 0x4924924924924924);
+ //z = static_cast(Unzipped >> 64);
+ }
+ inline uint32_t bitzip3D(uint32_t x, uint32_t y, uint32_t z) noexcept {
+ const __m256i CoordVec = _mm256_broadcastsi128_si256(
+ _mm_set_epi32(0, z, y, x)
+ );
+ const __m256i ShuffleVec = _mm256_permutexvar_epi8(
+ _mm256_set_epi64x(
+ 0xFFFFFFFFFF100800ul + 0x010101 * 1, // Lane 3 | ...000 | z[1] | y[1] | x[1]
+ 0xFFFFFFFFFF100800ul + 0x010101 * 0, // Lane 2 | ...000 | z[0] | y[0] | x[0]
+ 0xFFFFFFFFFF100800ul + 0x010101 * 0, // Lane 1 | ...000 | z[0] | y[0] | x[0]
+ 0xFFFFFFFFFF100800ul + 0x010101 * 0 // Lane 0 | ...000 | z[0] | y[0] | x[0]
+ ),
+ CoordVec
+ );
+ const __mmask32 Interleave = _mm256_bitshuffle_epi64_mask(
+ ShuffleVec,
+ _mm256_set_epi64x(
+ 0x0202010101000000 + 0x0100020100020100 * 8,
+ 0x0707070606060505 + 0x0201000201000201 * 8,
+ 0x0504040403030302 + 0x0002010002010002 * 8,
+ 0x0202010101000000 + 0x0100020100020100 * 8
+ )
+ );
+ return _cvtmask32_u32(Interleave);
+ }
+ inline uint64_t bitzip3D(uint64_t x, uint64_t y, uint64_t z) noexcept {
+ // Put both 32-bit integers into each 64-bit lane
+ // Todo: _mm512_shuffle_epi8 version, 128-bit lane woes
+ const __m512i CoordVec = _mm512_set_epi64(
+ 0, 0, 0, 0, 0, z, y, x
+ );
+ const __m512i ShuffleVec = _mm512_permutexvar_epi8(
+ _mm512_set_epi64(
+ 0xFFFFFFFFFF100800ul + 0x010101 * 2, // Lane 7 | ...000 | z[2] | y[2] | x[2]
+ 0xFFFFFFFFFF100800ul + 0x010101 * 2, // Lane 6 | ...000 | z[2] | y[2] | x[2]
+ 0xFFFFFFFFFF100800ul + 0x010101 * 1, // Lane 5 | ...000 | z[1] | y[1] | x[1]
+ 0xFFFFFFFFFF100800ul + 0x010101 * 1, // Lane 4 | ...000 | z[1] | y[1] | x[1]
+ 0xFFFFFFFFFF100800ul + 0x010101 * 1, // Lane 3 | ...000 | z[1] | y[1] | x[1]
+ 0xFFFFFFFFFF100800ul + 0x010101 * 0, // Lane 2 | ...000 | z[0] | y[0] | x[0]
+ 0xFFFFFFFFFF100800ul + 0x010101 * 0, // Lane 1 | ...000 | z[0] | y[0] | x[0]
+ 0xFFFFFFFFFF100800ul + 0x010101 * 0 // Lane 0 | ...000 | z[0] | y[0] | x[0]
+ ),
+ CoordVec
+ );
+ // Interleave bits from 32-bit X and Y and Z coordinate
+ const __mmask64 Interleave = _mm512_bitshuffle_epi64_mask(
+ ShuffleVec,
+ _mm512_set_epi64(
+ 0x0504040403030302 + 0x0002010002010002 * 8,
+ 0x0202010101000000 + 0x0100020100020100 * 8,
+ 0x0707070606060505 + 0x0201000201000201 * 8,
+ 0x0504040403030302 + 0x0002010002010002 * 8,
+ 0x0202010101000000 + 0x0100020100020100 * 8,
+ 0x0707070606060505 + 0x0201000201000201 * 8,
+ 0x0504040403030302 + 0x0002010002010002 * 8,
+ 0x0202010101000000 + 0x0100020100020100 * 8
+ )
+ );
+ return _cvtmask64_u64(Interleave);
+ }
+ } // namespace bitalg_detail
+
+ template
+ inline morton m2D_e_BITALG(const coord x, const coord y) {
+ return bitalg_detail::bitzip2D(
+ static_cast(x), static_cast(y)
+ );
+ }
+
+ template
+ inline void m2D_d_BITALG(const morton m, coord& x, coord& y) {
+ bitalg_detail::bitunzip2D(m, x, y);
+ }
+
+ template
+ inline morton m3D_e_BITALG(const coord x, const coord y, const coord z) {
+ return bitalg_detail::bitzip3D(
+ static_cast(x), static_cast(y), static_cast(z)
+ );
+ }
+
+ template
+ inline void m3D_d_BITALG(const morton m, coord& x, coord& y, coord& z) {
+ bitalg_detail::bitunzip3D(m, x, y, z);
+ }
+}
+#endif
\ No newline at end of file
diff --git a/src/libmorton/morton_BMI.h b/src/libmorton/morton_BMI.h
new file mode 100644
index 0000000..6b0222f
--- /dev/null
+++ b/src/libmorton/morton_BMI.h
@@ -0,0 +1,60 @@
+#pragma once
+#if defined(__BMI2__) || defined(__AVX2__)
+#include
+#include
+
+namespace libmorton {
+
+ namespace bmi2_detail {
+ inline uint32_t pdep(uint32_t source, uint32_t mask) noexcept {
+ return _pdep_u32(source, mask);
+ }
+ inline uint64_t pdep(uint64_t source, uint64_t mask) noexcept {
+ return _pdep_u64(source, mask);
+ }
+ inline uint32_t pext(uint32_t source, uint32_t mask) noexcept {
+ return _pext_u32(source, mask);
+ }
+ inline uint64_t pext(uint64_t source, uint64_t mask) noexcept {
+ return _pext_u64(source, mask);
+ }
+ } // namespace bmi2_detail
+
+#define BMI_2D_X_MASK 0x5555555555555555
+#define BMI_2D_Y_MASK 0xAAAAAAAAAAAAAAAA
+
+ template
+ inline morton m2D_e_BMI(const coord x, const coord y) {
+ morton m = 0;
+ m |= bmi2_detail::pdep(static_cast(x), static_cast(BMI_2D_X_MASK))
+ | bmi2_detail::pdep(static_cast(y), static_cast(BMI_2D_Y_MASK));
+ return m;
+ }
+
+ template
+ inline void m2D_d_BMI(const morton m, coord& x, coord& y) {
+ x = static_cast(bmi2_detail::pext(m, static_cast(BMI_2D_X_MASK)));
+ y = static_cast(bmi2_detail::pext(m, static_cast(BMI_2D_Y_MASK)));
+ }
+
+#define BMI_3D_X_MASK 0x9249249249249249
+#define BMI_3D_Y_MASK 0x2492492492492492
+#define BMI_3D_Z_MASK 0x4924924924924924
+
+ template
+ inline morton m3D_e_BMI(const coord x, const coord y, const coord z) {
+ morton m = 0;
+ m |= bmi2_detail::pdep(static_cast(x), static_cast(BMI_3D_X_MASK))
+ | bmi2_detail::pdep(static_cast(y), static_cast(BMI_3D_Y_MASK))
+ | bmi2_detail::pdep(static_cast(z), static_cast(BMI_3D_Z_MASK));
+ return m;
+ }
+
+ template
+ inline void m3D_d_BMI(const morton m, coord& x, coord& y, coord& z) {
+ x = static_cast(bmi2_detail::pext(m, static_cast(BMI_3D_X_MASK)));
+ y = static_cast(bmi2_detail::pext(m, static_cast(BMI_3D_Y_MASK)));
+ z = static_cast(bmi2_detail::pext(m, static_cast(BMI_3D_Z_MASK)));
+ }
+}
+#endif
diff --git a/src/libmorton/morton_common.h b/src/libmorton/morton_common.h
new file mode 100644
index 0000000..41078d5
--- /dev/null
+++ b/src/libmorton/morton_common.h
@@ -0,0 +1,49 @@
+#pragma once
+
+// Libmorton - Common helper methods needed in Morton encoding/decoding
+
+#include
+#if defined(_MSC_VER)
+#include
+#endif
+
+namespace libmorton {
+ template
+ inline bool findFirstSetBitZeroIdx(const morton x, unsigned long* firstbit_location) {
+#if defined(_MSC_VER) && !defined(_WIN64)
+ // 32 BIT on 32 BIT
+ if (sizeof(morton) <= 4) {
+ return _BitScanReverse(firstbit_location, x) != 0;
+ }
+ // 64 BIT on 32 BIT
+ else {
+ *firstbit_location = 0;
+ if (_BitScanReverse(firstbit_location, (x >> 32))) { // check first part
+ *firstbit_location += 32;
+ return true;
+ }
+ return _BitScanReverse(firstbit_location, (x & 0xFFFFFFFF)) != 0;
+ }
+#elif defined(_MSC_VER) && defined(_WIN64)
+ // 32 or 64 BIT on 64 BIT
+ return _BitScanReverse64(firstbit_location, x) != 0;
+#elif defined(__GNUC__)
+ if (x == 0) {
+ return false;
+ }
+ else {
+ *firstbit_location = static_cast((sizeof(morton) * 8) - __builtin_clzll(x) - 1);
+ return true;
+ }
+#endif
+ }
+
+ template
+ inline bool findFirstSetBit(const morton x, unsigned long* firstbit_location) {
+ if (findFirstSetBitZeroIdx(x, firstbit_location)) {
+ *firstbit_location += 1;
+ return true;
+ }
+ return false;
+ }
+}
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 96127b6..4627f57 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -44,6 +44,16 @@ int main(int argc, char** argv) {
// Load scene file
scene = new Scene(sceneFile);
+ // Generate BVH
+ if (scene->triangles.size() > 0)
+ {
+#if USE_LBVH
+ generateLBVH(scene);
+#elif USE_BVH
+ generateBVH(scene);
+#endif
+ }
+
//Create Instance for ImGUIData
guiData = new GuiDataContainer();
@@ -196,7 +206,7 @@ void mousePositionCallback(GLFWwindow* window, double xpos, double ypos) {
}
else if (rightMousePressed) {
zoom += (ypos - lastY) / height;
- zoom = std::fmax(0.1f, zoom);
+ zoom = std::fmax(0.8f, zoom);
camchanged = true;
}
else if (middleMousePressed) {
diff --git a/src/main.h b/src/main.h
index fdb7d5d..3e13623 100644
--- a/src/main.h
+++ b/src/main.h
@@ -16,6 +16,7 @@
#include "sceneStructs.h"
#include "image.h"
+#include "lbvh.h"
#include "pathtrace.h"
#include "utilities.h"
#include "scene.h"
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index fd2a464..8eca4dc 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -1,7 +1,9 @@
#include
#include
#include
+#include
#include
+#include
#include
#include
@@ -14,6 +16,20 @@
#include "intersections.h"
#include "interactions.h"
+// Turn on anti-aliasing to removed jagged edges on shapes
+#define ANTIALIASING
+
+// Turn on to sort by material (keeps same materials contiguous in memory)
+//#define MATERIAL_SORT
+
+// Turn on to stream compact
+#define STREAM_COMPACTION
+
+// Turn off cache first bouncing when anti-aliasing is enabled
+#ifndef ANTIALIASING
+ #define CACHE_FIRST_BOUNCE
+#endif
+
#define ERRORCHECK 1
#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
@@ -38,12 +54,53 @@ void checkCUDAErrorFn(const char* msg, const char* file, int line) {
#endif
}
+// Returns true if a path still has bounces left
+struct not_zero
+{
+ __host__ __device__
+ bool operator()(const PathSegment &path)
+ {
+ return path.remainingBounces != 0;
+ }
+};
+
+// Compares the material ids of two materials to sort them in ascending order
+struct mat_id
+{
+ __host__ __device__
+ bool operator()(const ShadeableIntersection &i1, ShadeableIntersection & i2)
+ {
+ return i1.materialId < i2.materialId;
+ }
+};
+
__host__ __device__
thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
return thrust::default_random_engine(h);
}
+// Color correction helper functions (to convert to sRGB)
+__host__ __device__ glm::vec3 reinhardOp(glm::vec3 c) {
+ return c / (glm::vec3(1.f, 1.f, 1.f) + c);
+}
+
+__host__ __device__ glm::vec3 gammaCorrect(glm::vec3 c) {
+ glm::vec3 gamma = glm::vec3(1.0 / 2.2, 1.0 / 2.2, 1.0 / 2.2);
+ return pow(c, gamma);
+}
+
+// Use a cosine-based color palette to map intersection count to color - from "Color Palettes" - Inigo Quilez
+__host__ __device__ glm::vec3 palette(glm::vec3 a, glm::vec3 b, glm::vec3 c, glm::vec3 d, float t) {
+ return a + b * cos(6.28318f * (c * t + d));
+}
+
+__host__ __device__ glm::vec3 intToColor(float count) {
+ // Map value to [0, 1] range
+ float val = count * (1.f / 250.f);
+ return palette(glm::vec3(0.5f, 0.5f, 0.5f), glm::vec3(0.5f, 0.5f, 0.5f), glm::vec3(1.f, 0.7f, 0.4f), glm::vec3(0.f, 0.15f, 0.2f), val);
+}
+
//Kernel that writes the image to the OpenGL PBO directly.
__global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
int iter, glm::vec3* image) {
@@ -54,10 +111,22 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
int index = x + (y * resolution.x);
glm::vec3 pix = image[index];
+ // Average samples
+ glm::vec3 mod_color = pix / glm::vec3(iter, iter, iter);
+
+#if CONVERT_TO_SRGB
+ // Apply Reinhard operator
+ mod_color = reinhardOp(mod_color);
+
+ // Apply gamma correction
+ mod_color = gammaCorrect(mod_color);
+#endif
+
+ // Convert to 0-255 scale
glm::ivec3 color;
- color.x = glm::clamp((int)(pix.x / iter * 255.0), 0, 255);
- color.y = glm::clamp((int)(pix.y / iter * 255.0), 0, 255);
- color.z = glm::clamp((int)(pix.z / iter * 255.0), 0, 255);
+ color.x = glm::clamp((int)(mod_color.x * 255.0), 0, 255);
+ color.y = glm::clamp((int)(mod_color.y * 255.0), 0, 255);
+ color.z = glm::clamp((int)(mod_color.z * 255.0), 0, 255);
// Each thread writes one pixel location in the texture (textel)
pbo[index].w = 0;
@@ -71,11 +140,15 @@ static Scene* hst_scene = NULL;
static GuiDataContainer* guiData = NULL;
static glm::vec3* dev_image = NULL;
static Geom* dev_geoms = NULL;
+static LBVHNode* dev_lbvh = NULL;
+static BVHNode* dev_bvh = NULL;
+static Triangle* dev_tris = NULL;
static Material* dev_materials = NULL;
static PathSegment* dev_paths = NULL;
static ShadeableIntersection* dev_intersections = NULL;
-// TODO: static variables for device memory, any extra info you need, etc
-// ...
+
+// For saving first-bounce intersections
+static ShadeableIntersection* dev_first_bounce_intersections = NULL;
void InitDataContainer(GuiDataContainer* imGuiData)
{
@@ -96,13 +169,23 @@ void pathtraceInit(Scene* scene) {
cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+ cudaMalloc(&dev_lbvh, scene->lbvh.size() * sizeof(LBVHNode));
+ cudaMemcpy(dev_lbvh, scene->lbvh.data(), scene->lbvh.size() * sizeof(LBVHNode), cudaMemcpyHostToDevice);
+
+ cudaMalloc(&dev_bvh, scene->bvh.size() * sizeof(BVHNode));
+ cudaMemcpy(dev_bvh, scene->bvh.data(), scene->bvh.size() * sizeof(BVHNode), cudaMemcpyHostToDevice);
+
+ cudaMalloc(&dev_tris, scene->triangles.size() * sizeof(Triangle));
+ cudaMemcpy(dev_tris, scene->triangles.data(), scene->triangles.size() * sizeof(Triangle), cudaMemcpyHostToDevice);
+
cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
- // TODO: initialize any extra device memeory you need
+ cudaMalloc(&dev_first_bounce_intersections, pixelcount * sizeof(ShadeableIntersection));
+ cudaMemset(dev_first_bounce_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
checkCUDAError("pathtraceInit");
}
@@ -111,13 +194,45 @@ void pathtraceFree() {
cudaFree(dev_image); // no-op if dev_image is null
cudaFree(dev_paths);
cudaFree(dev_geoms);
+ cudaFree(dev_lbvh);
+ cudaFree(dev_bvh);
+ cudaFree(dev_tris);
cudaFree(dev_materials);
cudaFree(dev_intersections);
- // TODO: clean up any extra device memory you created
+ cudaFree(dev_first_bounce_intersections);
checkCUDAError("pathtraceFree");
}
+/**
+* Concentric Disk Sampling from PBRT Chapter 13.6.2
+*/
+__host__ __device__ glm::vec3 concentricSampleDisk(glm::vec2 &sample)
+{
+ // Map sample point (uniform random numbers) to range [-1, 1]
+ glm::vec2 mappedSample = 2.f * sample - glm::vec2(1.f, 1.f);
+
+ // Handle origin to avoid divide by zero
+ if (mappedSample.x == 0.f && mappedSample.y == 0.f) {
+ return glm::vec3(0.f);
+ }
+
+ // Apply concentric mapping to the adjusted sample point
+ float r = 0.f;
+ float theta = 0.f;
+ // Find r and theta depending on x and y coords of mapped point
+ if (std::abs(mappedSample.x) > std::abs(mappedSample.y)) {
+ r = mappedSample.x;
+ theta = PI_OVER_FOUR * (mappedSample.y / mappedSample.x);
+ }
+ else {
+ r = mappedSample.y;
+ theta = PI_OVER_TWO - PI_OVER_FOUR * (mappedSample.x / mappedSample.y);
+ }
+
+ return glm::vec3(r * cos(theta), r * sin(theta), 0);
+}
+
/**
* Generate PathSegments with rays from the camera through the screen into the
* scene, which is the first bounce of rays.
@@ -131,19 +246,45 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+ // Add jitter to x and y
+ thrust::default_random_engine rng = makeSeededRandomEngine(iter, x + y * cam.resolution.x, 0);
+ thrust::uniform_real_distribution u01(0, 1);
+ float jitterX = 0.0;
+ float jitterY = 0.0;
+#ifdef ANTIALIASING
+ jitterX = u01(rng);
+ jitterY = u01(rng);
+#endif
+
if (x < cam.resolution.x && y < cam.resolution.y) {
int index = x + (y * cam.resolution.x);
PathSegment& segment = pathSegments[index];
segment.ray.origin = cam.position;
- segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+ segment.color = glm::vec3(0.0f, 0.0f, 0.0f);
+ segment.throughput = glm::vec3(1.0f, 1.0f, 1.0f);
- // TODO: implement antialiasing by jittering the ray
+ // Jitter the ray for anti-aliasing
segment.ray.direction = glm::normalize(cam.view
- - cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
- - cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
+ - cam.right * cam.pixelLength.x * ((float)(x + jitterX) - (float)cam.resolution.x * 0.5f)
+ - cam.up * cam.pixelLength.y * ((float)(y + jitterY) - (float)cam.resolution.y * 0.5f)
);
+ // Depth-of-field (if specified in scene file)
+ if (cam.lens_radius > 0.0f) {
+ // Get sample on lens
+ glm::vec3 samplePoint = cam.lens_radius * concentricSampleDisk(glm::vec2(u01(rng), u01(rng)));
+
+ // Focal point
+ float ft = glm::length(cam.lookAt - cam.position);
+ glm::vec3 focalPoint = getPointOnRay(segment.ray, ft);
+
+ // Update ray
+ segment.ray.origin += samplePoint;
+ segment.ray.direction = glm::normalize(focalPoint - segment.ray.origin);
+ }
+ segment.ray.invDirection = glm::vec3(1.0, 1.0, 1.0) / segment.ray.direction;
+ segment.ray.intersectionCount = 0.f;
segment.pixelIndex = index;
segment.remainingBounces = traceDepth;
}
@@ -158,6 +299,9 @@ __global__ void computeIntersections(
, int num_paths
, PathSegment* pathSegments
, Geom* geoms
+ , LBVHNode* dev_lbvh
+ , BVHNode* dev_bvh
+ , Triangle* dev_tris
, int geoms_size
, ShadeableIntersection* intersections
)
@@ -166,7 +310,7 @@ __global__ void computeIntersections(
if (path_index < num_paths)
{
- PathSegment pathSegment = pathSegments[path_index];
+ PathSegment &pathSegment = pathSegments[path_index];
float t;
glm::vec3 intersect_point;
@@ -179,7 +323,6 @@ __global__ void computeIntersections(
glm::vec3 tmp_normal;
// naive parse through global geoms
-
for (int i = 0; i < geoms_size; i++)
{
Geom& geom = geoms[i];
@@ -193,7 +336,16 @@ __global__ void computeIntersections(
t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
}
// TODO: add more intersection tests here... triangle? metaball? CSG?
-
+ else if (geom.type == MESH)
+ {
+#if USE_LBVH
+ t = lbvhIntersectionTest(dev_lbvh, dev_tris, pathSegment.ray, geom.triangleCount, tmp_intersect, tmp_normal, outside);
+#elif USE_BVH
+ t = bvhIntersectionTest(dev_bvh, dev_tris, pathSegment.ray, geom.triangleCount, tmp_intersect, tmp_normal, outside);
+#else
+ t = meshIntersectionTest(geom, pathSegment.ray, dev_tris, tmp_intersect, tmp_normal, outside);
+#endif
+ }
// Compute the minimum t from the intersection tests to determine what
// scene geometry object was hit first.
if (t > 0.0f && t_min > t)
@@ -273,6 +425,68 @@ __global__ void shadeFakeMaterial(
}
}
+__global__ void shadeAllMaterials(
+ int iter
+ , int num_paths
+ , ShadeableIntersection* shadeableIntersections
+ , PathSegment* pathSegments
+ , Material* materials
+)
+{
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx < num_paths)
+ {
+ //if (pathSegments[idx].remainingBounces <= 0)
+ //{
+ // return;
+ //}
+ ShadeableIntersection intersection = shadeableIntersections[idx];
+ if (intersection.t > 0.0f) { // if the intersection exists...
+ // Set up the RNG
+ // LOOK: this is how you use thrust's RNG! Please look at
+ // makeSeededRandomEngine as well.
+ thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, pathSegments[idx].remainingBounces);
+ thrust::uniform_real_distribution u01(0, 1);
+
+ Material material = materials[intersection.materialId];
+ glm::vec3 materialColor = material.color;
+
+ // If the material indicates that the object was a light, "light" the ray
+ if (material.emittance > 0.0f) {
+ pathSegments[idx].color += (materialColor * material.emittance) * pathSegments[idx].throughput;
+ pathSegments[idx].remainingBounces = 0;
+ }
+ // Otherwise, do some pseudo-lighting computation. This is actually more
+ // like what you would expect from shading in a rasterizer like OpenGL.
+ // TODO: replace this! you should be able to start with basically a one-liner
+ else {
+ scatterRay(pathSegments[idx], getPointOnRay(pathSegments[idx].ray, intersection.t),
+ intersection.surfaceNormal, material, rng);
+ }
+ // If there was no intersection, color the ray black.
+ // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
+ // used for opacity, in which case they can indicate "no opacity".
+ // This can be useful for post-processing and image compositing.
+
+#if RUSSIAN_ROULETTE
+ if (iter > 3) {
+ float maxColorChannel = glm::max(pathSegments[idx].throughput.r, glm::max(pathSegments[idx].throughput.g, pathSegments[idx].throughput.b));
+ float xi = u01(rng);
+ if (xi < (1.f - maxColorChannel)) {
+ pathSegments[idx].remainingBounces = 0;
+ }
+ else {
+ pathSegments[idx].throughput /= maxColorChannel;
+ }
+ }
+#endif
+ }
+ else {
+ pathSegments[idx].remainingBounces = 0;
+ }
+ }
+}
+
// Add the current iteration's output to the overall image
__global__ void finalGather(int nPaths, glm::vec3* image, PathSegment* iterationPaths)
{
@@ -281,7 +495,11 @@ __global__ void finalGather(int nPaths, glm::vec3* image, PathSegment* iteration
if (index < nPaths)
{
PathSegment iterationPath = iterationPaths[index];
+#if DISPLAY_HEATMAP
+ image[iterationPath.pixelIndex] += intToColor(iterationPath.ray.intersectionCount);
+#else
image[iterationPath.pixelIndex] += iterationPath.color;
+#endif
}
}
@@ -340,6 +558,8 @@ void pathtrace(uchar4* pbo, int frame, int iter) {
int depth = 0;
PathSegment* dev_path_end = dev_paths + pixelcount;
int num_paths = dev_path_end - dev_paths;
+ int compact_num_paths = num_paths;
+ thrust::device_ptr dev_thrust_paths = thrust::device_pointer_cast(dev_paths);
// --- PathSegment Tracing Stage ---
// Shoot ray into scene, bounce between objects, push shading chunks
@@ -348,39 +568,101 @@ void pathtrace(uchar4* pbo, int frame, int iter) {
while (!iterationComplete) {
// clean shading chunks
- cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+ cudaMemset(dev_intersections, 0, compact_num_paths * sizeof(ShadeableIntersection));
// tracing
- dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+ dim3 numblocksPathSegmentTracing = (compact_num_paths + blockSize1d - 1) / blockSize1d;
+
+#ifdef CACHE_FIRST_BOUNCE
+ // If first iteration, compute first bounce intersections
+ if (iter == 1) {
+ computeIntersections << > > (
+ depth
+ , compact_num_paths
+ , dev_paths
+ , dev_geoms
+ , dev_lbvh
+ , dev_bvh
+ , dev_tris
+ , hst_scene->geoms.size()
+ , dev_intersections
+ );
+ checkCUDAError("trace one bounce");
+ cudaDeviceSynchronize();
+ if (depth == 0) {
+ cudaMemcpy(dev_first_bounce_intersections, dev_intersections, pixelcount * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+ }
+ }
+ // For all subsequent iterations, read from cached first bounce intersections
+ else {
+ if (depth == 0) {
+ cudaMemcpy(dev_intersections, dev_first_bounce_intersections, pixelcount * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+ }
+ else {
+ computeIntersections << > > (
+ depth
+ , compact_num_paths
+ , dev_paths
+ , dev_geoms
+ , dev_lbvh
+ , dev_bvh
+ , dev_tris
+ , hst_scene->geoms.size()
+ , dev_intersections
+ );
+ checkCUDAError("trace one bounce");
+ cudaDeviceSynchronize();
+ }
+ }
+#else
computeIntersections << > > (
depth
- , num_paths
+ , compact_num_paths
, dev_paths
, dev_geoms
+ , dev_lbvh
+ , dev_bvh
+ , dev_tris
, hst_scene->geoms.size()
, dev_intersections
);
checkCUDAError("trace one bounce");
cudaDeviceSynchronize();
+#endif
depth++;
// TODO:
// --- Shading Stage ---
// Shade path segments based on intersections and generate new rays by
- // evaluating the BSDF.
- // Start off with just a big kernel that handles all the different
- // materials you have in the scenefile.
- // TODO: compare between directly shading the path segments and shading
- // path segments that have been reshuffled to be contiguous in memory.
+ // evaluating the BSDF.
+ // Start off with just a big kernel that handles all the different
+ // materials you have in the scenefile.
+ // TODO: compare between directly shading the path segments and shading
+ // path segments that have been reshuffled to be contiguous in memory.
+
+#ifdef MATERIAL_SORT
+ // Shuffle paths to be contiguous in memory
+ thrust::device_ptr dev_thrust_intersections = thrust::device_pointer_cast(dev_intersections);
+ thrust::sort_by_key(dev_thrust_intersections, dev_thrust_intersections + compact_num_paths, dev_thrust_paths, mat_id());
+#endif
- shadeFakeMaterial << > > (
+ shadeAllMaterials << > > (
iter,
- num_paths,
+ compact_num_paths,
dev_intersections,
dev_paths,
dev_materials
);
- iterationComplete = true; // TODO: should be based off stream compaction results.
+
+ // Stream compact
+#ifdef STREAM_COMPACTION
+ thrust::device_ptr dev_thrust_path_end = thrust::stable_partition(dev_thrust_paths, dev_thrust_paths + compact_num_paths, not_zero());
+ dev_path_end = dev_thrust_path_end.get();
+ compact_num_paths = dev_path_end - dev_paths;
+#endif
+
+ // TODO: should be based off stream compaction results
+ if (depth == traceDepth || dev_paths == dev_path_end) { iterationComplete = true; }
if (guiData != NULL)
{
diff --git a/src/scene.cpp b/src/scene.cpp
index 3fb6239..6d7b036 100644
--- a/src/scene.cpp
+++ b/src/scene.cpp
@@ -4,6 +4,9 @@
#include
#include
+#define TINYOBJLOADER_IMPLEMENTATION
+#include "tiny_obj_loader.h"
+
Scene::Scene(string filename) {
cout << "Reading scene from " << filename << " ..." << endl;
cout << " " << endl;
@@ -45,12 +48,20 @@ int Scene::loadGeom(string objectid) {
//load object type
utilityCore::safeGetline(fp_in, line);
if (!line.empty() && fp_in.good()) {
- if (strcmp(line.c_str(), "sphere") == 0) {
+ vector tokens = utilityCore::tokenizeString(line);
+ if (strcmp(tokens[0].c_str(), "sphere") == 0) {
cout << "Creating new sphere..." << endl;
newGeom.type = SPHERE;
- } else if (strcmp(line.c_str(), "cube") == 0) {
+ //newGeom.tri = glm::vec3(-1.0, -1.0, -1.0);
+ } else if (strcmp(tokens[0].c_str(), "cube") == 0) {
cout << "Creating new cube..." << endl;
newGeom.type = CUBE;
+ //newGeom.tri = glm::vec3(-1.0, -1.0, -1.0);
+ }
+ else if (strcmp(tokens[0].c_str(), "mesh") == 0) {
+ cout << "Creating new mesh..." << endl;
+ loadOBJ(tokens[1], id);
+ return 1;
}
}
@@ -96,7 +107,7 @@ int Scene::loadCamera() {
float fovy;
//load static properties
- for (int i = 0; i < 5; i++) {
+ for (int i = 0; i < 7; i++) {
string line;
utilityCore::safeGetline(fp_in, line);
vector tokens = utilityCore::tokenizeString(line);
@@ -111,6 +122,10 @@ int Scene::loadCamera() {
state.traceDepth = atoi(tokens[1].c_str());
} else if (strcmp(tokens[0].c_str(), "FILE") == 0) {
state.imageName = tokens[1];
+ } else if (strcmp(tokens[0].c_str(), "LENS_RADIUS") == 0) {
+ camera.lens_radius = atof(tokens[1].c_str());
+ } else if (strcmp(tokens[0].c_str(), "FOCAL_DIST") == 0) {
+ camera.focal_dist = atof(tokens[1].c_str());
}
}
@@ -186,3 +201,118 @@ int Scene::loadMaterial(string materialid) {
return 1;
}
}
+
+// Load obj using tinyobjloader (based off of example give by tinyobj library)
+int Scene::loadOBJ(string filename, int objectid)
+{
+ int materialid = -1;
+ glm::vec3 translation = glm::vec3(0.0, 0.0, 0.0);
+ glm::vec3 rotation = glm::vec3(0.0, 0.0, 0.0);
+ glm::vec3 scale = glm::vec3(1.0, 1.0, 1.0);
+ string line;
+
+ // Get material id (same for entire mesh)
+ utilityCore::safeGetline(fp_in, line);
+ if (!line.empty() && fp_in.good()) {
+ vector tokens = utilityCore::tokenizeString(line);
+ materialid = atoi(tokens[1].c_str());
+ cout << "Connecting Geom " << objectid << " to Material " << materialid << "..." << endl;
+ }
+
+ // Get transformations (default for all triangles)
+ glm::mat4 transform = utilityCore::buildTransformationMatrix(translation, rotation, scale);
+ glm::mat4 inverseTransform = glm::inverse(transform);
+ glm::mat4 invTranspose = glm::inverseTranspose(transform);
+
+ // Load obj using tinyobjloader
+ std::string inputfile = "../obj/" + filename;
+ tinyobj::ObjReaderConfig reader_config;
+ tinyobj::ObjReader reader;
+
+ if (!reader.ParseFromFile(inputfile, reader_config)) {
+ if (!reader.Error().empty()) {
+ std::cerr << "TinyObjReader: " << reader.Error();
+ }
+ exit(1);
+ }
+
+ if (!reader.Warning().empty()) {
+ std::cout << "TinyObjReader: " << reader.Warning();
+ }
+
+ auto& attrib = reader.GetAttrib();
+ auto& shapes = reader.GetShapes();
+ auto& materials = reader.GetMaterials();
+
+ meshCount = 0;
+ for (size_t s = 0; s < shapes.size(); s++) {
+ std::vector mesh_triangles;
+
+ // Track aabb
+ mesh_aabbs.resize(shapes.size());
+ glm::vec3 min = glm::vec3(INFINITY, INFINITY, INFINITY);
+ glm::vec3 max = glm::vec3(-INFINITY, -INFINITY, -INFINITY);
+
+ // Loop over faces(polygon)
+ size_t index_offset = 0;
+ for (size_t f = 0; f < shapes[s].mesh.num_face_vertices.size(); f++) {
+ size_t fv = size_t(shapes[s].mesh.num_face_vertices[f]);
+
+ // Loop over vertices in the face.
+ Triangle tri;
+
+ int i = 0;
+ for (size_t v = 0; v < fv; v++) {
+ // access to vertex
+ tinyobj::index_t idx = shapes[s].mesh.indices[index_offset + v];
+
+ tinyobj::real_t vx = attrib.vertices[3 * size_t(idx.vertex_index) + 0];
+ tinyobj::real_t vy = attrib.vertices[3 * size_t(idx.vertex_index) + 1];
+ tinyobj::real_t vz = attrib.vertices[3 * size_t(idx.vertex_index) + 2];
+ tri.verts[i] = glm::vec3((float)vx, (float)vy, (float)vz);
+
+ if (idx.normal_index >= 0) {
+ tinyobj::real_t nx = attrib.normals[3 * size_t(idx.normal_index) + 0];
+ tinyobj::real_t ny = attrib.normals[3 * size_t(idx.normal_index) + 1];
+ tinyobj::real_t nz = attrib.normals[3 * size_t(idx.normal_index) + 2];
+ tri.norms[i] = glm::vec3((float)nx, (float)ny, (float)nz);
+ }
+
+ // Determine AABB min and max
+ min = glm::min(min, tri.verts[i]);
+ max = glm::max(max, tri.verts[i]);
+
+ i++;
+ }
+ tri.computeAABB();
+ tri.computeCentroid();
+ tri.objectId = f;
+ mesh_triangles.push_back(tri);
+
+ index_offset += fv;
+ }
+
+ // Set AABB
+ mesh_aabbs[s].min = min;
+ mesh_aabbs[s].max = max;
+
+ // Initialize new mesh
+ Geom newGeom;
+ newGeom.type = MESH;
+ newGeom.aabb = mesh_aabbs[s];
+ newGeom.startIdx = triangles.size();
+ newGeom.triangleCount = mesh_triangles.size();
+ newGeom.materialid = materialid;
+ newGeom.translation = translation;
+ newGeom.rotation = rotation;
+ newGeom.scale = scale;
+ newGeom.transform = transform;
+ newGeom.inverseTransform = inverseTransform;
+ newGeom.invTranspose = invTranspose;
+ triangles.insert(triangles.end(), mesh_triangles.begin(), mesh_triangles.end());
+ geoms.push_back(newGeom);
+ meshCount++;
+ }
+
+ return 1;
+}
diff --git a/src/scene.h b/src/scene.h
index f29a917..7034431 100644
--- a/src/scene.h
+++ b/src/scene.h
@@ -5,6 +5,7 @@
#include
#include
#include "glm/glm.hpp"
+#include "lbvh.h"
#include "utilities.h"
#include "sceneStructs.h"
@@ -16,11 +17,19 @@ class Scene {
int loadMaterial(string materialid);
int loadGeom(string objectid);
int loadCamera();
+ int loadOBJ(string filename, int objectid);
public:
Scene(string filename);
~Scene();
std::vector geoms;
std::vector materials;
+ std::vector triangles;
+ std::vector sorted_triangles;
+ std::vector mcodes;
+ std::vector bvh;
+ std::vector lbvh;
+ std::vector