diff --git a/CMakeLists.txt b/CMakeLists.txt
index 162568b..ca9eb84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,7 @@ set(headers
     src/sceneStructs.h
     src/preview.h
     src/utilities.h
+src/common.h
     )
 
 set(sources
@@ -84,6 +85,7 @@ set(sources
     src/scene.cpp
     src/preview.cpp
     src/utilities.cpp
+src/common.cu
     )
 
 set(imgui
diff --git a/src/common.cu b/src/common.cu
new file mode 100644
index 0000000..7a83529
--- /dev/null
+++ b/src/common.cu
@@ -0,0 +1,40 @@
+#include "common.h"
+
+
+namespace StreamCompaction {
+    namespace Common {
+
+        /**
+         * Maps an array to an array of 0s and 1s for stream compaction. Elements
+         * which map to 0 will be removed, and elements which map to 1 will be kept.
+         */
+        __global__ void kernMapToBoolean(int n, int* bools1, int* bools2, const int* idata) {
+            // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            int result = idata[index] != 0;
+            bools1[index] = result;
+            bools2[index] = result;
+        }
+
+        /**
+         * Performs scatter on an array. That is, for each element in idata,
+         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+         */
+        __global__ void kernScatter(int n, int* odata,
+            const int* idata, const int* bools, const int* indices) {
+            // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            if (bools[index]) {
+                odata[indices[index]] = idata[index];
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..d2c1fed
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <stdexcept>
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return x == 1 ? 0 : ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+    namespace Common {
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices);
+
+        /**
+        * This class is used for timing the performance
+        * Uncopyable and unmovable
+        *
+        * Adapted from WindyDarian(https://github.com/WindyDarian)
+        */
+        class PerformanceTimer
+        {
+        public:
+            PerformanceTimer()
+            {
+                cudaEventCreate(&event_start);
+                cudaEventCreate(&event_end);
+            }
+
+            ~PerformanceTimer()
+            {
+                cudaEventDestroy(event_start);
+                cudaEventDestroy(event_end);
+            }
+
+            void startCpuTimer()
+            {
+                if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+                cpu_timer_started = true;
+
+                time_start_cpu = std::chrono::high_resolution_clock::now();
+            }
+
+            void endCpuTimer()
+            {
+                time_end_cpu = std::chrono::high_resolution_clock::now();
+
+                if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+                std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+                prev_elapsed_time_cpu_milliseconds =
+                    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+                cpu_timer_started = false;
+            }
+
+            void startGpuTimer()
+            {
+                if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+                gpu_timer_started = true;
+
+                cudaEventRecord(event_start);
+            }
+
+            void endGpuTimer()
+            {
+                cudaEventRecord(event_end);
+                cudaEventSynchronize(event_end);
+
+                if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+                cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+                gpu_timer_started = false;
+            }
+
+            float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+            {
+                return prev_elapsed_time_cpu_milliseconds;
+            }
+
+            float getGpuElapsedTimeForPreviousOperation() //noexcept
+            {
+                return prev_elapsed_time_gpu_milliseconds;
+            }
+
+            // remove copy and move functions
+            PerformanceTimer(const PerformanceTimer&) = delete;
+            PerformanceTimer(PerformanceTimer&&) = delete;
+            PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+            PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+        private:
+            cudaEvent_t event_start = nullptr;
+            cudaEvent_t event_end = nullptr;
+
+            using time_point_t = std::chrono::high_resolution_clock::time_point;
+            time_point_t time_start_cpu;
+            time_point_t time_end_cpu;
+
+            bool cpu_timer_started = false;
+            bool gpu_timer_started = false;
+
+            float prev_elapsed_time_cpu_milliseconds = 0.f;
+            float prev_elapsed_time_gpu_milliseconds = 0.f;
+        };
+    }
+}
diff --git a/src/main.cpp b/src/main.cpp
index 4092ae4..ce6124c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -5,7 +5,7 @@
 #include "../imgui/imgui.h"
 #include "../imgui/imgui_impl_glfw.h"
 #include "../imgui/imgui_impl_opengl3.h"
-
+#include "common.h"
 static std::string startTimeString;
 
 // For camera controls
@@ -29,6 +29,7 @@ float ui_colorWeight = 0.45f;
 float ui_normalWeight = 0.35f;
 float ui_positionWeight = 0.2f;
 bool ui_saveAndExit = false;
+bool imageDenoised = false;
 
 static bool camchanged = true;
 static float dtheta = 0, dphi = 0;
@@ -45,6 +46,44 @@ int iteration;
 int width;
 int height;
 
+static float timePT;
+static float timeAT;
+static bool  hasPrinted;
+using StreamCompaction::Common::PerformanceTimer;
+#define TIMER 1
+
+PerformanceTimer& timer()
+{
+    static PerformanceTimer timer;
+    return timer;
+}
+
+void FilterCreation(int filter_size, float *kernel)
+{
+    // initialising standard deviation to 1.0
+    float sigma = 1.0;
+    float r, s = 2.0 * sigma * sigma;
+    // sum is for normalization
+    float sum = 0.0;
+    int itr = 0;
+    // generating filter_sizexfilter_size kernel
+    for (int x = -filter_size/2; x <= filter_size/2; x++) {
+        for (int y = -filter_size/2; y <= filter_size/2; y++) {
+            r =  x * x + y * y ;
+            kernel[itr] = (glm::exp(-(r) / s)) / (PI * s);
+            sum += kernel[itr];
+            itr++;
+        }
+    }
+
+    // normalising the Kernel
+    for (int i = 0; i < filter_size * filter_size; ++i)
+    {
+        kernel[i] /= sum;
+    }
+}
+
+
 //-------------------------------
 //-------------MAIN--------------
 //-------------------------------
@@ -56,7 +95,6 @@ int main(int argc, char** argv) {
         printf("Usage: %s SCENEFILE.txt\n", argv[0]);
         return 1;
     }
-
     const char *sceneFile = argv[1];
 
     // Load scene file
@@ -150,8 +188,11 @@ void runCuda() {
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
 
     if (iteration == 0) {
+        float *gKernel = new float[5 * 5];
+        FilterCreation(5, gKernel);
         pathtraceFree();
-        pathtraceInit(scene);
+        pathtraceInit(scene, gKernel);
+        imageDenoised = false;
     }
 
     uchar4 *pbo_dptr = NULL;
@@ -160,14 +201,54 @@ void runCuda() {
     if (iteration < ui_iterations) {
         iteration++;
 
+#if TIMER
+        // Start Timer
+        if (iteration == 1)
+        {
+            timePT = 0.f;
+        }
+        timer().startCpuTimer();
+#endif // TIMER
+
         // execute the kernel
         int frame = 0;
-        pathtrace(frame, iteration);
+        pathtrace(frame, iteration); 
+
+#if TIMER
+        timer().endCpuTimer();
+        timePT += timer().getCpuElapsedTimeForPreviousOperation();
+        if (iteration == ui_iterations) {
+            std::cout << "Path-trace time for " << iteration << " iterations: " << timePT << "ms" << std::endl;
+        }
+#endif // TIMER
     }
 
     if (ui_showGbuffer) {
       showGBuffer(pbo_dptr);
-    } else {
+    }
+    else if (ui_denoise && iteration == ui_iterations)
+    {
+        if (!imageDenoised)
+        {
+#if TIMER
+            // Start Timer
+            timeAT = 0.f;
+            if (!hasPrinted) {
+                timer().startCpuTimer();
+            }
+#endif // TIMER
+            imageDenoised = DenoiseImage(renderState->camera.resolution.x, renderState->camera.resolution.y, iteration, ui_filterSize,
+                ui_colorWeight, ui_normalWeight, ui_positionWeight);
+
+#if TIMER
+                timer().endCpuTimer();
+                timeAT += timer().getCpuElapsedTimeForPreviousOperation();
+                std::cout << "Denoise time for " << iteration << " iterations: " << timeAT << "ms\n\n" << std::endl;
+#endif // TIMER
+        }
+        showDenoise(pbo_dptr, iteration);
+    }
+    else {
       showImage(pbo_dptr, iteration);
     }
 
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 23e5f90..c295723 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -16,118 +16,214 @@
 
 #define ERRORCHECK 1
 
+
+//Flags for Denoising
+#define SimpleGaussianDenoise 0
+#define ATrous1 1
+#define ATrous2 0
+
+
+
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
-void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+void checkCUDAErrorFn(const char* msg, const char* file, int line) {
 #if ERRORCHECK
-    cudaDeviceSynchronize();
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
-
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+	cudaDeviceSynchronize();
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess == err) {
+		return;
+	}
+
+	fprintf(stderr, "CUDA error");
+	if (file) {
+		fprintf(stderr, " (%s:%d)", file, line);
+	}
+	fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
 #  ifdef _WIN32
-    getchar();
+	getchar();
 #  endif
-    exit(EXIT_FAILURE);
+	exit(EXIT_FAILURE);
 #endif
 }
 
+
+float gaussianKernel[25] = { 0.003765, 0.015019, 0.023792, 0.015019, 0.003765,
+0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
+0.023792, 0.094907, 0.150342, 0.094907, 0.023792,
+0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
+0.003765, 0.015019, 0.023792, 0.015019, 0.003765, };
+
+
+//glm::vec2 offsetKernel[25];
+
 __host__ __device__
 thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
-    int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
-    return thrust::default_random_engine(h);
+	int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
+	return thrust::default_random_engine(h);
 }
 
 //Kernel that writes the image to the OpenGL PBO directly.
 __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
-        int iter, glm::vec3* image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-        glm::vec3 pix = image[index];
-
-        glm::ivec3 color;
-        color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255);
-        color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255);
-        color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255);
-
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
+	int iter, glm::vec3* image) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+		glm::vec3 pix = image[index];
+
+		glm::ivec3 color;
+		color.x = glm::clamp((int)((pix.x / iter) * 255.0), 0, 255);
+		color.y = glm::clamp((int)((pix.y / iter) * 255.0), 0, 255);
+		color.z = glm::clamp((int)((pix.z / iter) * 255.0), 0, 255);
+
+		// Each thread writes one pixel location in the texture (textel)
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
 }
 
 __global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-        float timeToIntersect = gBuffer[index].t * 256.0;
-
-        pbo[index].w = 0;
-        pbo[index].x = timeToIntersect;
-        pbo[index].y = timeToIntersect;
-        pbo[index].z = timeToIntersect;
-    }
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+		float timeToIntersect = gBuffer[index].t * 256.0f;
+
+		pbo[index].w = 0;
+		pbo[index].x = timeToIntersect;
+		pbo[index].y = timeToIntersect;
+		pbo[index].z = timeToIntersect;
+	}
+}
+
+__global__ void gbufferToPBO_Normals(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+
+		glm::vec3 normal = glm::abs(gBuffer[index].normal);
+		glm::ivec3 color;
+		color.x = glm::clamp((int)(normal.x * 255.0), 0, 255);
+		color.y = glm::clamp((int)(normal.y * 255.0), 0, 255);
+		color.z = glm::clamp((int)(normal.z * 255.0), 0, 255);
+
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
 }
 
-static Scene * hst_scene = NULL;
-static glm::vec3 * dev_image = NULL;
-static Geom * dev_geoms = NULL;
-static Material * dev_materials = NULL;
-static PathSegment * dev_paths = NULL;
-static ShadeableIntersection * dev_intersections = NULL;
+__global__ void gbufferToPBO_Position(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+
+		glm::vec3 position = glm::abs(gBuffer[index].position) ;
+		glm::ivec3 color;
+		color.x = glm::clamp((int)(position.x * 20.0), 0, 255);
+		color.y = glm::clamp((int)(position.y * 20.0), 0, 255);
+		color.z = glm::clamp((int)(position.z * 20.0), 0, 255);
+
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
+}
+
+
+static Scene* hst_scene = NULL;
+static glm::vec3* dev_image = NULL;
+static Geom* dev_geoms = NULL;
+static Material* dev_materials = NULL;
+static PathSegment* dev_paths = NULL;
+static ShadeableIntersection* dev_intersections = NULL;
 static GBufferPixel* dev_gBuffer = NULL;
 // TODO: static variables for device memory, any extra info you need, etc
 // ...
+static float* dev_gausKernel = NULL;
+static glm::vec2* dev_offsetKernel = NULL;
+static glm::vec3* dev_TrousImage = NULL;
+static glm::vec3* dev_pingPongImage = NULL;
+
+//static glm::vec3* dev_IntermediaryImage = NULL;
+
+void generateOffsetKern(int filterSize, vector<glm::vec2> &offsetKernel)
+{
+	int index = 0;
+	filterSize = filterSize % 2 == 0 ? filterSize - 1 : filterSize;
+	for (int y = -filterSize/2; y <= filterSize/2; y++)
+	{
+		for (int x = -filterSize/2; x <= filterSize/2; x++)
+		{
+			offsetKernel.push_back(glm::vec2(x, y));
+			index++;
+		}
+	}
+}
+
+void pathtraceInit(Scene* scene,float *gausKernel) {
+	hst_scene = scene;
+	const Camera& cam = hst_scene->state.camera;
+	const int pixelcount = cam.resolution.x * cam.resolution.y;
+
+	cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
+	cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
 
-void pathtraceInit(Scene *scene) {
-    hst_scene = scene;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
+	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
+
+	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
+	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
+	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
+	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
-    cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
-    cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
+	cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel));
 
-  	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
+	// TODO: initialize any extra device memeory you need
 
-  	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
-  	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+	cudaMalloc(&dev_gausKernel,  25 * sizeof(float));
+	cudaMemcpy(dev_gausKernel, gaussianKernel, 25 * sizeof(float), cudaMemcpyHostToDevice);
 
-  	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
-  	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
+	vector< glm::vec2> offKern;
 
-  	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
-  	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+	generateOffsetKern(5, offKern);
+	cudaMalloc(&dev_offsetKernel, 25 * sizeof(glm::vec2));
+	cudaMemcpy(dev_offsetKernel, offKern.data(), 25 * sizeof(glm::vec2), cudaMemcpyHostToDevice);
 
-    cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel));
+	cudaMalloc(&dev_TrousImage, pixelcount * sizeof(glm::vec3));
+	cudaMalloc(&dev_pingPongImage, pixelcount * sizeof(glm::vec3));
 
-    // TODO: initialize any extra device memeory you need
 
-    checkCUDAError("pathtraceInit");
+	checkCUDAError("pathtraceInit");
 }
 
 void pathtraceFree() {
-    cudaFree(dev_image);  // no-op if dev_image is null
-  	cudaFree(dev_paths);
-  	cudaFree(dev_geoms);
-  	cudaFree(dev_materials);
-  	cudaFree(dev_intersections);
-    cudaFree(dev_gBuffer);
-    // TODO: clean up any extra device memory you created
-
-    checkCUDAError("pathtraceFree");
+	cudaFree(dev_image);  // no-op if dev_image is null
+	cudaFree(dev_paths);
+	cudaFree(dev_geoms);
+	cudaFree(dev_materials);
+	cudaFree(dev_intersections);
+	cudaFree(dev_gBuffer);
+	// TODO: clean up any extra device memory you created
+
+	cudaFree(dev_gausKernel);
+	cudaFree(dev_offsetKernel);
+	cudaFree(dev_TrousImage);
+	cudaFree(dev_pingPongImage);
+	checkCUDAError("pathtraceFree");
 }
 
 /**
@@ -145,289 +241,585 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 
 	if (x < cam.resolution.x && y < cam.resolution.y) {
 		int index = x + (y * cam.resolution.x);
-		PathSegment & segment = pathSegments[index];
+		PathSegment& segment = pathSegments[index];
 
 		segment.ray.origin = cam.position;
-    segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+		segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
 
 		segment.ray.direction = glm::normalize(cam.view
 			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
 			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
-			);
-
+		);
 		segment.pixelIndex = index;
 		segment.remainingBounces = traceDepth;
 	}
 }
 
-__global__ void computeIntersections(
-	int depth
-	, int num_paths
-	, PathSegment * pathSegments
-	, Geom * geoms
-	, int geoms_size
-	, ShadeableIntersection * intersections
+
+	__global__ void GenerateGaussianBlur(
+		int pixelCount, int stepWidth,
+		float* dev_gausKernel, glm::vec2 *dev_offsetKernel,
+		glm::vec3* dev_colorImage, glm::vec3 *dev_TrousImage,
+		int resolutionX, int resolutionY
 	)
-{
-	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
+	{
 
-	if (path_index < num_paths)
+		int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+		if (index < pixelCount)
+		{
+			glm::vec3 currColor =glm::vec3(0.0f);
+			//glm::vec3 currColor = dev_colorImage[index];
+			for (int i = 0; i < 25 ; i++)
+			{
+				int index2D_y = index / resolutionX;
+				int index2D_x = (int)(index % resolutionX);
+
+				int offsetX = dev_offsetKernel[i].x;
+				int offsetY = dev_offsetKernel[i].y;
+
+				int finalValue_X = index2D_x + offsetX * stepWidth;
+				int finalValue_Y = index2D_y + offsetY * stepWidth;
+
+				if (finalValue_X >= 0 && finalValue_X <= (resolutionX - 1) && finalValue_Y >= 0 && finalValue_Y <= (resolutionY - 1))
+				{
+					float gausValue = dev_gausKernel[i];
+					int offsetColorIdx = finalValue_Y * resolutionX + finalValue_X;
+					if (offsetColorIdx >= 0 && offsetColorIdx < pixelCount)
+					{
+						glm::vec3 newColor = dev_colorImage[offsetColorIdx];
+						currColor += newColor * dev_gausKernel[i];
+					}
+				}
+			}
+			dev_TrousImage[index] = currColor;
+		}
+
+	}
+
+	/// <summary>
+	/// This A Trous Kernel performs conversion from 1D index to 2D.
+	/// </summary>
+	/// <param name="pixelCount"></param>
+	/// <param name="stepWidth"></param>
+	/// <param name="dev_gausKernel"></param>
+	/// <param name="dev_offsetKernel"></param>
+	/// <param name="dev_colorImage"></param>
+	/// <param name="dev_TrousImage"></param>
+	/// <param name="gbuf"></param>
+	/// <param name="resolutionX"></param>
+	/// <param name="resolutionY"></param>
+	/// <param name="ui_colorWeight"></param>
+	/// <param name="ui_normalWeight"></param>
+	/// <param name="ui_positionWeight"></param>
+	/// <returns></returns>
+	__global__ void GenerateAtrousImage(
+		int pixelCount, int stepWidth,
+		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
+		glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
+		GBufferPixel* gbuf, int resolutionX, int resolutionY, float ui_colorWeight,
+		float ui_normalWeight, float ui_positionWeight
+	)
 	{
-		PathSegment pathSegment = pathSegments[path_index];
 
-		float t;
-		glm::vec3 intersect_point;
-		glm::vec3 normal;
-		float t_min = FLT_MAX;
-		int hit_geom_index = -1;
-		bool outside = true;
+		int index = blockIdx.x * blockDim.x + threadIdx.x;
 
-		glm::vec3 tmp_intersect;
-		glm::vec3 tmp_normal;
+		if (index < pixelCount)
+		{
+			glm::vec3 sum = glm::vec3(0.0f);
+			glm::vec3 cval = dev_colorImage[index];
+			glm::vec3 nval = gbuf[index].normal;
+			glm::vec3 pval = gbuf[index].position;
+
+			float cphi = ui_colorWeight * ui_colorWeight;
+			float nphi = ui_normalWeight * ui_normalWeight;
+			float pphi = ui_positionWeight * ui_positionWeight;
+
+			float cum_w = 0.0f;
+				for (int i = 0; i < 25; i++)
+				{
+					// Calculate Offseted Index
+					int index2D_y = index / resolutionX ;
+					int index2D_x = (int)(index % resolutionX);
+
+					int offsetX = dev_offsetKernel[i].x;
+					int offsetY = dev_offsetKernel[i].y;
+
+					int finalValue_X = index2D_x + offsetX * stepWidth; // Final Offset Values
+					int finalValue_Y = index2D_y + offsetY * stepWidth; // Final Offset Values
+					if (finalValue_X >= 0 && finalValue_X <= (resolutionX - 1) && finalValue_Y >= 0 && finalValue_Y <= (resolutionY - 1))
+					{
+						int offsetColorIdx = finalValue_Y * resolutionX + finalValue_X;
+						if (offsetColorIdx >= 0 && offsetColorIdx < pixelCount)
+						{
+							glm::vec3 ctmp = dev_colorImage[offsetColorIdx];
+							glm::vec3 t = cval - ctmp;
+							float dist2 = glm::dot(t, t);
+							float c_w = glm::min(glm::exp(-(dist2) / cphi), 1.0f);
+
+							glm::vec3 ntmp = gbuf[offsetColorIdx].normal;
+							t = nval - ntmp;
+							dist2 = glm::max(glm::dot(t, t)/ (stepWidth * stepWidth), 0.0f);
+							float n_w = glm::min(glm::exp(-(dist2) / cphi), 1.0f);
+
+							glm::vec3 ptmp = gbuf[offsetColorIdx].position;
+							t = pval - ptmp;
+							dist2 = glm::dot(t, t);
+							float p_w = glm::min(glm::exp(-(dist2) / cphi), 1.0f);
+							float weight = c_w * n_w * p_w;
+
+							sum += ctmp * weight * dev_gausKernel[i];
+							cum_w += weight * dev_gausKernel[i];
+
+						}
+					}
+				}
+				if (cum_w == 0.f)
+				{
+					dev_TrousImage[index] = cval;
+					return;
+				}
+				
+				dev_TrousImage[index] = sum / cum_w;
+				
+		}
 
-		// naive parse through global geoms
+	}
+	  
+	/// <summary>
+	/// This A Trous Kernel already starts out with 2D indexes
+	/// </summary>
+	/// <param name="pixelCount"></param>
+	/// <param name="stepWidth"></param>
+	/// <param name="dev_gausKernel"></param>
+	/// <param name="dev_offsetKernel"></param>
+	/// <param name="dev_colorImage"></param>
+	/// <param name="dev_TrousImage"></param>
+	/// <param name="gbuf"></param>
+	/// <param name="resolutionX"></param>
+	/// <param name="resolutionY"></param>
+	/// <param name="ui_colorWeight"></param>
+	/// <param name="ui_normalWeight"></param>
+	/// <param name="ui_positionWeight"></param>
+	/// <returns></returns>
+	__global__ void GenerateAtrousImageGeneral(
+		int pixelCount, int stepWidth,
+		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
+		glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
+		GBufferPixel* gbuf, int resolutionX, int resolutionY, float ui_colorWeight,
+		float ui_normalWeight, float ui_positionWeight
+	)
+	{
+
+		int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+		int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+		if (x < resolutionX && y < resolutionY)
+		{
+			int index = x + (y * resolutionX);
+			glm::vec3 sum = glm::vec3(0.0f);
+			glm::vec3 cval = dev_colorImage[index];
+			glm::vec3 nval = gbuf[index].normal;
+			glm::vec3 pval = gbuf[index].position;
+
+			float cphi = ui_colorWeight ;
+			float nphi = ui_normalWeight ;
+			float pphi = ui_positionWeight ;
+
+			float cum_w = 0.0f;
+				for (int i = 0; i < 5 * 5; i++)
+				{
+					// Calculate Offseted Index
+
+					int offsetX = dev_offsetKernel[i].x;
+					int offsetY = dev_offsetKernel[i].y;
+
+					int finalValue_X  = x + offsetX * stepWidth;
+					int finalValue_Y = y + offsetY * stepWidth;
+					if (finalValue_X >= 0 && finalValue_X <= (resolutionX - 1) && finalValue_Y >= 0 && finalValue_Y <= (resolutionY - 1))
+					{
+						int offsetColorIdx = finalValue_Y * resolutionX + finalValue_X;
+						if (offsetColorIdx >= 0 && offsetColorIdx < pixelCount)
+						{
+							glm::vec3 ctmp = dev_colorImage[offsetColorIdx];
+							glm::vec3 t = cval - ctmp;
+							float dist2 = glm::length(t) * glm::length(t);
+							float newVal = glm::exp(-(dist2) / cphi);
+							float c_w = glm::min(newVal, 1.0f);
+
+							glm::vec3 ntmp = gbuf[offsetColorIdx].normal;
+							t = nval - ntmp;
+							dist2 = glm::max( (glm::length(t) * glm::length(t)) / (stepWidth * stepWidth), 0.f);
+							newVal = glm::exp(-1 * (dist2) / nphi);
+							float n_w = glm::min(newVal, 1.0f);
+
+							glm::vec3 ptmp = gbuf[offsetColorIdx].position;
+							t = pval - ptmp;
+							dist2 = glm::length(t) * glm::length(t);
+							newVal = glm::exp(-1 * (dist2) / pphi);
+							float p_w = glm::min(newVal, 1.0f);
+
+
+							float weight = c_w * n_w * p_w;
+							sum += ctmp * weight * dev_gausKernel[i];
+							cum_w += weight * dev_gausKernel[i];
+
+						}
+					}
+					
+				}
+			
+			dev_TrousImage[index] = sum / cum_w;
+		}
+
+	}
+
+	__global__ void computeIntersections(
+		int depth
+		, int num_paths
+		, PathSegment * pathSegments
+		, Geom * geoms
+		, int geoms_size
+		, ShadeableIntersection * intersections
+	)
+	{
+		int path_index = blockIdx.x * blockDim.x + threadIdx.x;
 
-		for (int i = 0; i < geoms_size; i++)
+		if (path_index < num_paths)
 		{
-			Geom & geom = geoms[i];
+			PathSegment pathSegment = pathSegments[path_index];
+
+			float t;
+			glm::vec3 intersect_point;
+			glm::vec3 normal;
+			float t_min = FLT_MAX;
+			int hit_geom_index = -1;
+			bool outside = true;
+
+			glm::vec3 tmp_intersect;
+			glm::vec3 tmp_normal;
+
+			// naive parse through global geoms
 
-			if (geom.type == CUBE)
+			for (int i = 0; i < geoms_size; i++)
 			{
-				t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+				Geom& geom = geoms[i];
+
+				if (geom.type == CUBE)
+				{
+					t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+				}
+				else if (geom.type == SPHERE)
+				{
+					t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+				}
+
+				// Compute the minimum t from the intersection tests to determine what
+				// scene geometry object was hit first.
+				if (t > 0.0f && t_min > t)
+				{
+					t_min = t;
+					hit_geom_index = i;
+					intersect_point = tmp_intersect;
+					normal = tmp_normal;
+				}
 			}
-			else if (geom.type == SPHERE)
+
+			if (hit_geom_index == -1)
 			{
-				t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+				intersections[path_index].t = -1.0f;
 			}
-
-			// Compute the minimum t from the intersection tests to determine what
-			// scene geometry object was hit first.
-			if (t > 0.0f && t_min > t)
+			else
 			{
-				t_min = t;
-				hit_geom_index = i;
-				intersect_point = tmp_intersect;
-				normal = tmp_normal;
+				//The ray hits something
+				intersections[path_index].t = t_min;
+				intersections[path_index].materialId = geoms[hit_geom_index].materialid;
+				intersections[path_index].surfaceNormal = normal;
 			}
 		}
+	}
 
-		if (hit_geom_index == -1)
+	__global__ void shadeSimpleMaterials(
+		int iter
+		, int num_paths
+		, ShadeableIntersection * shadeableIntersections
+		, PathSegment * pathSegments
+		, Material * materials
+	)
+	{
+		int idx = blockIdx.x * blockDim.x + threadIdx.x;
+		if (idx < num_paths)
 		{
-			intersections[path_index].t = -1.0f;
+			ShadeableIntersection intersection = shadeableIntersections[idx];
+			PathSegment segment = pathSegments[idx];
+			if (segment.remainingBounces == 0) {
+				return;
+			}
+
+			if (intersection.t > 0.0f) { // if the intersection exists...
+				segment.remainingBounces--;
+				// Set up the RNG
+				thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, segment.remainingBounces);
+
+				Material material = materials[intersection.materialId];
+				glm::vec3 materialColor = material.color;
+
+				// If the material indicates that the object was a light, "light" the ray
+				if (material.emittance > 0.0f) {
+					segment.color *= (materialColor * material.emittance);
+					segment.remainingBounces = 0;
+				}
+				else {
+					segment.color *= materialColor;
+					glm::vec3 intersectPos = intersection.t * segment.ray.direction + segment.ray.origin;
+					scatterRay(segment, intersectPos, intersection.surfaceNormal, material, rng);
+				}
+				// If there was no intersection, color the ray black.
+				// Lots of renderers use 4 channel color, RGBA, where A = alpha, often
+				// used for opacity, in which case they can indicate "no opacity".
+				// This can be useful for post-processing and image compositing.
+			}
+			else {
+				segment.color = glm::vec3(0.0f);
+				segment.remainingBounces = 0;
+			}
+
+			pathSegments[idx] = segment;
 		}
-		else
+	}
+
+	__global__ void generateGBuffer(
+		int num_paths,
+		ShadeableIntersection * shadeableIntersections,
+		PathSegment * pathSegments,
+		GBufferPixel * gBuffer) {
+		int idx = blockIdx.x * blockDim.x + threadIdx.x;
+		if (idx < num_paths)
 		{
-			//The ray hits something
-			intersections[path_index].t = t_min;
-			intersections[path_index].materialId = geoms[hit_geom_index].materialid;
-			intersections[path_index].surfaceNormal = normal;
+			int pixelPosition = pathSegments[idx].pixelIndex;
+			gBuffer[idx].t = shadeableIntersections[idx].t;
+			gBuffer[idx].normal = shadeableIntersections[idx].surfaceNormal;
+			gBuffer[idx].position = getPointOnRay(pathSegments[idx].ray, shadeableIntersections[idx].t);
 		}
 	}
-}
 
-__global__ void shadeSimpleMaterials (
-  int iter
-  , int num_paths
-	, ShadeableIntersection * shadeableIntersections
-	, PathSegment * pathSegments
-	, Material * materials
-	)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_paths)
-  {
-    ShadeableIntersection intersection = shadeableIntersections[idx];
-    PathSegment segment = pathSegments[idx];
-    if (segment.remainingBounces == 0) {
-      return;
-    }
-
-    if (intersection.t > 0.0f) { // if the intersection exists...
-      segment.remainingBounces--;
-      // Set up the RNG
-      thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, segment.remainingBounces);
-
-      Material material = materials[intersection.materialId];
-      glm::vec3 materialColor = material.color;
-
-      // If the material indicates that the object was a light, "light" the ray
-      if (material.emittance > 0.0f) {
-        segment.color *= (materialColor * material.emittance);
-        segment.remainingBounces = 0;
-      }
-      else {
-        segment.color *= materialColor;
-        glm::vec3 intersectPos = intersection.t * segment.ray.direction + segment.ray.origin;
-        scatterRay(segment, intersectPos, intersection.surfaceNormal, material, rng);
-      }
-    // If there was no intersection, color the ray black.
-    // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
-    // used for opacity, in which case they can indicate "no opacity".
-    // This can be useful for post-processing and image compositing.
-    } else {
-      segment.color = glm::vec3(0.0f);
-      segment.remainingBounces = 0;
-    }
-
-    pathSegments[idx] = segment;
-  }
-}
+	// Add the current iteration's output to the overall image
+	__global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
+	{
+		int index = (blockIdx.x * blockDim.x) + threadIdx.x;
 
-__global__ void generateGBuffer (
-  int num_paths,
-  ShadeableIntersection* shadeableIntersections,
-	PathSegment* pathSegments,
-  GBufferPixel* gBuffer) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_paths)
-  {
-    gBuffer[idx].t = shadeableIntersections[idx].t;
-  }
-}
+		if (index < nPaths)
+		{
+			PathSegment iterationPath = iterationPaths[index];
+			image[iterationPath.pixelIndex] += iterationPath.color;
+		}
+	}
 
-// Add the current iteration's output to the overall image
-__global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
-{
-	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	/**
+	 * Wrapper for the __global__ call that sets up the kernel calls and does a ton
+	 * of memory management
+	 */
+	void pathtrace(int frame, int iter) {
+		const int traceDepth = hst_scene->state.traceDepth;
+		const Camera& cam = hst_scene->state.camera;
+		const int pixelcount = cam.resolution.x * cam.resolution.y;
+
+		// 2D block for generating ray from camera
+		const dim3 blockSize2d(8, 8);
+		const dim3 blocksPerGrid2d(
+			(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+		// 1D block for path tracing
+		const int blockSize1d = 128;
+
+		///////////////////////////////////////////////////////////////////////////
+
+		// Pathtracing Recap:
+		// * Initialize array of path rays (using rays that come out of the camera)
+		//   * You can pass the Camera object to that kernel.
+		//   * Each path ray must carry at minimum a (ray, color) pair,
+		//   * where color starts as the multiplicative identity, white = (1, 1, 1).
+		//   * This has already been done for you.
+		// * NEW: For the first depth, generate geometry buffers (gbuffers)
+		// * For each depth:
+		//   * Compute an intersection in the scene for each path ray.
+		//     A very naive version of this has been implemented for you, but feel
+		//     free to add more primitives and/or a better algorithm.
+		//     Currently, intersection distance is recorded as a parametric distance,
+		//     t, or a "distance along the ray." t = -1.0 indicates no intersection.
+		//     * Color is attenuated (multiplied) by reflections off of any object
+		//   * Stream compact away all of the terminated paths.
+		//     You may use either your implementation or `thrust::remove_if` or its
+		//     cousins.
+		//     * Note that you can't really use a 2D kernel launch any more - switch
+		//       to 1D.
+		//   * Shade the rays that intersected something or didn't bottom out.
+		//     That is, color the ray by performing a color computation according
+		//     to the shader, then generate a new ray to continue the ray path.
+		//     We recommend just updating the ray's PathSegment in place.
+		//     Note that this step may come before or after stream compaction,
+		//     since some shaders you write may also cause a path to terminate.
+		// * Finally:
+		//     * if not denoising, add this iteration's results to the image
+		//     * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl
+
+		generateRayFromCamera << <blocksPerGrid2d, blockSize2d >> > (cam, iter, traceDepth, dev_paths);
+		checkCUDAError("generate camera ray");
+
+		int depth = 0;
+		PathSegment* dev_path_end = dev_paths + pixelcount;
+		int num_paths = dev_path_end - dev_paths;
+
+		// --- PathSegment Tracing Stage ---
+		// Shoot ray into scene, bounce between objects, push shading chunks
+
+	  // Empty gbuffer
+		cudaMemset(dev_gBuffer, 0, pixelcount * sizeof(GBufferPixel));
+
+		// clean shading chunks
+		cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+		bool iterationComplete = false;
+		while (!iterationComplete) {
+
+			// tracing
+			dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+			computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+				depth
+				, num_paths
+				, dev_paths
+				, dev_geoms
+				, hst_scene->geoms.size()
+				, dev_intersections
+				);
+			checkCUDAError("trace one bounce");
+			cudaDeviceSynchronize();
+
+			if (depth == 0) {
+				generateGBuffer << <numblocksPathSegmentTracing, blockSize1d >> > (num_paths, dev_intersections, dev_paths, dev_gBuffer);
+			}
+
+			depth++;
+			shadeSimpleMaterials << <numblocksPathSegmentTracing, blockSize1d >> > (
+				iter,
+				num_paths,
+				dev_intersections,
+				dev_paths,
+				dev_materials
+				);
+			iterationComplete = depth == traceDepth;
+		}
+		// Assemble this iteration and apply it to the image
+		dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
+		finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
+		///////////////////////////////////////////////////////////////////////////
+
+		// CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
+		// Otherwise, screenshots are also acceptable.
+		// Retrieve image from GPU
+		cudaMemcpy(hst_scene->state.image.data(), dev_image,
+			pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
+
+		checkCUDAError("pathtrace");
+	}
+
+	// CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging.
+	void showGBuffer(uchar4 * pbo) {
+		const Camera& cam = hst_scene->state.camera;
+		const dim3 blockSize2d(8, 8);
+		const dim3 blocksPerGrid2d(
+			(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+
+		// CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
+		//gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+		//gbufferToPBO_Normals<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+		gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+	}
 
-	if (index < nPaths)
+	__global__ void GeneratePingPongImage(int pixelCount, glm::vec3* devImage, glm::vec3* pingPongImage, int iter)
 	{
-		PathSegment iterationPath = iterationPaths[index];
-		image[iterationPath.pixelIndex] += iterationPath.color;
+		int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+		if (index < pixelCount)
+		{
+			pingPongImage[index] = devImage[index];
+		}
 	}
-}
 
-/**
- * Wrapper for the __global__ call that sets up the kernel calls and does a ton
- * of memory management
- */
-void pathtrace(int frame, int iter) {
-    const int traceDepth = hst_scene->state.traceDepth;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
-
-	// 2D block for generating ray from camera
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
-
-	// 1D block for path tracing
-	const int blockSize1d = 128;
-
-    ///////////////////////////////////////////////////////////////////////////
-
-    // Pathtracing Recap:
-    // * Initialize array of path rays (using rays that come out of the camera)
-    //   * You can pass the Camera object to that kernel.
-    //   * Each path ray must carry at minimum a (ray, color) pair,
-    //   * where color starts as the multiplicative identity, white = (1, 1, 1).
-    //   * This has already been done for you.
-    // * NEW: For the first depth, generate geometry buffers (gbuffers)
-    // * For each depth:
-    //   * Compute an intersection in the scene for each path ray.
-    //     A very naive version of this has been implemented for you, but feel
-    //     free to add more primitives and/or a better algorithm.
-    //     Currently, intersection distance is recorded as a parametric distance,
-    //     t, or a "distance along the ray." t = -1.0 indicates no intersection.
-    //     * Color is attenuated (multiplied) by reflections off of any object
-    //   * Stream compact away all of the terminated paths.
-    //     You may use either your implementation or `thrust::remove_if` or its
-    //     cousins.
-    //     * Note that you can't really use a 2D kernel launch any more - switch
-    //       to 1D.
-    //   * Shade the rays that intersected something or didn't bottom out.
-    //     That is, color the ray by performing a color computation according
-    //     to the shader, then generate a new ray to continue the ray path.
-    //     We recommend just updating the ray's PathSegment in place.
-    //     Note that this step may come before or after stream compaction,
-    //     since some shaders you write may also cause a path to terminate.
-    // * Finally:
-    //     * if not denoising, add this iteration's results to the image
-    //     * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl
-
-	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
-	checkCUDAError("generate camera ray");
-
-	int depth = 0;
-	PathSegment* dev_path_end = dev_paths + pixelcount;
-	int num_paths = dev_path_end - dev_paths;
-
-	// --- PathSegment Tracing Stage ---
-	// Shoot ray into scene, bounce between objects, push shading chunks
-
-  // Empty gbuffer
-  cudaMemset(dev_gBuffer, 0, pixelcount * sizeof(GBufferPixel));
-
-	// clean shading chunks
-	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
-  bool iterationComplete = false;
-	while (!iterationComplete) {
-
-	// tracing
-	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
-	computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
-		depth
-		, num_paths
-		, dev_paths
-		, dev_geoms
-		, hst_scene->geoms.size()
-		, dev_intersections
-		);
-	checkCUDAError("trace one bounce");
-	cudaDeviceSynchronize();
+	bool DenoiseImage(int resolutionX, int resolutionY, int iteration, int filterSize, float colWeight, float norWeight, float posWeight)
+	{
+		int pixelCount = resolutionX * resolutionY;
+		const int blockSize1d = 128;
+		dim3 numblocksPathSegmentTracing = (pixelCount + blockSize1d - 1) / blockSize1d;
 
-  if (depth == 0) {
-    generateGBuffer<<<numblocksPathSegmentTracing, blockSize1d>>>(num_paths, dev_intersections, dev_paths, dev_gBuffer);
-  }
+	
 
-	depth++;
+		const dim3 blockSize2d(8, 8);
+		const dim3 blocksPerGrid2d(
+			(resolutionX + blockSize2d.x - 1) / blockSize2d.x,
+			(resolutionY + blockSize2d.y - 1) / blockSize2d.y);
 
-  shadeSimpleMaterials<<<numblocksPathSegmentTracing, blockSize1d>>> (
-    iter,
-    num_paths,
-    dev_intersections,
-    dev_paths,
-    dev_materials
-  );
-  iterationComplete = depth == traceDepth;
-	}
+		int stepWidth = 1;
+		//int blur_iterations = ceil(glm::log2( (filterSize/25.f) * (filterSize / 25.f)) ); // This wavelet scalling is the correct option though
+		int blur_iterations = ceil(glm::log2( (filterSize* filterSize) / 25.f) ); // This wavelet scalling worked best for me
+		blur_iterations = blur_iterations <= 0 ? 1 : blur_iterations;
+		int colorWeight = colWeight;
 
-  // Assemble this iteration and apply it to the image
-  dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
-	finalGather<<<numBlocksPixels, blockSize1d>>>(num_paths, dev_image, dev_paths);
+		GeneratePingPongImage << < numblocksPathSegmentTracing, blockSize1d >> > (pixelCount, dev_image, dev_pingPongImage, iteration);
+		for (int i = 1; i < blur_iterations; i ++)
+		{
 
-    ///////////////////////////////////////////////////////////////////////////
+#if SimpleGaussianDenoise
 
-    // CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
-    // Otherwise, screenshots are also acceptable.
-    // Retrieve image from GPU
-    cudaMemcpy(hst_scene->state.image.data(), dev_image,
-            pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 
-    checkCUDAError("pathtrace");
-}
+			GenerateGaussianBlur <<< numblocksPathSegmentTracing, blockSize1d >> > (pixelCount, stepWidth, dev_gausKernel, dev_offsetKernel,
+				dev_pingPongImage, dev_TrousImage, resolutionX, resolutionY);
 
-// CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging.
-void showGBuffer(uchar4* pbo) {
-    const Camera &cam = hst_scene->state.camera;
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+#endif
 
-    // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
-    gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-}
+#if ATrous1
+			// 1D block for path tracing
+			GenerateAtrousImage << <numblocksPathSegmentTracing, blockSize1d >> > (pixelCount, stepWidth, dev_gausKernel, dev_offsetKernel,
+				dev_pingPongImage, dev_TrousImage, dev_gBuffer, resolutionX, resolutionY, colorWeight, norWeight, posWeight);
+#endif
 
-void showImage(uchar4* pbo, int iter) {
-const Camera &cam = hst_scene->state.camera;
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
-    // Send results to OpenGL buffer for rendering
-    sendImageToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, iter, dev_image);
-}
+#if ATrous2
+			GenerateAtrousImage2 << <blocksPerGrid2d, blockSize2d >> > (pixelCount, i,dev_gausKernel, dev_offsetKernel,
+				dev_pingPongImage, dev_TrousImage, dev_gBuffer, resolutionX, resolutionY, colWeight, norWeight, posWeight);
+#endif
+
+			stepWidth *= 2;
+			colorWeight /= 2;
+			swap(dev_pingPongImage, dev_TrousImage);
+		}
+		swap(dev_pingPongImage, dev_TrousImage); // Getting the data back to dev_TrousImage
+		cudaDeviceSynchronize();
+		return true;
+	}
+
+	void showImage(uchar4 * pbo, int iter) {
+		const Camera& cam = hst_scene->state.camera;
+		const dim3 blockSize2d(8, 8);
+		const dim3 blocksPerGrid2d(
+			(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+		// Send results to OpenGL buffer for rendering
+		sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_image);
+	}
+
+
+	void showDenoise(uchar4* pbo, int iter) {
+		const Camera& cam = hst_scene->state.camera;
+		const dim3 blockSize2d(8, 8);
+		const dim3 blocksPerGrid2d(
+			(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+		// Send results to OpenGL buffer for rendering
+		sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_TrousImage);
+	}
diff --git a/src/pathtrace.h b/src/pathtrace.h
index 9e12f44..a2791df 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -3,8 +3,11 @@
 #include <vector>
 #include "scene.h"
 
-void pathtraceInit(Scene *scene);
+void pathtraceInit(Scene *scene, float* gausKernel);
 void pathtraceFree();
 void pathtrace(int frame, int iteration);
 void showGBuffer(uchar4 *pbo);
 void showImage(uchar4 *pbo, int iter);
+void showDenoise(uchar4 *pbo, int iter);
+
+bool DenoiseImage(int resolutionX, int resolutionY, int iteration, int filterSize, float colWeight, float norWeight, float posWeight);
\ No newline at end of file
diff --git a/src/preview.cpp b/src/preview.cpp
index 3ca2718..994f657 100644
--- a/src/preview.cpp
+++ b/src/preview.cpp
@@ -214,7 +214,7 @@ void drawGui(int windowWidth, int windowHeight) {
 
     ImGui::Checkbox("Denoise", &ui_denoise);
 
-    ImGui::SliderInt("Filter Size", &ui_filterSize, 0, 100);
+    ImGui::SliderInt("Filter Size", &ui_filterSize, 0, 300);
     ImGui::SliderFloat("Color Weight", &ui_colorWeight, 0.0f, 10.0f);
     ImGui::SliderFloat("Normal Weight", &ui_normalWeight, 0.0f, 10.0f);
     ImGui::SliderFloat("Position Weight", &ui_positionWeight, 0.0f, 10.0f);
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index da7e558..131f211 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -79,4 +79,6 @@ struct ShadeableIntersection {
 // What information might be helpful for guiding a denoising filter?
 struct GBufferPixel {
   float t;
+  glm::vec3 normal;
+  glm::vec3 position;
 };