From e2a8185aa7c4880ce639c5eee9e1c95565609c80 Mon Sep 17 00:00:00 2001
From: Anthony Mansur <amansur99@gmail.com>
Date: Sun, 19 Sep 2021 00:30:31 -0400
Subject: [PATCH 01/10] part 1

---
 stream_compaction/cpu.cu | 45 ++++++++++++++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 6 deletions(-)
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..e9c912e 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,9 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            odata[0] = 0; // identity
+            for (int i = 1; i < n; i++)
+              odata[i] = odata[i - 1] + idata[i - 1];
             timer().endCpuTimer();
         }
 
@@ -30,9 +32,14 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int num = 0;
+            for (int i = 0; i < n; i++)
+            {
+              if (idata[i] > 0)
+                odata[num++] = idata[i];
+            }
             timer().endCpuTimer();
-            return -1;
+            return num;
         }
 
         /**
@@ -42,9 +49,35 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
+            // Temporary array with 0 or 1 depending if entry is a nonzero value
+            int* bitData = (int*) malloc(sizeof(int) * n);
+            for (int i = 0; i < n; i++)
+            {
+              bitData[i] = (idata[i] > 0) ? 1 : 0;
+            }
+            
+            // run exclusive scan on temporary array
+            int* scannedBitData = (int*) malloc(sizeof(int) * n);
+            scannedBitData[0] = 0; // identity
+            for (int i = 1; i < n; i++)
+              scannedBitData[i] = scannedBitData[i - 1] + bitData[i - 1];
+
+            // scatter to compute the stream compaction
+            for (int i = 0; i < n; i++)
+            {
+              if (bitData[i] == 1)
+              {
+                odata[scannedBitData[i]] = idata[i];
+              }
+            }
+
+            // size of final array
+            int num = ((bitData[n - 1] == 1) ? scannedBitData[n - 1] : scannedBitData[n - 2]) + 1;
+
+            // free allocated memory
+            free(bitData);
+            free(scannedBitData);
+            return num;
         }
     }
 }

From 018b63fee770ea511747f2f140a93dd6aa62eb73 Mon Sep 17 00:00:00 2001
From: Anthony Mansur <amansur99@gmail.com>
Date: Mon, 20 Sep 2021 21:32:00 -0400
Subject: [PATCH 02/10] naive implementation

---
 src/testing_helpers.hpp    |  2 +-
 stream_compaction/naive.cu | 47 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index 025e94a..8cbff45 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -10,7 +10,7 @@ template<typename T>
 int cmpArrays(int n, T *a, T *b) {
     for (int i = 0; i < n; i++) {
         if (a[i] != b[i]) {
-            printf("    a[%d] = %d, b[%d] = %d\n", i, a[i], i, b[i]);
+            printf("    expected[%d] = %d, actual[%d] = %d\n", i, a[i], i, b[i]);
             return 1;
         }
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..a2a8753 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,6 +2,12 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include <iostream>
+
+// nvcc does not seem to like variadic macros, so we have to define
+// one for each kernel parameter list:
+
+#define blockSize 128
 
 namespace StreamCompaction {
     namespace Naive {
@@ -11,14 +17,51 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+        __global__ void kernel_naive_parallel_scan(const int* read, int* write, int d, int n)
+        {
+          int k = blockIdx.x * blockDim.x + threadIdx.x;
+
+          if (k > n - 1)
+            return;
+
+          int step = (int)(powf(2, d - 1) + 0.5);
+          if (k >= step)
+            write[k] = read[k - step] + read[k];
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
-            // TODO
+            // allocate memory for the read write buffers
+            int* devRead, int* devWrite;
+            cudaMalloc((void**)&devRead, n * sizeof(int));
+            cudaMalloc((void**)&devWrite, n * sizeof(int));
+
+            // Copy idata to read and write buffer
+            cudaMemcpy(devRead, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMemcpy(devWrite, devRead, n * sizeof(int), cudaMemcpyDeviceToDevice);
+            
+            // define kernel dimension
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+            // run naive scan
+            for (int d = 1; d <= ilog2ceil(n); d++)
+            {
+              kernel_naive_parallel_scan<<<fullBlocksPerGrid, blockSize>>>(devRead, devWrite, d, n);
+
+              // swap read and write
+              cudaMemcpy(devRead, devWrite, n * sizeof(int), cudaMemcpyDeviceToDevice); // TODO: is there another way?
+            }
+
+            // Copy write buffer to odata
+            odata[0] = 0;
+            cudaMemcpy(odata + 1, devRead, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+
+            // free memory
+            cudaFree(devRead);
+            cudaFree(devWrite);
             timer().endGpuTimer();
         }
     }

From ea5406e294ee6ea0075c349c64b95052cc0d56ef Mon Sep 17 00:00:00 2001
From: Anthony Mansur <amansur99@gmail.com>
Date: Tue, 21 Sep 2021 01:09:17 -0400
Subject: [PATCH 03/10] power of 2 efficient scanning

---
 src/main.cpp                   |  1 +
 stream_compaction/common.cu    |  1 -
 stream_compaction/common.h     |  5 +++
 stream_compaction/efficient.cu | 70 +++++++++++++++++++++++++++++++++-
 stream_compaction/naive.cu     |  7 +++-
 5 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..e6e9faa 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -27,6 +27,7 @@ int main(int argc, char* argv[]) {
     printf("** SCAN TESTS **\n");
     printf("****************\n");
 
+    // TODO: uncomment
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..ab43b1e 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -14,7 +14,6 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     exit(EXIT_FAILURE);
 }
 
-
 namespace StreamCompaction {
     namespace Common {
 
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..cb9beda 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -30,6 +30,11 @@ inline int ilog2ceil(int x) {
     return x == 1 ? 0 : ilog2(x - 1) + 1;
 }
 
+inline int powi(int a, int b)
+{
+    return (int)(powf(a, b) + 0.5);
+}
+
 namespace StreamCompaction {
     namespace Common {
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..6abf307 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "efficient.h"
 
+#define blockSize 128
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,12 +14,78 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __device__ inline int d_powi(int a, int b)
+        {
+            return (int)(powf(a, b) + 0.5f);
+        }
+
+        // parallel reduction
+        __global__ void kernel_up_sweep(int* data, int d, int n)
+        {
+            int k = blockIdx.x * blockDim.x + threadIdx.x;
+
+            int leftInx = k + d_powi(2, d) - 1;
+            int rightInx = k + d_powi(2, d + 1) - 1;
+
+            if (rightInx > n - 1 || k % d_powi(2, d + 1) != 0) // note: this is an expensive check
+                return;
+            data[rightInx] += data[leftInx];
+        }
+
+        __global__ void kernel_down_sweep(int* data, int d, int n)
+        {
+            int k = blockIdx.x * blockDim.x + threadIdx.x;
+
+            int leftInx = k + d_powi(2, d) - 1;
+            int rightInx = k + d_powi(2, d + 1) - 1;
+
+            if (rightInx > n - 1 || k % d_powi(2, d+1) != 0) // note: this is an expensive check
+                return;
+
+            int leftChild = data[leftInx]; // save left child
+            data[leftInx] = data[rightInx]; // set left child to this node's value
+            data[rightInx] += leftChild; // set right child to old left value + 
+                                         // this node's value
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
+        // TODO: padding doesn't seem to work...
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
-            // TODO
+            // allocate memory for the data buffer
+            int* devData;
+            int N = powi(2, ilog2ceil(n)); // get the minimum power of 2 >= nl
+
+            cudaMalloc((void**)&devData, N * sizeof(int));
+
+            // Copy idata to read and write buffer
+            cudaMemcpy(devData, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            if (N > n)
+                cudaMemset(devData + n, 0, (N - n) * sizeof(int)); // padding 
+
+            // define kernel dimension
+            dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+            // run up-sweep
+            for (int d = 0; d <= ilog2ceil(N) - 1; d++)
+            {
+                kernel_up_sweep << <fullBlocksPerGrid, blockSize >> > (devData, d, n);
+            }
+
+            // run down-sweep
+            cudaMemset(devData + N - 1, 0, sizeof(int));
+            for (int d = ilog2ceil(N) - 1; d >= 0; d--)
+            {
+                kernel_down_sweep << <fullBlocksPerGrid, blockSize >> > (devData, d, n);
+            }
+
+            // Copy write buffer to odata
+            cudaMemcpy(odata, devData, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            // free memory
+            cudaFree(devData);
             timer().endGpuTimer();
         }
 
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index a2a8753..d51b41f 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -17,6 +17,11 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
+
+        __device__ inline int powi(int a, int b)
+        {
+            return (int)(powf(a, b) + 0.5);
+        }
         __global__ void kernel_naive_parallel_scan(const int* read, int* write, int d, int n)
         {
           int k = blockIdx.x * blockDim.x + threadIdx.x;
@@ -24,7 +29,7 @@ namespace StreamCompaction {
           if (k > n - 1)
             return;
 
-          int step = (int)(powf(2, d - 1) + 0.5);
+          int step = powi(2, d - 1);
           if (k >= step)
             write[k] = read[k - step] + read[k];
         }

From fcc28c8c6ea9abe7bb730db90b95a64f7ae18fac Mon Sep 17 00:00:00 2001
From: Anthony Mansur <amansur99@gmail.com>
Date: Tue, 21 Sep 2021 22:04:44 -0400
Subject: [PATCH 04/10] implementing stream compaction

---
 stream_compaction/cpu.cu       |   2 +-
 stream_compaction/efficient.cu | 118 ++++++++++++++++++++++++++-------
 2 files changed, 95 insertions(+), 25 deletions(-)

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e9c912e..d300baf 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -72,7 +72,7 @@ namespace StreamCompaction {
             }
 
             // size of final array
-            int num = ((bitData[n - 1] == 1) ? scannedBitData[n - 1] : scannedBitData[n - 2]) + 1;
+            int num = scannedBitData[n - 1];
 
             // free allocated memory
             free(bitData);
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 6abf307..047767f 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -48,44 +48,74 @@ namespace StreamCompaction {
                                          // this node's value
         }
 
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        // TODO: padding doesn't seem to work...
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // allocate memory for the data buffer
-            int* devData;
-            int N = powi(2, ilog2ceil(n)); // get the minimum power of 2 >= nl
+        __global__ void kernel_bitmap(int* input, int* output, int n)
+        {
+            int k = blockIdx.x * blockDim.x + threadIdx.x;
 
-            cudaMalloc((void**)&devData, N * sizeof(int));
+            if (k > n - 1)
+                return;
 
-            // Copy idata to read and write buffer
-            cudaMemcpy(devData, idata, n * sizeof(int), cudaMemcpyHostToDevice);
-            if (N > n)
-                cudaMemset(devData + n, 0, (N - n) * sizeof(int)); // padding 
+            output[k] = input[k] > 0;
+        }
+
+        __global__ void kernel_scatter(int* input, int* bitmap, int* scanResult, int* dest, int n)
+        {
+            int k = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (k > n - 1)
+                return;
 
+            if (bitmap[k] == 1)
+            {
+                dest[scanResult[k]] = input[k];
+            }
+        }
+
+        void efficientScan(int* cudaBuffer, int n)
+        {
             // define kernel dimension
-            dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
 
             // run up-sweep
-            for (int d = 0; d <= ilog2ceil(N) - 1; d++)
+            for (int d = 0; d <= ilog2ceil(n) - 1; d++)
             {
-                kernel_up_sweep << <fullBlocksPerGrid, blockSize >> > (devData, d, n);
+                kernel_up_sweep << <fullBlocksPerGrid, blockSize >> > (cudaBuffer, d, n);
             }
 
             // run down-sweep
-            cudaMemset(devData + N - 1, 0, sizeof(int));
-            for (int d = ilog2ceil(N) - 1; d >= 0; d--)
+            cudaMemset(cudaBuffer + n - 1, 0, sizeof(int));
+            for (int d = ilog2ceil(n) - 1; d >= 0; d--)
             {
-                kernel_down_sweep << <fullBlocksPerGrid, blockSize >> > (devData, d, n);
+                kernel_down_sweep << <fullBlocksPerGrid, blockSize >> > (cudaBuffer, d, n);
             }
+        }
+
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+        void scan(int n, int *odata, const int *idata) {
+            timer().startGpuTimer();
+
+            // allocate memory for the data buffer
+            int* devBuffer;
+            int N = powi(2, ilog2ceil(n)); // get the minimum power of 2 >= n
+
+            cudaMalloc((void**)&devBuffer, N * sizeof(int));
+
+            // Copy idata to read and write buffer
+            cudaMemcpy(devBuffer, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            if (N > n)
+                cudaMemset(devBuffer + n, 0, (N - n) * sizeof(int)); // padding 
+
+            // run efficient scan algorithm
+            efficientScan(devBuffer, N);
 
             // Copy write buffer to odata
-            cudaMemcpy(odata, devData, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, devBuffer, n * sizeof(int), cudaMemcpyDeviceToHost);
 
             // free memory
-            cudaFree(devData);
+            cudaFree(devBuffer);
+
             timer().endGpuTimer();
         }
 
@@ -100,9 +130,49 @@ namespace StreamCompaction {
          */
         int compact(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
-            // TODO
+            int *devInput, *devBitmap, * devScan, * devOutput;
+            int N = powi(2, ilog2ceil(n)); // get the minimum power of 2 >= n
+
+            // allocate memory to device buffers
+            cudaMalloc((void**)&devInput, N * sizeof(int));
+            cudaMalloc((void**)&devBitmap, N * sizeof(int));
+            cudaMalloc((void**)&devScan, N * sizeof(int));
+            cudaMalloc((void**)&devOutput, N * sizeof(int));
+
+            // copy idata to device buffer
+            cudaMemcpy(devInput, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            if (N > n)
+                cudaMemset(devInput + n, 0, (N - n) * sizeof(int)); // padding 
+
+            // define kernel dimension
+            dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+            // step 1: compute temporary bitmap array
+            kernel_bitmap << <fullBlocksPerGrid, blockSize >> > (devInput, devBitmap, N);
+
+            // step 2: run exclusive scan on temporary bitmap array
+            cudaMemcpy(devScan, devBitmap, N * sizeof(int), cudaMemcpyDeviceToDevice);
+            efficientScan(devScan, N);
+
+            // step 3: scatter
+            kernel_scatter << <fullBlocksPerGrid, blockSize >> > (devInput, devBitmap, devScan, devOutput, N);
+            int numOfElements = 0;
+            cudaMemcpy(&numOfElements, devScan + N - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaDeviceSynchronize();
+            //if (odata[n-1] == 0)
+            //    numOfElements--; // exclusive scan, may need to disregard last element
+
+            // copy device's output buffer to odata
+            cudaMemcpy(odata, devOutput, numOfElements * sizeof(int), cudaMemcpyDeviceToHost);
+
+            // free memory
+            cudaFree(devInput);
+            cudaFree(devBitmap);
+            cudaFree(devScan);
+            cudaFree(devOutput);
+
             timer().endGpuTimer();
-            return -1;
+            return numOfElements;
         }
     }
 }

From 3939fecec295aecb4782cffed037c0df1f3bf0af Mon Sep 17 00:00:00 2001
From: Anthony Mansur <amansur99@gmail.com>
Date: Tue, 21 Sep 2021 22:33:59 -0400
Subject: [PATCH 05/10] thurst implementation

---
 stream_compaction/thrust.cu | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..8590a25 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -19,9 +19,23 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            // Create host vector
+            thrust::host_vector<int> input(n);
+
+            // Assign values to host vector
+            for (int i = 0; i < n; i++)
+            {
+                input[i] = idata[i];
+            }
+
+            // Create device vectors
+            thrust::device_vector<int> d_input = input; // cast from host
+            thrust::device_vector<int> d_output(n);
+
+            thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
+
+            // copy output to pointer
+            thrust::copy(d_output.begin(), d_output.end(), odata);
             timer().endGpuTimer();
         }
     }

From 6629c8d205209fb21eac02381ba507c028955af1 Mon Sep 17 00:00:00 2001
From: Anthony Mansur <amansur99@gmail.com>
Date: Tue, 21 Sep 2021 23:11:15 -0400
Subject: [PATCH 06/10] refactored code and updated timers

---
 src/main.cpp                   |  2 +-
 stream_compaction/common.cu    | 18 ++++++++++--
 stream_compaction/efficient.cu | 52 +++++++++++-----------------------
 stream_compaction/naive.cu     | 10 +++++--
 4 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index e6e9faa..dc2caac 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index ab43b1e..211c25f 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -22,7 +22,12 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            int k = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (k > n - 1)
+                return;
+
+            bools[k] = idata[k] > 0;
         }
 
         /**
@@ -31,8 +36,15 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
-        }
+            int k = blockIdx.x * blockDim.x + threadIdx.x;
 
+            if (k > n - 1)
+                return;
+
+            if (bools[k] == 1)
+            {
+                odata[indices[k]] = idata[k];
+            }
+        }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 047767f..324a776 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,7 +3,7 @@
 #include "common.h"
 #include "efficient.h"
 
-#define blockSize 128
+#define blockSize 256
 
 namespace StreamCompaction {
     namespace Efficient {
@@ -48,29 +48,6 @@ namespace StreamCompaction {
                                          // this node's value
         }
 
-        __global__ void kernel_bitmap(int* input, int* output, int n)
-        {
-            int k = blockIdx.x * blockDim.x + threadIdx.x;
-
-            if (k > n - 1)
-                return;
-
-            output[k] = input[k] > 0;
-        }
-
-        __global__ void kernel_scatter(int* input, int* bitmap, int* scanResult, int* dest, int n)
-        {
-            int k = blockIdx.x * blockDim.x + threadIdx.x;
-
-            if (k > n - 1)
-                return;
-
-            if (bitmap[k] == 1)
-            {
-                dest[scanResult[k]] = input[k];
-            }
-        }
-
         void efficientScan(int* cudaBuffer, int n)
         {
             // define kernel dimension
@@ -94,8 +71,6 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-
             // allocate memory for the data buffer
             int* devBuffer;
             int N = powi(2, ilog2ceil(n)); // get the minimum power of 2 >= n
@@ -107,16 +82,20 @@ namespace StreamCompaction {
             if (N > n)
                 cudaMemset(devBuffer + n, 0, (N - n) * sizeof(int)); // padding 
 
+            // START TIMER
+            timer().startGpuTimer();
+
             // run efficient scan algorithm
             efficientScan(devBuffer, N);
 
+            timer().endGpuTimer();
+            // TIMER END
+
             // Copy write buffer to odata
             cudaMemcpy(odata, devBuffer, n * sizeof(int), cudaMemcpyDeviceToHost);
 
             // free memory
             cudaFree(devBuffer);
-
-            timer().endGpuTimer();
         }
 
         /**
@@ -129,7 +108,6 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             int *devInput, *devBitmap, * devScan, * devOutput;
             int N = powi(2, ilog2ceil(n)); // get the minimum power of 2 >= n
 
@@ -147,20 +125,26 @@ namespace StreamCompaction {
             // define kernel dimension
             dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
 
+            // START TIMER
+            timer().startGpuTimer();
+
             // step 1: compute temporary bitmap array
-            kernel_bitmap << <fullBlocksPerGrid, blockSize >> > (devInput, devBitmap, N);
+            Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (N, devBitmap, devInput);
 
             // step 2: run exclusive scan on temporary bitmap array
             cudaMemcpy(devScan, devBitmap, N * sizeof(int), cudaMemcpyDeviceToDevice);
             efficientScan(devScan, N);
 
             // step 3: scatter
-            kernel_scatter << <fullBlocksPerGrid, blockSize >> > (devInput, devBitmap, devScan, devOutput, N);
+            Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (N, devOutput, devInput, devBitmap, devScan);
+
+            timer().endGpuTimer();
+            // TIMER END
+
+            // find number of elements
             int numOfElements = 0;
             cudaMemcpy(&numOfElements, devScan + N - 1, sizeof(int), cudaMemcpyDeviceToHost);
             cudaDeviceSynchronize();
-            //if (odata[n-1] == 0)
-            //    numOfElements--; // exclusive scan, may need to disregard last element
 
             // copy device's output buffer to odata
             cudaMemcpy(odata, devOutput, numOfElements * sizeof(int), cudaMemcpyDeviceToHost);
@@ -170,8 +154,6 @@ namespace StreamCompaction {
             cudaFree(devBitmap);
             cudaFree(devScan);
             cudaFree(devOutput);
-
-            timer().endGpuTimer();
             return numOfElements;
         }
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index d51b41f..ccc33f6 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -7,7 +7,7 @@
 // nvcc does not seem to like variadic macros, so we have to define
 // one for each kernel parameter list:
 
-#define blockSize 128
+#define blockSize 256
 
 namespace StreamCompaction {
     namespace Naive {
@@ -38,7 +38,6 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // allocate memory for the read write buffers
             int* devRead, int* devWrite;
             cudaMalloc((void**)&devRead, n * sizeof(int));
@@ -51,6 +50,9 @@ namespace StreamCompaction {
             // define kernel dimension
             dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
 
+            // START TIMER
+            timer().startGpuTimer();
+
             // run naive scan
             for (int d = 1; d <= ilog2ceil(n); d++)
             {
@@ -60,6 +62,9 @@ namespace StreamCompaction {
               cudaMemcpy(devRead, devWrite, n * sizeof(int), cudaMemcpyDeviceToDevice); // TODO: is there another way?
             }
 
+            timer().endGpuTimer();
+            // TIMER END
+
             // Copy write buffer to odata
             odata[0] = 0;
             cudaMemcpy(odata + 1, devRead, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
@@ -67,7 +72,6 @@ namespace StreamCompaction {
             // free memory
             cudaFree(devRead);
             cudaFree(devWrite);
-            timer().endGpuTimer();
         }
     }
 }

From ce50d99cf9d9fdca2c945cf7121dbea2aa3fd3f1 Mon Sep 17 00:00:00 2001
From: Anthony Mansur <amansur99@gmail.com>
Date: Tue, 21 Sep 2021 23:16:37 -0400
Subject: [PATCH 07/10] updating thrust timers

---
 src/main.cpp                   | 2 +-
 stream_compaction/efficient.cu | 2 +-
 stream_compaction/thrust.cu    | 6 ++++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index dc2caac..b187909 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 20; // feel free to change the size of array
+const int SIZE = 1 << 22; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 324a776..8187d14 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,7 +3,7 @@
 #include "common.h"
 #include "efficient.h"
 
-#define blockSize 256
+#define blockSize 384
 
 namespace StreamCompaction {
     namespace Efficient {
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 8590a25..12fc91e 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,7 +18,7 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            
             // Create host vector
             thrust::host_vector<int> input(n);
 
@@ -32,11 +32,13 @@ namespace StreamCompaction {
             thrust::device_vector<int> d_input = input; // cast from host
             thrust::device_vector<int> d_output(n);
 
+            timer().startGpuTimer();
             thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
+            timer().endGpuTimer();
 
             // copy output to pointer
             thrust::copy(d_output.begin(), d_output.end(), odata);
-            timer().endGpuTimer();
+            
         }
     }
 }

From 2244b4aad38ce9cdd90ab74771209b6475c4eb25 Mon Sep 17 00:00:00 2001
From: Anthony Mansur <amansur99@gmail.com>
Date: Tue, 21 Sep 2021 23:57:38 -0400
Subject: [PATCH 08/10] Updating readme

---
 README.md                      | 97 ++++++++++++++++++++++++++++++++--
 stream_compaction/efficient.cu |  2 +-
 2 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 0e38ddb..386f2b1 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,101 @@
 CUDA Stream Compaction
 ======================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+Implementing GPU stream compaction in CUDA, from scratch. GPU stream compaction is a widely used algorithm, especially for accelerating path tracers.
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
 
-### (TODO: Your README)
+
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking**
+
+- Anthony Mansur
+  - https://www.linkedin.com/in/anthony-mansur-ab3719125/
+- Tested on: Windows 10, AMD Ryzen 5 3600, Geforce RTX 2060 Super (personal)
 
 Include analysis, etc. (Remember, this is public, so don't put
 anything here that you don't want to share with the world.)
 
+
+
+### Features 
+
+- CPU implementation of the "prefix sum" algorithm
+- CPU implementation of stream compaction with and without use of the scan function
+- Naive implementation of the prefix-sum algorithm
+- Work-efficient implementation of the prefix-sum algorithm
+- GPU implementation of the stream compaction algorithm
+- Wrapped the Thrust's scan implementation
+
+
+
+### Performance Analysis
+
+Please note: this is an incomplete analysis.
+
+To roughly optimize the block size, compared the the gpu stream compaction algorithm from n = 128 to n = 1024. The following time taken in ms was 5.48, 5.53, 6.86, 5.98, 9.16, 9.20, 8.12, and 7.62. Thus, our block size optimization is of size 128.
+
+Below are the results from running the different algorithms for comparison:
+
+````
+****************
+** SCAN TESTS **
+****************
+    [   4  24   5  29  21   6  24  19  39  29  47  46  20 ...   4   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 8.3087ms    (std::chrono Measured)
+    [   0   4  28  33  62  83  89 113 132 171 200 247 293 ... 102687260 102687264 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 8.2725ms    (std::chrono Measured)
+    [   0   4  28  33  62  83  89 113 132 171 200 247 293 ... 102687181 102687208 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 6.04371ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 6.21773ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 5.66464ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 5.58922ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.254624ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.25872ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   0   3   3   3   0   0   3   1   1   1   0   0 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 8.4867ms    (std::chrono Measured)
+    [   2   3   3   3   3   1   1   1   1   3   1   1   2 ...   1   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 8.4656ms    (std::chrono Measured)
+    [   2   3   3   3   3   1   1   1   1   3   1   1   2 ...   1   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 8.4656ms    (std::chrono Measured)
+    [   2   3   3   3   3   1   1   1   1   3   1   1   2 ...   1   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 5.92832ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 5.76102ms    (CUDA Measured)
+    passed
+````
+
+
+
+### Questions
+
+1. Block Optimization: see performance analysis
+2. Comparison of implementations: see performance analysis. Ran with n = 2^22
+3. Although there were improvements in performance between naive and work-efficient implementations of scanning, the cpu implementation was faster. This is most likely due to the inefficiencies in terms of branching and in terms of using global memory as opposed to shared memory (i.e., kernels need to be optimized). For compaction, it seems that the gpu implementations ran faster due to the large size of n.
+4. See performance analysis for test program output
+
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 8187d14..963af38 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,7 +3,7 @@
 #include "common.h"
 #include "efficient.h"
 
-#define blockSize 384
+#define blockSize 128
 
 namespace StreamCompaction {
     namespace Efficient {

From fa344e5011b1996ae291a810b644195486402862 Mon Sep 17 00:00:00 2001
From: Anthony Mansur <amansur99@gmail.com>
Date: Tue, 21 Sep 2021 23:58:23 -0400
Subject: [PATCH 09/10] updating readme

---
 README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/README.md b/README.md
index 386f2b1..b83e5a2 100644
--- a/README.md
+++ b/README.md
@@ -11,9 +11,6 @@ Implementing GPU stream compaction in CUDA, from scratch. GPU stream compaction
   - https://www.linkedin.com/in/anthony-mansur-ab3719125/
 - Tested on: Windows 10, AMD Ryzen 5 3600, Geforce RTX 2060 Super (personal)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
-
 
 
 ### Features 

From 6f1b8a511d97183aaf8ebe49122d5bd8a307e9ba Mon Sep 17 00:00:00 2001
From: Anthony Mansur <amansur99@gmail.com>
Date: Tue, 28 Sep 2021 20:34:58 -0400
Subject: [PATCH 10/10] Small readme edit

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b83e5a2..aa42c38 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ Implementing GPU stream compaction in CUDA, from scratch. GPU stream compaction
 
 
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking**
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2 - Stream Compaction**
 
 - Anthony Mansur
   - https://www.linkedin.com/in/anthony-mansur-ab3719125/