diff --git a/README.md b/README.md
index 0e38ddb..d7351a0 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,111 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Paul (San) Jewell
+  * [LinkedIn](https://www.linkedin.com/in/paul-jewell-2aba7379), [work website](
+    https://www.biociphers.org/paul-jewell-lab-member), [personal website](https://gitlab.com/inklabapp), [twitter](https://twitter.com/inklabapp), etc.
+* Tested on: (TODO) Linux pop-os 5.11.0-7614-generic, i7-9750H CPU @ 2.60GHz 32GB, GeForce GTX 1650 Mobile / Max-Q 4GB 
 
-### (TODO: Your README)
+### GPU scan and stream compaction demonstration
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+This repository contains some comparisons of basic algorithms compared between a CPU serial implementation, my GPU CUDA implementation, 
+and similar functions provided by the 'thrust' library. 
 
+I demonstrate usage of the naive, buffered, work-efficient, and chunked work-efficient scan algorithms as described in 
+[this link](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda).
+The implementation is also broken down into a number of sub-functions for easier comprehension by the curious reader. 
+
+### Performance comparison overview
+
+After manually testing a number of block sizes, I've found that a size of 64 works best for the efficient scan 
+implementation. (same as for boids)and 512 (the largest I can do) works best for the naive implementation. (this 
+pretty much makes sense as it is a largely processor-count-limited algorithm). Thrust does not allow specifying a 
+manual block size that I could find so I didn't do any testing there. 
+
+![boidstats](img/chart1.png)
+
+For a first iteration, I was unable to run larger sizes without causing OOM on my local machine
+ (desktop completely crashed), so I tested up to 1mil only. Clearly something seems to have goofed in either 
+ a single blocking line in my efficient implementation, or the measurement code location. I don't believe a simple
+ inefficiency would lead to the huge discrepancy between naive and efficient implementations here. The no-chunking
+ implementation should do the same amount of operations for the up and down sweep as the chunking implementation (ideal),
+ however, it will get the incorrect result. There is something very inefficient with the chunking that I've not yet been 
+ able to work out.
+
+```
+  ****************
+  ** SCAN TESTS **
+  ****************
+      [  29  18  29  17  11   3  19  31   3  49  25  31  31 ...  14   0 ]
+  ==== cpu scan, power-of-two ====
+     elapsed time: 0.000686ms    (std::chrono Measured)
+      [  29  47  76  93 104 107 126 157 160 209 234 265 296 ... 23987 23987 ]
+  ==== cpu scan, non-power-of-two ====
+     elapsed time: 0.000377ms    (std::chrono Measured)
+      [  29  47  76  93 104 107 126 157 160 209 234 265 296 ... 23950 23960 ]
+      passed 
+  ==== naive scan, power-of-two ====
+     elapsed time: 0.060032ms    (CUDA Measured)
+      [  29  47  76  93 104 107 126 157 160 209 234 265 296 ... 23987 23987 ]
+  ==== naive scan, non-power-of-two ====
+     elapsed time: 0.055296ms    (CUDA Measured)
+      [  29  47  76  93 104 107 126 157 160 209 234 265 296 ... 23950 23960 ]
+      passed 
+  ==== work-efficient scan no chunk, power-of-two ====
+     elapsed time: 0.087648ms    (CUDA Measured)
+      [  29  47  76  93 104 107 126 157 160 209 234 265 296 ... 23987 23987 ]
+      passed 
+  ==== work-efficient scan no chunk, non-power-of-two ====
+     elapsed time: 0.041152ms    (CUDA Measured)
+      [  29  47  76  93 104 107 126 157 160 209 234 265 296 ... 23950 23960 ]
+      passed 
+  ==== work-efficient scan, power-of-two ====
+     elapsed time: 0.04512ms    (CUDA Measured)
+      [  29  47  76  93 104 107 126 157 160 209 234 265 296 ... 23987 23987 ]
+      passed 
+  ==== work-efficient scan, non-power-of-two ====
+     elapsed time: 0.058752ms    (CUDA Measured)
+      [  29  47  76  93 104 107 126 157 160 209 234 265 296 ... 23950 23960 ]
+      passed 
+  ==== thrust scan, power-of-two ====
+     elapsed time: 0.022528ms    (CUDA Measured)
+      [  29  47  76  93 104 107 126 157 160 209 234 265 296 ... 23987 23987 ]
+      passed 
+  ==== thrust scan, non-power-of-two ====
+     elapsed time: 0.021024ms    (CUDA Measured)
+      [  29  47  76  93 104 107 126 157 160 209 234 265 296 ... 23950 23960 ]
+      passed 
+  
+  *****************************
+  ** STREAM COMPACTION TESTS **
+  *****************************
+      [   1   2   1   3   1   1   3   1   3   3   3   3   3 ...   0   0 ]
+      [   1   2   1   3   1   1   3   1   3   3   3   3   3 ...   2   2 ]
+  ==== cpu compact without scan, power-of-two ====
+     elapsed time: 0.002584ms    (std::chrono Measured)
+      [   1   2   1   3   1   1   3   1   3   3   3   3   3 ...   2   3 ]
+      passed 
+  ==== cpu compact without scan, non-power-of-two ====
+     elapsed time: 0.001953ms    (std::chrono Measured)
+      [   1   2   1   3   1   1   3   1   3   3   3   3   3 ...   2   2 ]
+      passed 
+  ==== cpu compact with scan ====
+     elapsed time: 0.004194ms    (std::chrono Measured)
+      [   1   2   1   3   1   1   3   1   3   3   3   3   3 ...   2   3 ]
+      passed 
+  ==== cpu compact with scan, non-power-of-two ====
+     elapsed time: 0.003411ms    (std::chrono Measured)
+      [   1   2   1   3   1   1   3   1   3   3   3   3   3 ...   2   2 ]
+      passed 
+  ==== work-efficient compact, power-of-two ====
+     elapsed time: 0.065216ms    (CUDA Measured)
+      [   1   2   1   3   1   1   3   1   3   3   3   3   3 ...   2   3 ]
+      passed 
+  ==== work-efficient compact, non-power-of-two ====
+     elapsed time: 0.063552ms    (CUDA Measured)
+      [   1   2   1   3   1   1   3   1   3   3   3   3   3 ...   2   2 ]
+      passed 
+  sh: 1: pause: not found
+  
+  Process finished with exit code 0
+```
\ No newline at end of file
diff --git a/img/chart1.png b/img/chart1.png
new file mode 100644
index 0000000..cb51c9c
Binary files /dev/null and b/img/chart1.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..a3de94a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,9 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+//const int SIZE = 1 << 8; // feel free to change the size of array
+//const int SIZE = 10000; // feel free to change the size of array
+const int SIZE = 1000;
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -29,6 +31,7 @@ int main(int argc, char* argv[]) {
 
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
+
     printArray(SIZE, a, true);
 
     // initialize b using StreamCompaction::CPU::scan you implement
@@ -51,48 +54,102 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-    onesArray(SIZE, c);
-    printDesc("1s array for finding bugs");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
+    printArray(SIZE, c, true);
 
-    zeroArray(SIZE, c);
+    zeroArray(NPOT, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(NPOT, c, true);
+    printCmpResult(NPOT, b, c);
+
+
+     // For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+
+//  onesArray(SIZE, a);
+//  zeroArray(SIZE, c);
+//  printDesc("1s array for finding bugs");
+//  StreamCompaction::Naive::scan(NPOT, c, a);
+//  printArray(NPOT, c);
+
+//  onesArray(NPOT, a);
+//  zeroArray(NPOT, c);
+//  printDesc("1s array for finding bugs");
+//  StreamCompaction::Naive::scan(NPOT, c, a);
+//  printArray(NPOT, c);
+//
+//  onesArray(SIZE, a);
+//  zeroArray(SIZE, c);
+//  printDesc("1s array for finding bugs");
+//  StreamCompaction::Efficient::scan(SIZE, c, a);
+//  printArray(SIZE, c);
+
+
+//  onesArray(3, a);
+//  a[0] = 4;
+//  a[1] = 4;
+//  a[2] = 1;
+//  zeroArray(3, c);
+//  printDesc("1s array for finding bugs");
+//  StreamCompaction::Efficient::scanplain(3, c, a);
+//  printArray(3, c);
+//
+//     onesArray(SIZE, a);
+//    zeroArray(SIZE, c);
+//    printDesc("1s array for finding bugs");
+//    StreamCompaction::Efficient::scan(SIZE, c, a);
+//    printArray(SIZE, c);
+//
+//  onesArray(SIZE, a);
+//  zeroArray(SIZE, c);
+//  printDesc("1s array for finding bugs");
+//  StreamCompaction::Efficient::scan(SIZE, c, a);
+//  printArray(SIZE, c);
+
+    zeroArray(SIZE, c);
+    printDesc("work-efficient scan no chunk, power-of-two");
+    StreamCompaction::Efficient::scanplain(SIZE, c, a);
+    exclusive2inclusive(SIZE, c, a[SIZE-1]);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("work-efficient scan no chunk, non-power-of-two");
+    StreamCompaction::Efficient::scanplain(NPOT, c, a);
+    exclusive2inclusive(NPOT, c, a[NPOT-1]);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
+    exclusive2inclusive(SIZE, c, a[SIZE-1]);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
+    exclusive2inclusive(NPOT, c, a[NPOT-1]);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -104,7 +161,16 @@ int main(int argc, char* argv[]) {
 
     genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
+  //a[NPOT - 1] = 0;
+//
+//  a[0] = 0;
+//  a[1] = 2;
+//  a[2] = 1;
+//  a[3] = 2;
+//  a[4] = 1;
+
     printArray(SIZE, a, true);
+    printArray(NPOT, a, true);
 
     int count, expectedCount, expectedNPOT;
 
@@ -133,18 +199,25 @@ int main(int argc, char* argv[]) {
     printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
+    zeroArray(SIZE, c);
+    printDesc("cpu compact with scan, non-power-of-two");
+    count = StreamCompaction::CPU::compactWithScan(NPOT, c, a);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    printArray(count, c, true);
+    printCmpLenResult(count, expectedNPOT, b, c);
+
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
-    zeroArray(SIZE, c);
+    zeroArray(NPOT, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..c2ad78d 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -30,6 +30,13 @@ inline int ilog2ceil(int x) {
     return x == 1 ? 0 : ilog2(x - 1) + 1;
 }
 
+inline void exclusive2inclusive(int n, int* scanned, int lastvalue){
+  for(int i=0; i<n-1; i++){
+    scanned[i] = scanned[i+1];
+  }
+  scanned[n-1] = scanned[n-2] + lastvalue;
+}
+
 namespace StreamCompaction {
     namespace Common {
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..dbfca23 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,6 +1,6 @@
 #include <cstdio>
 #include "cpu.h"
-
+#include "testing_helpers.hpp"
 #include "common.h"
 
 namespace StreamCompaction {
@@ -20,6 +20,25 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            /*
+             * out[0] = in[0]; // assuming n > 0
+               for (int k = 1; k < n; ++k)
+               out[k] = out[k – 1] + in[k];
+             */
+            if(n < 1) return;
+
+            // inclusive
+            odata[0] = idata[0];
+            for(int i=1; i<n; ++i){
+              odata[i] = odata[i-1] + idata[i];
+            }
+
+            // make exclusive
+//						for(int i=n-1; i>0; --i){
+//							odata[i] = odata[i-1];
+//						}
+//						odata[0] = 0;
+
             timer().endCpuTimer();
         }
 
@@ -30,9 +49,17 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+
+            int numNonZero = 0;
+            for(int i=0; i<n; ++i){
+              if(idata[i] != 0){
+                odata[numNonZero] = idata[i];
+                numNonZero++;
+              }
+            }
+
             timer().endCpuTimer();
-            return -1;
+            return numNonZero;
         }
 
         /**
@@ -42,9 +69,36 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+
+            if(n < 1) return 0;
+
+            int numNonZero = 0;
+            int tmp[n];
+            for(int i=0; i<n; ++i){
+              if(idata[i] == 0){
+                tmp[i] = 0;
+              }else{
+                tmp[i] = 1;
+                numNonZero++;
+              }
+            }
+
+
+            int scan[n];
+            scan[0] = 0;
+            for(int i=1; i<n; ++i){
+              scan[i] = scan[i-1] + tmp[i-1];
+            }
+
+
+            for(int i=0; i<n; ++i) {
+              if (tmp[i] == 1) {
+                odata[scan[i]] = idata[i];
+              }
+            }
+
             timer().endCpuTimer();
-            return -1;
+            return numNonZero;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..1d7a676 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,6 +2,10 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include <iostream>
+
+#define checkCUDAErrorWithLine(msg) checkCUDAErrorFn(msg, __FILE__, __LINE__)
+#define blockSize 64
 
 namespace StreamCompaction {
     namespace Efficient {
@@ -12,15 +16,267 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernEfficientScanUp(int N, int d, int* odata) {
+          int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+          if (index >= N) return;
+
+          int power_plus = (int) powf(2.0f, (float) d+1);
+          if (index % power_plus != 0) return;
+
+          int power_ident = power_plus / 2;
+          //int power_ident = (int) powf(2.0f, (float) d);
+
+          odata[index + power_plus - 1] += odata[index + power_ident - 1];
+
+
+        }
+
+        __global__ void kernEfficientScanReset(int N, int* odata) {
+          // should not be needed in the end
+          int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+          if (index != 0) return;
+          odata[N-1] = 0;
+        }
+
+
+
+
+        __global__ void kernEfficientMoveLastValue(int N, int block_idx, const int* odata, int* auxArray) {
+          // should not be needed in the end
+          int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+          if (index != 0) return;
+          auxArray[block_idx] = odata[N-1];
+
+
+        }
+
+        __global__ void kernEfficientScanDown(int N, int d, int* odata) {
+          int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+          if (index >= N) return;
+
+          int power_plus = (int) powf(2.0f, (float) d+1);
+          if (index % power_plus != 0) return;
+
+          int power_ident = power_plus / 2;
+          //int power_ident = (int) powf(2.0f, (float) d);
+
+          int t = odata[index + power_ident - 1];
+
+          odata[index + power_ident - 1] = odata[index + power_plus - 1];
+          odata[index + power_plus - 1] += t;
+
+
+        }
+
+        __global__ void kernAddToAll(int N, int block_idx, const int* auxArr, int* odata) {
+          int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+          if (index >= N) return;
+
+          odata[index] += auxArr[block_idx];
+        }
+
+        void _scan(int N, int* dev_odata) {
+          dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+
+
+          for(int d=0;d<=ilog2ceil(N)-1;d++){
+            kernEfficientScanUp<<<fullBlocksPerGrid, blockSize>>>(N, d, dev_odata);
+
+          }
+
+          kernEfficientScanReset<<<fullBlocksPerGrid, blockSize>>>(N, dev_odata);
+
+          for(int d=ilog2ceil(N)-1;d>=0;d--){
+            kernEfficientScanDown<<<fullBlocksPerGrid, blockSize>>>(N, d, dev_odata);
+
+
+          }
+
+
+        }
+
+        void _scanBlock(int N, int* dev_odata, int block_idx, int* aux_arr) {
+          dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+          int offset = N*block_idx;
+
+          for(int d=0;d<=ilog2ceil(N)-1;d++){
+            kernEfficientScanUp<<<fullBlocksPerGrid, blockSize>>>(N, d, dev_odata + offset);
+
+          }
+
+          cudaMemcpy(aux_arr + block_idx, dev_odata + offset + N - 1, sizeof(int), cudaMemcpyDeviceToDevice);
+          //cudaMemset(dev_odata + offset + N -1 , 0, sizeof(int));
+          kernEfficientScanReset<<<fullBlocksPerGrid, blockSize>>>(N, dev_odata + offset);
+
+          for(int d=ilog2ceil(N)-1;d>=0;d--){
+            kernEfficientScanDown<<<fullBlocksPerGrid, blockSize>>>(N, d, dev_odata + offset);
+
+
+          }
+
+
+        }
+
+
+        int getNextPower(int _N){
+          int N = 1;
+          while(N < _N){
+            N *= 2;
+          }
+          return N;
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+
+
+        void _chunkedScanAutomatic(int N, int *odata){
+
+
+          dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+          // divide array into blocks
+          const int num_per_block = 1024;
+          int num_blocks = ceil(N / (double) num_per_block);
+          int num_scan_blocks = getNextPower(num_blocks);
+
+          int* dev_auxarr;
+          cudaMalloc((void**)&dev_auxarr, num_scan_blocks * sizeof(int));
+          checkCUDAErrorWithLine("cudaMalloc dev_auxarr failed!");
+
+
+          for(int block_idx=0; block_idx<num_blocks; block_idx++){
+            _scanBlock(num_per_block, odata, block_idx, dev_auxarr);
+          }
+          _scan(num_scan_blocks, dev_auxarr);
+
+          for(int block_idx=1; block_idx<num_blocks; block_idx++){
+            int offset = num_per_block*block_idx;
+            kernAddToAll<<<fullBlocksPerGrid, blockSize>>>(num_per_block, block_idx, dev_auxarr, odata + offset);
+          }
+
+          cudaFree(dev_auxarr);
+
         }
 
+        void _chunkedScan(int N, int *odata, int *auxarr, int num_per_block, int num_blocks, int num_scan_blocks){
+
+
+          dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+
+
+
+
+
+          for(int block_idx=0; block_idx<num_blocks; block_idx++){
+            _scanBlock(num_per_block, odata, block_idx, auxarr);
+          }
+          _scan(num_scan_blocks, auxarr);
+
+          for(int block_idx=1; block_idx<num_blocks; block_idx++){
+            int offset = num_per_block*block_idx;
+            kernAddToAll<<<fullBlocksPerGrid, blockSize>>>(num_per_block, block_idx, auxarr, odata + offset);
+          }
+
+
+
+        }
+
+
+        void scan(int _N, int *odata, const int *idata) {
+
+
+          int N = getNextPower(_N);
+
+
+          //dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+          // divide array into blocks
+          const int num_per_block = 1024;
+          int num_blocks = ceil(N / (double) num_per_block);
+          int num_scan_blocks = getNextPower(num_blocks);
+
+          int* dev_auxarr;
+          cudaMalloc((void**)&dev_auxarr, num_scan_blocks * sizeof(int));
+          checkCUDAErrorWithLine("cudaMalloc dev_auxarr failed!");
+
+          // copy data to gpu buffer
+          int* dev_odata;
+          cudaMalloc((void**)&dev_odata, N * sizeof(int));
+          checkCUDAErrorWithLine("cudaMalloc dev_odata failed!");
+
+          cudaMemset(dev_odata, 0, sizeof(int) * N);
+          cudaMemcpy(dev_odata, idata, sizeof(int) * _N, cudaMemcpyHostToDevice);
+
+          timer().startGpuTimer();
+
+          _chunkedScan(N, dev_odata, dev_auxarr, num_per_block, num_blocks, num_scan_blocks);
+
+
+          timer().endGpuTimer();
+
+          cudaMemcpy(odata, dev_odata, sizeof(int) * _N, cudaMemcpyDeviceToHost);
+
+          cudaDeviceSynchronize();
+          cudaFree(dev_odata);
+          cudaFree(dev_auxarr);
+
+        }
+
+        void scanplain(int _N, int *odata, const int *idata) {
+
+
+          int N = getNextPower(_N);
+
+          dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+
+          // copy data to gpu buffer
+          int* dev_odata;
+          //int* dev_idata;
+          cudaMalloc((void**)&dev_odata, N * sizeof(int));
+          checkCUDAErrorWithLine("cudaMalloc dev_odata failed!");
+
+          cudaMemset(dev_odata, 0, sizeof(int) * N);
+          cudaMemcpy(dev_odata, idata, sizeof(int) * _N, cudaMemcpyHostToDevice);
+
+          timer().startGpuTimer();
+
+          _scan(N, dev_odata);
+
+          timer().endGpuTimer();
+
+          cudaMemcpy(odata, dev_odata, sizeof(int) * _N, cudaMemcpyDeviceToHost);
+
+          cudaDeviceSynchronize();
+          cudaFree(dev_odata);
+        }
+
+
+        __global__ void kernBooleanAssigner(int N, int* odata, const int* idata) {
+          int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+          if (index >= N) return;
+
+          odata[index] = idata[index] == 0 ? 0 : 1;
+
+        }
+
+        __global__ void kernScatter(int N, int* odata, const int* idata, const int* boolarr, const int* scanres ) {
+          int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+          if (index >= N) return;
+
+          if(!boolarr[index]) return;
+
+          odata[scanres[index]] = idata[index];
+
+
+        }
+
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -30,11 +286,66 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
-        int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-            return -1;
+        int compact(int _N, int *odata, const int *idata) {
+
+          int N = getNextPower(_N);
+
+          dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+          // copy data to gpu buffer
+          int* dev_odata;
+          int* dev_idata;
+          int* dev_1bool;
+          int* dev_2scanres;
+          int numElements = -1;
+
+
+          cudaMalloc((void**)&dev_odata, N * sizeof(int));
+          checkCUDAErrorWithLine("cudaMalloc dev_odata failed!");
+          cudaMalloc((void**)&dev_idata, N * sizeof(int));
+          checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+          cudaMalloc((void**)&dev_1bool, N * sizeof(int));
+          checkCUDAErrorWithLine("cudaMalloc dev_1bool failed!");
+          cudaMalloc((void**)&dev_2scanres, N * sizeof(int));
+          checkCUDAErrorWithLine("cudaMalloc dev_2scanres failed!");
+
+          cudaMemset(dev_odata, 0, sizeof(int) * N);
+          cudaMemset(dev_idata, 0, sizeof(int) * N);
+          cudaMemset(dev_1bool, 0, sizeof(int) * N);
+          cudaMemset(dev_2scanres, 0, sizeof(int) * N);
+
+          cudaMemcpy(dev_idata, idata, sizeof(int) * _N, cudaMemcpyHostToDevice);
+          //cudaMemcpy(dev_odata, odata, sizeof(int) * N, cudaMemcpyHostToDevice);
+
+          timer().startGpuTimer();
+
+          kernBooleanAssigner<<<fullBlocksPerGrid, blockSize>>>(N, dev_1bool, dev_idata);
+          cudaMemcpy(dev_2scanres, dev_1bool, sizeof(int) * _N, cudaMemcpyDeviceToDevice);
+
+
+          _chunkedScanAutomatic(_N, dev_2scanres);
+
+
+          kernScatter<<<fullBlocksPerGrid, blockSize>>>(N, dev_odata, dev_idata, dev_1bool, dev_2scanres);
+          cudaMemcpy(&numElements, dev_2scanres + (_N-1), sizeof(int), cudaMemcpyDeviceToHost);
+
+          if(_N % 2 != 0){
+            numElements++;
+          }
+
+          cudaDeviceSynchronize();
+          timer().endGpuTimer();
+
+          cudaMemcpy(odata, dev_odata, sizeof(int) * _N, cudaMemcpyDeviceToHost);
+
+          cudaDeviceSynchronize();
+          cudaFree(dev_odata);
+          cudaFree(dev_idata);
+          cudaFree(dev_1bool);
+          cudaFree(dev_2scanres);
+
+
+            return numElements;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..2499d5e 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -7,6 +7,7 @@ namespace StreamCompaction {
         StreamCompaction::Common::PerformanceTimer& timer();
 
         void scan(int n, int *odata, const int *idata);
+        void scanplain(int n, int *odata, const int *idata);
 
         int compact(int n, int *odata, const int *idata);
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..22f690d 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,6 +2,11 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include <iostream>
+
+#define checkCUDAErrorWithLine(msg) checkCUDAErrorFn(msg, __FILE__, __LINE__)
+#define blockSize 512
+
 
 namespace StreamCompaction {
     namespace Naive {
@@ -12,14 +17,81 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
+        __global__ void kernNaiveScan(int N, int d, int* odata, const int* idata) {
+          int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+          if (index >= N) return;
+
+//          int power = 1;
+//          if(d>1){
+//            for(int i=0; i<d-1; i++){
+//              power *= 2;
+//            }
+//          }
+          int power = (int) powf(2.0f, (float) d-1);
+
+
+          if (index >= power){
+
+            odata[index] = idata[index-power] + idata[index];
+
+
+          }else{
+            odata[index] = idata[index];
+          }
+
+
+        }
+
+        int getNextPower(int _N){
+          int N = 1;
+          while(N < _N){
+            N *= 2;
+          }
+          return N;
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+        void scan(int _N, int *odata, const int *idata) {
+
+          int N = getNextPower(_N);
+
+          dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+          // copy data to gpu buffer
+          int* dev_odata;
+          int* dev_idata;
+          cudaMalloc((void**)&dev_odata, N * sizeof(int));
+          checkCUDAErrorWithLine("cudaMalloc dev_odata failed!");
+          cudaMalloc((void**)&dev_idata, N * sizeof(int));
+          checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+
+          cudaMemcpy(dev_idata, idata, sizeof(int) * _N, cudaMemcpyHostToDevice);
+          cudaMemcpy(dev_odata, idata, sizeof(int) * _N, cudaMemcpyHostToDevice);
+
+
+          timer().startGpuTimer();
+
+          for(int d=1;d<=ilog2ceil(N);d++){
+            int* tmp = dev_idata;
+            dev_idata = dev_odata;
+            dev_odata = tmp;
+
+            kernNaiveScan<<<fullBlocksPerGrid, blockSize>>>(N, d, dev_odata, dev_idata);
+
+            cudaDeviceSynchronize();
+
+          }
+
+          cudaDeviceSynchronize();
+          timer().endGpuTimer();
+
+          cudaMemcpy(odata, dev_odata, sizeof(int) * _N, cudaMemcpyDeviceToHost);
+
+          cudaDeviceSynchronize();
+          cudaFree(dev_odata);
+          cudaFree(dev_idata);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..6e2685f 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -6,6 +6,9 @@
 #include "common.h"
 #include "thrust.h"
 
+#define checkCUDAErrorWithLine(msg) checkCUDAErrorFn(msg, __FILE__, __LINE__)
+#define blockSize 128
+
 namespace StreamCompaction {
     namespace Thrust {
         using StreamCompaction::Common::PerformanceTimer;
@@ -17,12 +20,41 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-            timer().endGpuTimer();
+        void scan(int N, int *odata, const int *idata) {
+
+        	dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+						// copy data to gpu buffer
+						int* dev_odata;
+						int* dev_idata;
+						cudaMalloc((void**)&dev_odata, N * sizeof(int));
+						checkCUDAErrorWithLine("cudaMalloc dev_odata failed!");
+						cudaMalloc((void**)&dev_idata, N * sizeof(int));
+						checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+
+						cudaMemcpy(dev_odata, odata, sizeof(int) * N, cudaMemcpyHostToDevice);
+						cudaMemcpy(dev_idata, idata, sizeof(int) * N, cudaMemcpyHostToDevice);
+
+
+						timer().startGpuTimer();
+
+						thrust::device_ptr<int> dev_thrust_odata(dev_odata);
+						thrust::device_ptr<int> dev_thrust_idata(dev_idata);
+
+						 // TODO use `thrust::exclusive_scan`
+						            // example: for device_vectors dv_in and dv_out:
+						            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+						thrust::inclusive_scan(dev_thrust_idata, dev_thrust_idata+N, dev_thrust_odata);
+
+						//cudaDeviceSynchronize();
+						timer().endGpuTimer();
+
+						cudaMemcpy(odata, dev_odata, sizeof(int) * N, cudaMemcpyDeviceToHost);
+
+						cudaDeviceSynchronize();
+						cudaFree(dev_odata);
+						cudaFree(dev_idata);
+
         }
     }
 }