CIS565-Fall-2018 · Zichuanyun · Sep 17, 2018 · Sep 17, 2018 · Sep 19, 2018 · Sep 19, 2018
diff --git a/README.md b/README.md
@@ -3,12 +3,112 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zichuan Yu
+  * [LinkedIn](https://www.linkedin.com/in/zichuan-yu/), [Behance](https://www.behance.net/zainyu717ebcc)
+* Tested on: Windows 10.0.17134 Build 17134, i7-4710 @ 2.50GHz 16GB, GTX 980m 4096MB GDDR5
+
+## Features
+
+- CPU Scan
+- CPU Stream Compaction
+- Naive GPU Scan
+- Work-Efficient GPU Scan
+- Work-Efficient GPU Stream Compaction
+- Thrust Implementation
+
+## Performance Analysis
+
+### Block size analysis
+
+We fix array size as 2^21 and change the block size.
+
+![block_size](img/block_size.png)
+
+As we can see, as long as the block size is not 32, it makes little differences when we increase the block size.
+
+### Array Size Analysis on Scan
+
+We fix block size as 1024 and change the array size.
+
+![scan](img/scan.png)
+
+As we can see, CPU is of course the slowest. We can also see that my own implementation is still much slower than
+Thrust implementation. I think this is because our own code is still not efficient and hardware-exploiting enough.
+
+### Array Size Analysis on Compaction
+
+We fix block size as 1024 and change the array size.
+
+![compaction](img/compaction.png)
+
+As we can see, CPU with scan is the slowest. I think that scan brings overhead to CPU, thus, if we are using CPU, we'd rather not use scan at all.
+
+## Output
+
+Array size 2^28, block size 1024
+
+```shell
+
+****************
+** SCAN TESTS **
+****************
+    [   1   1   1   1   1   1   1   1   1   1   1   1   1 ...   1   1 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 1535.85ms    (std::chrono Measured)
+    [   0   1   2   3   4   5   6   7   8   9  10  11  12 ... 268435454 268435455 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 594.798ms    (std::chrono Measured)
+    [   0   1   2   3   4   5   6   7   8   9  10  11  12 ... 268435451 268435452 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 510.046ms    (CUDA Measured)
+    [   0   1   2   3   4   5   6   7   8   9  10  11  12 ... 268435454 268435455 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 510.037ms    (CUDA Measured)
+    [   0   1   2   3   4   5   6   7   8   9  10  11  12 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 175.304ms    (CUDA Measured)
+    [   0   1   2   3   4   5   6   7   8   9  10  11  12 ... 268435454 268435455 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 175.151ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 28.8416ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 28.8394ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   1   0   0   1   0   3   1   3   3   0   3   1 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 708.621ms    (std::chrono Measured)
+    [   1   1   3   1   3   3   3   1   1   1   1   1   1 ...   1   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 680.761ms    (std::chrono Measured)
+    [   1   1   3   1   3   3   3   1   1   1   1   1   1 ...   1   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 1471.92ms    (std::chrono Measured)
+    [   1   1   3   1   3   3   3   1   1   1   1   1   1 ...   1   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 213.044ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 212.931ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+```
+
+
+
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/img/block_size.png b/img/block_size.png
diff --git a/img/compaction.png b/img/compaction.png
diff --git a/img/scan.png b/img/scan.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -6,14 +6,17 @@
  * @copyright University of Pennsylvania
  */
 
+
+
+
 #include <cstdio>
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 28; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -29,6 +32,7 @@ int main(int argc, char* argv[]) {
 
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
+    onesArray(SIZE, a);
     printArray(SIZE, a, true);
 
     // initialize b using StreamCompaction::CPU::scan you implement
@@ -51,7 +55,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
 	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,14 +68,14 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);

diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <cstdlib>
-#include <cstdio>
-#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
 #include <string>
 #include <ctime>
 
@@ -69,8 +69,8 @@ void printArray(int n, int *a, bool abridged = false) {
     printf("]\n");
 }
 
-template<typename T>
-void printElapsedTime(T time, std::string note = "")
-{
-	std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
+template<typename T>
+void printElapsedTime(T time, std::string note = "")
+{
+	std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_52
     )
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
@@ -13,6 +13,11 @@
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+
+#define BLOCK_SIZE 1024
+
+#define PLUS_OP_IDENTITY 0
+
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -1,15 +1,15 @@
 #include <cstdio>
 #include "cpu.h"
 
-#include "common.h"
+#include "common.h"
 
 namespace StreamCompaction {
     namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-	        static PerformanceTimer timer;
-	        return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+	        static PerformanceTimer timer;
+	        return timer;
         }
 
         /**
@@ -19,7 +19,10 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-            // TODO
+          odata[0] = 0;
+          for (int i = 1; i < n; ++i) {
+            odata[i] = odata[i - 1] + idata[i - 1];
+          }
 	        timer().endCpuTimer();
         }
 
@@ -30,9 +33,15 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-            // TODO
+          int non_zero_idx = 0;
+          for (int i = 0; i < n; ++i) {
+            if (idata[i] != 0) {
+              odata[non_zero_idx] = idata[i];
+              ++non_zero_idx;
+            }
+          }
 	        timer().endCpuTimer();
-            return -1;
+          return non_zero_idx;
         }
 
         /**
@@ -41,10 +50,31 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-	        // TODO
+
+	        // allocate a temporary 0/1 accumulating array
+          int* temp = new int[n];
+
+          //for (int i = 0; i < n; ++i) {
+          //  temp[i] = 0;
+          //}
+          temp[0] = 0;
+          timer().startCpuTimer();
+          // scan to 0/1 accumulating array
+          for (int i = 1; i < n; ++i) {
+            temp[i] = temp[i - 1] + (idata[i - 1] != 0);
+          }
+
+          // use temp to map to output
+          int count = 0;
+          for (int i = 0; i < n; ++i) {
+            if (idata[i] != 0) {
+              ++count;
+              odata[temp[i]] = idata[i];
+            }
+          }
 	        timer().endCpuTimer();
-            return -1;
+          delete[] temp;
+          return count;
         }
     }
 }