From 84d0edaad5d9777a6c52d43e584bae60d1b50be0 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Sun, 19 Sep 2021 20:59:27 -0400 Subject: [PATCH 01/27] Add some comments --- stream_compaction/common.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..509365b 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -11,6 +11,7 @@ #include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) +// usage: checkCUDAError("a descriptive name of this error") #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) /** @@ -26,6 +27,7 @@ inline int ilog2(int x) { return lg; } +// computes the ceiling of log2(x), as an integer. inline int ilog2ceil(int x) { return x == 1 ? 0 : ilog2(x - 1) + 1; } From b6b21a1afcae465bf0283424fee02dae0cc2c736 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Sun, 19 Sep 2021 21:29:09 -0400 Subject: [PATCH 02/27] Add useful comments --- src/main.cpp | 2 ++ src/testing_helpers.hpp | 2 ++ stream_compaction/cpu.cu | 1 + 3 files changed, 5 insertions(+) diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..3b845ad 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,6 +13,7 @@ #include #include "testing_helpers.hpp" +// The tests default to an array of size 1 << 8 = 256 const int SIZE = 1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; @@ -35,6 +36,7 @@ int main(int argc, char* argv[]) { // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. zeroArray(SIZE, b); + // Here, power-of-two refers to the length of the input array printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 025e94a..39161c7 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -49,6 +49,8 @@ void onesArray(int n, int *a) { } } +// This function populates n elements of array a with values +// between 0 and maxval - 1 void genArray(int n, int *a, int maxval) { srand(time(nullptr)); diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..9af91b9 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -14,6 +14,7 @@ namespace StreamCompaction { /** * CPU scan (prefix sum). + * CPU scan (exclusive prefix sum). * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ From c5666139caf076f90b6c4ab2176da012db0ee58a Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Sun, 19 Sep 2021 21:30:05 -0400 Subject: [PATCH 03/27] tested: cpu scan, power of two --- src/main.cpp | 16 ++++++++++++++++ stream_compaction/cpu.cu | 6 +++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index 3b845ad..d433f45 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -20,6 +20,10 @@ int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; +int* bookArraya = new int[8]{ 3, 1, 7, 0 ,4 ,1 ,6, 3 }; +int* bookArrayb = new int[8]{}; +const int BOOK_SIZE = 8; + int main(int argc, char* argv[]) { // Scan tests @@ -42,6 +46,15 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(SIZE, b, true); + printf("\n"); + + printArray(BOOK_SIZE, bookArraya, false); + zeroArray(BOOK_SIZE, bookArrayb); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(BOOK_SIZE, bookArrayb, bookArraya); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(BOOK_SIZE, bookArrayb, false); + zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); @@ -153,4 +166,7 @@ int main(int argc, char* argv[]) { delete[] a; delete[] b; delete[] c; + + delete[] bookArraya; + delete[] bookArrayb; } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 9af91b9..01a5456 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -13,7 +13,6 @@ namespace StreamCompaction { } /** - * CPU scan (prefix sum). * CPU scan (exclusive prefix sum). * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. @@ -21,6 +20,11 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int j = 1; j < n; j++) + { + odata[j] = odata[j - 1] + idata[j - 1]; + } timer().endCpuTimer(); } From 0ca8573035c5cf17a4c2d059bba993299bbab6ba Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Sun, 19 Sep 2021 22:39:57 -0400 Subject: [PATCH 04/27] CPU compactWithoutScan --- src/main.cpp | 9 +++++++++ stream_compaction/cpu.cu | 18 +++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index d433f45..3c0b527 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -55,6 +55,8 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(BOOK_SIZE, bookArrayb, false); + printf("\n"); + zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); @@ -62,6 +64,9 @@ int main(int argc, char* argv[]) { printArray(NPOT, b, true); printCmpResult(NPOT, b, c); + printf("\n"); + +#if 0 zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); @@ -110,6 +115,8 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); +#endif + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -148,6 +155,7 @@ int main(int argc, char* argv[]) { printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); +#if 0 zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); @@ -161,6 +169,7 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); +#endif system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 01a5456..fe38aa8 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -30,19 +30,31 @@ namespace StreamCompaction { /** * CPU stream compaction without using the scan function. - * + * This stream compaction method will remove 0s from an array of ints. * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + // Given an array of elements, create a new array with all the 0s + // removed while preserving order + int index = 0; + for (int i = 0; i < n; i++) + { + int thisElement = idata[i]; + if (thisElement != 0) + { + odata[index] = thisElement; + index++; + } + } timer().endCpuTimer(); - return -1; + return n - index; } /** * CPU stream compaction using scan and scatter, like the parallel version. - * + * This stream compaction method will remove 0s from an array of ints. * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { From a4d281728100b56c78cf60752e44f81694e4339a Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Sun, 19 Sep 2021 23:45:18 -0400 Subject: [PATCH 05/27] Upate REAME --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 0e38ddb..e6e2358 100644 --- a/README.md +++ b/README.md @@ -12,3 +12,10 @@ CUDA Stream Compaction Include analysis, etc. (Remember, this is public, so don't put anything here that you don't want to share with the world.) +# Question +``` +genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); +``` +Why leave 0? \ No newline at end of file From dbeb2414c897ef1cc8bbd6d6eb197a7fc6846d39 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Sun, 19 Sep 2021 23:45:37 -0400 Subject: [PATCH 06/27] Part 1: CPU Scan & Stream Compaction --- stream_compaction/cpu.cu | 63 ++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index fe38aa8..89d81f5 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -3,6 +3,8 @@ #include "common.h" +#include // for smart pointers + namespace StreamCompaction { namespace CPU { using StreamCompaction::Common::PerformanceTimer; @@ -12,6 +14,14 @@ namespace StreamCompaction { return timer; } + void scanWithoutTimer(int n, int* odata, const int* idata) { + odata[0] = 0; + for (int j = 1; j < n; j++) + { + odata[j] = odata[j - 1] + idata[j - 1]; + } + } + /** * CPU scan (exclusive prefix sum). * For performance analysis, this is supposed to be a simple for loop. @@ -19,12 +29,7 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO - odata[0] = 0; - for (int j = 1; j < n; j++) - { - odata[j] = odata[j - 1] + idata[j - 1]; - } + scanWithoutTimer(n, odata, idata); timer().endCpuTimer(); } @@ -35,7 +40,6 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO // Given an array of elements, create a new array with all the 0s // removed while preserving order int index = 0; @@ -59,9 +63,50 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int numElement = 0; + std::unique_ptrtempArray{ new int[n] }; + std::unique_ptrscanResult{ new int[n] }; + for (int i = 0; i < n; i++) + { + scanResult[i] = -1; + } + + // STEP 1: Compute temp Array with 0s and 1s + // intialize array such that all elements meet criteria + for (int i = 0; i < n; i++) + { + tempArray[i] = 1; + } + // next, figure out which one doesn't meet criteria + for (int i = 0; i < n; i++) + { + // since we want to remove 0s, elements with value = 0 doesn't + // meet criteria + if (idata[i] == 0) + { + tempArray[i] = 0; + } + } + + // STEP 2: Run exclusive scan on tempArray + scanWithoutTimer(n, scanResult.get(), tempArray.get()); + + // STEP 3: scatter + for (int i = 0; i < n; i++) + { + // result of scan is index into final array + int index = scanResult[i]; + // only write an element if temp array has a 1 + if (tempArray[i] == 1) + { + odata[index] = idata[i]; + numElement++; + } + } + timer().endCpuTimer(); - return -1; + return n - numElement; } } } From 152f9d29f3959e28f72f40436b9825a5635ab790 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Mon, 20 Sep 2021 11:56:02 -0400 Subject: [PATCH 07/27] Bloopers 1 --- src/main.cpp | 49 +++++++++++++++++++++++-- stream_compaction/naive.cu | 73 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 118 insertions(+), 4 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 3c0b527..b327f86 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,6 +7,8 @@ */ #include +#include +#include #include #include #include @@ -24,7 +26,49 @@ int* bookArraya = new int[8]{ 3, 1, 7, 0 ,4 ,1 ,6, 3 }; int* bookArrayb = new int[8]{}; const int BOOK_SIZE = 8; +std::string deviceName; +int deviceMaxThreadsPerBlock; +int deviceSharedMemPerBlock; +int deviceMaxThreadsPerSM; +int deviceMaxBlocksPerSM; + int main(int argc, char* argv[]) { + cudaDeviceProp deviceProp; + int gpuDevice = 0; + int device_count = 0; + cudaGetDeviceCount(&device_count); + if (gpuDevice > device_count) { + std::cout + << "Error: GPU device number is greater than the number of devices!" + << " Perhaps a CUDA-capable GPU is not installed?" + << std::endl; + return false; + } + cudaGetDeviceProperties(&deviceProp, gpuDevice); + int major = deviceProp.major; + int minor = deviceProp.minor; + deviceMaxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + deviceSharedMemPerBlock = deviceProp.sharedMemPerBlock; + deviceMaxThreadsPerSM = deviceProp.maxThreadsPerMultiProcessor; + deviceMaxBlocksPerSM = deviceProp.maxBlocksPerMultiProcessor; + + + + std::ostringstream ss; + ss << " [SM " << major << "." << minor << " " << deviceProp.name << "]" + << "\n Max threads per block: " << deviceMaxThreadsPerBlock + << "\n Shared memory per block: " << deviceSharedMemPerBlock << " bytes" + // << "\n Shared memory in each block can fit " << deviceSharedMemPerBlock / sizeof(int) << " number of integers" + << "\n Max threads per SM: " << deviceMaxThreadsPerSM + << "\n Max blocks per SM: " << deviceMaxBlocksPerSM + << "\n Max grid size: " << deviceProp.maxGridSize[0] << ", " + << deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2]; + + + deviceName = ss.str(); + + std::cout << deviceName << '\n'; + // Scan tests printf("\n"); @@ -66,14 +110,15 @@ int main(int argc, char* argv[]) { printf("\n"); -#if 0 + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); +#if 0 /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); printDesc("1s array for finding bugs"); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..5e74bf2 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,10 @@ #include "common.h" #include "naive.h" +/*! Block size used for CUDA kernel launch. */ +#define blockSize 128 +#define sectionSize 128 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +15,80 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernNaiveGPUScan(int* inputArray, int* outputArray, + int inputSize) + { + // Each thread loads one value from the input array into shared + // memory array XY + __shared__ int XY[sectionSize]; + int i = blockIdx.x * blockDim.x + threadIdx.x; + // convert inclusive scan into exclusive scan by shifting + // all elements to the right by one position and fill the frist + // element and out-of-bound elements with 0. + if (i < inputSize && threadIdx.x != 0) + { + XY[threadIdx.x] = inputArray[i - 1]; + } + else { + XY[threadIdx.x] = 0; + } + // perform naive scan + for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) + { + // make sure that input is in place + __syncthreads(); + int index = threadIdx.x; + int previousIndex = index - stride; +#if 0 + if (previousIndex < 0) + { + previousIndex = 0; + } +#endif + int temp = XY[index] + XY[previousIndex]; + // make sure previous output has been consumed + __syncthreads(); + XY[index] = temp; + } + + // each thread writes its result into the output array + outputArray[i] = XY[threadIdx.x]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int size = n * sizeof(int); + int* d_InputData; + int* d_OutputData; + + cudaMalloc((void**)&d_InputData, size); + checkCUDAError("cudaMalloc d_InputData failed!"); + + cudaMalloc((void**)&d_OutputData, size); + checkCUDAError("cudaMalloc d_OutputData failed!"); + + cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice); + + dim3 dimGrid((n + blockSize - 1) / blockSize, 1, 1); + dim3 dimBlock(blockSize, 1, 1); + timer().startGpuTimer(); - // TODO + kernNaiveGPUScan <<>> (d_InputData, + d_OutputData, n); + checkCUDAError("kernNaiveGPUScan failed!"); timer().endGpuTimer(); + + cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + // cleanup + cudaFree(d_InputData); + cudaFree(d_OutputData); + checkCUDAError("cudaFree failed!"); } } } From 37835f7607c2cc1d89e225d8751e982c1fe0137f Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Mon, 20 Sep 2021 11:56:30 -0400 Subject: [PATCH 08/27] Bloopers 1 Fixed --- stream_compaction/naive.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 5e74bf2..b21a365 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -40,12 +40,10 @@ namespace StreamCompaction { __syncthreads(); int index = threadIdx.x; int previousIndex = index - stride; -#if 0 if (previousIndex < 0) { previousIndex = 0; } -#endif int temp = XY[index] + XY[previousIndex]; // make sure previous output has been consumed __syncthreads(); From a5cae919402c4cb14fd8ba0bfd16e01fe6b52f9e Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Mon, 20 Sep 2021 14:39:14 -0400 Subject: [PATCH 09/27] Tested kernNaiveGPUScanFirstStep power/non power of two --- src/main.cpp | 6 +- stream_compaction/naive.cu | 114 +++++++++++++++++++++++++++++++++++-- 2 files changed, 113 insertions(+), 7 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index b327f86..401f5fa 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -80,6 +80,7 @@ int main(int argc, char* argv[]) { a[SIZE - 1] = 0; printArray(SIZE, a, true); +#if 0 // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. @@ -110,7 +111,7 @@ int main(int argc, char* argv[]) { printf("\n"); - +#endif zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); @@ -162,6 +163,7 @@ int main(int argc, char* argv[]) { #endif +#if 0 printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -200,7 +202,7 @@ int main(int argc, char* argv[]) { printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); -#if 0 + zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index b21a365..718eb27 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#include // testing + /*! Block size used for CUDA kernel launch. */ #define blockSize 128 #define sectionSize 128 @@ -16,8 +18,8 @@ namespace StreamCompaction { return timer; } - __global__ void kernNaiveGPUScan(int* inputArray, int* outputArray, - int inputSize) + __global__ void kernNaiveGPUScanFirstStep(int* inputArray, int* outputArray, + int* SumArray, int inputSize) { // Each thread loads one value from the input array into shared // memory array XY @@ -52,6 +54,65 @@ namespace StreamCompaction { // each thread writes its result into the output array outputArray[i] = XY[threadIdx.x]; + + // the last thread in the block should write the output value of + // the last XY element in the block to the blockIdx.x position of + // SumArray + + // make sure XY[sectionSize - 1] has the correct partial sum + __syncthreads(); + if (threadIdx.x == blockDim.x - 1) + { + SumArray[blockIdx.x] = XY[sectionSize - 1]; + } + } + + __global__ void kernNaiveGPUScanSecondStep(int* inputArray, int* outputArray, + int inputSize) + { + // Each thread loads one value from the input array into shared + // memory array XY + __shared__ int XY[sectionSize]; + int i = blockIdx.x * blockDim.x + threadIdx.x; + // convert inclusive scan into exclusive scan by shifting + // all elements to the right by one position and fill the frist + // element and out-of-bound elements with 0. + if (i < inputSize && threadIdx.x != 0) + { + XY[threadIdx.x] = inputArray[i - 1]; + } + else { + XY[threadIdx.x] = 0; + } + // perform naive scan + for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) + { + // make sure that input is in place + __syncthreads(); + int index = threadIdx.x; + int previousIndex = index - stride; + if (previousIndex < 0) + { + previousIndex = 0; + } + int temp = XY[index] + XY[previousIndex]; + // make sure previous output has been consumed + __syncthreads(); + XY[index] = temp; + } + + // each thread writes its result into the output array + outputArray[i] = XY[threadIdx.x]; + } + + __global__ void kernNaiveGPUScanThirdStep(int* inputArray, int* outputArray, + int inputSize) + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < inputSize && blockIdx.x > 0) + { + outputArray[i] += inputArray[blockIdx.x - 1]; + } } /** @@ -61,6 +122,12 @@ namespace StreamCompaction { int size = n * sizeof(int); int* d_InputData; int* d_OutputData; + int sumArrayNumEle = (n + blockSize - 1) / blockSize; + int sumArraySize = sumArrayNumEle * sizeof(int); + int* d_SumArray; + + // for testing + int* sumArray = new int[sumArrayNumEle]; cudaMalloc((void**)&d_InputData, size); checkCUDAError("cudaMalloc d_InputData failed!"); @@ -68,6 +135,9 @@ namespace StreamCompaction { cudaMalloc((void**)&d_OutputData, size); checkCUDAError("cudaMalloc d_OutputData failed!"); + cudaMalloc((void**)&d_SumArray, sumArraySize); + checkCUDAError("cudaMalloc d_SumArray failed!"); + cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice); @@ -75,18 +145,52 @@ namespace StreamCompaction { dim3 dimBlock(blockSize, 1, 1); timer().startGpuTimer(); - kernNaiveGPUScan <<>> (d_InputData, - d_OutputData, n); - checkCUDAError("kernNaiveGPUScan failed!"); + // First step: compute the scan result for individual sections + // then, store their block sum to sumArray + kernNaiveGPUScanFirstStep <<>> (d_InputData, + d_OutputData, d_SumArray, n); + checkCUDAError("kernNaiveGPUScanFirstStep failed!"); +#if 0 + // cudaDeviceSynchronize(); + + kernNaiveGPUScanFirstStep << > > (d_InputData, + d_OutputData, d_SumArray, n); + checkCUDAError("kernNaiveGPUScanFirstStep failed!"); + + // cudaDeviceSynchronize(); + + kernNaiveGPUScanFirstStep << > > (d_InputData, + d_OutputData, d_SumArray, n); + checkCUDAError("kernNaiveGPUScanFirstStep failed!"); + + // cudaDeviceSynchronize(); +#endif timer().endGpuTimer(); cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); + // testing: + cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + for (int i = 0; i < sumArrayNumEle; i++) + { + std::cout << sumArray[i] << '\n'; + } + printf("\n"); + for (int i = 0; i < n; i++) + { + std::cout << odata[i] << '\n'; + } + + // cleanup cudaFree(d_InputData); cudaFree(d_OutputData); checkCUDAError("cudaFree failed!"); + + // testing clean up + delete[] sumArray; } } } From 5f90d77a86235eb08c980884bf2359104a6e2953 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Mon, 20 Sep 2021 15:08:21 -0400 Subject: [PATCH 10/27] refactor and const correctness --- src/main.cpp | 9 +++-- stream_compaction/naive.cu | 76 ++++++++++++++++---------------------- 2 files changed, 37 insertions(+), 48 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 401f5fa..fc37f31 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 9; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -112,14 +112,14 @@ int main(int argc, char* argv[]) { printf("\n"); #endif + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - -#if 0 + /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); printDesc("1s array for finding bugs"); @@ -133,6 +133,9 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); +#if 0 + + zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 718eb27..731df8a 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -17,13 +17,10 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - - __global__ void kernNaiveGPUScanFirstStep(int* inputArray, int* outputArray, - int* SumArray, int inputSize) + + __device__ void computeScanToOutputArray(const int* inputArray, int* outputArray, + int* XY, int inputSize) { - // Each thread loads one value from the input array into shared - // memory array XY - __shared__ int XY[sectionSize]; int i = blockIdx.x * blockDim.x + threadIdx.x; // convert inclusive scan into exclusive scan by shifting // all elements to the right by one position and fill the frist @@ -39,7 +36,7 @@ namespace StreamCompaction { for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) { // make sure that input is in place - __syncthreads(); + __syncthreads(); int index = threadIdx.x; int previousIndex = index - stride; if (previousIndex < 0) @@ -54,6 +51,15 @@ namespace StreamCompaction { // each thread writes its result into the output array outputArray[i] = XY[threadIdx.x]; + } + + __global__ void kernNaiveGPUScanFirstStep(const int* inputArray, + int* outputArray, int* SumArray, int inputSize) + { + // Each thread loads one value from the input array into shared + // memory array XY + __shared__ int XY[sectionSize]; + computeScanToOutputArray(inputArray, outputArray, XY, inputSize); // the last thread in the block should write the output value of // the last XY element in the block to the blockIdx.x position of @@ -67,46 +73,17 @@ namespace StreamCompaction { } } - __global__ void kernNaiveGPUScanSecondStep(int* inputArray, int* outputArray, - int inputSize) + __global__ void kernNaiveGPUScanSecondStep(const int* inputArray, + int* outputArray, int inputSize) { // Each thread loads one value from the input array into shared // memory array XY __shared__ int XY[sectionSize]; - int i = blockIdx.x * blockDim.x + threadIdx.x; - // convert inclusive scan into exclusive scan by shifting - // all elements to the right by one position and fill the frist - // element and out-of-bound elements with 0. - if (i < inputSize && threadIdx.x != 0) - { - XY[threadIdx.x] = inputArray[i - 1]; - } - else { - XY[threadIdx.x] = 0; - } - // perform naive scan - for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) - { - // make sure that input is in place - __syncthreads(); - int index = threadIdx.x; - int previousIndex = index - stride; - if (previousIndex < 0) - { - previousIndex = 0; - } - int temp = XY[index] + XY[previousIndex]; - // make sure previous output has been consumed - __syncthreads(); - XY[index] = temp; - } - - // each thread writes its result into the output array - outputArray[i] = XY[threadIdx.x]; + computeScanToOutputArray(inputArray, outputArray, XY, inputSize); } - __global__ void kernNaiveGPUScanThirdStep(int* inputArray, int* outputArray, - int inputSize) + __global__ void kernNaiveGPUScanThirdStep(const int* inputArray, + int* outputArray, int inputSize) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < inputSize && blockIdx.x > 0) @@ -141,16 +118,24 @@ namespace StreamCompaction { cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice); - dim3 dimGrid((n + blockSize - 1) / blockSize, 1, 1); - dim3 dimBlock(blockSize, 1, 1); + dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); + dim3 dimBlockArray(blockSize, 1, 1); + + dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1); + dim3 dimBlockSumArray(blockSize, 1, 1); + timer().startGpuTimer(); // First step: compute the scan result for individual sections // then, store their block sum to sumArray - kernNaiveGPUScanFirstStep <<>> (d_InputData, + kernNaiveGPUScanFirstStep <<>> (d_InputData, d_OutputData, d_SumArray, n); checkCUDAError("kernNaiveGPUScanFirstStep failed!"); + + #if 0 + kernNaiveGPUScanSecondStep << > > ( + sumArray, sumArray, sumArrayNumEle); // cudaDeviceSynchronize(); kernNaiveGPUScanFirstStep << > > (d_InputData, @@ -170,6 +155,7 @@ namespace StreamCompaction { cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); +#if 1 // testing: cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); @@ -182,7 +168,7 @@ namespace StreamCompaction { { std::cout << odata[i] << '\n'; } - +#endif // cleanup cudaFree(d_InputData); From f90c4dda86eda1795443ac7332203d7425408a02 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Mon, 20 Sep 2021 15:31:16 -0400 Subject: [PATCH 11/27] tested kernNaiveGPUScanSecondStep --- src/main.cpp | 2 +- stream_compaction/naive.cu | 36 +++++++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index fc37f31..b3bd4b9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 9; // feel free to change the size of array +const int SIZE = 1 << 10; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 731df8a..eefb2cd 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -97,14 +97,17 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { int size = n * sizeof(int); - int* d_InputData; - int* d_OutputData; int sumArrayNumEle = (n + blockSize - 1) / blockSize; int sumArraySize = sumArrayNumEle * sizeof(int); + + int* d_InputData; + int* d_OutputData; int* d_SumArray; + int* d_SumArrayOutput; // for testing int* sumArray = new int[sumArrayNumEle]; + int* sumArrayOutput = new int[sumArrayNumEle]; cudaMalloc((void**)&d_InputData, size); checkCUDAError("cudaMalloc d_InputData failed!"); @@ -115,6 +118,9 @@ namespace StreamCompaction { cudaMalloc((void**)&d_SumArray, sumArraySize); checkCUDAError("cudaMalloc d_SumArray failed!"); + cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); + checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); + cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice); @@ -124,7 +130,6 @@ namespace StreamCompaction { dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1); dim3 dimBlockSumArray(blockSize, 1, 1); - timer().startGpuTimer(); // First step: compute the scan result for individual sections // then, store their block sum to sumArray @@ -132,12 +137,14 @@ namespace StreamCompaction { d_OutputData, d_SumArray, n); checkCUDAError("kernNaiveGPUScanFirstStep failed!"); - -#if 0 - kernNaiveGPUScanSecondStep << > > ( - sumArray, sumArray, sumArrayNumEle); // cudaDeviceSynchronize(); + kernNaiveGPUScanSecondStep <<>> ( + d_SumArray, d_SumArrayOutput, sumArrayNumEle); + checkCUDAError("kernNaiveGPUScanSecondStep failed!"); +#if 0 + + kernNaiveGPUScanFirstStep << > > (d_InputData, d_OutputData, d_SumArray, n); checkCUDAError("kernNaiveGPUScanFirstStep failed!"); @@ -159,11 +166,23 @@ namespace StreamCompaction { // testing: cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); + cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize, + cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + for (int i = 0; i < sumArrayNumEle; i++) { std::cout << sumArray[i] << '\n'; } + printf("\n"); + + for (int i = 0; i < sumArrayNumEle; i++) + { + std::cout << sumArrayOutput[i] << '\n'; + } + printf("\n"); + for (int i = 0; i < n; i++) { std::cout << odata[i] << '\n'; @@ -173,10 +192,13 @@ namespace StreamCompaction { // cleanup cudaFree(d_InputData); cudaFree(d_OutputData); + cudaFree(d_SumArray); + cudaFree(d_SumArrayOutput); checkCUDAError("cudaFree failed!"); // testing clean up delete[] sumArray; + delete[] sumArrayOutput; } } } From eee8f54f4c7a39db004b71a5716cb972f5bdea3c Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Mon, 20 Sep 2021 21:14:03 -0400 Subject: [PATCH 12/27] Do inclusive first #1 --- src/main.cpp | 34 ++++--- stream_compaction/naive.cu | 199 ++++++++++++++++++++++++++++--------- 2 files changed, 169 insertions(+), 64 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index b3bd4b9..29860a5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 10; // feel free to change the size of array +const int SIZE = 1 << 4; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -78,9 +78,9 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; - printArray(SIZE, a, true); + printArray(SIZE, a, false); + -#if 0 // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. @@ -89,35 +89,37 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + printArray(SIZE, b, false); printf("\n"); - - printArray(BOOK_SIZE, bookArraya, false); - zeroArray(BOOK_SIZE, bookArrayb); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(BOOK_SIZE, bookArrayb, bookArraya); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(BOOK_SIZE, bookArrayb, false); - printf("\n"); + +#if 0 zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); + printArray(NPOT, b, false); printCmpResult(NPOT, b, c); - printf("\n"); + printArray(BOOK_SIZE, bookArraya, false); + zeroArray(BOOK_SIZE, bookArrayb); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(BOOK_SIZE, bookArrayb, bookArraya); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(BOOK_SIZE, bookArrayb, false); + printf("\n"); #endif - + printf("\n"); + + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(SIZE, c, true); + printArray(SIZE, c, false); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index eefb2cd..77f9084 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,8 +6,8 @@ #include // testing /*! Block size used for CUDA kernel launch. */ -#define blockSize 128 -#define sectionSize 128 +#define blockSize 8 +#define sectionSize 8 namespace StreamCompaction { namespace Naive { @@ -18,16 +18,72 @@ namespace StreamCompaction { return timer; } - __device__ void computeScanToOutputArray(const int* inputArray, int* outputArray, - int* XY, int inputSize) + // write a kernel to convert from inclusive scan to exclusive scan + + __global__ void convertFromInclusiveToExclusive(const int* inputArray, + int* outputArray, int inputSize) { int i = blockIdx.x * blockDim.x + threadIdx.x; // convert inclusive scan into exclusive scan by shifting // all elements to the right by one position and fill the frist // element and out-of-bound elements with 0. - if (i < inputSize && threadIdx.x != 0) + if (i < inputSize && i != 0) { - XY[threadIdx.x] = inputArray[i - 1]; + + outputArray[i] = inputArray[i - 1]; + } + else { + outputArray[i] = 0; + } + } + + void unitTestConversion() + { + // for testing + int numObject = 8; + int size = numObject * sizeof(int); + int* toyExclusiveArray = new int[numObject]; + int* toyInclusiveArray = new int[numObject] {3, 4, 11, 11, 15, 16, 22, 25}; + + int* dev_toyExclusiveArray; + int* dev_toyInclusiveArray; + + cudaMalloc((void**)&dev_toyExclusiveArray, size); + checkCUDAError("cudaMalloc dev_toyExclusiveArray failed!"); + + cudaMalloc((void**)&dev_toyInclusiveArray, size); + checkCUDAError("cudaMalloc dev_toyInclusiveArray failed!"); + + cudaMemcpy(dev_toyInclusiveArray, toyInclusiveArray, size, + cudaMemcpyHostToDevice); + + dim3 dimGridArray((numObject + blockSize - 1) / blockSize, 1, 1); + dim3 dimBlockArray(blockSize, 1, 1); + convertFromInclusiveToExclusive <<>> ( + dev_toyInclusiveArray, dev_toyExclusiveArray, numObject); + + cudaMemcpy(toyExclusiveArray, dev_toyExclusiveArray, size, + cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + printf("\n"); + + for (int i = 0; i < numObject; i++) + { + std::cout << toyExclusiveArray[i] << '\n'; + } + + printf("\n"); + + } + + __device__ void computeScanToOutputArray(const int* inputArray, int* outputArray, + int* XY, int inputSize) + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < inputSize) + { + XY[threadIdx.x] = inputArray[i]; } else { XY[threadIdx.x] = 0; @@ -37,16 +93,16 @@ namespace StreamCompaction { { // make sure that input is in place __syncthreads(); - int index = threadIdx.x; - int previousIndex = index - stride; - if (previousIndex < 0) + int previousValue = 0; + int previousIndex = threadIdx.x - stride; + if (previousIndex >= 0) { - previousIndex = 0; + previousValue = XY[previousIndex]; } - int temp = XY[index] + XY[previousIndex]; + int temp = XY[threadIdx.x] + previousValue; // make sure previous output has been consumed __syncthreads(); - XY[index] = temp; + XY[threadIdx.x] = temp; } // each thread writes its result into the output array @@ -86,9 +142,9 @@ namespace StreamCompaction { int* outputArray, int inputSize) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < inputSize && blockIdx.x > 0) + if (i < inputSize) { - outputArray[i] += inputArray[blockIdx.x - 1]; + outputArray[i] += inputArray[blockIdx.x]; } } @@ -96,6 +152,7 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // unitTestConversion(); int size = n * sizeof(int); int sumArrayNumEle = (n + blockSize - 1) / blockSize; int sumArraySize = sumArrayNumEle * sizeof(int); @@ -103,11 +160,10 @@ namespace StreamCompaction { int* d_InputData; int* d_OutputData; int* d_SumArray; - int* d_SumArrayOutput; + // int* d_SumArrayOutput; // for testing int* sumArray = new int[sumArrayNumEle]; - int* sumArrayOutput = new int[sumArrayNumEle]; cudaMalloc((void**)&d_InputData, size); checkCUDAError("cudaMalloc d_InputData failed!"); @@ -118,15 +174,64 @@ namespace StreamCompaction { cudaMalloc((void**)&d_SumArray, sumArraySize); checkCUDAError("cudaMalloc d_SumArray failed!"); - cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); - checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); + // cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); + // checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); - cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice); dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); dim3 dimBlockArray(blockSize, 1, 1); + timer().startGpuTimer(); + // First step: compute the scan result for individual sections + // then, store their block sum to sumArray + kernNaiveGPUScanFirstStep << > > (d_InputData, + d_OutputData, d_SumArray, n); + checkCUDAError("kernNaiveGPUScanFirstStep failed!"); + timer().endGpuTimer(); + + cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + // testing: + cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + printf("\n"); + for (int i = 0; i < sumArrayNumEle; i++) + { + std::cout << sumArray[i] << '\n'; + } + + std::cout << '\n'; + for (int i = 0; i < n; i++) + { + std::cout << odata[i] << '\n'; + } + + // cleanup + cudaFree(d_InputData); + cudaFree(d_OutputData); + cudaFree(d_SumArray); + // cudaFree(d_SumArrayOutput); + checkCUDAError("cudaFree failed!"); + + // testing clean up + delete[] sumArray; + // delete[] sumArrayOutput; + +#if 0 + + int* sumArrayOutput = new int[sumArrayNumEle]; + + + dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1); + dim3 dimBlockSumArray(blockSize, 1, 1); + + + cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice); + + dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1); dim3 dimBlockSumArray(blockSize, 1, 1); @@ -137,42 +242,43 @@ namespace StreamCompaction { d_OutputData, d_SumArray, n); checkCUDAError("kernNaiveGPUScanFirstStep failed!"); - // cudaDeviceSynchronize(); + cudaDeviceSynchronize(); + + cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + + - kernNaiveGPUScanSecondStep <<>> ( + kernNaiveGPUScanSecondStep << > > ( d_SumArray, d_SumArrayOutput, sumArrayNumEle); checkCUDAError("kernNaiveGPUScanSecondStep failed!"); -#if 0 - - kernNaiveGPUScanFirstStep << > > (d_InputData, - d_OutputData, d_SumArray, n); - checkCUDAError("kernNaiveGPUScanFirstStep failed!"); + cudaDeviceSynchronize(); - // cudaDeviceSynchronize(); + kernNaiveGPUScanThirdStep <<>> ( + d_SumArrayOutput, d_OutputData, n); + checkCUDAError("kernNaiveGPUScanThirdStep failed!"); - kernNaiveGPUScanFirstStep << > > (d_InputData, - d_OutputData, d_SumArray, n); - checkCUDAError("kernNaiveGPUScanFirstStep failed!"); + cudaDeviceSynchronize(); - // cudaDeviceSynchronize(); -#endif timer().endGpuTimer(); cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); -#if 1 - // testing: - cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); + + + cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); + printf("\n"); + for (int i = 0; i < sumArrayNumEle; i++) { - std::cout << sumArray[i] << '\n'; + std::cout << sumArrayOutput[i] << '\n'; } printf("\n"); @@ -182,23 +288,20 @@ namespace StreamCompaction { std::cout << sumArrayOutput[i] << '\n'; } printf("\n"); - + for (int i = 0; i < n; i++) + { + std::cout << idata[i] << '\n'; + } + std::cout << '\n'; for (int i = 0; i < n; i++) { std::cout << odata[i] << '\n'; } -#endif - // cleanup - cudaFree(d_InputData); - cudaFree(d_OutputData); - cudaFree(d_SumArray); - cudaFree(d_SumArrayOutput); - checkCUDAError("cudaFree failed!"); - // testing clean up - delete[] sumArray; - delete[] sumArrayOutput; + + +#endif } } } From 4a82c650fb7192af37d6f76822f82270ff0c614e Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Mon, 20 Sep 2021 21:19:08 -0400 Subject: [PATCH 13/27] Do inclusive first #2 --- stream_compaction/naive.cu | 52 +++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 77f9084..3cc81ce 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -160,10 +160,11 @@ namespace StreamCompaction { int* d_InputData; int* d_OutputData; int* d_SumArray; - // int* d_SumArrayOutput; + int* d_SumArrayOutput; // for testing int* sumArray = new int[sumArrayNumEle]; + int* sumArrayOutput = new int[sumArrayNumEle]; cudaMalloc((void**)&d_InputData, size); checkCUDAError("cudaMalloc d_InputData failed!"); @@ -174,20 +175,28 @@ namespace StreamCompaction { cudaMalloc((void**)&d_SumArray, sumArraySize); checkCUDAError("cudaMalloc d_SumArray failed!"); - // cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); - // checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); + cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); + checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); dim3 dimBlockArray(blockSize, 1, 1); + dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1); + dim3 dimBlockSumArray(blockSize, 1, 1); + timer().startGpuTimer(); // First step: compute the scan result for individual sections // then, store their block sum to sumArray kernNaiveGPUScanFirstStep << > > (d_InputData, d_OutputData, d_SumArray, n); checkCUDAError("kernNaiveGPUScanFirstStep failed!"); + + kernNaiveGPUScanSecondStep << > > ( + d_SumArray, d_SumArrayOutput, sumArrayNumEle); + checkCUDAError("kernNaiveGPUScanSecondStep failed!"); + timer().endGpuTimer(); cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); @@ -196,6 +205,9 @@ namespace StreamCompaction { // testing: cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); + cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize, + cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); printf("\n"); for (int i = 0; i < sumArrayNumEle; i++) @@ -203,30 +215,40 @@ namespace StreamCompaction { std::cout << sumArray[i] << '\n'; } + + printf("\n"); + + for (int i = 0; i < sumArrayNumEle; i++) + { + std::cout << sumArrayOutput[i] << '\n'; + } + std::cout << '\n'; for (int i = 0; i < n; i++) { std::cout << odata[i] << '\n'; } + + + + + // cleanup cudaFree(d_InputData); cudaFree(d_OutputData); cudaFree(d_SumArray); - // cudaFree(d_SumArrayOutput); + cudaFree(d_SumArrayOutput); checkCUDAError("cudaFree failed!"); // testing clean up delete[] sumArray; - // delete[] sumArrayOutput; + delete[] sumArrayOutput; #if 0 - int* sumArrayOutput = new int[sumArrayNumEle]; - + - dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1); - dim3 dimBlockSumArray(blockSize, 1, 1); cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice); @@ -269,18 +291,6 @@ namespace StreamCompaction { - - cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize, - cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - - printf("\n"); - - for (int i = 0; i < sumArrayNumEle; i++) - { - std::cout << sumArrayOutput[i] << '\n'; - } - printf("\n"); for (int i = 0; i < sumArrayNumEle; i++) From 38cc4a431247f5a50bb5186e37229d0c5c009d7a Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Mon, 20 Sep 2021 21:29:44 -0400 Subject: [PATCH 14/27] Before removing testing code --- src/main.cpp | 15 +++--- stream_compaction/naive.cu | 96 +++++++++----------------------------- 2 files changed, 30 insertions(+), 81 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 29860a5..8c30a68 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 4; // feel free to change the size of array +const int SIZE = 1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -78,7 +78,7 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; - printArray(SIZE, a, false); + printArray(SIZE, a, true); // initialize b using StreamCompaction::CPU::scan you implement @@ -89,20 +89,19 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, false); + printArray(SIZE, b, true); printf("\n"); - -#if 0 - zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, false); + printArray(NPOT, b, true); printCmpResult(NPOT, b, c); +#if 0 + printArray(BOOK_SIZE, bookArraya, false); zeroArray(BOOK_SIZE, bookArrayb); printDesc("cpu scan, power-of-two"); @@ -119,7 +118,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(SIZE, c, false); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3cc81ce..c14c74e 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,8 +6,8 @@ #include // testing /*! Block size used for CUDA kernel launch. */ -#define blockSize 8 -#define sectionSize 8 +#define blockSize 128 +#define sectionSize 128 namespace StreamCompaction { namespace Naive { @@ -142,9 +142,9 @@ namespace StreamCompaction { int* outputArray, int inputSize) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < inputSize) + if (i < inputSize && blockIdx.x > 0) { - outputArray[i] += inputArray[blockIdx.x]; + outputArray[i] += inputArray[blockIdx.x - 1]; } } @@ -159,8 +159,10 @@ namespace StreamCompaction { int* d_InputData; int* d_OutputData; + int* d_OutputExclusiveData; int* d_SumArray; int* d_SumArrayOutput; + // for testing int* sumArray = new int[sumArrayNumEle]; @@ -172,6 +174,9 @@ namespace StreamCompaction { cudaMalloc((void**)&d_OutputData, size); checkCUDAError("cudaMalloc d_OutputData failed!"); + cudaMalloc((void**)&d_OutputExclusiveData, size); + checkCUDAError("cudaMalloc d_OutputExclusiveData failed!"); + cudaMalloc((void**)&d_SumArray, sumArraySize); checkCUDAError("cudaMalloc d_SumArray failed!"); @@ -193,13 +198,25 @@ namespace StreamCompaction { d_OutputData, d_SumArray, n); checkCUDAError("kernNaiveGPUScanFirstStep failed!"); + cudaDeviceSynchronize(); + kernNaiveGPUScanSecondStep << > > ( d_SumArray, d_SumArrayOutput, sumArrayNumEle); checkCUDAError("kernNaiveGPUScanSecondStep failed!"); + cudaDeviceSynchronize(); + + kernNaiveGPUScanThirdStep << > > ( + d_SumArrayOutput, d_OutputData, n); + checkCUDAError("kernNaiveGPUScanThirdStep failed!"); + + convertFromInclusiveToExclusive << > > ( + d_OutputData, d_OutputExclusiveData, n); + checkCUDAError("convertFromInclusiveToExclusive failed!"); + timer().endGpuTimer(); - cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); + cudaMemcpy(odata, d_OutputExclusiveData, size, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); // testing: @@ -237,6 +254,7 @@ namespace StreamCompaction { // cleanup cudaFree(d_InputData); cudaFree(d_OutputData); + cudaFree(d_OutputExclusiveData); cudaFree(d_SumArray); cudaFree(d_SumArrayOutput); checkCUDAError("cudaFree failed!"); @@ -244,74 +262,6 @@ namespace StreamCompaction { // testing clean up delete[] sumArray; delete[] sumArrayOutput; - -#if 0 - - - - - - cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice); - - - dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1); - dim3 dimBlockSumArray(blockSize, 1, 1); - - timer().startGpuTimer(); - // First step: compute the scan result for individual sections - // then, store their block sum to sumArray - kernNaiveGPUScanFirstStep <<>> (d_InputData, - d_OutputData, d_SumArray, n); - checkCUDAError("kernNaiveGPUScanFirstStep failed!"); - - cudaDeviceSynchronize(); - - cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - - - - - kernNaiveGPUScanSecondStep << > > ( - d_SumArray, d_SumArrayOutput, sumArrayNumEle); - checkCUDAError("kernNaiveGPUScanSecondStep failed!"); - - cudaDeviceSynchronize(); - - kernNaiveGPUScanThirdStep <<>> ( - d_SumArrayOutput, d_OutputData, n); - checkCUDAError("kernNaiveGPUScanThirdStep failed!"); - - cudaDeviceSynchronize(); - - timer().endGpuTimer(); - - cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - - - - printf("\n"); - - for (int i = 0; i < sumArrayNumEle; i++) - { - std::cout << sumArrayOutput[i] << '\n'; - } - printf("\n"); - for (int i = 0; i < n; i++) - { - std::cout << idata[i] << '\n'; - } - std::cout << '\n'; - for (int i = 0; i < n; i++) - { - std::cout << odata[i] << '\n'; - } - - - - -#endif } } } From ff897f19743d8c095ffefb53d0111519ccd94b1f Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Mon, 20 Sep 2021 21:35:01 -0400 Subject: [PATCH 15/27] Part 2: Naive GPU Scan Algorithm --- src/main.cpp | 2 +- stream_compaction/naive.cu | 42 -------------------------------------- 2 files changed, 1 insertion(+), 43 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 8c30a68..31c6861 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 12; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index c14c74e..1b43a1c 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -152,7 +152,6 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // unitTestConversion(); int size = n * sizeof(int); int sumArrayNumEle = (n + blockSize - 1) / blockSize; int sumArraySize = sumArrayNumEle * sizeof(int); @@ -162,11 +161,6 @@ namespace StreamCompaction { int* d_OutputExclusiveData; int* d_SumArray; int* d_SumArrayOutput; - - - // for testing - int* sumArray = new int[sumArrayNumEle]; - int* sumArrayOutput = new int[sumArrayNumEle]; cudaMalloc((void**)&d_InputData, size); checkCUDAError("cudaMalloc d_InputData failed!"); @@ -219,38 +213,6 @@ namespace StreamCompaction { cudaMemcpy(odata, d_OutputExclusiveData, size, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); - // testing: - cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize, - cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - - printf("\n"); - for (int i = 0; i < sumArrayNumEle; i++) - { - std::cout << sumArray[i] << '\n'; - } - - - printf("\n"); - - for (int i = 0; i < sumArrayNumEle; i++) - { - std::cout << sumArrayOutput[i] << '\n'; - } - - std::cout << '\n'; - for (int i = 0; i < n; i++) - { - std::cout << odata[i] << '\n'; - } - - - - - - // cleanup cudaFree(d_InputData); cudaFree(d_OutputData); @@ -258,10 +220,6 @@ namespace StreamCompaction { cudaFree(d_SumArray); cudaFree(d_SumArrayOutput); checkCUDAError("cudaFree failed!"); - - // testing clean up - delete[] sumArray; - delete[] sumArrayOutput; } } } From 39d39dd2ddefe893c2f9775a17a89f8a1a84c33b Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Mon, 20 Sep 2021 23:49:54 -0400 Subject: [PATCH 16/27] Part 3: Work-Efficient GPU Scan & Stream Compaction --- src/main.cpp | 55 +++--- stream_compaction/common.h | 4 + stream_compaction/efficient.cu | 319 ++++++++++++++++++++++++++++++++- stream_compaction/naive.cu | 14 +- 4 files changed, 346 insertions(+), 46 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 31c6861..3ba8919 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 12; // feel free to change the size of array +const int SIZE = 1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -100,27 +100,29 @@ int main(int argc, char* argv[]) { printArray(NPOT, b, true); printCmpResult(NPOT, b, c); -#if 0 - - printArray(BOOK_SIZE, bookArraya, false); - zeroArray(BOOK_SIZE, bookArrayb); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(BOOK_SIZE, bookArrayb, bookArraya); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(BOOK_SIZE, bookArrayb, false); - - printf("\n"); -#endif printf("\n"); + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(SIZE, c, true); + // printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - + /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); printDesc("1s array for finding bugs"); @@ -134,23 +136,8 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); -#if 0 - - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); +#if 0 zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); @@ -167,7 +154,7 @@ int main(int argc, char* argv[]) { #endif -#if 0 + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -206,21 +193,19 @@ int main(int argc, char* argv[]) { printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); -#endif system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 509365b..d4732e8 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -10,6 +10,10 @@ #include #include +/*! Block size used for CUDA kernel launch. */ +#define blockSize 128 +#define sectionSize 128 + #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) // usage: checkCUDAError("a descriptive name of this error") #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..ea72d31 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "efficient.h" +#include namespace StreamCompaction { namespace Efficient { @@ -12,13 +13,280 @@ namespace StreamCompaction { return timer; } + __global__ void convertFromInclusiveToExclusive(const int* inputArray, + int* outputArray, int inputSize) + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + // convert inclusive scan into exclusive scan by shifting + // all elements to the right by one position and fill the frist + // element and out-of-bound elements with 0. + if (i < inputSize && i != 0) + { + + outputArray[i] = inputArray[i - 1]; + } + else { + outputArray[i] = 0; + } + } + + __device__ void reductionStep(int *XY) + { + for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) + { + // make sure that input is in place + __syncthreads(); + int index = (threadIdx.x + 1) * stride * 2 - 1; + if (index < sectionSize) + { + XY[index] += XY[index - stride]; + } + } + } + + __device__ void postScanStep(int* XY) + { + for (unsigned int stride = sectionSize / 4; stride > 0; stride /= 2) + { + // make sure that input is in place + __syncthreads(); + int index = (threadIdx.x + 1) * stride * 2 - 1; + if ((index + stride) < sectionSize) + { + XY[index + stride] += XY[index]; + } + } + } + + __device__ void computeScanToOutputArray(const int* inputArray, int* outputArray, + int* XY, int inputSize) + { + int i = 2 * blockIdx.x * blockDim.x + threadIdx.x; + // each thread loads two input elements into the shared memory + if (i < inputSize) + { + XY[threadIdx.x] = inputArray[i]; + } + if (i + blockDim.x < inputSize) + { + XY[threadIdx.x + blockDim.x] = inputArray[i + blockDim.x]; + } + reductionStep(XY); + postScanStep(XY); + // each thread write two elements into the output array + __syncthreads(); + if (i < inputSize) + { + outputArray[i] = XY[threadIdx.x]; + } + if (i + blockDim.x < inputSize) + { + outputArray[i + blockDim.x] = XY[threadIdx.x + blockDim.x]; + } + } + + __global__ void kernWorkEfficientGPUScanFirstStep(const int* inputArray, + int* outputArray, int* SumArray, int inputSize) + { + __shared__ int XY[sectionSize]; + computeScanToOutputArray(inputArray, outputArray, XY, inputSize); + + // the last thread in the block should write the output value of + // the last XY element in the block to the blockIdx.x position of + // SumArray + + // make sure XY[sectionSize - 1] has the correct partial sum + __syncthreads(); + if (threadIdx.x == blockDim.x - 1) + { + SumArray[blockIdx.x] = XY[sectionSize - 1]; + } + } + + __global__ void kernWorkEfficientGPUScanSecondStep(const int* inputArray, + int* outputArray, int inputSize) + { + __shared__ int XY[sectionSize]; + computeScanToOutputArray(inputArray, outputArray, XY, inputSize); + } + + + __global__ void kernWorkEfficientGPUScanThirdStep(const int* inputArray, + int* outputArray, int inputSize) + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < inputSize && blockIdx.x > 0) + { + outputArray[i] += inputArray[blockIdx.x - 1]; + } + } + + void scanWithoutTimer(int n, int* odata, const int* idata) { + int size = n * sizeof(int); + int sumArrayNumEle = (n + blockSize - 1) / blockSize; + int sumArraySize = sumArrayNumEle * sizeof(int); + + int* d_InputData; + int* d_OutputData; + int* d_OutputExclusiveData; + int* d_SumArray; + int* d_SumArrayOutput; + + cudaMalloc((void**)&d_InputData, size); + checkCUDAError("cudaMalloc d_InputData failed!"); + + cudaMalloc((void**)&d_OutputData, size); + checkCUDAError("cudaMalloc d_OutputData failed!"); + + cudaMalloc((void**)&d_OutputExclusiveData, size); + checkCUDAError("cudaMalloc d_OutputExclusiveData failed!"); + + cudaMalloc((void**)&d_SumArray, sumArraySize); + checkCUDAError("cudaMalloc d_SumArray failed!"); + + cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); + checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); + + cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); + + // Only need to launch a kernel with (blockSize / 2) in a block + // b/c each thread loads/stores two elements + dim3 dimGridArrayEfficient((n + (blockSize / 2) - 1) / (blockSize / 2), 1, 1); + dim3 dimBlockArrayEfficient((blockSize / 2), 1, 1); + + dim3 dimGridSumArray((sumArrayNumEle + (blockSize / 2) - 1) / (blockSize / 2), 1, 1); + dim3 dimBlockSumArray((blockSize / 2), 1, 1); + + dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); + dim3 dimBlockArray(blockSize, 1, 1); + + // timer().startGpuTimer(); + + // First step: compute the scan result for individual sections + // then, store their block sum to sumArray + kernWorkEfficientGPUScanFirstStep << > > (d_InputData, d_OutputData, + d_SumArray, n); + checkCUDAError("kernNaiveGPUScanFirstStep failed!"); + + // cudaDeviceSynchronize(); + + // Second step: scan block sums + kernWorkEfficientGPUScanSecondStep << > > ( + d_SumArray, d_SumArrayOutput, sumArrayNumEle); + checkCUDAError("kernNaiveGPUScanSecondStep failed!"); + + // cudaDeviceSynchronize(); + + // Third step: add scanned block sum i to all values of scanned block + // i + 1 + kernWorkEfficientGPUScanThirdStep << > > ( + d_SumArrayOutput, d_OutputData, n); + checkCUDAError("kernNaiveGPUScanThirdStep failed!"); + + // cudaDeviceSynchronize(); + + // Last step: + convertFromInclusiveToExclusive << > > ( + d_OutputData, d_OutputExclusiveData, n); + checkCUDAError("convertFromInclusiveToExclusive failed!"); + // timer().endGpuTimer(); + + cudaMemcpy(odata, d_OutputExclusiveData, size, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + // cleanup + cudaFree(d_InputData); + cudaFree(d_OutputData); + cudaFree(d_OutputExclusiveData); + cudaFree(d_SumArray); + cudaFree(d_SumArrayOutput); + checkCUDAError("cudaFree failed!"); + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int size = n * sizeof(int); + int sumArrayNumEle = (n + blockSize - 1) / blockSize; + int sumArraySize = sumArrayNumEle * sizeof(int); + + int* d_InputData; + int* d_OutputData; + int* d_OutputExclusiveData; + int* d_SumArray; + int* d_SumArrayOutput; + + cudaMalloc((void**)&d_InputData, size); + checkCUDAError("cudaMalloc d_InputData failed!"); + + cudaMalloc((void**)&d_OutputData, size); + checkCUDAError("cudaMalloc d_OutputData failed!"); + + cudaMalloc((void**)&d_OutputExclusiveData, size); + checkCUDAError("cudaMalloc d_OutputExclusiveData failed!"); + + cudaMalloc((void**)&d_SumArray, sumArraySize); + checkCUDAError("cudaMalloc d_SumArray failed!"); + + cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); + checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); + + cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); + + // Only need to launch a kernel with (blockSize / 2) in a block + // b/c each thread loads/stores two elements + dim3 dimGridArrayEfficient((n + (blockSize / 2) - 1) / (blockSize / 2), 1, 1); + dim3 dimBlockArrayEfficient((blockSize / 2), 1, 1); + + dim3 dimGridSumArray((sumArrayNumEle + (blockSize / 2) - 1) / (blockSize / 2), 1, 1); + dim3 dimBlockSumArray((blockSize / 2), 1, 1); + + dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); + dim3 dimBlockArray(blockSize, 1, 1); + timer().startGpuTimer(); - // TODO + + // First step: compute the scan result for individual sections + // then, store their block sum to sumArray + kernWorkEfficientGPUScanFirstStep <<> > (d_InputData, d_OutputData, + d_SumArray, n); + checkCUDAError("kernNaiveGPUScanFirstStep failed!"); + + // cudaDeviceSynchronize(); + + // Second step: scan block sums + kernWorkEfficientGPUScanSecondStep << >> ( + d_SumArray, d_SumArrayOutput, sumArrayNumEle); + checkCUDAError("kernNaiveGPUScanSecondStep failed!"); + + // cudaDeviceSynchronize(); + + // Third step: add scanned block sum i to all values of scanned block + // i + 1 + kernWorkEfficientGPUScanThirdStep << >> ( + d_SumArrayOutput, d_OutputData, n); + checkCUDAError("kernNaiveGPUScanThirdStep failed!"); + + // cudaDeviceSynchronize(); + + // Last step: + convertFromInclusiveToExclusive <<>> ( + d_OutputData, d_OutputExclusiveData, n); + checkCUDAError("convertFromInclusiveToExclusive failed!"); timer().endGpuTimer(); + + cudaMemcpy(odata, d_OutputExclusiveData, size, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + // cleanup + cudaFree(d_InputData); + cudaFree(d_OutputData); + cudaFree(d_OutputExclusiveData); + cudaFree(d_SumArray); + cudaFree(d_SumArrayOutput); + checkCUDAError("cudaFree failed!"); } /** @@ -31,10 +299,51 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + timer().startCpuTimer(); + + int numElement = 0; + std::unique_ptrtempArray{ new int[n] }; + std::unique_ptrscanResult{ new int[n] }; + for (int i = 0; i < n; i++) + { + scanResult[i] = -1; + } + + // STEP 1: Compute temp Array with 0s and 1s + // intialize array such that all elements meet criteria + for (int i = 0; i < n; i++) + { + tempArray[i] = 1; + } + // next, figure out which one doesn't meet criteria + for (int i = 0; i < n; i++) + { + // since we want to remove 0s, elements with value = 0 doesn't + // meet criteria + if (idata[i] == 0) + { + tempArray[i] = 0; + } + } + + // STEP 2: Run exclusive scan on tempArray + scanWithoutTimer(n, scanResult.get(), tempArray.get()); + + // STEP 3: scatter + for (int i = 0; i < n; i++) + { + // result of scan is index into final array + int index = scanResult[i]; + // only write an element if temp array has a 1 + if (tempArray[i] == 1) + { + odata[index] = idata[i]; + numElement++; + } + } + + timer().endCpuTimer(); + return n - numElement; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 1b43a1c..8c038c4 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -5,10 +5,6 @@ #include // testing -/*! Block size used for CUDA kernel launch. */ -#define blockSize 128 -#define sectionSize 128 - namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -192,18 +188,24 @@ namespace StreamCompaction { d_OutputData, d_SumArray, n); checkCUDAError("kernNaiveGPUScanFirstStep failed!"); - cudaDeviceSynchronize(); + //(); + // Second step: scan block sums kernNaiveGPUScanSecondStep << > > ( d_SumArray, d_SumArrayOutput, sumArrayNumEle); checkCUDAError("kernNaiveGPUScanSecondStep failed!"); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); + // Third step: add scanned block sum i to all values of scanned block + // i + 1 kernNaiveGPUScanThirdStep << > > ( d_SumArrayOutput, d_OutputData, n); checkCUDAError("kernNaiveGPUScanThirdStep failed!"); + // cudaDeviceSynchronize(); + + // Last step: convertFromInclusiveToExclusive << > > ( d_OutputData, d_OutputExclusiveData, n); checkCUDAError("convertFromInclusiveToExclusive failed!"); From 20e9597f00e1b9edc45446ed1b73b917cf6721fc Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Tue, 21 Sep 2021 00:39:56 -0400 Subject: [PATCH 17/27] Part 4: Using Thrust's Implementation --- src/main.cpp | 7 ++----- stream_compaction/thrust.cu | 6 ++++++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 3ba8919..7602d1f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -137,23 +137,20 @@ int main(int argc, char* argv[]) { printCmpResult(NPOT, b, c); -#if 0 zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); -#endif - printf("\n"); printf("*****************************\n"); diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..4320c2c 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,17 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + thrust::device_vector d_idata(idata, idata + n); + thrust::device_vector d_result(n); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(d_idata.begin(), d_idata.end(), d_result.begin()); timer().endGpuTimer(); + thrust::copy(d_result.begin(), d_result.end(), odata); } } } From 574f2a4603c384e345d498bf39075c42160c22be Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Tue, 21 Sep 2021 11:07:02 -0400 Subject: [PATCH 18/27] README first draft --- README.md | 111 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index e6e2358..b77bb72 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,112 @@ -CUDA Stream Compaction -====================== +

+

Prefix Sum and Stream Compaction

+

Author: (Charles) Zixin Zhang

+

+ CPU and GPU Implementations of Exclusive Prefix Sum(Scan) Algorithm and Stream Compaction in CUDA C +

+

-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +--- +## Highlights -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +XXXXX -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) + +Tested on: + +``` + [SM 8.6 NVIDIA GeForce RTX 3080] + Max threads per block: 1024 + Shared memory per block: 49152 bytes + Max threads per SM: 1536 + Max blocks per SM: 16 + Max grid size: 2147483647, 65535, 65535 +``` + +--- + +## Features + +- CPU Scan & Stream Compaction +- Naive GPU Scan Algorithm Using Shared Memory +- Work-Efficient GPU Scan Using Shared Memory & Stream Compaction +- Thrust's Scan Algorithm + +For all GPU Scan algorithms, I choose to implement inclusive Scan first, and then convert the result of inclusive Scan to exclusive Scan. This can be done in parallel with minimal code. + +## Performance Analysis + +### Block Size + +RTX 3080 Stats: + +``` + [SM 8.6 NVIDIA GeForce RTX 3080] + Max threads per block: 1024 + Shared memory per block: 49152 bytes + Max threads per SM: 1536 + Max blocks per SM: 16 + Max grid size: 2147483647, 65535, 65535 +``` + +I want to choose a block configuration that would result in the largest number of threads in the SM. + +:heavy_check_mark: 512 threads per block + +- You need 1536/512 = 3 blocks to fully occupy the SM. Fortunately, SM allows up to 16 blocks. Thus, the actual number of threads that can run on this SM is 3 * 512 = 1536. We have occupied 1536/1536 = 100% of the SM. + +## Naive Scan Analysis + +- Implemented ```NaiveGPUScan``` using shared memory. +- Each thread is assigned to evolve the contents of one element in the input array. +- This is largely a four step process: + - compute the scan result for individual sections. Then, store their block sum to ```sumArray``` + - scan block sums + - add scanned block sum ```i``` to all values of scanned block ```i + 1``` + - convert from inclusive to exclusive scan + +In my implementation, the naive kernel can process up to 128 elements in each section by using 128 threads in each block. If the input data consists of 1,000,000 elements, we can use ceil(1,000,000 / 128) = 7813 thread blocks. With up to 2147483647 thread blocks in the x-dimension of the grid, the naive kernel can process up to 2147483647 * 128 = around 274 billion elements. + +## Work Efficient Scan + +Understand thread to data mapping: + +```int index = (threadIdx.x + 1) * stride * 2 - 1;``` + +- (threadIdx.x + 1) shifts thread indices from 0, 1, 2, 3, ... to 1, 2, 3, 4, ...All indices become non-zero integers. +- (threadIdx.x + 1) * stride * 2 - 1 + - For example, when stride = 1, we want thread 0 maps to data index [1], thread 1 maps to data index[3], etc. + - (threadIdx.x + 1) * stride * 2 - 1 = (0 + 1) * 1 * 2 - 1 = 1 + - (threadIdx.x + 1) * stride * 2 - 1 = (1 + 1) * 1 * 2 - 1 = 3 + - For example, when stride = 2, we want thread 0 maps to data index [3], thread 1 maps to data index[7], etc. + - (threadIdx.x + 1) * stride * 2 - 1 = (0 + 1) * 2 * 2 - 1 = 3 + - (threadIdx.x + 1) * stride * 2 - 1 = (1 + 1) * 2 * 2 - 1 = 7 # Question + ``` genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); ``` -Why leave 0? \ No newline at end of file +Why leave 0? + + + +## Bloopers + +### #1 + +``` +CUDA error (d:\dev\565\project2-stream-compaction\stream_compaction\naive.cu:84): memCpy back failed!: an illegal memory access was encountered + +83 cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); +84 checkCUDAError("memCpy back failed!"); +``` + +- I encountered this error when implementing the naive version (without considering arbirary-length inputs) of the scan algorithm. At first, I suspected the culprit is on line 83 (because the line 84 reports the error). However, the culprit actually resides in my ```kernNaiveGPUScan``` function where I accessed ```XY[-1]``` inside the for loop. +- Fix: Need a if-statement to make sure we never access```XY[-1]```. Also need to make sure ```__syncthreads()``` are **not** in the if-statement. + +> When a ```__syncthread()``` statement is placed in an if-statement, either all or none of the threads in a block execute the path that includes the __syncthreads(). PMPP p.59 + From 3de29d57d1b2144b9fe6f0c45a8f4f8290ecb607 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Tue, 21 Sep 2021 11:15:17 -0400 Subject: [PATCH 19/27] first draft --- README.md | 81 +++++++- .../CUDA Flocking-checkpoint.ipynb | 189 ++++++++++++++++++ images/plotting/CUDA Flocking.ipynb | 91 +++++++++ images/scan.png | Bin 0 -> 24790 bytes src/main.cpp | 2 +- stream_compaction/common.h | 4 +- 6 files changed, 354 insertions(+), 13 deletions(-) create mode 100644 images/plotting/.ipynb_checkpoints/CUDA Flocking-checkpoint.ipynb create mode 100644 images/plotting/CUDA Flocking.ipynb create mode 100644 images/scan.png diff --git a/README.md b/README.md index b77bb72..dec8797 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,76 @@ For all GPU Scan algorithms, I choose to implement inclusive Scan first, and the ## Performance Analysis +![scan](images/scan.png) + +When the array size is under 20,000, CPU Scan performs better than other algorithms. As the array size increases, GPU Naive Scan performs better than the rest of the algorithms. The Thrust implementation has more stable performance than the rest of the algorithms. + +Output when array size is 65536: + +``` +**************** +** SCAN TESTS ** +**************** + [ 27 40 6 30 21 41 41 26 20 5 6 29 41 ... 32 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0972ms (std::chrono Measured) + [ 0 27 67 73 103 124 165 206 232 252 257 263 292 ... 1599954 1599986 ] + +==== cpu scan, non-power-of-two ==== + elapsed time: 0.085ms (std::chrono Measured) + [ 0 27 67 73 103 124 165 206 232 252 257 263 292 ... 1599856 1599858 ] + passed + +==== work-efficient scan, power-of-two ==== + elapsed time: 0.178144ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.096544ms (CUDA Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.091232ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.182464ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.10432ms (CUDA Measured) + [ 0 27 67 73 103 124 165 206 232 252 257 263 292 ... 1599954 1599986 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.075776ms (CUDA Measured) + [ 0 27 67 73 103 124 165 206 232 252 257 263 292 ... 1599856 1599858 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 0 1 3 3 2 1 0 1 2 1 2 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.1293ms (std::chrono Measured) + [ 1 1 3 3 2 1 1 2 1 2 2 1 3 ... 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.1319ms (std::chrono Measured) + [ 1 1 3 3 2 1 1 2 1 2 2 1 3 ... 3 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.6768ms (std::chrono Measured) + [ 1 1 3 3 2 1 1 2 1 2 2 1 3 ... 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.096544ms (CUDA Measured) + [ 1 1 3 3 2 1 1 2 1 2 2 1 3 ... 3 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.096544ms (CUDA Measured) + [ 1 1 3 3 2 1 1 2 1 2 2 1 3 ... 3 3 ] + passed +Press any key to continue . . . +``` + + + ### Block Size RTX 3080 Stats: @@ -56,7 +126,7 @@ I want to choose a block configuration that would result in the largest number o - You need 1536/512 = 3 blocks to fully occupy the SM. Fortunately, SM allows up to 16 blocks. Thus, the actual number of threads that can run on this SM is 3 * 512 = 1536. We have occupied 1536/1536 = 100% of the SM. -## Naive Scan Analysis +## Naive Scan - Implemented ```NaiveGPUScan``` using shared memory. - Each thread is assigned to evolve the contents of one element in the input array. @@ -83,15 +153,6 @@ Understand thread to data mapping: - (threadIdx.x + 1) * stride * 2 - 1 = (0 + 1) * 2 * 2 - 1 = 3 - (threadIdx.x + 1) * stride * 2 - 1 = (1 + 1) * 2 * 2 - 1 = 7 -# Question - -``` -genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); -``` -Why leave 0? - ## Bloopers diff --git a/images/plotting/.ipynb_checkpoints/CUDA Flocking-checkpoint.ipynb b/images/plotting/.ipynb_checkpoints/CUDA Flocking-checkpoint.ipynb new file mode 100644 index 0000000..fca51ef --- /dev/null +++ b/images/plotting/.ipynb_checkpoints/CUDA Flocking-checkpoint.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 39, + "id": "1f1923e1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEWCAYAAAB42tAoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABJmElEQVR4nO3dd3hUVfrA8e+bTiChJHQICb0kEDoIIiC6gEizgSjgoqyLuGvZXXH5rYvuooh1sXcsLIKsAiKi0kFQAQ2E3kIvIYRAIIWU8/vj3oRJT2AmySTv53nyzMy5d845dwjz5txz73nFGINSSilVEI+y7oBSSqnyTQOFUkqpQmmgUEopVSgNFEoppQqlgUIppVShNFAopZQqlAYK5fZE5G0R+YfD6z+KyGkRuSgiQWXZN3clIodEZEBZ90OVDxoolNszxjxojPkXgIh4Ay8DNxtjqhljzpZFn0Skr4gYEflbWbSvlDNpoFAVTV3AD9hR0jeKxVn/J8YB8fajS4iIl6vqVsqRBgpVLth/fTd3eD1bRP5tP+8rIsdE5HERiRWRkyJyX+59RaQlsMcuThCRlfb260Rkk4ictx+vc3jvahGZLiI/AklAU7svk0Rkn4gkisi/RKSZiGwUkQsiMl9EfAo5Fn/gduAhoIWIdHHYFmrXP1FETtjH8rjD9mkiskBE5tlt/yoiHRy2HxKRJ0RkG3BJRLxEZKiI7BCRBPt42jjsP0VEDth17RSREbn6+oCI7HLY3slhc6SIbLM/t3ki4lfEP6OqoDRQKHdRD6gONAQmAG+ISE3HHYwxe4F29ssaxpj+IlIL+AaYBQRhnZb6Jtfcxb3ARCAAOGyXDQQ6Az2AvwHvAmOAxkA4MLqQvt4GXAS+AL4DxuazTz+gBXAzMCXXfMAw+721gP8CC+1TallGA7cANYCmwFzgEaA2sBT42iGQHQCux/rsngY+E5H6ACJyBzDN7l8gMBRwPFV3p/05hAHtgfGFHLOqwDRQKHeRBjxjjEkzxizF+iJuVYz33QLsM8Z8aoxJN8bMBXYDtzrsM9sYs8PenmaXPW+MuWCM2QFsB743xhw0xpwHvgU6FtLmOGCeMSYD64t+dK4veoCnjTGXjDHRwEfkDDxbjDEL7L68jHUqrYfD9lnGmKPGmGTgLuAbY8wP9v4vAlWA6wCMMV8YY04YYzKNMfOAfUA3u577gZnGmE3Gst8YczhXOyeMMfHA10BkIcesKjANFMpdnDXGpDu8TgKqFeN9DbgySshyGGtkkuVoPu877fA8OZ/X+bYtIo2xRgtz7KJFWF/0t+Ta1bHNw3Y/82wzxmQCxwraTq7js/c/in18IjJWRKLs01IJWKOhYHv3xlgjjoKccnhe3M9bVUAaKFR5kQT4O7yu56R6TwBNcpWFAMcdXjtzCeV7sf5ffS0ip4CDWIEi9+mnxrn6cyK/bfbkeqNc2x37m+P4RETs9x8XkSbAe8BkIMgYUwNrdCT27keBZiU7PFUZaaBQ5UUUcLeIeIrIQOAGJ9W7FGgpInfbE793AW2BJU6qP7exWHMBkQ4/twG35JoX+YeI+ItIO+A+YJ7Dts4iMtK+qukRIBX4qYD25tt132if3nrc3n8DUBUrqJwBsC8ACHd47/vAX0Sks33FV3M7uCiVgwYKVV78GWveIAFr0nihMyq176MYgvUFehZrYnqIMSbOGfU7EpEeQCjwhjHmlMPPYmA/Oech1thlK4AXjTHfO2xbhDX3cA5rhDLSYe4kB2PMHuAe4DUgDuszvNUYc9kYsxN4CdiIdeosAvjR4b1fANOx5lESsT7zWtfyGaiKSTRxkVKlR0RCgRjAO9ecS9b2aUBzY8w9pdw1pQqkIwqllFKF0kChlFKqUHrqSSmlVKF0RKGUUqpQbr2oWHBwsAkNDS3rbiillFvZsmVLnDGmdnH3d+tAERoayubNm8u6G0op5VZEJPdqBYXSU09KKaUKpYFCKaVUoTRQKKWUKpRbz1EoVdmkpaVx7NgxUlJSyroryg34+fnRqFEjvL1zr3JfMhoolHIjx44dIyAggNDQUKyFYpXKnzGGs2fPcuzYMcLCwq6pLpeeehKRGnZax912usWeIlJLRH6w00z+kJWlzF69cpaI7LfTL3Yqqv6rMTd6DuFvhuL5jAfhb4YyN3pO0W9SqpxISUkhKChIg4QqkogQFBTklNGnq+co/gMsM8a0BjoAu4ApwApjTAuslTOn2PsOwkoN2QIrLeVbzu7M3Og5TF05kdcGHSZlquG1QYeZunKiBgvlVjRIqOJy1u+KywKFiAQCfYAPAOxljxOw8gF/bO/2MTDcfj4M+MROyfgTUCMrt6+zTF83lQ+GJtEvDLw9oV8YfDA0ienrpjqzGaWUqlBcOaJoipUw5SMR+U1E3heRqkBdY8xJAPuxjr1/Q3KmeDxGznSVAIjIRBHZLCKbz5w5U6IO7Yo7Qu+QnGW9Q6xypVTRHn30UV599dXs17/73e+4//77s18//vjjvPzyyyxevJgZM2YAsHDhQnbu3Jm9T9++fYu8UTYsLIw9e/bkKHvkkUeYOXMmb7/9Np988okTjuaK0NBQ4uKsFCXXXXfdVdXx7LPP5nh9tfWUR64MFF5AJ+AtY0xH4BJXTjPlJ78xUp4VC40x7xpjuhhjutSuXew70AFoExzC+lwxYf0Rq1ypisjZc3LXXXcdGzZsACAzM5O4uDh27NiRvX3Dhg306tWLoUOHMmWK9d89d6AojlGjRvH5559nv87MzGTBggXcddddPPjgg4wdmzuzrPNkHV9J5Q4UV1tPeeTKQHEMOGaM+dl+vQArcJzOOqVkP8Y67O+YRzh3nuBrNvX66UxY7M+qGEjLgFUxMGGhD1Ovn+7MZpQqF1wxJ9erV6/sL8AdO3YQHh5OQEAA586dIzU1lV27dtGxY0dmz57N5MmT2bBhA4sXL+avf/0rkZGRHDhwAIAvvviCbt260bJlS9atW5enndGjR+cIFGvXriU0NJQmTZowbdo0XnzxRQBmzZpF27Ztad++PaNGjQLIsR0gPDycQ4cOATB8+HA6d+5Mu3btePfdd/M9xmrVqgHw1FNPERkZSWRkJA0bNuS+++4rsI4pU6aQnJxMZGQkY8aMyVGPMYa//vWvhIeHExERwbx5Vtbb1atX07dvX26//XZat27NmDFjKK+rebvs8lhjzCkROSoirex0jTcCO+2fccAM+3GR/ZbFwGQR+RzoDpzPOkXlLKMjrH/Ah7+dyq64I7Tx82G61GF02zud2YxSpeLpr3ew88SFArevPv84n99hzcnBlTm5UV88zsINTfN9T9sGgfzz1nYF1tmgQQO8vLw4cuQIGzZsoGfPnhw/fpyNGzdSvXp12rdvj4+PT/b+1113HUOHDmXIkCHcfvvt2eXp6en88ssvLF26lKeffprly5fnaKd9+/Z4eHiwdetWOnTowOeff87o0aPJbcaMGcTExODr60tCQkKB/c7y4YcfUqtWLZKTk+natSu33XYbQUFB+e77zDPP8Mwzz3D+/Hmuv/56Jk+eXGAdM2bM4PXXXycqKipPPV9++SVRUVFs3bqVuLg4unbtSp8+fQD47bff2LFjBw0aNKBXr178+OOP9O7du8jjKG2uvurpYWCOiGzDSjL/LFaAuElE9gE32a8BlgIHsfIIvwdMckWHRkeMYfukQ2Q8lcn2YQsZfek8bJvviqaUKlNnkmPznZM7kxyb/xuKKWtUkRUoevbsmf26uOflR44cCUDnzp2z/9rPLWtUkZ6ezqJFi7jjjjvy7NO+fXvGjBnDZ599hpdX0X/3zpo1iw4dOtCjRw+OHj3Kvn37Ct3fGMOYMWN49NFH6dy581XVsX79ekaPHo2npyd169blhhtuYNOmTQB069aNRo0a4eHhQWRkZIGfRVlz6Q13xpgooEs+m27MZ18DPOTK/uTR8ndQPxLWvgDt7wJPvf9QuY/C/vIH2PFmCOuPHM4eUYA1J9e2dgjz/tDzqtvNmqeIjo4mPDycxo0b89JLLxEYGMjvf//7YtXh6+sLgKenJ+npeVKHA1aguPnmm7nhhhto3749derUybPPN998w9q1a1m8eDH/+te/2LFjB15eXmRmZmbvk3UfwerVq1m+fDkbN27E39+fvn37FnmPwbRp02jUqFH2aaerqaOw00lZnwMU/lmUtcq91pMI9J0C52Jg27yy7o1STpXvnNxi/2uek+vVqxdLliyhVq1aeHp6UqtWLRISEti4cSM9e+YNQAEBASQmJpa4nWbNmhEUFMSUKVPyPe2UmZnJ0aNH6devHzNnziQhIYGLFy8SGhrKr7/+CsCvv/5KTEwMAOfPn6dmzZr4+/uze/dufvrpp0LbX7JkCT/88AOzZs3KLiusDm9vb9LS0vLU06dPH+bNm0dGRgZnzpxh7dq1dOvWrcSfR1mq3IECoOVAqN/BGlVklM9ortTVGB0xhun93+Xhb5vgN114+NsmTO//bvZc3dWKiIggLi6OHj165CirXr06wcHBefYfNWoUL7zwAh07dsyezC72MYweze7duxkxYkSebRkZGdxzzz1ERETQsWNHHn30UWrUqMFtt91GfHw8kZGRvPXWW7Rs2RKAgQMHkp6eTvv27fnHP/6Ro//5eemllzhx4gTdunUjMjKSp556qtA6Jk6cmH0qzNGIESNo3749HTp0oH///sycOZN69eqV6HMoa26dM7tLly7GKYmLdi+Fz0fD8Lcg8u5rr08pF9m1axdt2rQp624oN5Lf74yIbDHG5DctkC8dUQC0GgT12sOamTqqUEqpXDRQgD1X8aQ1VxH9RVn3RimlyhUNFFmyRhVrdVShlFKONFBkyboCKv6gjiqUUsqBBgpHrQZDvQi9AkoppRy4OnHRIRGJFpEoEdlsl00TkeN2WZSIDHbY/0k7cdEeEfmdK/tWQIfhhikQfwC2Lyj15pVSqjwqjRFFP2NMZK5LsV6xyyKNMUsBRKQtMApoBwwE3hQRz1LoX06tb7FGFXoFlFJ5lNYy48WVe8XWLOPHj+edd97JUbZw4UIGDx7M5s2b+dOf/uSU9h3bW7DA+uPy/vvvL/FquQCzZ8/mxIkr66BebT2uUJ5OPQ0DPjfGpBpjYrDWfCr92xd1VKEqEHddZry4CgoUuVefBbIXFuzSpUuOu62d7f3336dt27Ylfl/uQHG19biCqwOFAb4XkS0iMtGhfLKdF/vDrJzZlELiomJrfQvU1bkK5d7cbZnxlJQU7rvvvuw7rVetWgWQXVeWIUOGsHr16nyX9s4yYMAAdu/ezcmT1gLUSUlJLF++nOHDh7N69WqGDBkCwJo1a7KXEu/YsSOJiYk5tgNMnjyZ2bNnA9aKsl27diU8PJyJEyfmu45T1ohp8eLF2XW3atWKsLCwAutYsGABmzdvZsyYMURGRpKcnJxj5DV37lwiIiIIDw/niSeeyG6rWrVqTJ06NXuRwtOnT1/lv2zhXB0oehljOmHlw35IRPpg5cJuhrWa7EngJXtflycuKjYR6PsEnN0P2//nmjaUulbfToGPbinwZ/q39+ef+vfb+wt+37eF5RbLf5nx7t27s3HjRjZv3lzgMuMvvPACUVFRNGvWDLiyzPirr77K008/DcAbb7wBQHR0NHPnzmXcuHGFLrg3Y8YMqlSpQlRUFHPm5Ax+np6ejBw5kvnzrZWhFy9eTL9+/QgICMix34svvsgbb7xBVFQU69ato0qVKoUe/+TJk9m0aRPbt28nOTmZJUuWFLjv0KFDiYqKIioqig4dOvCXv/ylwDpuv/12unTpwpw5c4iKisrRjxMnTvDEE0+wcuVKoqKi2LRpEwsXLgTg0qVL9OjRg61bt9KnTx/ee++9Qvt/tVwaKIwxJ+zHWOAroJsx5rQxJsMYk4m1nHjW6SWXJy4qkVZZo4qZkJlRZt1Q6mrtSk7JP/VvcuGrnRbFVcuMr1+/nnvvvReA1q1b06RJE/bu3XvV/XQ8/VRQPotevXrx2GOPMWvWLBISEopcqnzVqlV0796diIgIVq5cmeO0W0FmzpxJlSpVeOihh66qjk2bNtG3b19q166Nl5cXY8aMYe3atQD4+Phkj34KW7L9WrlsXW07P7aHMSbRfn4z8IyI1HdISDQC2G4/Xwz8V0ReBhoALYBfXNW/Inl4wA1/g/n3WqOK9prcSJUzg2YUurnNm6H5LjPepnYTuO+bq27WVcuMF7TuXEHLhhelV69enDx5kq1bt7Jhw4Y8cxZgZaa75ZZbWLp0KT169GD58uUFtpeSksKkSZPYvHkzjRs3Ztq0aUX2ZcWKFXzxxRfZX+xXU0dh6/F5e3sjYp2MceUy5a4cUdQF1ovIVqwv/G+MMcuAmfYls9uAfsCjAMaYHcB8rAx4y4CHjDFl+6d86yFQNxzWPK+jCuV23G2Z8T59+mSfQtq7dy9HjhyhVatWhIaGEhUVlb2s+C+/XPn7saClvQFEhDvvvJNx48YxePBg/Pz88uxz4MABIiIieOKJJ+jSpQu7d++mSZMm7Ny5k9TUVM6fP8+KFSuAKwEjODiYixcvZl/lVJDDhw8zadIk5s+fn30qqbA6Cvqcunfvzpo1a4iLiyMjI4O5c+dyww03FNq2s7kyFepBoEM+5fcW8p7pQPlJYO3hATc8oaMK5ZbypP4NDmF6/+lOW2b87rvvzlF28eLFApcZf+CBB5g1a1ahX66TJk3iwQcfJCIiAi8vL2bPno2vry+9evUiLCwsezK3U6dO2e/JWtq7U6dOeeYpwDr99MILL2Rfqpvbq6++yqpVq/D09KRt27YMGjQIX19f7rzzTtq3b0+LFi3o2LEjADVq1OCBBx4gIiKC0NBQunbtWujnNHv2bM6ePZu9RHqDBg1YunRpgXWMHz+eBx98kCpVqrBx48bs8vr16/Pcc8/Rr18/jDEMHjyYYcOGFdq2s+ky40XJzIS3e0PGZXjoZ/Ao/Vs7lMqiy4yrktJlxkuDh4d9BdQ+2P5lWfdGKaVKnQaK4mh9K9Rpp3MVSqlKSQNFcWRdAXV2H+z4qqx7oyo5dz5drEqXs35XNFAUV5uhUKetjipUmfLz8+Ps2bMaLFSRjDGcPXs236u9SsplVz1VOFlXQH0xzhpVRNxe1j1SlVCjRo04duwYLlu+RlUofn5+NGrU6Jrr0UBREtmjipnQboReAaVKnbe3d/aaQUqVFj31VBJZcxVxe3SuQilVaZRF4qJaIvKDiOyzH2va5SIis+zERdtEpFPhtZeRNsOgdhtrVKFzFUqpSqAsEhdNAVYYY1oAK+zXYK0w28L+mYi1ymz54ziq2LmwrHujlFIuVxannoYBH9vPPwaGO5R/Yiw/ATVEpH4Z9K9obYdD7dY6qlBKVQplkbiobtbqsfZjHbu8/CQuKkrWFVBnduuoQilV4ZVF4qKClJ/ERcWRY1SRWeTuSinlrko9cRFwOuuUkv0Ya+9evhIXFSVrrkJHFUqpCs5lgUJEqopIQNZzrMRF27ESFI2zdxsHLLKfLwbG2lc/9QDOOyQ4Kp/aDofgVjqqUEpVaGWRuGgGcJOI7ANusl8DLAUOAvuxUqROcmHfnMPD0x5V7IJdi4reXyml3JDmo7hWmRnwZk8QD/jjBuuUlFJKlWOaj6K06ahCKVXBaaBwhnYjILilzlUopSokDRTO4OFp3VcRuxN2LS7r3iillFNpoHCW7FHF8zqqUEpVKBoonMXDE/r8zRpV7P66rHujlFJOo4HCmcJHQlALWK2jCqVUxaGBwpmyroCK3aGjCqVUhaGBwtnCb4Og5noFlFKqwnB5oBARTxH5TUSW2K9ni0iMncwoSkQi7XL3SFxUlKwroE5vh91Lyro3Sil1zUpjRPFnYFeusr/ayYwijTFRdpl7JC4qjuxRhc5VKKXcn6tToTYCbgHeL8bu7pO4qChZV0Cd3g57vinr3iil1DVx9YjiVeBvQO4/q6fbp5deERFfu6xUEhfNjZ5D+JuheD7jQfibocyNnlPiOoola1ShV0AppdycK5cZHwLEGmO25Nr0JNAa6ArUAp7Ieks+1Tg1cdHc6DlMXTmR1wYdJmWq4bVBh5m6cqJrgoWnF/T5K5yO1lGFUsqtuXJE0QsYKiKHgM+B/iLymTHmpH16KRX4CCuZEZRC4qLp66bywdAk+oWBtyf0C4MPhiYxfd1UZzZzRfjtUKuZNVfhxqv0KqUqN5cFCmPMk8aYRsaYUGAUsNIYc49DdjsBhmMlM4JSSFy0K+4IvUNylvUOscpdwtPLuq/iVDTs1lGFUso9lcV9FHNEJBqIBoKBf9vlLk9c1CY4hPW5YsL6I1a5y4TfDrWawpoZOqpQSrmlUgkUxpjVxpgh9vP+xpgIY0y4MeYeY8xFu9wYYx4yxjSztzs9I9HU66czYbE/q2IgLQNWxcDYr/yYev10Zzd1haeXdQXUqWjYs9R17SillIt4lXUHStPoiDEAPPztVHbFHaG+fz3Szo8hotYQ1zYccQesnQmrn4NWg0Hym7dXSqnyqdIt4TE6YgzbJx0i46lMdkw6TGPf3zH1q2gyMl14WijrCigdVSil3FClCxSOqvt7848hbdh67Dxzfj7s2sYi7oSaYbBa5yqUUu6lUgcKgKEdGnB9i2BmLtvD6Qsprmso+wqobbDnW9e1o5RSTlbpA4WI8O/h4aRlZPL01ztc21j2qOI5HVUopdxGpQ8UAE2CqvKnG1uwNPoUK3efdl1D2XMV22DvMte1o5RSTqSBwvbA9U1pUaca/1i4g6TL6a5rqP1dOqpQSrkVDRQ2Hy8Pnh0ZwfGEZP6zYp/rGvL0gj5/gZNbdVShlHILZZG4KExEfhaRfSIyT0R87HJf+/V+e3uoq/uWW9fQWozq2pj318Ww6+QF1zXU/i6oGapXQCml3EJZJC56HnjFGNMCOAdMsMsnAOeMMc2BV+z9St2UQa2pUcWbv38VTaar7q3w9LbmKk5Gwd7vXNOGUko5SakmLrIXAuwPLLB3+RhrYUCwEhd9bD9fANxo71+qavj78H9D2vDbkQT++4uLFgsEa1RRo4nOVSilyr3STlwUBCQYY7Jmix2TE2UnLrK3n7f3z+FaExcVx/DIhvRqHsTzy3YTm+iieyscRxX7vndNG0op5QSlnbiosORELk9cVFzWvRURpKZn8q8ludN9O1GHUTqqUEqVe6WauAhrhFFDRLIWI3RMTpSduMjeXh2Id2H/ChUWXJXJ/Zrz9dYTrN4T65pGPL2tK6BO/KajCqVUuVXaiYvGAKuA2+3dxgGL7OeL7dfY21caU7Z/Zv/hhqY0rV2VfyzaTvLlDNc00mE01AjRK6CUUuVWWdxH8QTwmIjsx5qD+MAu/wAIsssfA6aUQd9y8PXy5NkRERyNT+a1lS66tyJrruLEr7DvB9e0oZRS10DK+I/2a9KlSxezebPT8xvl8ZcvtrLwt+N886fraVUvwPkNZKTBa53APxgeWKn5KpRSLiUiW4wxXYq7v96ZXQx/H9yGAD8vprrq3gpPb7j+LzqqUEqVSxooiqFWVR+m3tKWzYfPMW/zUdc00mE0VA/R3NpKqXJHA0Ux3dapIT2a1uK5pbs4k5jq/Aa8fKDP43B8C+xf7vz6lVLqKmmgKKaseytS0jKZ/s1O1zTS4W5rVKH3VSilyhENFCXQvE41HuzbjIVRJ1i3zwV3hXv5wPWP2aOKFc6vXymlroIGihKa1LcZYcFV+cfC7aSkueDeisgxUL2xjiqUUuWGBooS8vP2ZPrwcA6dTeKNVfud34CXD1z/OBzfrKMKpVS5oIHiKlzXPJiRHRvy9poD7I9NdH4DWaMKvQJKKVUOuHJRQD8R+UVEtorIDhF52i6fLSIxIhJl/0Ta5SIis+zERdtEpJOr+uYMU29pQ1VfL/7+5Xbn31uRNVdxbBMc0FGFUqpsuXJEkQr0N8Z0ACKBgSLSw972V2NMpP0TZZcNAlrYPxOBt1zYt2sWVM2XJwe15pdD8SzYcsz5DUTeA4GNdA0opVSZc+WigMYYc9F+6W3/FPaNNwz4xH7fT1irzNZ3Vf+c4Y7OjekWWotnv93F2YtOvrcix6hipXPrVkqpEnB1hjtPEYkCYoEfjDE/25um26eXXhERX7ssO3GRzTGpkWOdLk9cVFweHsL0EeFcSk1n+lIX5K3oqKMKpVTZc2mgMMZkGGMisfJOdBORcOBJoDXQFaiFtZoslKPERSXRom4Af+jTjC9/Pc6GA3HOrdzL1x5V/KKjCqVUmSmVq56MMQnAamCgMeakfXopFfgI6Gbvlp24yOaY1Khcm9y/OU2C/Pm/r1xwb0XHeyCwIax5XkcVSqky4cqrnmqLSA37eRVgALA7a95BRAQYDmy337IYGGtf/dQDOG+MOemq/jmTn7cn/x4ezsG4S7y1+oBzK88aVRz9GQ6ucm7dSilVDK4cUdQHVonINmAT1hzFEmCOiEQD0UAw8G97/6XAQWA/8B4wyYV9c7rrW9RmWGQD3lp9gANnLhb9hpLoeK81qtC5CqVUGdDERU50JjGVG19aTdsGgcx9oAfizAREv7wHS/8C9y6EZv2cV69SqtLRxEVlqHaAL1MGteGng/F8+etx51beaSwENNBRhVKq1GmgcLJRXRvTuUlN/v3NTuIvXXZexdlzFT/BwdXOq1cppYpQaKAQEX8R8XZ43UpEHhWRka7vmnvKurciMSWd55x9b0XWqEKvgFJKlaKiRhTLgFAAEWkObASaAg+JyHOu7Zr7al0vkAf6NOWLLcf46eBZ51WcNao4shFi1jivXqWUKkRRgaKmMWaf/XwcMNcY8zDWukxDXNozN/en/i1oXKsKU7+KJjXdifdWdLxX5yqUUqWqqEDh+E3UH/gBwBhzGch0Vacqgio+njwzLJwDZy7x7pqDzqvY2w96P2qPKtY6r16llCpAUYFim4i8KCKPAc2B7wGybqRThevXqg63tK/Pa6v2ExN3yXkVdxoLAfV1VKGUKhVFBYoHgDggBLjZGJNkl7cFXnRlxyqKfw5pi6+nB/+3MBqn3bPi7Qe9H4MjG3RUoZRyuUIDhTEmGfgOWA9cdijfYIz5tLD3FpK4KExEfhaRfSIyT0R87HJf+/V+e3votR5ceVAn0I+/DWzFj/vPsijKiUtXdRrLXD9/whcMxPMZD8LfDGVu9Bzn1a+UUraiLo99CpgH3AZ8IyIPlKDughIXPQ+8YoxpAZwDJtj7TwDOGWOaA6/Y+1UId3dvQmTjGvxryU4Skpxzb8Xc3f9jqm88r92RQspUw2uDDjN15UQNFkoppyvq1NNdQKQxZjTWsuATi1txIYmL+gML7PKPsRYGBCtx0cf28wXAjeLUNTDKjqeH8OyICBKS03h+2W6n1Dl93VQ+GH6ZfmHg7Qn9wuCDoUlMXzfVKfUrpVSWogJFSta8hDHmbDH2zyF34iLgAJBgjEm3d3FMTpSduMjefh4IyqfOcpO4qCTaNghkQu8w5v5ylE2H4q+5vl1xR+gdkrOsd4hVrpRSzlTUF38zEVls/3yd6/XioirPnbgIaJPfbvajWyYuKolHBrSgYY0q/P3LaC6nX9vVxW2CQ1ifKyasPwJtghvn/wallLpKRQWKYcBL9s+LuV6/VNxGHBIX9cDKhe1lb3JMTpSduMjeXh249j+9yxF/Hy+eGdaOfbEXeW/dtd1bMfX66UxY7M+qGEjLgFUxMOF/MNWvGWTqLS5KKefxKmJ7jDHmqs5liEhtIM0Yk+CQuOh5YBVwO/A51t3ei+y3LLZfb7S3rzTuvAZ6AW5sU5dB4fWYtWIfQ9rXp0lQ1auqZ3TEGAAe/nYqu+KO0CY4hOkhPRi981tY8gjc+h+oGFM8SqkyVmg+ChH51RjTyX7+P2PMbcWuWKQ91uS0J9bIZb4x5hkRaYoVJGoBvwH3GGNSRcQP+BToiDWSGGWMKfTP7vKWj6K4Tp1PYcDLa+jUpCYf39fVeXkrjIEVz8D6l6HbH2DQ8xoslFJ5lDQfRVEjCsdvmaYl6YgxZhvWl37u8oNcyZPtWJ4C3FGSNtxVvep+/OXmlkz7eidfbzvJ0A4NnFOxCNz4FKSnwE9vWjfmDXhag4VS6pqUZK2nCncaqCzd2zOU9o2q88zXOzmfnOa8ikXgd89Cl9/Dj/+xlvlQSqlrUFSg6CAiF0QkEWhvP78gIokicqE0OlhRZd1bEX8plZlOurcimwgMfgkix8CaGbD+FefWr5SqVAo99WSM8SytjlRG4Q2rc1+vMD5YH8PITo3o3KSm8yr38IChr0F6KiyfBl5+0OOPzqtfKVVpaCrUMvbYTS2pX92PqV9Fk5bh5MtaPTxhxNvQeggsmwKbP3Ju/UqpSkEDRRmr6uvF00PbsftUIh+sj3F+A57ecPtH0OJmWPIoRM11fhtKqQpNA0U5cHO7etzUti6vLt/L0fikot9QUl4+cOenENYHFk2C7f9zfhtKqQpLA0U58fTQdniI8NSi7c7LW+HI2w9Gz4XGPeB/D8CuJc5vQylVIWmgKCca1KjC4ze3YtWeMyyNPuWaRnyqwt3zoEFH+GI87PvBNe0opSoUlwUKEWksIqtEZJeduOjPdvk0ETkuIlH2z2CH9zxpJy7aIyK/c1XfyqtxPZvQrkEgT3+9gwspTry3wpFfINyzAOq0hnn3wME1rmlHKVVhuHJEkQ48boxpg7UY4EMi0tbe9ooxJtL+WQpgbxsFtAMGAm+KSKW6PNfL04PnRkYQdzGVF7/b47qGqtSEexdBzTCYOwoOb3RdW0opt+eyQGGMOWmM+dV+ngjs4kruifwMAz43xqQaY2KA/eSz1EdF175RDcb2DOXTnw4TdTTBdQ1VDYKxiyCwAcy5A45tcV1bSim3VipzFHb+647Az3bRZBHZJiIfikjWXWbZiYtsjkmNHOtyy8RFJfH4zS2pE+DLk19Gk+7seyscBdSFsYvBvxZ8NgJObnNdW0opt+XyQCEi1YD/AY8YYy4AbwHNsPJon+RKXosKn7iouAL8vJl2azt2nbzARz8ecm1j1RvCuK/BJwA+HQ6xu1zbnlLK7bg0UIiIN1aQmGOM+RLAGHPaznyXCbzHldNL2YmLbI5JjSqdgeH1uLF1HV7+YS/Hzrng3gpHNZvAuMXg4Q2fDIOzB1zbnlLKrbjyqicBPgB2GWNediiv77DbCGC7/XwxMEpEfEUkDGgB/OKq/pV3IsLTw9oBMG3xDtfcW+EoqJk1Z5GZDh/fCucOubY9pZTbcOWIohdwL9A/16WwM0UkWkS2Af2ARwGMMTuA+cBOYBnwkDEmw4X9K/ca1fTn0ZtasHxXLN/tOO36Buu0toLF5Uvw8VA4f9z1bSqlyr1CM9yVd+6a4a4k0jIyGfr6j5y7dJkfHutDgJ+36xs9vgU+GQ5Va8N9SyGgnuvbVEqVmpJmuNM7s8s5b08Pnh0RzunEFF76fm/pNNqwM4z5AhJPWXMWl+JKp12lVLmkgcINdAypyT3dm/DJxkNsO5ZQOo2G9IC7P7fmKj4dDsnnSqddpVS5o4HCTfx1YCuCqvny969cfG+Fo7A+cNccOLMHPrsNUjSpoVKVkQYKNxHo580/b23L9uMX+GTj4dJruMUAuGM2nNwK/73TmuhWSlUqGijcyC0R9enbqjYvfb+HEwnJpddw61tg5Htw9Gdrbai0UmxbKVXmNFC4ERHhX8PCyTCGaYt3lG7j4SNh+FsQsw7m3Wvl4lZKVQoaKNxM41r+/PnGlny/8zTf73BR3oqCdBgFt74K+3+ABb+HDBctha6UKlc0ULih+68Po1XdAKYt3sGl1PTSbbzzeBg0E3Yvga/+AJmV+p5IpSqFskhcVEtEfhCRffZjTbtcRGSWnbhom4h0clXf3J23pwfPjgxn38Vvaf16CJ7PeBD+Zihzo+eUTge6/wEGPG3l3l40GTJL6SospVSZ8HJh3VmJi34VkQBgi4j8AIwHVhhjZojIFGAK8AQwCGt9pxZAd6xVZru7sH9ube+Fb/ELeoNPRqTQOwTWHznMhMUTARgdMcb1Hej9CKSnwOrnwMsXhrwCkt8CwEopd1cWiYuGAR/bu30MDLefDwM+MZafgBq5FhBUDqavm8onI1LoFwbentAvDD4YmsT0dVNLrxM3PAG9HoEtH8GyJ8GNl4NRShXMlSOKbLkSF9U1xpwEK5iISB17t4ISF53MVddEYCJASEiIazteju2KO0LvXIffO8QqPxqfRONa/q7vhAgMmGaNLH5+C7z94MZ/6shCqQqmLBIXFbhrPmWVMnFRcbQJDmH9kZxl649AoFcwfV5Yxf0fb2bdvjNkZrr4r3wRGDjDmuRe/wqsmena9pRSpa7UExcBp7NOKdmPsXa5Ji4qganXT2fCYn9WxUBaBqyKgQmL/Zl+4/NM7tecqKPnuPeDXxjwyho++jGGCykuvJRVBG55BTqMhtXPwo//cV1bSqlS57JTTwUlLsJKUDQOmGE/LnIonywin2NNYp/POkWl8sqasH7426nsijtCm+AQpvefnl0+uX9zvo0+xccbD/H01zt54bs9jOzUkLE9Q2lZN8D5HfLwgKGvW6ehfngKvPysq6OUUm7PZfkoRKQ3sA6IBrKun/w71jzFfCAEOALcYYyJtwPL68BAIAm4zxhTaLKJypCPwhm2HUvgk42HWbz1BJfTM+nZNIixPZtwU9u6eHk6eVCZkQbzx8Geb+DW/1inpJRS5UpJ81Fo4qJKJP7SZeZvPsqnGw9zPCGZ+tX9GNM9hFHdQgiu5uu8htJT4fO7Yf8KGPEOdLjLeXUrpa6ZBgpVpIxMw8rdsXyy8RDr9sXh4+nB4Ih6jL0ulI6NayDOuGopLdlabfbQerj9Q2g34trrVEo5hQYKVSIHzlzk042HWbDlGBdT04loWJ2xPZtwa4cG+Hl7Xlvlly/BpyPh+Ga481NoPdg5nVZKXRMNFOqqXExN56vfjvPJhkPsi71ITX9v7uzamHu6N7m2ezJSLljpVE9vh9FzofkA53VaKXVVNFCoa2KM4aeD8Xyy8RDf7zxNpjHc2Lou465rQq9mwXh4XMVpqaR4+HgonN1n5eIO6+P8jiulik0DhXKaEwnJ/PfnI8z95QhnL12mae2q3NujCbd1bkSgn3fJKrsUB7NvgYSjcO+XVk5upVSZ0EChnC41PSP7nozfjiTg7+N5dfdkJJ6CjwZZQWPsImioCwQrVRY0UCiXyu+ejHHXNWFAm2Lek3H+mBUsUi7A+CVQL8L1nVZK5aCBQpWK+EuXmbfpKJ/9dBX3ZJw7BB8OgozLMP4bqNO6VPqslLKUm0AhIh8CQ4BYY0y4XTYNeAA4Y+/2d2PMUnvbk8AEIAP4kzHmu6La0EBR9vK7J+OW9vUZ27MJkYXdkxG33xpZiAfctxSCmpVux5WqxMpToOgDXMTKMeEYKC4aY17MtW9bYC7QDWgALAdaGmMKzbOpgaJ82R97kc9+KsE9GbG7rAlurypWsKjZpPQ7rVQlVNJA4crERWuB+GLuPgz43BiTaoyJAfZjBQ3lRprXqca0oe346e838q/h4aSkZfDXBdvo+dwKZny7m6PxSTnfUKcN3LsQLicy9/3rCX+jUemndVVKFcnl+SjyMdnOif1hVr5sCk5alIeITBSRzSKy+cyZM/ntospYNV8v7u3RhO8f7cN/H+hO97Ag3lt3kBsc8mRkj2Trt2duz4lMlWO8Nvg4KVMNrw06zNSVEzVYKFVOuHQy285st8Th1FNdIA4rIdG/gPrGmN+LyBvARmPMZ/Z+HwBLjTH/K6x+PfXkPvK7J2OsfU9Gzw9b8Nqgw/QLu7L/qhh4+NsmbJ90qMz6rFRFVW5OPeXHGHPaGJNhjMkE3uPK6SVNWlTBNahRhb/8rhUbnuzPK3d1INDPm2lf76THsyvYeeZw/mldzxyGPcsgMzP/SpVSpaJUA0VWZjvbCGC7/XwxMEpEfEUkDGgB/FKafVOlw9fLkxEdG7HwoV4seqgXA8PrE+DjkW9a15Y+AnPvgte7wM/vQGpi2XRaqUrOZYFCROYCG4FWInJMRCYAM0UkWkS2Af2ARwGMMTuwkhntBJYBDxV1xZNyfx0a1+ClOzuQeDmTCYvJldYV9l82cNsH4F8Lvv0bvNwWlj0J8TFl3XWlKhW94U6VufA3Qxne+jALd8OuOGgTDMNbw/ub6/L1nTvpGloLjm2Gn96CnQshMwNaDYYeD0Lo9VbObqVUsZWb+yhKgwaKimFu9BymrpzIB0OT6B1inXYat7AKXol/JjOpN31a1ubxm1rSoXENuHACNn0AWz6CpLNQN9zKzR1xB3hXKetDUcotaKBQbmlu9Bymr5vKrrgjtAkOYer10xneahSf/nSIt1Yf4FxSGgPa1OWxm1rStkGglUEv+gv46W2I3QH+QdD5Puh6PwTWL7pBpSoxDRSqwrmYms5H62N4d91BElPSuaV9fR4d0ILmdQLAGDi0zgoYe5aChye0HQ49JkGjzmXddaXKJQ0UqsI6n5TG++sP8uH6GJLTMhjesSF/vrEFTYKqWjvEH4Rf3oPfPoPUC9CoK3R/ENoOA88S5s9QqgLTQKEqvLMXU3ln7UE+3nCIjEzDHV0aMbl/CxrWsOcoUhMh6r/w89tW8AhoAF0nWKemqgaVbeeVKgc0UKhKI/ZCCm+uPsB/f7Zuwri7ewiT+jajTqCftUNmJuz/AX56Ew6uBi8/aH8ndP8j1G1bdh1XqoxpoFCVzvGEZF5fuY/5m4/h7SmM7RnKH/o0JcgxL0bsLmuEsfVzSE+x8nb3mAQtfgceZbHkmVJlRwOFqrQOn73Ef1bsY+Fvx6ni7cnve4dx//VNqV7FYX4iKR62zIZN78OF41AzzLq8NnIM+AWWWd+VKk3lJlAUkLioFjAPCAUOAXcaY86Jld3mP8BgIAkYb4z5tag2NFCo/OyPTeSV5fv4ZttJAvy8mHh9U+7rHUY1X68rO2Wkwa6vrVHG0Z/BJwA6joFuEzWJkqrwylOgyC9x0Uwg3hgzQ0SmADWNMU+IyGDgYaxA0R34jzGme1FtaKBQhdl54gKvLN/LDztPU9Pfmz/2bca9PUKp4pMridLxLdbltTu+gsx0aDnQuus77Aa961tVSOUmUNidCSXnMuN7gL7GmJP2AoGrjTGtROQd+/nc3PsVVr8GClUcUUcTePmHvazde4baAb481LcZo7uH4OuVK2AknrLu+t78ISTFQZ221mmp9nfpXd+qQinvgSLBGFPDYfs5Y0xNEVkCzDDGrLfLVwBPGGPyRAERmQhMBAgJCel8+PBhl/VfVSybDsXz4nd7+DkmngbV/Xj4xhbc3rkR3p65JrPTUmD7AmuUcToaqtS8ctd39XzzaSnlVsp1PopC5De+zzeCGWPeNcZ0McZ0qV27tou7pSqSrqG1+HxiD+bc35261f148stobnxpDf/bcoyMTIdfN28/6HgPPLgOxn8DTXrBj6/CqxHwxX1wdFOZHYNSZaG0A8XprJwU9mOsXa6Ji1SpEBF6NQ/myz9ex4fjuxDg58XjX2zl5lfW8PXWE2Q6BgwRCO0No+bAn36DHn+E/SvggwHwXn/Y9gWkXy67g1GqlJR2oFgMjLOfjwMWOZSPFUsP4HxR8xNKXQsRoX/ruix5uDdv39MJTw/h4bm/MXjWOr7fcYo8p2RrhsLvpsNjO2Hwi5ByHr683xplrHkBLsWVyXEoVRpcedXTXKAvEAycBv4JLMRKUBQCHAHuMMbE25fHvg4MxLo89r785idy08ls5SwZmYYl207w6vJ9xMRdon2j6jx+cyv6tAhG8rvyKTMT9i+Hn9+CAyvB0xfa3wHd/8jcM1vzrIQ7OmJM6R+UUgUoV5PZrqaBQjlbekYmX/52nP8s38fxhGS6NKnJ4ze3omezQtaIit0Nv7wDWz9nbnoCU6ul8cHIjOzcGhMW+zO9/7saLFS5oYFCKSe4nJ7JvM1HeX3lPk5fSKVX8yAeu6kVnZvULPhNSfGEv9Oc14afo1/YleJVMfDwoiC2370OgltaS6ErVYY0UCjlRClpGcz5+Qhvrd5P3MXL9GtVm8dvbkV4w+r57u/5jAcpUw3eDrEgLQP8/g0ZJtC6A7xBJDTqAg27WI8B9UrnYJSylTRQeBW9i1KVl5+3JxN6hzGqa2M+3niId9YcZMhr6xnYrh6P3tSSVvUCcuzfJjiE9UcO5xhRrD8CbYIaQO8XrNzfxzfDhtesu8ABAhtCw85XgkeDSPCpWnoHqVQRdEShVAlcSEnjw/UxfLAuhouX07m1fQMeGdCCprWrAfnn/853jiItGU5us4LGsc3WMiIJ9s2j4mHdFZ4dPDpD7dZ6yko5jZ56UqoUnLt0mXfXHWT2j4dITc/gtk6N+NONLWhcy5+Hl05iTvS7JKRkUMPPkzERE3lt8JtFV3rxjBUwjtuB4/gW6zJcAJ9q0KCjFTSyAkhgA9cepKqwNFAoVYrOJKby9poDfPrTYYwxhDffRtSFf/FhUSOK4sjMhPgDV0YcxzfDqe2QmWZtD2hg5QVv2Nk+ZdURfKs5/yBVhVOpAkVY2zDzz//+M0dZu9rt6NqwK2kZacyJnpPnPZH1IomsF0lSWhLzd8zPs71Lgy6E1wnnfMp5vtr9VZ7tPRv1pFVwK+KS4liyd0me7X2a9KFpzaacuniKZfuX5dl+Y9iNNK7emKPnj7IiZkWe7QObD6RetXocPHeQtYfX5tk+pOUQgv2D2RO3h43HNubZPqL1CKr7VWd77HY2n8gbRO9sdyf+3v5EnYoi6lRUnu1jIsbg7enNpuOb2HFmR57t4yPHA7Dh6Ab2nt2bY5uXhxf3tL8HgDWH1hCTEJNjexWvKtwVfhcAyw8u59iFYzm2B/oGMrLNSACW7V/GqYuncmwPqhLEra1uBeDrPV9zNvlsju31qtVjYPOBAHy560supF7Isb1RYCMGNB0AwLzt80hOT86xPaxGGDeE3gDAZ9s+Iz1rDsHWMqgl1zW+DoDZUbNzbDuflMa2QwF8eehPzL/rDMcTr2zbdQY+i67FN3evuPbfvepNiDu0liXbPoP4Q3AuxlrAEOgj3jStHc6pOq1Y5uVp3SQY2MA6lYX+7lXU3z0o+ffefR3v08lspUpbdX9vHryhGbP3x9GrMczfeWVbyyA4kRjPXe9spEngeYKqwTlzhIAq3lT388p+bFQ1mRa1MgpvyNsPGnaCSycgK21GaiKcOwyZwNmDsO8HSLW/yDx9oUaIFTTS06HFIBccvaroymREISKHgEQgA0g3xnQpKKlRYfXoqSdV3oS/Gcprgw7nuY9i/FcNGNfsO05fSOH0hVTOJKYSm5hCWkbe/381/L2pG+BHnUBf6gT4UTfQlzoBvtQN9KNOoB91AnypE+ibd5n0LMZA/MErV1gd2wynoh1OWdXPOdfRoCP4BuSpZm70HL3DvIJyp8tj+xljHBfImQKscEhqNAV4omy6ptTVmXr9dCYsznvV04ybZjI6IjzHvpmZhnNJl4lNTOX0hRRiL9iP9uvTiakciI0jNjGV9My8AaWmvzd17IBS1w4gdQOtwFI7oBZ1Q26ldrvbrYCSnmoFi6zgcXwL7M46dSrWVVWNOmff2zH3dBRTVt7P7OGX7eM4zPiFvwfQYFEJleWIootjoCgoqVFh9eiIQpVHzv5LPDPTEJ902QokiSnEZgWVRGt0EpuYapUlpuZcLt1W09+buoF+1HYIJHUC/Gjkm0STlN3UubCdanFb8TixBZKtQXy470VeG5WZZ2T0+8VBxPxZF0B0d24xmS0iMcA5rJwT7xhj3i0oqVE+79XERUrlIyugZI1OYu1AkjVKic067XUx/4BSy9+bDlXj6eYTw+T4v5Pyf+R/h3m9PtacR81QqBV25XlgI/DUaU934C6nnnoZY06ISB3gBxHZXdw3GmPeBd4Fa0Thqg4q5W48PITgar4EV/OlXSG3WGRkGuIvXSY28crprtPZgaUW3yY2wt/HOm2W+w7zRj6wKVZofGYTwRlL8DJXrs4x4gk1GiM1HYKHYzDxy3/ZE1X+lUmgMMacsB9jReQroBt2UiOHU0+xhVailLoqnh5C7QBfagcUHFAavxzE+EVnmT2M7LmW8YvgolTn/dAXOZ6QzMn4i/ilnCbEI5bGEkuIxBIaF0vThKM0itlCoMl5iWiGbw2kVhgetfIJJIEN9c7zcqzUA4WIVAU8jDGJ9vObgWe4ktRoBjmTGimlStnMm/7DI8vu44Gv04hJgLAakJLmzeu3vMHoiCtnLJIup3MiIZlj55I5npDMznPJLE+wnifEx+Fz8SiNiSVEThOSHktIUixhJzdQn0V4ceVS4EzxJi2gIR61wvAKboo4BpKaoeAXWLofgMqhLEYUdYGv7GQwXsB/jTHLRGQTMF9EJmAnNSqDvimluHJl0/R1U4Ej+HmF8K9+eSfl/X28aF4ngOZ18l5eC5CWkcmp8ymcsIPH1nPJLE1I5uS5i1yOP4pP4hHqZZ6miZwm5FwsjRMO0yRmEzXkYo56Un1qkh4YgkdQGH61myO1Qh3mRhoUORrRS32vjVvfma1XPSnl3owxnL10meP2iCTrMf7sGUx8DD6JRwhOO0mI2KMSiaWhxOElmdl1pIs3Sf4NyagegmdQGP51m+EV1MwOJE2Yu3cxU5b/3uFSXxi/0IcZAz6stMHCXSaznePsWZg9O2dZu3bQtSukpcGcvLeyExlp/SQlwfy8yyjQpQuEh8P58/BV3mUU6NkTWrWCuDhYkncJD/r0gaZN4dQpWJZ3CQ9uvBEaN4ajR2FF3mUUGDgQ6tWDgwdhbd5lFBgyBIKDYc8e2Jh3GQVGjIDq1WH7dsgviN55J/j7Q1SU9ZPbmDHg7Q2bNsGOvMsoMH689bhhA+zNuYwCXl5wj7WMAmvWQEzOZRSoUgXuspZRYPlyOJZzGQUCA2GktYwCy5ZZn6GjoCC41VpGga+/tv79HdWrZ31+AF9+CRdyniOnUSMYYC2jwLx5kJxzGQXCwuAGaxkFPvvMupPZUcuWcJ21jEKe3zvQ372r+N0TrFzJwUAHx9+9+EMQJBDUhJS0RiQkpbHrdyNZm5CMrF9Htd1bICkOr5Sz+Kedo6bnRbzaH6HJ8c14Hb4ACVcCSbRXKktvMLSzJ+b7HYBl6Zf5aPr9jO6y01rSPage3HUPePlUzt+9Irh3oFBKVXh+3p7Uq+5JvdZ1rILM4xB85Ys0PSOThMuGvTcP4/uEFLxXLKXK/ii4FIdPyllOpa+kVXDOOlsEQVxKCqx/2SrwFdj3KMke/qQf8CQz2Yd0L3+Md1Xw8ccE1SbNex8+AcFUO3wQn8sGD5+qVpDxqlI6H0QZ0lNPSqkKLeA5YfEo8tw8ePvn8HKjDyE5Hkk5h2dqAr6XE6iSfp6qGReozkVqkEgNuUR1LuEh+X9XZuDBJY8AkjwDSfGuQbpPdTL8amKq1ET8a+FZNQifgCD8AmtTpUZtqlavjUfVIPC+ugBz1cvYO6hcp56UUqoINXzzv9TX3zeIcffel+97jDEkp2VwPjmN08lp7L2YQtKFsyRfiCP9YhzpF+MxSfF4pJzDM+UcPmkJ+KWdp0ryBQIuHqW67KImF/GX1AL7lYIPFz0CSPKsTrJXdS77VCfdtybGrwbiH4Rn1Vp4BwTjExiMf2Aw1WrV4e8bn2bB7nf4351Zx5LB3V++BVDiYFESOqJQSlVoc6Pn8Miy+wjwvXKpb2KqN68O/Mglk9nGGJIuW0HmQmIil87HkXLhDJcvxJFx6SyZl+KR5Hg8UhLwvpyAb9p5/NPPUy3zAgEmkRpczDFZ7yjcN5HXRpk8o6Pb5nsS/0R6vu/Jj44olFLKQXEv9XUWEaGqrxdVfb1oUKMKNK5T7PcaY0hKTefM+XguJcSSlHCGy4lnSb94lsxLZ9m1///oHZLzPb1DICGliOXpr5EGCqVUhTc6YoxbXAorIlT186aqX12oWzfP9urP/5P1RzLyLK1Sw8+1d7V7uLR2pZRSTjMmYiJ3f2mdbkrLsB7v/tIqd6VyN6IQkYHAfwBP4H1jzIwy7pJSSpULWRPWt82/tqueSqpcTWaLiCewF7gJOAZsAkYbY3bmt79OZiulVMmVdDK7vJ166gbsN8YcNMZcBj4HhpVxn5RSqlIrb4GiIXDU4fUxuyybiEwUkc0isvnMmTOl2jmllKqMylugkHzKcpwbM8a8a4zpYozpUrt27VLqllJKVV7lLVAcAxo7vG4EnCijviillKL8BYpNQAsRCRMRH2AUVkIjpZRSZaRcXfUEICKDgVexLo/90BgzvZB9zwCHr7KpYCDuKt9b3uixlE8V5VgqynGAHkuWJsaYYp+7L3eBorSIyOaSXB5WnumxlE8V5VgqynGAHsvVKm+nnpRSSpUzGiiUUkoVqjIHinfLugNOpMdSPlWUY6koxwF6LFel0s5RKKWUKp7KPKJQSilVDBoolFJKFc4Y4zY/wIdALLDdoawW8AOwz36saZcLMAvYD2wDOhVQ50Bgj73fFIfyMOBnu955gI9d7mu/3m9vD73KY2kMrAJ2ATuAP7vj8QB+wC/AVvs4nnZGe8A4+737gHEO5Z2BaPv9s7hy+jTfz+0q/208gd+AJe58LMAhu/4oYLM7/n7ZddQAFgC7sf6/9HTT42hl/1tk/VwAHnGHY7mq/0hl9QP0ATqRM1DMzPpwgCnA8/bzwcC39ofdA/g5n/o8gQNAU8AH68uurb1tPjDKfv428Ef7+STgbfv5KGDeVR5L/ax/eCAAa3n1tu52PHZ/qtnPve1fvB7X0p79H+eg/VjTfp71n+cXrC8KsT+PQYX9Hlzlv81jwH+5Eijc8liwAkVwrjK3+v2y3/cxcL/93AcrcLjdceTTh1NAE3c4llL5gnfmDxBKzkCxB6hvP68P7LGfv4OVyyLPfg5lPYHvHF4/af8I1h2PXrn3A74DetrPvez9xAnHtQgrD4fbHg/gD/wKdL+W9oDRwDsOr9+xy+oDu/Pbr6DP7SqOoRGwAugPLLnWz66Mj+UQeQOFW/1+AYFATD6fq1sdRz7HdTPwo7scS0WYo6hrjDkJYD9mZTIvcsnyQvYJAhKMMen5vDf7Pfb28/b+V01EQoGOWH+Nu93xiIiniERhnRb8AesvnGtpr6DjaGg/z10OBX9uJfUq8Dcg0359rZ9dWR6LAb4XkS0ikpUr091+v5oCZ4CPROQ3EXlfRKq64XHkNgqYaz8v98dSEQJFQYpcsryQfQp7b3HqLTYRqQb8D3jEGHOhsF2L0W6ZHI8xJsMYE4n113g3oM01tnc1x3HNRGQIEGuM2VKMvhS1raj3u/RYbL2MMZ2AQcBDItKnkH3L6++XF9bp5reMMR2BS1inZwpSXo/jSuPWgqdDgS+K2rUYbZbKsVSEQHFaROoD2I+xdnlxliwvaJ84oIaIeOXz3uz32NurA/FX03ER8cYKEnOMMV+6+/EYYxKA1VjnU6+lvYKO45j9PHc5FPy5lUQvYKiIHMLKrtgfa4ThjseCMeaE/RgLfIUVxN3t9+sYcMwY87P9egFW4HC343A0CPjVGHPafl3uj6UiBIrFWFeVYD8ucigfK5YewPms4Z2I7Lb3yXdZc2OdvFsF3F5AvVnt3Q6stPcvERER4ANglzHmZXc9HhGpLSI17OdVgAFYV6aUqD0RaSgiK+zy74CbRaSmiNTEOp/7nX28iSLSw/78xhZQr2N7xWaMedIY08gYE4r12a00xoxxx2MRkaoiEpD13G53eyF1l8vfL2PMKeCoiLSyi24EdrrbceQymiunnXLXXT6P5WonY8rix/5wTwJpWFFxAta5tRVYl4CtAGrZ+wrwBtb58migi10ejMPkINaVBXvt/aY6lDfFuiplP9YQ0dcu97Nf77e3N73KY+mNNdzbxpXL5Qa72/EA7bEuJd2G9UX01NW0B3Qh56Tc7+199gP3OZR3sds5ALzOlUtK8/3cruF3rS9Xrnpyu2Ox+7yVK5ctTy2s7vL6+2XXEQlstn/HFmJdPeZ2x2HX4w+cBao7lJX7Y6l0S3jY56GbGmNmlXVfnKGiHI+ITAaOGGPcPlFVBTuWivL7VSGOA8rmWCpdoFBKKVUyFWGOQimllAtpoFBKKVUoDRRKKaUKpYFCKaVUoTRQKLckIkZEXnJ4/RcRmeakumeLyO1F73nN7dwhIrtEZFWu8lARSRaRKBHZKiIbHO4jKKiuLiKS71UwInJIRIKd2XdVuWigUO4qFRhZ3r4ARcSzBLtPACYZY/rls+2AMSbSGNMBa/XUvxdWkTFmszHmTyVoW6li00Ch3FU6Vs7gR3NvyD0iEJGL9mNfEVkjIvNFZK+IzBCRMSLyi4hEi0gzh2oGiMg6e78h9vs9ReQFEdkkIttE5A8O9a4Skf9i3RiVuz+j7fq3i8jzdtlTWDddvi0iLxRxrIHAOft9fiLykV3fbyLSz6EPS+znQSLyvb39Hey1fey7tb+xRynbReSuYnzOSuFV9C5KlVtvANtEZGYJ3tMBa9HCeKzcEO8bY7qJyJ+Bh7ESyYC1nP0NQDNglYg0x1pm47wxpquI+AI/isj39v7dgHBjTIxjYyLSAHgeK0nROazVXIcbY54Rkf7AX4wxm/PpZzOxVuQNwLqbt7td/hCAMSZCRFrb9bXM9d5/AuvtNm4BslaOHQicMMbcYvetevE+MlXZ6YhCuS1jrbb7CVCSUy6bjDEnjTGpWEseZH3RR2MFhyzzjTGZxph9WAGlNdZ6SWPtL/CfsZZeaGHv/0vuIGHrCqw2xpwx1pLOc7AScBUl69RTM6zg9a5d3hv4FMAYsxs4DOQOFH2Az+x9vsEejdjHOEBEnheR640x54vRD6U0UCi39yrWuf6qDmXp2L/b9oJ7Pg7bUh2eZzq8ziTnCDv3kgVZSzc/bH+BRxpjwowxWYHmUgH9y29J55JazJXgUtz68iy5YIzZy5X0q8/Zp7+UKpIGCuXWjDHxWCkfJzgUH8L6QgQYhpWitaTuEBEPe96iKVZ2se+AP4q1PDwi0tJembUwPwM3iEiwPdE9GlhTwr70xhr9AKwFxmS1D4TYfXPkuM8grEX0sk6DJRljPgNexFquW6ki6RyFqgheAiY7vH4PWCQiv2CtxlnQX/uF2YP1hV4XeNAYkyIi72OdnvrVHqmcAYYXVokx5qSIPIm15LMAS40xxVk2PGuOQoDLwP12+ZtYE+DRWCOn8caYVKs72Z4G5orIr/YxHLHLI4AXRCQTawXmPxajH0rpooBKKaUKp6eelFJKFUoDhVJKqUJpoFBKKVUoDRRKKaUKpYFCKaVUoTRQKKWUKpQGCqWUUoX6f8peSWnMolyoAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "import numpy as np\n", + "# with visualization\n", + "naiveBoids = np.linspace(10000, 100000, 8)\n", + "naiveFPS = [180, 92, 64, 46, 27, 20, 15, 12]\n", + "# without visualization\n", + "naiveBoidsV = np.linspace(10000, 100000, 8)\n", + "naiveFPSV = [207, 102, 68, 49, 29, 20, 15, 12]\n", + "\n", + "naiveFig, naiveAxes = plt.subplots()\n", + "\n", + "naiveAxes.plot(naiveBoids, naiveFPS, label=\"With Visualization\", marker='o', markerfacecolor=\"yellow\", markeredgecolor=\"green\")\n", + "naiveAxes.plot(naiveBoidsV, naiveFPSV, label=\"Without Visualization\", marker='o', markerfacecolor=\"yellow\", markeredgecolor=\"green\")\n", + "naiveAxes.get_xaxis().set_major_formatter(\n", + " matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))\n", + "naiveAxes.yaxis.set_ticks(np.arange(0, 220, 10))\n", + "naiveAxes.xaxis.set_ticks(np.arange(10000, 110000, 15000))\n", + "naiveAxes.set_xlabel('Number of Boids') # Notice the use of set_ to begin methods\n", + "naiveAxes.set_ylabel('FPS')\n", + "naiveAxes.set_title('Naive Approach')\n", + "naiveAxes.axhline(y=30, color='r', linestyle='--',alpha=0.5)\n", + "naiveAxes.axhline(y=60, color='g', linestyle='--',alpha=0.5)\n", + "naiveAxes.legend()\n", + "\n", + "# with visualization\n", + "uniformBoids = np.linspace(100000, 700000, 7)\n", + "uniformFPS = [440, 200, 137, 60, 45, 27, 17]\n", + "# without visualization\n", + "uniformBoidsV = np.linspace(100000, 700000, 7)\n", + "uniformFPSV = [600, 300, 145, 80, 48, 28, 17]\n", + "\n", + "uniformFig, uniformAxes = plt.subplots()\n", + "\n", + "uniformAxes.plot(uniformBoids, uniformFPS, label=\"With Visualization\", marker='o', markerfacecolor=\"yellow\", markeredgecolor=\"green\")\n", + "uniformAxes.plot(uniformBoidsV, uniformFPSV, label=\"Without Visualization\", marker='o', markerfacecolor=\"yellow\", markeredgecolor=\"green\")\n", + "uniformAxes.get_xaxis().set_major_formatter(\n", + " matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))\n", + "uniformAxes.yaxis.set_ticks(np.arange(0, 650, 50))\n", + "#uniformAxes.xaxis.set_ticks(np.arange(10000, 110000, 15000))\n", + "uniformAxes.set_xlabel('Number of Boids') # Notice the use of set_ to begin methods\n", + "uniformAxes.set_ylabel('FPS')\n", + "uniformAxes.set_title('uniform Approach')\n", + "uniformAxes.axhline(y=30, color='r', linestyle='--',alpha=0.5)\n", + "uniformAxes.axhline(y=60, color='g', linestyle='--',alpha=0.5)\n", + "uniformAxes.legend()\n", + "\n", + "\n", + "# with visualization\n", + "coherentBoids = np.linspace(1000000, 2500000, 4)\n", + "coherentFPS = [95, 50, 30, 19]\n", + "# without visualization\n", + "coherentBoidsV = np.linspace(1000000, 2500000, 4)\n", + "coherentFPSV = [108, 53, 31, 20]\n", + "\n", + "coherentFig, coherentAxes = plt.subplots()\n", + "\n", + "coherentAxes.plot(coherentBoids, coherentFPS, label=\"With Visualization\", marker='o', markerfacecolor=\"yellow\", markeredgecolor=\"green\")\n", + "coherentAxes.plot(coherentBoidsV, coherentFPSV, label=\"Without Visualization\", marker='o', markerfacecolor=\"yellow\", markeredgecolor=\"green\")\n", + "coherentAxes.yaxis.set_ticks(np.arange(0, 110, 10))\n", + "coherentAxes.set_xlabel('Number of Boids') # Notice the use of set_ to begin methods\n", + "coherentAxes.set_ylabel('FPS')\n", + "coherentAxes.set_title('coherent Approach')\n", + "coherentAxes.axhline(y=30, color='r', linestyle='--',alpha=0.5)\n", + "coherentAxes.axhline(y=60, color='g', linestyle='--',alpha=0.5)\n", + "coherentAxes.legend()\n", + "\n", + "# with visualization\n", + "blockSize = [128, 256, 512, 1024]\n", + "blockFPS = [850, 859, 860, 900]\n", + "\n", + "blockFig, blockAxes = plt.subplots()\n", + "\n", + "blockAxes.plot(blockSize, blockFPS, marker='o', markerfacecolor=\"yellow\", markeredgecolor=\"green\")\n", + "blockAxes.xaxis.set_ticks(np.arange(0, 1088, 128))\n", + "blockAxes.set_xlabel('Block Size') \n", + "blockAxes.set_ylabel('FPS')\n", + "blockAxes.set_title('Block Size vs FPS')\n", + "\n", + "\n", + "\n", + "naiveFig.savefig(\"naive.png\")\n", + "uniformFig.savefig(\"uniform.png\")\n", + "coherentFig.savefig(\"coherent.png\")\n", + "blockFig.savefig(\"blocksize.png\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8015ef9a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1000000., 1500000., 2000000., 2500000.])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/images/plotting/CUDA Flocking.ipynb b/images/plotting/CUDA Flocking.ipynb new file mode 100644 index 0000000..d18ac08 --- /dev/null +++ b/images/plotting/CUDA Flocking.ipynb @@ -0,0 +1,91 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "id": "1f1923e1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABfR0lEQVR4nO3dd1hUR9vA4d+wdFBAxIINu1gAFcHEEnvvMUaN0ZjElqbJl/pGjZoekzflNbGma2KMPfaeGI0iNgSxoKIiIEpV+u7O98dZCCIoIguszH1dXOyePeVZXM+zZ2bOM0JKiaIoiqLkZ1XWASiKoijlk0oQiqIoSoFUglAURVEKpBKEoiiKUiCVIBRFUZQCqQShKIqiFEglCEUpJiFEmBCiS1nHcS+EEAuEEDPKOg7FMgh1H4RS3ggh9gC+QA0pZWYZxmELfAg8DrgC14C1UsqXyyqmuxFCbAY6mZ7aARLIMj1fKqWcXCaBKRbJuqwDUJS8hBBeaCe4ZGAQ8Psd1tVJKQ15nltLKfUlGM5bgD8QAMQA9YDOJbj/Eiel7JvzWAjxAxAlpZxedhEplkw1MSnlzVjgAPADMC7vC0KIH4QQ84UQm4QQqUBXIUSkEOINIUQIkCqEsBZCvCmEOCeEuCGEOCmEGGra3k4IkSCEaJVnn9WEEOlCCI8CYmkHrJFSRktNpJTypzzbRgohepgeJwkhbpp+UoUQ0pTsEEIMEEIcM62zXwjhU9AbNzX/fJpv2TohxCumx28IIa6Y3tdpIUT3e/zb5vwN3zM97iKEiBJCvC6EiBNCxAghhggh+gkhzpj+Vv/Js61Vnr9tvBBihRCiyr3GoFgOlSCU8mYssMz001sIUT3f66OB94FKwN+mZaOA/oCr6QriHNpViAswG1gqhKhpaq5aDozJs79RwA4p5bUCYjkAvCKEeE4I0UoIIQoLWkrpKqV0llI6A18Ce4ErQog2wHfAJMAdWAisF0LYFbCbX4DHc44jhHADegHLhRBNgReAdlLKSkBvILKweO5BDcAeqAXMBBaj/X3aov0NZwohGpjWfQkYAjwCeAKJwNclEINSTqkEoZQbQoiOaM04K6SUh9FO9KPzrbZOSrlPSmmUUmaYln0lpbwspUwHkFL+bvrWb5RS/gacRWsmAvgRGC2EyPnsPwn8XEhIHwIfA08AwWgn/HGFrJvzHh43xfyolDIbmAAslFIelFIapJQ/AplA+wI234vWZ5DThzAc+EdKGQ0Y0PoUmgshbExXM+fuFEsRZQPvm2JdDlQFvpRS3pBShgFhQM4VzyTgbSlllCnZzgKGCyFUU/UDSiUIpTwZB2yTUl43Pf+FfM1MwOUCtrtlmRBibJ4mnSSgJdqJDynlQSAVeEQI0QxoBKwvKBjTCf1rKWUHtE7q94HvhBDeBa0vhGgNzAOG5rkiqQf8X04spnjqoH0Dz388iXaSHmVaNBrtSgopZQQwDe2kHCeEWC6EuG0fxRCfpx8n3fT7ap7X0wHnPO9lTZ73EY6WuPJf5SkPCJUglHJBCOEAjEA7cccKIWKBlwFfIYRvnlULGnaXu0wIUQ+tmeQFwF1K6QqEAnmbh35Ea0Z5EliZ50qkUFLKdCnl12jNKs0LiN8DWAO8IKU8muely2jf0F3z/DhKKX8t5FC/on0rrwcEAqvyxPCLlDLnKkuiXd2UpstA33zvxV5KeaWU41BKiUoQSnkxBO3baHPAz/TjjdbsMvYe9uOEdvK8BiCEGI92BZHXz8BQtCTxE4UQQkwzdeQ6mDq/x6H1fRzNt5412ol8malJK6/FwGQhRKDQOAkh+gshKhV0TFNyuQYsAbZKKZNMx2gqhOhm6rvIQPtmbyhoH2a0AHjflLwQQngIIQaXcgxKKVIJQikvxgHfSykvSSljc37QmmyeKGo7t5TyJPAZ8A9aU0krYF++daKAI2iJZO8ddpdu2lcscB14Hq1v4Xy+9Wqj9RtMyzOS6aYQoq6UMhitH2Ie2tVHBPDUXd7Gr0APtCa2HHbAR6Y4YoFqwH9u39SsvkRrjtsmhLiB1okfWMoxKKVI3SinVEhCiO+AaHWPgKIUTo0+UCoc0/0Jw4DWZRyKopRrqolJqVCEEO+idVrPlVJeKOt4FKU8U01MiqIoSoHUFYSiKIpSoAemD6Jq1arSy8urrMNQFEWxKIcPH74upSyoFtmDkyC8vLwIDg4u6zAURVEsihDiYmGvqSYmRVEUpUAqQSiKoigFUglCURRFKdAD0wdRkOzsbKKiosjIuGstNqWU2NvbU7t2bWxsbMo6FEVR7uKBThBRUVFUqlQJLy8v7jDXi1JKpJTEx8cTFRVF/fr1yzocRVHuwqxNTEKIPqapESOEEG8W8HpnIcQRIYReCDE832vjhBBnTT93nKSlMBkZGbi7u6vkUE4IIXB3d1dXdIpSUkJWwOctYZar9jtkRYnu3mxXEEIIHdp0hD2BKOCQEGK9qdpmjktolS1fzbdtFeAdtAnjJXDYtG1iMeIo3htQzEL9eyhKCQlZAX+8BNmmeZ6SL2vPAXxGlMghzHkFEQBESCnPSymz0GbKuqV2vGnaxBDAmG/b3sB2KWWCKSlsB/qYMVZFURTLsnPOv8khR3a6tryEmDNB1OLWqSCjTMtKbFshxEQhRLAQIvjatYLmnC97sbGxjBw5koYNG9K8eXP69evHmTNniIyMxMHBAT8/P5o3b87kyZMxGo3s2bOHAQMG3LKPp556ipUrV9627wMHDhAYGIifnx/e3t7MmjWrlN6VoihlLjnq3pYXgzk7qQtqSyhqZcAibSulXAQsAvD397/vqoNrj15h7tbTRCel4+nqwGu9mzKkdVFz2u2klAwdOpRx48axfPlyAI4dO8bVq1epU6cODRs25NixY+j1erp168batWupUqVKkfc/btw4VqxYga+vLwaDgdOnTxc7VkVRLMjZHSAEFFRs1aV2iR3GnFcQUWiTs+eoDUSXwrbFsvboFd5afYIrSelI4EpSOm+tPsHao8Wfbnf37t3Y2NgwefLk3GV+fn506tTplvWsra15+OGHiYiIuKf9x8XFUbNmTQB0Oh3Nm2tTJd+8eZPx48fTqlUrfHx8WLVKm9Z4ypQp+Pv706JFC955553c/Xh5efHOO+/Qpk0bWrVqxalTp4r1fhVFMbPsdNj0Gix7FJxrgLXdra/bOED3mSV2OHNeQRwCGgsh6gNXgJHA6CJuuxX4QAjhZnreC3jrfoKZ/UcYJ6NTCn396KUksgy3doWkZxt4fWUIvwZdKnCb5p6VeWdgi0L3GRoaStu2be8aW1paGjt37mTOnHtrO3z55Zdp2rQpXbp0oU+fPowbNw57e3veffddXFxcOHHiBACJiVrf/vvvv0+VKlUwGAx0796dkJAQfHx8AKhatSpHjhzhm2++4dNPP2XJkiX3FIuiKGYWEwKrnoXrpyFwCvSYBeHrtT6H5CjtyqH7zBLroAYzXkFIKfXAC2gn+3BghZQyTAgxRwgxCEAI0U4IEQU8BiwUQoSZtk0A3kVLMoeAOaZlZpM/OdxteUk4d+4cfn5+dOjQgf79+9O3b99CR/kUtHzmzJkEBwfTq1cvfvnlF/r00frxd+zYwfPPP5+7npublmdXrFhBmzZtaN26NWFhYZw8+e+AsmHDhgHQtm1bIiMjS+otKopyv4wG2PclLO4GGckwZjX0/Qhs7LVk8HIozErSfpdgcgAz3ygnpdwEbMq3bGaex4fQmo8K2vY74LuSiuVO3/QBOny0iytJ6bctr+XqwG+THirWMVu0aFFg53KOnD6IvNzd3XO/8edISEigatWqhe5jypQpTJgwAQ8PD+Lj45FS3pZQLly4wKeffsqhQ4dwc3PjqaeeuuV+BDs77VJVp9Oh1+vv5W0qimIuSZdh7RSI3AvNBsDAr8DJvdQOr2oxmbzWuykONrpbljnY6Hitd9Ni77Nbt25kZmayePHi3GWHDh3izz//LHSbxo0bEx0dTXh4OAAXL17k+PHj+Pn53bbuxo0byZkR8OzZs+h0OlxdXenVqxfz5s3LXS8xMZGUlBScnJxwcXHh6tWrbN68udjvS1GUUnBiJczvANFHYfDX8PjSUk0O8ICX2rgXOaOVSnIUkxCCNWvWMG3aND766CPs7e3x8vLiiy++KHQbOzs7li5dyvjx48nIyMDGxoYlS5bg4uJy27o///wzL7/8Mo6OjlhbW7Ns2TJ0Oh3Tp0/n+eefp2XLluh0Ot555x2GDRtG69atadGiBQ0aNKBDhw7Ffl+KophRRjJsfBVOrIDa7WDYIqjSoExCeWDmpPb395f5JwwKDw/H29u7jCJSCqP+XRSlEJH7YM0kSImGR16HTq+Czrzf44UQh6WU/gW9pq4gFEVRypo+C/Z8AH9/AW5e8PRWqNOurKNSCUJRFKVMXTsDq5+FmOPQ+kno8yHYVSrrqACVIBRFUcqGlBD8LWydrt3g9vhS8B5Y1lHdQiUIRVGU0nYzDta9AGe3QsNuMPgbqFyzrKO6jUoQiqIopen0Zi05ZN6APh9DwESwKt4dByVdPy4/lSAURVFKQ1YqbH0bDn8P1VvBUxugWvFH8+XUj0vPNgD/1o8DSixJqBvlzOzq1auMHj2aBg0a0LZtWx566CHWrFkDwJ49e3BxcaF169Z4e3sze/ZsAH744QdeeOGFW/bTpUsX8g/jzVnetGlT/Pz88PPzY/hwbWK+a9euERgYSOvWrdm7dy+///473t7edO3aleDgYF566aU7xt2vXz+SkpKK9Z7Xrl17SxkPRanwrhyBhZ3h8A/w8IswYed9JQfQ7tnKSQ450rMNzN1aclWd1RVEXiErSrTwlZSSIUOGMG7cOH755RdAuzN6/fr1uet06tSJDRs2kJqaip+f321zQRTFsmXL8Pe/dRjzzp07adasGT/++CMAffr04ZtvvqFr164At62f36ZNm+74+p2sXbuWAQMG5FaXVZQKy2iAvz+HPR+Cc3UYuw4aPFIiu44uoDTQnZYXh7qCyJEzfV/yZUD+O33ffczxumvXLmxtbW8p912vXj1efPHF29Z1cnKibdu2nDt3rtjHy3Hs2DFef/11Nm3ahJ+fH7Nnz+bvv/9m8uTJvPbaa7dMSlRYaXAvLy+uX78OwNKlSwkICMDPz49JkyZhMGjfWpydnXn77bfx9fWlffv2XL16lf3797N+/Xpee+01/Pz8SuT9KIpFSoyE7/vBrne10UlT9pVIcsg2GFn0V+H/rzxdHe77GDkqzhXE5jch9kThr0cdAkPmrcuy07XOpMM/FrxNjVZaVcVChIWF0aZNmyKFFx8fz4EDB5gxYwaHDh0q0jY5nnjiCRwctA9Fz549mTt3LnPmzCE4ODi3JtPu3bv59NNP8ff3Z8+ePbnbFlYaPEd4eDi//fYb+/btw8bGhueee45ly5YxduxYUlNTad++Pe+//z6vv/46ixcvZvr06QwaNIgBAwbkNncpSoUiJYT8ppXLEAKGLtJaIkpgPvZDkQlMXxPK6as3aF6zEueupZKp/7fi9P3Wj8uv4iSIu8mfHO62vBief/55/v77b2xtbXOTwN69e2ndujVWVla8+eabtGjRosC+Bii45DcU3MRUVDt27Mid7Q7+LQ2eY+fOnRw+fJh27bS7OtPT06lWrRoAtra2uVcibdu2Zfv27cWKQVEeGOmJsOFlCFsDdR+CoQvBrd597zb+ZiYfbj7FysNR1HJ1YNGTbenZvDrrjkWrUUwl4g7f9AH4vKWpeSkflzowfmOxDtmiRYvcJhuAr7/+muvXr99yMs/pg8jrXkt+34+CSoPnf33cuHF8+OGHt71mY2OTu60qE65UeOf/1Epz37wK3WZAx5fBSnf37e7AaJQsP3SZj7ecIjVTz5QuDXmxWyMcbbVT95DWtUo0IeSn+iBydJ+p3c2Y131O39etWzcyMjKYP39+7rK0tLS7bteuXTv27dtHbGwsAMHBwWRmZlKnTp27bHnvCioNnlf37t1ZuXIlcXFxgJaoLl68eMd9VqpUiRs3bpR4rIpSLukzteGrPw3SzhnPbIfOr953cgi9kszQ+fv5z5oTeNesxOapnXijT7Pc5FAaVILI4TNCm4zDpQ4gtN8Dv7qvUUxCCNauXcuff/5J/fr1CQgIYNy4cXz88cd33K569ep8+eWX9OvXDz8/P6ZNm8avv/6KVSE30zzxxBO5w1x79OhxTzFOnz6dxMREWrZsia+vL7t3777l9ebNm/Pee+/Rq1cvfHx86NmzJzExMXfc58iRI5k7dy6tW7dWndTKgy0uXJvp7Z954P80TPoLahWt37EwKRnZzFofxqB5f3MlMY3PH/fl1wntaVy99OszqXLfSqlT/y6KxTMaIWgRbJ+pFdYb/DU07XNfu5RSsv54NO9tDOf6zUzGBNbj1d5NcXGwKaGgC6bKfSuKopSUlBhY9zyc2wmNe8PgeeBc7b52ee7aTWauC2VfRDw+tV34dpw/PrVdSybe+6AShKIoSlGF/wHrX9KGwPf/r9asdB/DV9OzDHy9O4KFf53D3kbHu0NaMjqgLjqrou0z+Y8/iPv8C/QxMVjXrEm1l6fhMrDkKsKqBKEoinI3mTdhyxtwdCnU9IVhS8CjyX3tcmf4Vd5ZH0ZUYjrDWtfirX7eeFSyK/L2yX/8QcyMmciMDAD00dHEzNAG1ZRUklAJQlEU5U4uH4LVE7Q7ozu+Al3eAmvbYu/uSlI6s9eHse3kVRpVc+bXCe15qKH7Pe0jOzaW2Pc/yE0OOWRGBnGff6EShKIoilkZ9LD3U/jzE6hcC8ZvgnoPF3t3WXoj3/59ga92ngXgjT7NeKZjfWyt7z6YVB8fT1pQEKkHDpJ24ABZdxhqrr/LKMN7oRKEoihKfvHnYPVEuBIMPo9Dv7lg71Ls3R04H8+MtaGcjbtJr+bVmTmwObXdHAtd35CSQlpwMKkHDpB24CCZZ84AYOXkhGO7driOGkn8t99huHbttm2ta5bcxEPqPggzK41y33nvzA4ODqZLly53jCk6OlrVSVKUgkgJR36GBZ0g/iw8+i0MW1Ts5HDtRiav/HaMkYsOkJ5t4Ntx/iwa639bcjCmpXFz79/EffopF4Y/xpn2DxH13PMkrfgd66rueLz8Ml6/LafJwQPUWTAf96eeovrrryHs7W/Zj7C3p9rL04r77m+jriDy2Hh+I18e+ZLY1FhqONVgapup9G/Qv9j7K61y33FxcWzevJm+ffsWaX1PT09Wrlx5z8dRlAdaWoJWwTn8D/DqBEMXaGX/i8FglPwSdIm5W06Rnm3gha6NeL5rIxxstburjVlZpB87RtqBg6QePEh6SAhkZ4ONDQ6+PlSdMgWn9oHY+/piZVtwf0dOP4MaxVQKNp7fyKz9s8gwaJ0+MakxzNo/C6DYSaK0yn2/9tprvPfee7cliMjISJ588klSU1MBmDdvHg8//DCRkZEMGDCA0NBQAgMD+e6772jRogWgXZF89tlnNGvWjBdffJETJ06g1+uZNWsWgwcPvufYFMUiROyEtc9BWjz0nAMPvVjsaUBDopKYvjaUkKhkHm7ozpzBLWlYxZ6M0BNcPxhE2sEDpB0+gszMBCsr7Fu0wP2pcTgGtsexTWusHAtvesrPZeDAEk0I+VWYBPFx0MecSjhV6Osh10LIMmbdsizDkMHMfTNZeabgb9vNqjTjjYA3Ct1naZX7zmm22r17N5Uq/Xs7frVq1di+fTv29vacPXuWUaNG3dZMNXLkSFasWMHs2bOJiYkhOjqatm3b8p///Idu3brx3XffkZSUREBAAD169MDJyemeYlOUci07HXbMgoMLwKMZPPE71PQp1q6S07P5dOtplh68iIeTDQvaO9MuMZy0GT9wJjgYo+mLml2TJrg+PgKn9u1x9PdHV7lyCb6hklVhEsTd5E8Od1teHOYq9w1aTaX33nvvljpP2dnZvPDCCxw7dgydTscZU0dXXiNGjKBnz57Mnj2bFStW8NhjjwGwbds21q9fz6effgpARkYGly5dUiUylAdH7AlYNQGuhUPAJOg5+/aCnUUgpWTt0Si++/VPvC6dZKExhrqXwpFLk4kDbL28qDxwgJYQAgKwrlKl5N+LmVSYBHGnb/oAvVb2Iib19uFhNZ1q8n2f74t1zNIs992tWzdmzJjBgQMHcpd9/vnnVK9enePHj2M0GrHP16EFUKtWLdzd3QkJCeG3335j4cKFgPahX7VqFU2bltzkI4pSLhiNcOBrbXphBzd4YhU0vrcilwBZUVFc2LaHY+t3UivyJJ9kpADaKCKnbt1wah+IY2AgNjVqlPQ7KDVqFJPJ1DZTsdfdegK119kztc3UYu+ztMt9v/3223zyySe5z5OTk6lZsyZWVlb8/PPPuVOF5jdy5Eg++eQTkpOTadWqFQC9e/fmf//7HznFHI8ePXrXuBWl3Eu+Aj8Phm3ToVFPmLK/yMkh+2ocyX/8QfTbb3OmWw/O9eiJ8ZP3qRsZhpVvG6rPnk3DbVtptGsnnh9+gMvgwRadHMDMVxBCiD7Al4AOWCKl/Cjf63bAT0BbIB54XEoZKYSwAZYAbUwx/iSlvH3GmhKU0xFdkqOYcsp9v/zyy3zyySd4eHjg5OR0T+W+jUYjzs7Odyz3naNfv354eHjkPn/uued49NFH+f333+natWuh/QfDhw9n6tSpzJgxI3fZjBkzmDZtGj4+Pkgp8fLyuu1KR1EsSuhq2DBNuwFu0P+g9ZN3rKOkT0wkLegQaQcPkHrgIFnnzwNgdHLmmHtDDvi0o/ojHZk0tjselW6/On8QmK3ctxBCB5wBegJRwCFglJTyZJ51ngN8pJSThRAjgaFSyseFEKOBQVLKkUIIR+Ak0EVKGVnY8VS5b8uh/l2UUpWRApteg5DlUKstDFsM7g1vW81w8yZphw7lDj3NPKUNahGOjjj6tyXbpw3fprqzPMGexjVceHdISwLqW05/QmHKqtx3ABAhpTxvCmI5MBjtZJ9jMDDL9HglME9oPbEScBJCWAMOQBaQYsZYFUV5EF38B9ZMhOQoeOQN6Pwa6LT5FYzp6aQfPUrqgYOkHjxARmgYGAwIW1sc2rTBY9pUHAMDsWrmzbcHovjfrrNYCcGb/RszvkN9bHQPfgu9ORNELSDvJM9RQGBh60gp9UKIZMAdLVkMBmIAR+BlKWVC/gMIISYCEwHq1q1b0vErimKpDNmw5yP4+7/gWhee3oqs7kf60eOkHjxI2oGDpB87hszOBmtrHFq1wn3iBJwC2+PQ2g8rO62q6v6I60z/5gDnr6XSt2UNZgxojqfrvY90slTmTBAFNe7lb88qbJ0AwAB4Am7AXiHEjpyrkdwVpVwELAKtiem+I1YUxfJdPwurJyCjjpLhMYhU60DSZi0i7cgRZHo6CIG9tzduTz6JU/tAHNq0Red8a/9cXEoG728KZ92xaOpWceT78e3o2vT+JgWyROZMEFFA3mE3tYHoQtaJMjUnuQAJwGhgi5QyG4gTQuwD/IHzKIqiFEAaDGSu/5S0Vd+QetWOtPgGGNOCgWDsGjfC9dFHtaGn/v7oXF0L3IfBKPn5n0g+23aGTL2Rl7o35rkuDbG30ZXqeykvzJkgDgGNhRD1gSvASLQTf17rgXHAP8BwYJeUUgohLgHdhBBL0ZqY2gNfmDFWRVEsjJSSrMhI0g4eJHXfX6Tt24shTQ84YlPbk8r9O+DYPhCngACs84zuK8zRS4lMXxtKWHQKnRpXZc7gltSvWrErB5gtQZj6FF4AtqINc/1OShkmhJgDBEsp1wPfAj8LISLQrhxGmjb/GvgeCEVrhvpeShlirlgVRbEM2dHR2pwIpqGn+qtXAbB2lDhXz8KxW3+cRr6BTZ2iF9lLSsvik62n+TXoEh7Odswb3Zr+rWresXJBRWHW+yCklJuATfmWzczzOAN4rIDtbha03NLEx8fTvXt3AGJjY9HpdHh4eBAZGYmnpycnT568yx7u3QcffMB//vOfEt+vopQF/fXruZ3KqQcPkn3pEgA6NzccA/xxco7GKWMXNg2aIoYvgeotirxvKSWrjlzhw03hJKVn83SH+kzr0ZhK9jbmejslrqQrUOdXYUptFEVJTwDu7u7OsWPHAJg1axbOzs68+uqrudVU70av12NtfW//RCpBKJbMkJREau69CAfIitCqG1s5O+MYEECVMU/gGNgeO6ebiLWT4PoZ6PwCdJsBNkW/We107A1mrA0lKDKBNnVd+XlIK5p7lt+ieQUxRwXq/FSCMCmNCcDzMhgMTJgwgf3791OrVi3WrVuHg4MDXbp04eGHH2bfvn0MGjSIEydOMGDAgNwJfpydnbl58yYxMTE8/vjjpKSkoNfrmT9/Phs3biQ9PR0/Pz9atGjBsmXLSjxuRSlJhpuppB85nDuVZkZ4OEiJcHDAsW1bXAYPxql9e+y9vRHW1mA0wL4vYff74FQNxq6DBl2KfLzUTD1f7jzLt39foJK9NR8/2orH2tbBysrympO+PPJlbnLIkWHI4MsjX6oEca9iP/iAzPDCy32nHz+OzLq1cqvMyCDm7ekkrfi9wG3svJtRo5jf1s+ePcuvv/7K4sWLGTFiBKtWrWLMmDEAJCUl8eeffwLw1FNPFbj9L7/8Qu/evXn77bcxGAykpaXRqVMn5s2bl3vVoijljTEjg/Rjx7SpNA8GkX7iBOj1CBsbHPz8qPrC8zi1b49Dq1aI/BPlJF2CNZPh4j5oPhgGfAGORbuTWUrJ1rBYZv9xkpjkDB73r8MbfZtRxangyXjKOyllgcVFAWJTY0vsOBUmQdxN/uRwt+X3q379+vj5+QHQtm1bIiMjc197/PHH77p9u3btePrpp8nOzmbIkCG5+1KU8kRmZ5N+IjS3Uzn96FHt/5SVFfatWuL+9NPavQitW2PlcIcb0EJWwMb/06YEHbIAfEfesY5SXhfjU3lnfRh7Tl+jWY1KzBvdmrb1LLNEhpSS/dH7mX98fqHr1HAquQKBFSZB3O2b/tlu3dFH579NA6w9Pan3808lHo+d6U5NAJ1OR3p6eu7zvEX1rK2tMRqNgGlYnylhde7cmb/++ouNGzfy5JNP8tprrzF27NgSj1NR7oU0GMg4dSq3DyEt+DDSVMHYztsbt9GjcQwM0O5FyDO5VaHSk7TEELoS6gRq80O7eRUplky9gYV/nufr3RFYWwlmDGjOuIfqYW2BJTKklOyL3sf8Y/MJuR5CTaeaDGk4hC2RW25pZrrfCtT5VZgEcTfVXp52Sx8ElPwE4MXh5eXF4cOHGTFiBOvWrSM7OxvQ5rauVasWEyZMIDU1lSNHjjB27FhsbGzIzs7GxsZyRmIolktKSVZERG49o7SgQxhTtLJptg0a4DpksDaVZkA7rN3c7m3nF/ZqTUo3YqDrdOj4MuiKdsrae/YaM9eFceF6Kv19ajKjf3NquFhexVUpJXuv7GXB8QWcuH4CTydPZj40kyENh2Cjs6G9Z3s1iqk0lMYE4MUxYcIEBg8eTEBAAN27d8+9utizZw9z587FxsYGZ2dnfvpJu8qZOHEiPj4+tGnTRnVSKyVOSkn25ctaH8KBg6QGBWG4fh0Am1q1qNSzh2nmtEBsqhezNIU+U+uE3vcVVGkAz2yH2m2LtOnVlAzmbDjJxpAYvNwd+enpADo3uftNcuVNTmKYf2w+ofGh1HKuxayHZjGo4SBsdP9++evfoH+JJoT8zFbuu7Spct+WQ/27WJbs2FjtbmXTVYI+WusctfbwwLF9+9yZ02xrF/3mtELFnYLVz2rTgbZ9Cnq9D3bOd91MbzDy4z8X+Xz7GbIMRp7v0ohJjzSwuBIZUkr+jPqTBccXEBYfRi3nWkz0mcjAhgOxsTJPq0BZlftWFMUC6ePjSQsKyh16mnXxIgA6FxccAwNxfPZZnNq3x7Z+/ZK721hKCFoM22eArROM/BWa9SvSpocvJvD2mlBOxd7gkSYezBncgnrullUiQ0rJnst7mH98PuEJ4dR2rs2ch+cwoOEAsyWGolAJQlEqOENKCmnBwbnNRplnzgBg5eSEo78/riNH4tQ+ELumTRF3mdWwWG5chXXPQcQObRrQwV9Dpep33SwxNYuPt5xi+aHL1HSxZ8GYNvRuUcOiSmRIKdl9eTcLji8gPCGcOpXq8G6Hd+nfoH+ZJoYcD3yCkFJa1AfmQfegNGlaMmNaGmmHj2hDTw8GkREWBkYjws4Ox7ZtqNz/ZZzaB2LfooV2c5o5ndoI61+ErFTo9ym0e/auw1eNRsnvhy/z0eZTpGTomdi5AVO7N8bJznJOZ0ZpZPel3SwIWcCphFPUrVSX9zq8R/8G/bG2Kj/vo/xEYgb29vbEx8fj7u6ukkQ5IKUkPj4ee3vLG01iyYxZWaQfO5Zbzyg9JARyJsrx9aXq5Mk4tg/Ewc8Pq/w3p5lL5k3Y+hYc+Qlq+MCjS8Cj6V03C49JYfraUA5fTKSdlxvvDmlJsxqWUyLDKI3surSL+cfncybxDPUq1+ODjh/Qt37fcpUYcpS/iEpQ7dq1iYqK4tq1a2UdimJib29P7ZLozFQKJfV6MsLCcqueph0+gszM1G5Oa9EC96fG4RgQiGPbNlg5OpZ+gFGHtY7ohAvQYRp0fRus75yYbmbq+Xz7GX7YH4mLgw1zh/vwaJvaFlMiwyiN7Li4gwUhCzibeBavyl7lOjHkKL+RlQAbGxvq169f1mEoillJo5HM06dzO5XTgoMxpqYCYNekCa6Pj9CGnvr7o6tcht+2DXptCtA9H0GlmvDUBvDqeMdNpJRsOhHLnA1hxN3IZFRAXV7v3RRXR8sokWGURrZf3M6C4wuISIrAq7IXH3X6iD5efdBZlf8RVg90gjCnkq78qihFJaUk68KF3E7ltIMHMSQnA2Bbrx6VBwzQhp4GBGDt7l7G0ZokXIA1k+DyQWg5HPp/Bg6ud9zkwvVUZq4LZe/Z67TwrMz8MW1pU/ceb7YrIwajge0Xt7MwZCERSRE0cGnAx50+prdXb4tIDDlUgiiG0q78qihZUVdy6xmlHTiA3tRsal2zJs7duuXei2BTo+Tq8JQIKeHYL7D5dRA6GLYEfO481UtGtoFv9pxjwZ5z2FlbMWtgc8a0t4wSGQajgW0Xt7Hg+ALOJ5+noUtD5naeS896PS0qMeRQCaIY4j7/4paSHKBVfo37/AuVIJQSkX01jrSgg7lXCdlXrgCgc3fHKTBQm0qzfXts6tQpvwMw0hJgwzQ4uQ7qdYChC8C17h032XM6jnfWh3ExPo1Bvp5M7+9Ntcrlf1CDwWhgS+QWFoYs5ELyBRq5NmLuI3PpVa8XVqL8J7bCqARRDPqYgsvs6qOjybp4Edt69Uo5IsXS6RMTSQs6lHuVkHX+PABWlSvjGNCOKk89hVP7QGwbNSq/CSGvc7th7RRIvQ49ZsHDL8EdvkHHJKcz54+TbA6NpYGHE8ueDaRDo6qlF28xGYwGNkduZuHxhUSmRNLItRGfPfIZPer1sOjEkEMliGLQubvn1p/J71yfvjh17kSVMWNw6tDBPDcWKRbPcPMmaTkzpwUFkRkeDoBwdMTRvy2ujz6KY/tA7Js1Q+gsqGkiOwN2zoEDX0PVJjBqOXj6Fb66wcgP+yL5fMcZDEbJq72aMKFzA+ysy/d71hv1bL6wmUUhi4hMiaSxW2P+2+W/dK/b/YFIDDlUgrhH+sREjFlZ2s08eW76Evb2eLz2KsaERBJ/+43LEyZi6+WF2+jRuAwbis757vVklAeXMT2d9KNHc+sZZYSGgcGAsLXFoXVrPKa+hGNgexxatURYaiXeq2GwagLEhUG7CdBzDtgWPoz2UGQC09eEcvrqDbo1q8bsQS2oU6UMht3eA71Rz6YLm1gUsoiLKRdp6taUz7t8Tre63R6oxJDjgS7WV9Kk0cjlyZNJ++cA7s9NIen3lQWOYpJZWaRs3UrC0qVkHA/BytERlyFDcBvzBHYNGpg1RqV8kFlZpIeEkHrwIGkHDpJ+7BgyOxt0Ohx8fHAMDNBmTvPzw8rSbxw0GuHgfNgxC+xdtVIZTXoVunr8zUw+2nyK3w9HUcvVgXcGNqdn8+rluulMb9Sz8fxGFoUs4tKNSzSr0ozJvpPpWqerxSeGOxXrUwniHlxfuIhrn39OjXdm4jZqVJG2ST9xgsSlS0nZtBmZnY1Thw64jXkC586dLavpQLkjaTCQcfLkv0NPjxxBpqeDENh7e+dWPXVo0xads2UVkrujlGitr+H8HmjSFwb9D5wLLq9tNEqWH7rMx1tOkZqp59lODXipeyMcbctvQ0a2MZsN5zaw+MRiLt+4jHcV79zEUJ4T2r1QCaIEpAYFcemp8VTu0wfPzz695w+HPj6epBUrSPx1Ofq4OGzq1MFt9GhcHx1WtjcvKcUijUYyz0b8O/T00CGMN24AYNuoIU6B7bWRRu3aoXN1LdtgzSVsLfwxFQxZ0PsDrTx3If8vQq8kM31tKMcuJxFYvwrvDWlJ4+pFmFGujOQkhkUhi4i6GYV3FW+e83uOR2o/8sAkhhwqQdwn/fXrXBg6DCsnJ7xWrryvb4AyO5sbO3aQsHQZ6YcPIxwccBk0iCpjnsCuceMSjFopSVJKsiIjtXkRDh4k7WAQhoQEAGzq1tWGngYG4hQYgLWH5U1Qc08yUmDLm3BsGXi2gWGLoWqjAldNycjmv9vO8NM/kVRxsuU//bwZ2rpWuT3JZhuzWR+xnsUnFnPl5hVauLdgiu8UOtfuXG5jvl8qQdwHaTBwecIE0g4fwWvFb9g3vXtBsaLKOHmShKXLSNmwAZmVhWNgIG5jnqBSt26q+akcyI6Ozq1nlHrgIPqrVwGwrlYNp4fa4xjYHqfAAGxq1SrjSEvRpYOwegIkX4ZO/wePvAG62zvVpZSsPx7NexvDuX4zkzGB9Xi1V1NcHMtnB3y2IZt159axOGQx0anRtHRvyRS/KXSq1emBTQw5VIK4D9fmfc31efOo+d67uA4fXuL7B21kVNLvK0n89Vf0MTHYeHriNnoULo8+eu/z+CrFpr9+PbdTOfXgQbIvXQJA5+amXR3kzJzm5fXAnzRuY8iGPz+BvZ+CS23tqqFu+wJXPXftJjPXhbIvIp5WtVx4b0hLfOu4lm68RZRtyGZNxBqWnFhCTGoMPlV9mOw7mY61OlaYf2OVIIopdf9+Lj3zLC6DBlHzow/N/oGRej03du0iceky0oKCEHZ2VB44gCpjxmDfrJlZj10RGZKSSDXdi5AWdJDMsxEAWDk74xgQYEoI7bFr3Khi388Sf067arhyGHxHQ9+Pwf72frP0LANf745g4V/nsLfR8XrvpowOrIeuHFZczTJksTZiLYtPLCY2NRYfDx+m+E6hg2eHCpMYcqgEUQzZV+O4MGwYOjdX6q9YUeplkTNOnyFx2TKS169HZmTg4N+WKmPGUKl7d8sdJ1/GDDdTST9yOLeeUUZ4OEiJsLfHsW3b3PIV9t7e5p8oxxJICUd+hC1vgc4WBn4BLYYWuOquU1eZuS6MqMR0hrWuxVv9vPGoZFe68RZBliGLNWfXsPjEYq6mXcXXw5fnfJ/jIc+HKlxiyKESxD2Sej2XnhpPelgY9X9fgV2jgjvgSoMhOZmkVatJ/OUXsqOisK5eHbdRI3EdMQLrKlXKLC5LYMzMJP3oMVIPakNP00+cAL0eYWODg69v7tBTex+f0psox1KkXof1L8HpjVD/ERgyH1xu72u5kpTO7PVhbDt5lUbVnHl3cEsealhOKsjmkWnIZPXZ1Xx74luupl2ldbXWTPadzEM1K25iyKESxD2K+/wL4hcuxPPjj3AZPLhE9nm/pMHAzT//JHHpUlL3/4OwsaFy//64jRmDQ8sWZR1euSCzs0k/EZrbqZx+9CgyK0ubKKdVS5wCTfcitG6NlYNDWYdbfp3doc0RnZ4I3d+B9s9Bvia2LL2Rb/++wFc7zyKRTO3ehGc61sfWunw1xWUaMll1ZhXfhn5LXFocbaq1YYrfFAJrBFb4xJDjvhOEEMIK8AU8gXQgTEp5tUSjvE8llSBu/vUXlydOwvWx4dR8990SiKzkZZ47R+KyZSStXYdMS8PBzw+3MWOo3KsnogJ9E5YGAxmnTpk6lQ+QFnwYmZYGgF2zZrlVTx39/dFVKr9j7suN7HTYPhOCFkG15lpHdI2Wt6124Hw8M9aGcjbuJj2bV+edgc2p7Va+SmRk6DNYdXYV3534jrj0ONpWb8sU3ykE1AhQiSGfYicIIURD4A2gB3AWuAbYA02ANGAh8KOU0ljI9n2ALwEdsERK+VG+1+2An4C2QDzwuJQy0vSaj2n/lQEj0E5KeWuN7TxKIkFkx8RwYegwrKtXx+u35eW+BILhxg2S16whYdkysi9eQudRFbfHR+L2+IgHciy+lJKsiAitDyHoIKlBhzDmTJTToEFup7JjQDs1+utexRzX6ihdP61dMXR/B2xu/fxfu5HJh5vCWX30CrXdHJg1sAU9mlcvo4ALlqHPYOWZlXwX+h3X0q/hX92f5/yeo12NdmUdWrl1PwniV2A+sFfmW1EIUQ0YDSRKKX8sYFsdcAboCUQBh4BRUsqTedZ5DvCRUk4WQowEhkopHxdCWANHgCellMeFEO5AkpTSUFis95sgZHY2F58cS+aZM3itWomdBU1VKo1GUv/+m4Sfl5K6dy/Y2FC5d2+qPDkGB1/fsg6v2KSUZF++nFu+IjUoKLeKro2nJ44Ptdem0gwIxKZ6tTKO1kIZDbD/f7DrPXB0h6HzoWG3W1YxGCW/BF1i7pZTpGcbmNi5AS90bYyDbfm5Vyddn87vp3/n+7DvuZ5+nXY12jHFd4pKDEVwpwRxx6EaUspCCw5JKeOAL+6weQAQIaU8bwpiOTAYOJlnncHALNPjlcA8oV3/9QJCpJTHTceKv1Oc9yN36tDoaABcnxhtUckBQFhZ4dy5M86dO5N54QKJv/xK8urVpGzYgH2rVlQZ8wSV+va1iI7Y7NhY7W5lU7ORPlqbe0PnURUnU6eyY/v22NauXcaRPgCSLmt1lCL3gvdAGPgVON468CEkKonpa0MJiUrm4YbuzBnckkbVyk9l4nR9OitOr+D70O+Jz4gnsEYgczvPxb9Ggec75R4VtQ/iMWCLlPKGEGI60AZ4T0p55A7bDAf6SCmfNT1/EgiUUr6QZ51Q0zpRpufngEBgDFqzUzXAA1gupfykgGNMBCYC1K1bt+3FixeL9q5N8k8dClrZ7prvzrH4meEMN1NJXreWxGW/kHX+PDp3d1xHPIbbyJHYVC8/zQL6+HjSgoJMdywfJCsyEgCdiwuOeWZOs61fX7Udl6QTK2HDKyAN2n0Nfk/cUkcpOT2bT7eeZunBi7g72TFjgDeDfD3Lzb9BWnaalhjCvichI4HAmoFM8Z1C2+ptyzo0i1MSndQhUkofIURH4EPgU+A/UsrAO2zzGNA7X4IIkFK+mGedMNM6eRNEADAeeB5oh9bXsROYLqXcWdjxitPEdLZb99wrh7ysPT1pvKvQQ1kUKSWp+/eTuHQZN/fsAZ2OSj17UGXMGBzatCn1//CGlBTSgoNzm40yz5wBwMrREcd27XKHnto1bVqxb04zl/Qk2PQanFgBtQNg2EKo8m8Jeikla49d4f2N4SSkZjH2IS9e6dWEyvbl496btOw0fjv9Gz+E/UBCRgIP1XyIKX5TaF2tdVmHZrGK3cSUR07bf39gvpRynRBi1l22iQLq5HleG8h/Ns5ZJ8rU7+ACJJiW/ymlvG56A5vQrlpK9Kxd6NShhSy3REIInDt0wLlDB7IuXyZx2S8krV7Njc1bsPP2psqYJ6jcv7/ZOuSNaWmkHT5iml/5IBlhYWA0IuzscGjTGo9p07R7EVq0UDcAmlvkPlgzSSvR3eU/Wi0l3b+ngLNXbzB9bSgHLyTgW8eVH8YH0LKWSxkG/K+07DSWn17OD6E/kJiZyMOeDzPFdwp+1fzKOrQHWlGvIDYAV9BGM7VFG+oaJKUstAfUdMI/A3Q3bXsIGC2lDMuzzvNAqzyd1MOklCOEEG5oyaAjkAVsAT6XUm4s7HjqCqLojGlpJK//g8RlS8k8G4HO1RXXx4bjNmoUNp6e97fvrCzSjx3LrWeUHhIC2dlgbY2Dr2/u0FMHX1+s7MrfnbYPJH0W7PkA/v4C3Ly04at1/u28TcvS879dESz+6zyOtjre6NuMke3qlosSGanZqfx66ld+CvuJxMxEOtTqwGSfySoxlKCSaGJyBPoAJ6SUZ4UQNdFO7Nvusl0/tI5sHfCdlPJ9IcQcIFhKuV4IYQ/8DLRGu3IYmadTewzwFiCBTVLK1+90rOIkiAe5D6IopJSkHQwicdlSbuzcBUCl7t1xGzMGx4B2CCH+7cQvYOY80O46zwgLy616mnb4CDIzU7s5rXnzf4eetmmNldMDNFGOpbh2BlY/qw1jbTMWen8Idv92Mm8Li2X2Hye5kpTO8La1ebNvM6o6l33izkkMP4b9SFJmEh1rdWSK7xR8PHzKOrQHToncSW36Vl+HPM1Sd+qkLm3FHeZ6txNgRZF95QqJy5eTtOJ3DMnJ2DVujF2rVtzYtOm2BOo+eTJWdnakHThAWnAwxtRUAOyaNPm36qm/PzqX8tE8USFJCYeWwLYZYOOgzfTmPSD35csJacz+I4wd4XE0qe7Me0NaEVC/7Eu33My6yS+nfuGnkz+RnJlM59qdmewzmVYerco6tAdWSVxBvAs8BZxD+0YPIKWU3QrdqJSVxpSjFYExI4OUjRtJWLqMzPDwO65rW69ebqeyY0AA1u7lrwZPhXQzDtY9D2e3QcPuMOQbqFQD0EpkLN57nv/tOouVEEzr0ZjxHepjoyvbAQE3sm7wS7iWGFKyUnik9iNM9p1My6q338mtlKyS6KQeATSUUmaVXFhKeWRlb4/ro4/iMmwYp7ybF7peo927sKlZsxQjU4rk9GZY9wJk3oC+n0C7Cbl1lPZHXGfGulDOXUulT4sazBzYHE/Xsq1JdSPrBkvDl/LzyZ+5kXWDLrW7MNl3Mi2qqvpi5UFRE0Qo4ArEmS8UpTwRQmDt6VloJ75KDuVMVipsfRsOfw/VW8FTG6CaNwBxNzJ4f2M4645FU7eKI98/1Y6uzcr2zvOUrBSWnVzGz+FaYuhapyuTfSfT3L3wLyVK6StqgvgQOGq6sS0zZ6GUcpBZolLKhWovTyuwE7/ay9PKLijldleOaBP6xJ+Dh1+CbtPB2g6DUbL0wEU+3XqaTL2Rl7o35rkuDbG3KbsSGcmZySwNX8qyk8u4kX2DbnW6Mdl3Mt7u3mUWk1K4oiaIH4GPgRNohfOUCiCns1514pdTRgP8/V/Y8xE4V4dx66F+ZwCOXU7i7TUnCItOoWOjqswZ3IIGHmVXIiM5M5mfT/7MsvBl3My+SY+6PZjkO4lmVdRMieVZURPEdSnlV2aNRCmXXAYOVAmhPEqMhNWT4PIBaDEMBvwXHNxITsvm462n+DXoEh7Odswb3Zr+rWqWWYmM5Mxkfjr5E8vCl5GanUrPej2Z5DOJplWalkk8yr0paoI4LIT4EFjPrU1M5WaYq6JUCFJCyG+w8VWtdtLQReAzAgmsOhzFh5vCSUzLYvzD9Xm5Z2MqlVGJjKSMJH46+RO/nPqF1OxUetXrxSTfSTRxa1Im8SjFU9QEkVPopH2eZRIoN8NcFeWBl5YAG1+BsDVQ92EYugDc6nE69gYz1oYSFJlAm7qu/PRMAC08y+YelMSMRC0xhP9Cuj6dXl69mOQzicZujcskHuX+FClBSCm7mjsQRVHu4PyfsGYypMZB95nQYRqp2ZKvNoXz7d8XcLa35uNHW/FY2zpYlUGJjISMBH4M+5FfT/1Khj6D3l69meQziUZuZTefu3L/7pggTOUufrnDjHENgZpSyr/NEZyiVHj6TNg5B/6ZB+6NYOR2pGdrtppKZMQkZ/C4fx3e6NuMKk6lP99HQkYCP4T9wPJTy8nQZ9Cnfh8m+UyioWvDUo9FKXl3u4JwRxveehg4zL9TjjYCHgGuA2+aNUJFqaiuntSGr14NBf9noNe7XLoheOeHQ+w+fY1mNSrxv1Gt8fcq/RIZ8enx/BD2A7+d/o1MQyZ9vLTE0MC1wd03VizG3WaU+1IIMQ+tr6ED4INWyTUcbTrQS+YPUVEqGKMRghbC9nfArhKM+o3Mhj1Z+Od5vt4dgbWVYHp/b5562AvrUi6RcT39Oj+E/sCKMyvINGTSr34/JvpMpL6LZc3CqBTNXfsgTPNAbzf9KIpiTikxsO45OLcLGveGwfPYGyOY+cVeLlxPpX+rmswY0JwaLuaZv6Mw19Ov813od/x++neyjFn0r9+fiT4T8XLxKtU4lNJV1FFMiqKYW/gfsP4lyE6H/v/lapPRvLs+nA0hMXi5O/Lj0wE80sSjVEO6lnZNSwxnfkdv1NO/gZYY6lWuV6pxKGVDJQhFKWuZN2DLm3B0KdT0Qz9kET+eteXz//5FlsHIyz2aMOmRBqVaIiMuLY7vQr9j5ZmV6I16BjQYwESfidStXLfUYlDKnkoQilKWLh/SOqITI6HT/3Gk/iTeXn6G8JgUOjfxYM6gFnhVLb2Jlq6mXs1NDAZpYFDDQUxoNYE6levcfWPlgVOkBCGEqA58AHhKKfsKIZoDD0kpvzVrdIryoDLo4a+52k/lWtwYtY73Q91YviiYGpXtmf9EG/q0rFFqJTKupl7l29BvWXVmFUZpZFCjQTzb6lnqVFKJoSIr6hXED8D3wNum52eA3wCVIBTlXsWfg9UT4Uow0udx1taYxpzfokjJiGJCp/pM7dEEZ7vSubiPTY1lyYklrD67GiklgxsN5tlWz1K7Uu1SOb5SvhX1U1hVSrlCCPEWgJRSL4QwmDEuRXnwSKn1M2x+A3TWRHX/mqmhDTgcdAH/em68N7QlzWpULpVQ8ieGIY2H8GyrZ6nlXKtUjq9YhqImiFQhhDum6UaFEO2BZLNFpSgPmtR4+OMlOLUBfb2OfO3yKl9tTsfFIZW5w314tE3tUimREXMzRksMEasBGNpoKM+2ehZPZ0+zH1uxPEVNEK+gVXJtKITYB3gAw80WlaI8SCJ2wtrnkGnxnGr5GuNPBxB7Oo1RAXV5vXdT3EqhREb0zWgWn1jM2oi1ADza+FGeafkMNZ3VzIBK4YparO+IEOIRoCkggNNSymyzRqYoli47HXbMgoMLyKrShNlO77As2IXmNR345sl2tKnrZvYQrty8wuKQxayLWIcQgkcbP8qzrZ6lhlMNsx9bsXxFHcWkA/oBXqZtegkhkFL+14yxKYrlij0BqybAtXCO1nycsZf7I3X2vDOwCU+2r2f2EhmXb1xmyYklrI9YjxCC4U2G80yrZ1RiUO5JUZuY/gAyUFOOKsqdGY1a5dVd75Jp48Lbdu+w8kJTBvp6MqO/N9Uqm7dExuUbl1kcspj159ajEzpGNB3B0y2fprpTdbMeV3kwFTVB1JZS+pg1EkWxdMlR2pwNkXs55tSB8fFP4la1JkufaUnHxlXNeuhLKZdYFLKIDec3oBM6RjYbydMtn6aaYzWzHld5sBU1QWwWQvSSUm4zazSKYqlCVyM3TEOfncVs4yR+T+7Ci70aM6FzA+yszVci41LKJRaGLGTj+Y1YW1kzqtkoxrccrxKDUiKKmiAOAGuEEFZANlpHtZRSls6gbUUprzKSYdPrELKcU7qmTE6bRMOmPuwY1II6VRzNdtiLKRdzrxhsrWwZ7T2a8S3G4+FYusX8lAdbURPEZ8BDwAkppTRjPIpiOS7+g2HVBETKFb7SD2OV7Uimj/GlV/PqZiuRcSH5AotCFrHpwiZsrWwZ4z2G8S3HU9XBvE1YSsVU1ARxFghVyUFRAEM2cveHyL8/JxoPXsl+hzYde7O1e2Mcbc1TIuN88nkWhSxi84XN2OnsGNt8LONajFOJQTGron6aY4A9QojNQGbOQjXMValwrp8lbfnTOF4PYYW+CxtrvcT7wwJoUr2SWQ53Puk8C0IWsOXCFuyt7RnXfBzjWozD3cHdLMdTlLyKmiAumH5sTT+KUrFIScaBJVhtn06mwZqZuld5eNh4fmxdyyzNSeeSzrHw+EK2RGqJYXzL8YxrMY4q9qU//7RScRX1TurZxdm5EKIP8CWgA5ZIKT/K97od8BPQFogHHpdSRuZ5vS5wEpglpfy0ODEoyv2SN+O4unQCNWL3sNfYiv2t3mVG/464ONqU+LHOJp5lYchCtkVuw8HagadbPs24FuNwszf/XdeKkt8dE4QQYp6U8gUhxB+YCvXlJaUcdIdtdcDXQE8gCjgkhFgvpTyZZ7VngEQpZSMhxEjgY+DxPK9/Dmwu8rtRlBIWc2gtjpun4mZIZZHTRAJHvsUbdUv+W/yZxDMsPL6QbRe34WjtyDOtnmFs87EqMShl6m5XEGOBF4DifHsPACKklOcBhBDLgcFoVwQ5BgOzTI9XAvOEEEJKKYUQQ4DzQGoxjq0o9yUj7QanfpyK39VVnKYepzss4ZkePdCVcMXV0wmnWRiykO0Xt+Nk48SEVhMY23wsrvauJXocRSmOuyWIcwBSyj+Lse9awOU8z6OAwMLWMc0xkQy4CyHSgTfQrj5eLewAQoiJwESAunXVXLlKyTi0fyfVtr+In7zC7iqP0+LJuQxycynRY5xOOM2C4wvYcWkHzjbOTPSZyNjmY3GxK9njKMr9uFuC8BBCvFLYi3cZxVTQV638zVSFrTMb+FxKefNOHYBSykXAIgB/f381BFe5L1cSbnLw55kMTPiBJCtXwnr8TNeOhbaiFsuphFMsOL6AnZd24mzjzGTfyYzxHqMSg1Iu3S1B6ABnCj6R300UkHdC29pAdCHrRAkhrAEXIAHtSmO4EOITwBUwCiEypJTzihGHotxRlt7Iip37aLb/VYaJU0RU60HdsYvwqFRyQ0nD48OZf3w+uy/vppJNJab4TuEJ7ydUYlDKtbsliBgp5Zxi7vsQ0FgIUR+4AowERudbZz0wDvgHbQKiXaab8TrlrCCEmAXcVMlBMYcD5+PZ/fs8nk+bj42VIL7HlzR6eByU0NDVsPgwFhxfwJ7Le6hkW4nn/J7jCe8nqGyrqtQo5d/dEkSx/5eY+hReALaiXYl8J6UME0LMAYKllOuBb4GfhRARaFcOI4t7PEW5F9dvZvL5H0EEnPyAt3T7SfJog8MT3+Pg5lUi+w+7Hsb84/P5M+pPKttW5nm/53nC+wkq2ZrnhjpFMQdxp+oZQogqUsqEUoyn2Pz9/WVwcHBZh6GUcwaj5JegS+zesor35DyqWyVh7PwmNp1fAd39l8kIvR7K/OPz+SvqLyrbVmZci3GMbjYaZ1vnEoheUUqeEOKwlNK/oNfu+D/CUpKDohTFiahkZq05Qs+rS1hivRG9qxe6Eb+jq9X2vvcdci2E+cfn8/eVv3Gxc+Gl1i8xqtkolRgUi2aeymKKUo4kp2fz2bbT/HNwH/Ns59PU+gKy7Xhse78Ptk73te/j144z//h89l3Zh6udK1PbTGVUs1E42dzffhWlPFAJQnlgSSlZe+wK7284Sf+MDWyy+xWdfSUY/CuiWb/72vexuGPMPz6f/dH7cbNzY1qbaYxsNlIlBuWBohKE8kCKiLvB9LWhnDt/joWVvqOtzWFo1AsGzYNKxZ+f+WjcUeYfm88/Mf/gZufGy21fZmTTkTjamG9yIEUpKypBKA+UtCw9/9sVwZK95+lrc4QfKi3BTqZDv0+h3bPFHr565OoR5h+fz4GYA1Sxr8L/tf0/RjQdoRKD8kBTCUJ5YGw/eZVZ68NITErk5xqraZ+0Aar6wrDF4NG0WPsMjg1mwfEFHIw9SBX7Krzq/yqPNXlMJQalQlAJQrF4lxPSmP1HGDvC4xjkfoVPPOZhn3QJOr4MXf4D1vc+hcmh2EMsOL6AoNgg3O3dec3/NR5r+hgO1g5meAeKUj6pBKFYrCy9kcV7z/O/XWexxsjvzf7C/+JiRGVPeGoDeHW8530eij3EN8e+IfhqMFUdqvJ6u9cZ3mS4SgxKhaQShGKR9kdcZ8a6UM5dS2VMEyMzs7/ANjIYWj2m9Tc4uBZ5X1JKgmKDmH98PoevHsbDwYM32r3B8CbDsbe2N9+bUJRyTiUIxaLE3cjgg43hrD0WTR03ezZ3voT3sfdA6GDYEvB5rMj7klJyMPYg84/N50jcEao5VOPNgDd5tPGjKjEoCipBKBbCYJQsPXCRT7eeJlNv5PVOVZl0Yx66oPVQryMMXQCude6+I7TE8E/MPyw4voCjcUep5liNtwLe4tEmj2KnszPzO1EUy6EShFLuHbucxPS1Jwi9kkLHRlWZ2yaemrvHQep16DEbHn4RrHR33Y+Ukn+i/2H+8fkcu3aM6o7VeTvwbYY2HqoSg6IUQCUIpdxKTsvm462n+DXoEh7Odnw9ojn9ri5ErJ8PVZvC6N+gpu9d9yOlZH/0fr45/g0h10Ko4VSD6YHTGdp4KLa6ex/hpCgVhUoQSrkjpWTVkSt8uCmcxLQsxj9cn//zzcZpwxMQFwYBE7UrB9s734sgpeTvK3+z4PgCQq6HUNOpJjPaz2BIoyEqMShKEagEoZQrp2NvMGNtKEGRCbSu68pPT/vT4tIv8OMssHeFJ1ZC45533IeUkr1X9rLg+AJOXD+Bp5MnMx+ayZCGQ7DR2ZTK+1CUB4FKEEq5kJqp56udZ/n27ws421vz0bBWjGiiw2rdOLjwJzTtB4P+B05VC92HlJK/ov5i/vH5hMWHUcu5Fu889A6DGw5WiUFRikElCKVMSSnZGhbL7D9OEpOcwQj/2rzZ15sqkZtgwVQwZMHAL6FN4dOASin5M+pP5h+fz8n4k9RyrsXsh2czsOFAbKxUYlCU4lIJQikzl+LTeGd9KLtPX6NZjUr8b1Rr/GtYw5ZpcGwZeLaBR5eAe8MCt5dSsvvybhYcX0B4Qji1nWsz5+E5DGg4QCUGRSkBKkEopS5Tb2Dhn+f5encE1laC6f29GfewFzZXgmDBREi+DJ1fh0dehwKahqSU7Lq8iwXHF3Aq4RR1KtXh3Q7v0r9Bf5UYFKUEqQShlKq9Z68xc10YF66n0r9VTaYP8KamszX8+QHs/Qxc6sD4zVC3/W3bGqWRXZe0xHA68TR1K9XlvQ7v0b9Bf6yt1EdZUUqa+l+llIqrKRm8u+EkG0Ji8HJ35MenA3ikiQfEn4PvJsCVw+D3BPT5COwr37KtURrZeWknC44v4EziGepVrscHHT+gb/2+KjEoihmp/12KWekNRn765yL/3X6GLIORaT0aM/mRhthbW8HhH2DLW6Czhcd+hBZDbtnWKI3suLiDBSELOJt4Fq/KXioxKEopUv/LFLM5fDGR6WtDCY9JoXMTD+YMaoFXVSetRMbKl+D0Rqj/iFZHqbJn7nZGaWTbxW0sPL6QiKQIvCp78VGnj+jj1QddEUpqKIpSMlSCUEpcYmoWH285xfJDl6lR2Z5vnmhD35Y1EELA2e2w9jnISILeH0DgFLCyAsBgNLD94nYWHF/AueRz1Hepz8edPqa3V2+VGBSlDKgEoZQYo1Gy8nAUH24OJyVDz4RO9ZnaownOdtaQnQ7bZ0LQIqjWHJ5cAzVaAlpi2Bq5lYUhCzmffJ6GLg35pPMn9KrXSyUGRSlDKkEoJSI8JoXpa0M5fDER/3puvDe0Jc1qmDqbY47Dqglw/TS0fx66zwQbewxGA1sit7AwZCEXki/QyLURcx+ZS696vbASVmX7hhRFUQlCuT83M/V8sf0M3++PpLK9NZ8M92F4m9pYWQkwGmD//2DXe1qJjCfXQMNu6I16Np/7g0Uhi4hMiaSRayM+feRTetbrqRKDopQjKkEoxSKlZNOJWOZsCONqSiajAurweu9muDmZqqQmXYY1k+Hi3+A9CAZ+id6+MpvP/cHCkIVcTLlIY7fG/LfLf+let7tKDIpSDqkEodyzyOupzFwfxl9nrtG8ZmXmj2lLm7pu/65wYiVseAWkAQZ/g95nBBsvbGJRyCIu3bhEU7emfN7lc7rV7aYSg6KUYypBKHe09ugV5m49TXRSOjVd7GlVy4XdZ65hq7PinYHNebJ9Pax1ppN8ehJseg1OrIDaAeiHzGdDUiiL1g3m8o3LNKvSjC+6fkHXOl1VYlAUC6AShFKotUev8NbqE6RnGwCITs4gOjmD1nVcWfBkW6pXtv935ci/tSallGiyu7zFBs/GLPrzRaJuRuFdxZsvu35J1zpdtaGuiqJYBLN+jRNC9BFCnBZCRAgh3izgdTshxG+m1w8KIbxMy3sKIQ4LIU6YfnczZ5xKweZuPZ2bHPKKu5H5b3LQZ8GOWfDDALJ11qzuO52B8XuY+c8sKtlW4quuX/HbgN/oVrebSg6KYmHMdgUhhNABXwM9gSjgkBBivZTyZJ7VngESpZSNhBAjgY+Bx4HrwEApZbQQoiWwFahlrliV28XfzORKUnqBr0XnLL92BlY/S3bMcda36MlikcKV8O9p7t6ctwLeonPtziopKIoFM2cTUwAQIaU8DyCEWA4MBvImiMHALNPjlcA8IYSQUh7Ns04YYC+EsJNSZpoxXgXIyDbw7d8XWLDnXKHreLrYQ9BisrfNYG3lSixp6kN02mlaurfkP+2n06lWJ5UYFOUBYM4EUQu4nOd5FBBY2DpSSr0QIhlwR7uCyPEocFQlB/MyGiVrjl7hs22niU7OoId3Ndp5ufHFjohbmplq29zgV9f5rNh7lCV1ahKDnlaV6zDd9wM61uqoEoOiPEDMmSAKOlPIe1lHCNECrdmpV4EHEGIiMBGgbt26xYtS4e+z1/lgUzgnY1JoVcuFz0b48VBDdwAy475hXfJWrlkLPPRG2mdm87StDbGOVfCp2pyZflPo4NlBJQZFeQCZM0FEAXXyPK8NRBeyTpQQwhpwARIAhBC1gTXAWCllge0dUspFwCIAf3///MlHuYtTsSl8uOkUf565Ri1XB74c6cdAH0/tLmhg454Z/HxzGxk22liGOBsd62101NFVYmHXT3nI8yGVGBTlAWbOBHEIaCyEqA9cAUYCo/Otsx4YB/wDDAd2SSmlEMIV2Ai8JaXcZ8YYIWQF7JwDyVHgUlurE+Qzwnzb3Q8pQZ+hFb675XcG6NO157e9pv2WWWmkZaeSlJVCXMYNwq/HE5t2gxbWenzrgd7GyKFDenYcMpKCkWQhuWCtw2h1ewLQZ6XwcK2HzfteFUUpc2ZLEKY+hRfQRiDpgO+klGFCiDlAsJRyPfAt8LMQIgLtymGkafMXgEbADCHEDNOyXlLKuBINMmQF/PGSdhIFbS7kP17Sagi1Gl74didWwsaXb91u/Ytw4yo07JLnhF3Y78JP5HdcV5+BBG4KQbLOimQrK5KtdP8+zl1mRbJOl+exFSlWVujzftuvbPoxccAKF2Gj/ehsaWBlz7mM/Bd8mlh1j5uiVAhCygejZcbf318GBwff20aft2SjPp4v3VyJtdZRQ29gamIS/VPT7rrpRifHYm2Xw6iz46atPcm2jiTb2JFsY0uyzkY7seec6IUkGUkyRpKlnmSZTYoxC8NtXTn/ctTZ42LrjKutC5XtXHCxc6OSnSvRiVYEncvkZpotbevUYmyAN009quNq70pl28rY6mxv21ev71oSo7v9CqKmQbLt6dAiv1dFUcovIcRhKaV/Qa9V6DupN+oTmFW1ChmmCWtibKyZVbUKAP0DXyl8u4P/vW27d6pW4ZK1Na3av0SyNJBkzCLFmEWyIZNkQzrJ+jSSs1NJzr5BctYNUrJSMEqjaY9GIMP0A0hwtnLGxc6FyraVcbFzoYadCy62LrjY5fkxPXe1c6WyXWVcbF2w0dnkximlZGd4HB9tOUVE3E3aebnxn6HetM5bN+kOpjYYyqwLa8jI08xkb5RMbTC0aH9gRVEsWoW+gijsGzJSYm/tUOh2Gfp0KGLnbCWbStrJ23RCzz2Z5zvBu9i55J7kK9tVxsbK5u47v4OQqCTe3xjOwQsJ1K/qxJt9m9GrefV77lTeuGcGX55fQ6wV1DBqSaN/l3fvKzZFUcoPdQVRiNiCkgOAEIxqNqrQ7b4P+77Q137u+3PuSb+SbSWsrUr3T3w5IY25W0+z/ng07k62zBncglEBdbHRFa/joH+Xd1VCUJQKqkIniBpONYlJjblteU2nmrziX3gT05bILYVu51fNryRDLLLktGy+3hPBD/siEQKe79qQyY80pJL9/V2JKIpScVXo8ShT20zFXmd/yzJ7nT1T20w1y3bmkKk3sGTveTrP3c3ivecZ5OfJnte68FrvZio5KIpyXyr0FUT/Bv0B+PLIl8SmxlLDqQZT20zNXV7S25UkKSUbQmL4ZOspLiek06lxVd7q601zz8p331hRFKUIKnQntaUKupDA+5vCOX45iWY1KvFWP28eaeJR1mEpimKBVCf1A+LctZt8vPkU205epXplOz4Z7sOjbWqjK+BuZ0VRlPulEoQFuH4zky93nOWXoEvYW1vxaq8mPNOxAQ62urIOTVGUB5hKEOVYepaB7/ZdYP6ec6RnGxgVUIep3ZvgUcmurENTFKUCUAmiHDIYJauPRPHZtjPEpmTQs3l13ujTjEbVnMs6NEVRKhCVIMqZv85c44NN4ZyKvYFvbRe+HOlHYAP3sg5LUZQKSCWIciI8JoUPNoWz9+x1ars58NWo1gxoVTN3bgZFUZTSphJEGYtNzuCzbadZeSSKyvY2TO/vzZMP1cPOWnVAK4pStlSCKCM3M/Us2HOOJX+fx2iEZzvW5/mujXB1vL3stqIoSllQCaKUZRuMLA+6xBc7zhKfmsVAX09e792UOlUcyzo0RVGUW6gEUUqklGw/eZWPtpzi/LVUAupX4bt+3vjWcS3r0BRFUQqkEkQpOHY5iQ82hhMUmUADDycWj/Wnh3e1e56bQVEUpTSpBGFGlxPS+GTraf44Hk1VZ1veHdKSke3qFHtuBkVRlNKkEoQZJKVlMW9XBD/9cxErK3ixWyMmPdIQZzv151YUxXKoM1YJytQb+Gn/RebtjiAlI5vH2tbmlZ5NqeFif/eNFUVRyhmVIIpp7dErzN16muikdGq62tO9WTV2n75GVGI6nZt48FbfZnjXVHMzKIpiuVSCKIa1R6/w1uoTpGcbAIhOyuDnA5fwdLHn52cC6NRYzc2gKIrlU72lxTB36+nc5HALgUoOiqI8MFSCuEfxNzO5kpRe4GsxSRmlHI2iKIr5qCamIsrI1uZm+Gb3uULX8XR1KMWIFEVRzEsliLswGiXrjl9h7pbTRCdn0MO7Ou283Phix9lbmpkcbHS81rtpGUaqKIpSslSCuIN/zsXz/qaThF5JoVUtFz4b4cdDDbW5GapXts8dxeTp6sBrvZsypHWtMo5YURSl5KgEUYCIuBt8tPkUO8Lj8HSx54vH/Rjk63nL3AxDWtdSCUFRlAdahU8Qee9nqO5iT8Oqjhy4kIiDjY7X+zTl6Q71sbdRczMoilLxVOgEkf9+htjkDGKTM+jYyJ0vR7bG3dmujCNUFEUpOxV6mGth9zNcuJ6mkoOiKBWeWROEEKKPEOK0ECJCCPFmAa/bCSF+M71+UAjhlee1t0zLTwshepsjvuhC7mcobLmiKEpFYrYEIYTQAV8DfYHmwCghRPN8qz0DJEopGwGfAx+btm0OjARaAH2Ab0z7K1GF3beg7mdQFEUx7xVEABAhpTwvpcwClgOD860zGPjR9Hgl0F1os+gMBpZLKTOllBeACNP+StRrvZvikK8DWt3PoCiKojFngqgFXM7zPMq0rMB1pJR6IBlwL+K2CCEmCiGChRDB165du+cAh7SuxYfDWlHL1QEB1HJ14MNhrdTwVUVRFMw7iqmg+TRlEdcpyrZIKRcBiwD8/f1ve70o1P0MiqIoBTPnFUQUUCfP89pAdGHrCCGsARcgoYjbKoqiKGZkzgRxCGgshKgvhLBF63Ren2+d9cA40+PhwC4ppTQtH2ka5VQfaAwEmTFWRVEUJR+zNTFJKfVCiBeArYAO+E5KGSaEmAMESynXA98CPwshItCuHEaatg0TQqwATgJ64HkpZQETMCiKoijmIrQv7JbP399fBgcHl3UYiqIoFkUIcVhK6V/QaxX6TmpFURSlcA/MFYQQ4hpwsYirVwWumzEcc1Kxlw0Ve9mx5PgtIfZ6UsoC50p+YBLEvRBCBBd2SVXeqdjLhoq97Fhy/JYcO6gmJkVRFKUQKkEoiqIoBaqoCWJRWQdwH1TsZUPFXnYsOX5Ljr1i9kEoiqIod1dRryAURVGUu1AJQlEURSlQhUoQd5vhrhTj+E4IESeECM2zrIoQYrsQ4qzpt5tpuRBCfGWKOUQI0SbPNuNM658VQozLs7ytEOKEaZuvTHNslFTsdYQQu4UQ4UKIMCHEVEuJXwhhL4QIEkIcN8U+27S8vmlGw7OmGQ5tTcvvecZDc3/GhBA6IcRRIcQGC4w90vTvekwIEWxaVu4/N6Z9uwohVgohTpk++w9ZSuz3RUpZIX7Q6kGdAxoAtsBxoHkZxdIZaAOE5ln2CfCm6fGbwMemx/2AzWgl0NsDB03LqwDnTb/dTI/dTK8FAQ+ZttkM9C3B2GsCbUyPKwFn0GYMLPfxm/bnbHpsAxw0xbQCGGlavgCYYnr8HLDA9Hgk8JvpcXPT58cOqG/6XOlK4zMGvAL8AmwwPbek2COBqvmWlfvPjWnfPwLPmh7bAq6WEvt9ve+yDqDU3qj2x9+a5/lbwFtlGI8XtyaI00BN0+OawGnT44XAqPzrAaOAhXmWLzQtqwmcyrP8lvXM8D7WAT0tLX7AETgCBKLd6Wqd/3OCVmjyIdNja9N6Iv9nJ2c9c3/G0Mre7wS6ARtMsVhE7KZ9RnJ7gij3nxugMnAB06AeS4r9fn8qUhNTkWapK0PVpZQxAKbf1UzLC4v7TsujClhe4kzNFq3RvolbRPymJppjQBywHe1bc5LUZjTMf7x7nfHQ3J+xL4DXAaPpubsFxQ7apF/bhBCHhRATTcss4XPTALgGfG9q3lsihHCykNjvS0VKEEWapa4cutdZ90rlfQohnIFVwDQpZcqdVi0knjKJX0ppkFL6oX0bDwC873C8chO7EGIAECelPJx38R2OV25iz6ODlLIN0Bd4XgjR+Q7rlqf4rdGahOdLKVsDqWhNSoUpT7Hfl4qUIMr7LHVXhRA1AUy/40zLC4v7TstrF7C8xAghbNCSwzIp5WpLix9ASpkE7EFrI3YV2oyG+Y93rzMemvMz1gEYJISIBJajNTN9YSGxAyCljDb9jgPWoCVoS/jcRAFRUsqDpucr0RKGJcR+f8q6jau0ftC+BZxH65jL6YRrUYbxeHFrH8Rcbu3w+sT0uD+3dngFmZZXQWsXdTP9XACqmF47ZFo3p8OrXwnGLYCfgC/yLS/38QMegKvpsQOwFxgA/M6tHb3PmR4/z60dvStMj1twa0fvebRO3lL5jAFd+LeT2iJiB5yASnke7wf6WMLnxrTvvUBT0+NZprgtIvb7et9lHUCpvlltdMEZtHbnt8swjl+BGCAb7dvDM2jtwzuBs6bfOR8cAXxtivkE4J9nP08DEaaf8XmW+wOhpm3mka9z7T5j74h2+RsCHDP99LOE+AEf4Kgp9lBgpml5A7RRJBFoJ1w703J70/MI0+sN8uzrbVN8p8kz4qQ0PmPcmiAsInZTnMdNP2E5+7eEz41p335AsOmzsxbtBG8Rsd/Pjyq1oSiKohSoIvVBKIqiKPdAJQhFURSlQCpBKIqiKAVSCUJRFEUpkEoQiqIoSoFUglAUQAgxVAghhRDNSvGYT5sqeIYIIUKFEINNy+cIIXqUVhyKUhg1zFVRACHECrSiaTullLMKeF0npTQU9rwYx6sN/IlWGTfZVLrEQ0p5obj7VJSSpq4glArPdHLugHbD4sg8y7sIbe6LX4AT+Z+b1llrKj4XllOATgjxjBDi8zz7mSCE+G++w1YDbgA3AaSUN3OSgxDiByHEcCGEv2nuhGOmKw1per2hEGKL6bh7S/OqR6lYrO++iqI88IYAW6SUZ4QQCUKINlLKI6bXAoCWUsoLQogueZ+bXn9aSpkghHAADgkhVqHVSgoRQrwupcwGxgOT8h3zOHAVuCCE2AmsllL+kXcFKWUw2h28CCHmAltMLy0CJkspzwohAoFv0GozKUqJUglCUbT6+1+YHi83Pc9JEEH5mn3yP39JCDHU9LgO0FhKeUAIsQsYIIQIB2yklCfyHlBKaRBC9AHaAd2Bz4UQbQtp3hqBVhyul+lq52Hg9zyTjtkV500ryt2oBKFUaEIId7Rv3y1NTTg6QAohXjetkppvk9Q823YBeqBNzJMmhNiDVgMJYAnwH+AU8H1Bx5ZaB2AQECSE2G5ab1a++FoAs4HOpqRihTYHhF8x3q6i3BPVB6FUdMOBn6SU9aSUXlLKOmhVNjsWYVsXINGUHJqhVeMEQGqloesAo9GKM95CCOGZd65itKaki/nWcUG7ohkrpbxm2m8KWrPUY6Z1hBDCt8jvVlHugUoQSkU3Cm1ugrxWoZ3Y72YLYC2ECAHeBQ7ke30FsE9KmVjAtjbAp0KIU6YZ7h4HpuZbZwhQD1ic01ltWv4E8IwQIqcy6uAixKoo90wNc1UUMxFCbAA+l1LuLOtYFKU41BWEopQwIYSrEOIMkK6Sg2LJ1BWEoiiKUiB1BaEoiqIUSCUIRVEUpUAqQSiKoigFUglCURRFKZBKEIqiKEqB/h/uliimdNUlSAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "import numpy as np\n", + "\n", + "arraySize = [1024, 4096, 16384, 65536]\n", + "CPUScan = [0.0014, 0.0054, 0.0213, 0.0917]\n", + "GPUEfficient = [0.017408, 0.017408, 0.018432, 0.099712]\n", + "GPUNaive = [0.016384, 0.016384, 0.018432, 0.075776]\n", + "Thrust = [0.059072, 0.068608, 0.05632, 0.08601]\n", + "\n", + "scanFig, scanAxes = plt.subplots()\n", + "\n", + "scanAxes.plot(arraySize, CPUScan, label=\"CPU Scan\", marker='o')\n", + "scanAxes.plot(arraySize, GPUEfficient, label=\"GPU Efficient\", marker='o')\n", + "scanAxes.plot(arraySize, GPUNaive, label=\"GPU Naive\", marker='o')\n", + "scanAxes.plot(arraySize, Thrust, label=\"Thrust\", marker='o')\n", + "scanAxes.set_xlabel('Array Size') # Notice the use of set_ to begin methods\n", + "scanAxes.set_ylabel('Time (ms)')\n", + "scanAxes.set_title('Array Size vs Time')\n", + "scanAxes.legend()\n", + "\n", + "\n", + "scanFig.savefig(\"../scan.png\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8015ef9a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1000000., 1500000., 2000000., 2500000.])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/images/scan.png b/images/scan.png new file mode 100644 index 0000000000000000000000000000000000000000..7ae527e1dd472ca86b04d7cdf9a003f3a0374ead GIT binary patch literal 24790 zcmcF~bySqm*X|I~B}kWufJ29LN{F-!Jv2xTE!`-cB14yefW*+uz0|J5Y72e50Kp>O~;Po5}1Nda> z6|)WSChRJw>#E^s>FQzXVgXVyb#=0HbhWcFd+u)G^3lf8fsd1q^ELZ(YgbpNk0M-L z_W%0}oQ^J5Ttu2fO~4@7PVe+Sf(g3SgpcDhJ`VYR;=D=|NhSGy0USyy_Il^`n*)3LHmRw6{2cPaE-O+Ut#Yz(F!e~I zf53vgH2RYdw3lM^gmf7nvweIC=)68IUbq=2|AsOK6G`h*r{{7?VKD^%#Kq=j2qw~2 zKvf3*)A|Y`k;XDjeA)5%9j5<(m%@E>;W*lgZk#(JE;3+#V=zDB$1c$eE;zF32aISf z77m#YB9pQ}xVbz%gE|^EL$X~IjJIT$REkDibtdFDcLSmrCE(WqAnbcnn&gax$ zn#Ru8tT(u~bG*k}t`z%M3dd1bY%PZ4=qMVhE8;^G&(sw&Yi>)IepVn zVgm;l(Tnnz9kR}XRJyiTJ4Ntf`1V8cJ5XhjO0>_tpkaDTDK?*r2feqFVO}XfsMSifG`VViB^#%tUPE z2|EbMnomWOqXT?1QC4uv)K%;2Dy8n|>Tk&;2`y*}9f&BwV8MCeL@~Wou zl2TPiajLHL<^?yFC(#d7Srp?98GPwK;pi!^>fjJs9@PnsX()XMR&GBpqV6rMRS9N~ zgtF)*`IXhSeXmtTAA>%a+3=^}N$Nij`WCQkGdGHb+zEkXsldl2~Hn;%*FK}Un7+X)qtdxak1_R6HIpZ9ffcz?zNfoXhWW|BXz|d|1oflWg9K}l3`vq0mY{Ap#KlJX84fnL zdPWS201Q#CpY`yrU?RLDpS`ZxA{Aom8mi$wE~OD|JOs>)dere*u_#o^(8ON5R~L1Y z4RLlyktwwFgp4hp%A&RnQcQZt#4RKfRmOZ6APB3R)S%6Kl5pio#VAWx%kGVr#}tXY ze$7qE__`|*G6;v%?=up<;5Cp}uJLV8TRoHQ;OO?zOI-D%>Bd41AwDqR1Ow}t1c!`t zI7ORaYbb`GaG`L(AsoD=XS(q&~$Sf(MFGr`#9u$SY=rFZESpTw6 zf@S`wy~kOv(OYNVDF>CJW&t}gaQK@knk?2Tl#3j3rZ!8n!xF8nqS8Tl$uEO1X^U{I zgYduIt>899^|tcnQibjgh8Tps%764ieysKKt3D~EK^?~E3ERAnmGnB%#{Z^4aEO{& zO$YBa3Pto&v*iTfBz;hl$Fx$~{K7*qy+mLE9VKA2PJ*iHS)TH9zX5L);*}{DwJCxx zN@H!X+hH9qOcIbHb1u;sibZ8fFe?QL4n_^_n^#gNK^`2dk&xm~5L*+5>F0_(z#bXI zptMavjVGaDi?>}uOoF_H(|R>X`Je~X2a^mO7{=Z}jVhKC5#yHof~iMDl3xhQbkHVu znSJJPjS+wJu66?7scf`{;?x-Q#@qn&hB)ucoyv2K({Y_;g&|twhoCTOHHTX;3Tcp_ z!*Y$3U#N4FTQy_{<1Gt$yL8zbxS0!MHBlhMmWPP-;vKfB`1krRug=&ntng8wq|RXJ zaDy)z;klX`imfr@Mdq={YXkLZ#^9gA!XdAMiMm)7aQ|m;RD3E(bH$=a>`&a zlnw-;VV1!V)l@@GGvG>;zw7It+uH-wlON-{9M~=A2ka;KzefthW(clOBsP%-eh0}h zLeo$@>!OexCe!_1o0YkJh8d{F^Ou?`r`9NbwU(!2*wlZiawreZR#a3J931=w?mFGs zC&%FA;*txkwVj!tCwBu&W3g#kqZGiadQ%X!qC}{BI*N!|ygKg*x#%DjLF39Fzbz+KX0`frMtbowVthFilO2+>x-rgIV}q&lH+84=`pAj*tOU56jgKdu!V{! zEy`ZhYR=u~Z(n2?n8cDl?^uhpA>V<2(x=)NELD&R^sSbTw}<^Juq zQp`TqT~*pqD&d+6W7G`@3GXUBLbyR9{*eRlm;86j{2E+3dfapKLw$4JA|5qX5{2Xi7c|78aHz{~LQTF)=JW zJoM+R16wUu|C$dVWd_2^%p{9VZtFke8G*Z|2?X}!G zN1W)C)iFrQ^3$%k?k{rOuIfbuT(5dLDw-Q-=eZ?)7+CXlYi&sr zSkx@f_hyBJg;xR3F3k2%+ZO-X<3_xAVoYOJ5q(b3f-Zn_C24}U9h zt*)(o%FLuyO6KfEB9n?V3Xc~)j*i#ZXxQ1?djaQ!g7*U!0$Q7^kia}yW|)3+&@MT8 zpoFC!Nc>{8{+E+0{$vY1#wg|}6|g>)-(0m9Ih8=h#>Sam5*6nCvEx6~^IjCc`yh9( z65Q~aZ{FlgHoTp{uuwXTyrb{@-~1w4{v-;b)-j0gW%j$r^c0ys`7%#%HLLuaub-zY zrKXMEq!y{@W1>;dFu)Hr?w9uDuA#+Uv7r)vmM*ZRA%Qv-cXp~o$7`>$|A%3-fc;#}YTL~} z^QmGjklk_{Eg>Nxkj>=s-o~T|e`E$d+$=wM-d}Ax&Z+?p_sciEV|W9f>h(8^!K50y z*{`PEp(sxKbNr71e5#mK)O~Ly+1NivJ@2jS_U|VF0Rcw7N4=uR#X>du+`PP#(=6ch z50B@`AGGBX<@ffs_Nn|rk?ce(@0dtGVQM+{0UfWYLQOorj!t}pAt+y46Pfo#ulCai zphw5^Bg^hQ-ruWciX6>5wm+q(S9|otbX<0pBu>M~@&iP@v)yb}NGLw_@uJ5v%2&!^0&b>QD(vtF}1M|sOJ4u^i-Q1S(1uvVlH zT9i-MGQIT4Q*KkMwqTv&UqiXHR@hN0BgbcB?0v2rU`0nglTvq$%Z%CaUXVep2vnQc8rgN{;8R_8vL=gox#=58ms6SxTx~?uxb8S%=2I4 z=1k*976Jl-ZH8L&egeR0Ltw=CI87U`*IEgrj204?K+Q5r*nw(kP*r-%HFer7gX}^X z0h`}1@8C2YI>vszXS>sm*y-uh9lQhkaMp_#B}F>C?Glzl6iWdQck@Qo_4R7Y+6AUh z^OuSwO^2($P&^S|Mr)5S`RJ%)KV&&PdY^o`a7a@xLa+`8xaAS{S*hsVV64<|ZK{L-}lH>cl|F zy(-O{DbHcCsnV<$r>nagh4CGiL4#AjUsH4QM*w*EmTQHZf%%^bw4f@5eTpKFUc|T% z_ZVFzrJ?E`yJ}9Y;{Ae=xG*bJL{ffcn?h$~GL&1ndsdc_zL;!S?!hs>`h?hI7ACRK zK)sYF@-6z{PO`nwC|nV&4IF+Ty~qKP*m}B*_VH|gCX{GccTZ&1TB{zF$WzOfc#JB) z5?D?q)Fe$4h6BRm8RZK+@&dMU!d$f0fpilQ8JTM{QBY$&{>gs6Zt>U~u-9ud(*y7~ z%;9vwLZ6F&28}Kou{Sh4X6PUP4&`W-X#b9of!pbznsk1yaAc?nKw^~-s2tcN3x0*9 zHgcxi8}I#rd_biqbnL8h-Y?SmC7vBwXT+Ns`3iFE_(BoA8rwYpMSqzGYWR8cO=j6e zjS_-?H`ZV^b2-+9gELj#pWmxd9!<<(@1$s}^p`6u5agk<#=LzzV(n;DXM!V>4e;hd z!>m@a{!{+WJu$ybOD%ET*L}%Xo~j{DM*rmqbz-QCS*W7eAL<47)S zJabz=o;J`F*M4Te{bsQ5Gq+KpW947U`o1|5u|800V`o&KpDQlCpL8Dl@{ z=hosFvW|pf=Ie(BPe3#i}J`? zpO92TIfRyniY-=>5x9FC(3DO5euqQA>_2KmeEyGNwd0Me3-UAj_r?6=3jPE@g9~qN zbSKu9Fn3LG6sPiTBaC(MT3TGHeyXfb((|jt+11e?qW)U20IHX4H$831_lrf9BB<7i z5c~c@m8-K>&TW2y7bq5{YpPOsJiu0dI3jwmKxVQzn!S0{PtUUCva>qK)&5;3 zi~yi}S^js|PoF&t&d#QfBIhaquvXxB?#)5KgHm@WzSn99Ie_$Ea&W{@@>y~NvDjdm zIao`may-TB1rvdH0M>rrMx&3f$vZp^9ZX^(Vwo@zqamspcoXU3yH+aIp|;kZgISrb ztk&;F0?b)89L97yl9C2-1P;^$Mvi_X5kqs6W-{)4lN@GA(<{W^|*uFtml|CxIV z83F?totKHnZ`xHmW2mY2cZGqB5$pn5cf($~t?7`J^qAWI{d*t~)TlI*1JVIuMn;B^ z+s?B%iR&%orwHpBIuD2KTy>x2wI2aj9<t0ufCUnx_v25jy_ zDn0Edgg_n*ZKdNP&gicMr=yeZMjH*dUuUe9F7%o09$UUQIA8TuzA9V3(de3+eE#HU z()0yuzAyXWej;QUoR69a>4!r!aGH<`2$t0kKD%6=Ecz_K8H8{8k)6A;I<{Rg)B9hV zUZ13-P_(qRR#sN7hSB?XeBiZcTnvB8czM`O7!eVX1F&?^>E3JtSKH;M^v{y?BCcCy zZn)}ua+zU{Hx!=_1*kwI9vea}u8KI=m;_GyCFMe*JI=A!b={}gn>OtMj8g;6>dsxB zw!Rm%XBYeVf9sba{$1z>(Auwu1M(qAR1L*hRAZFIAR?1?h%EwnZ5=w5g^^o>bW4OV z1`-C!2Zo`@Bi#7%;kZtk;IjneweOoLpX$7~{^!yy!kGg~T&+ET-S0whNS)l=hNoOv zEvqR71?xps3tcp_CH#K8&yl`9vC8&K%8>~Js9gJ4zEZMXxiNs8qk*%`12BbCi-`gi z^W#6i9*H@EUqV8nzV=y21r@YAKHKze@a<+az%{DF4rbJ4(v@G*il;4YzgS&)<7)N- z3g(2DjNC(Moe##c)JO5YN%vOG{*(WcEuF3k$a3`DGO)#oV_%#=NgQN= z01J&3#2DBg*cpgEj<59&^Y)9wc3TjH+9P`G=F?ll;k;SOVykdkX zw^jP`u*?E@BuYylX@}Gb&I)RaYvCcWbU1tF9j>-Y;H8tf``Hc4=NM7Ac6U~BU`*#(v{lWTvvn#V*u$>)g0_N2EuN#=DBO5Ibj=m{| zuD1Qj&~W=6(grgnkX1_0Bn+~nhB%wm7tpq);U@U@HiqiLpn@nj_?=QHNA*fJS7!;G z*foo$jhwBu^|j}xBOpw;pRt166WP|0MI@a2^Xog)1UIZ&pc%ivA`?j%&VQj|t!bK; z-Vc3%7{)o^p2bjsM+@%}^1EzCijZRIDDI>B2I-?J!&-@S zvHHM0pFDw@7aNH`p1&VP8Ad$P5-2y%h{S7$ZjyPI@)!dZu;DRX8MQo zpgrrV;8#z_J05L%g8a9_nOh8c7R zBA_q?ya~%yOOc~fC}f82vZexuTw0f*nsNd;8ex3@6nom>UAUvlaTTIN+iqde*=9P{ zCGYL(?;!L;$RyAmkKy z*&FzRnf|$JhT!`NxxT^Id))Rxszr#~O=!UWR=8MNbh6jpS%1;*-PVJEii%}&9c94Ps%f{vv zb1tkCCrH-snu6PN*6}oKczDoOCTfLhFdH=3(bB^#`dT` z1GJwlB!U_QAbLP{c~pZ&vL)33^ep6kx@CC&#pj^)g3ht!=xy79PI&G&4Gjf6JjDao zc+V{&5vC`0_RbE!Dauu~oTo6I9FSP)@@+xKXD)4y2tBVu-UEA~x0=#8YVJRJl@@nb z|GgBXBiA7|@gS;pFlz92{D1j|aE&I+sN0*LoOCGN5nj}|~k$3zm5lm?k z3HRH4VPn{xaFFdFpTJ7@8Gm+kgMQH87w7 za`$SyJRO68KvGT3>%YV4{nux^3d+jg0XgHf&F^3!;QjSkl>Wnl;0G6~k+gecj}Oe% z(|=UP0};6GtO#2Zaqiza+02tQS6OtzF#<+|vNxqtix2-`{Fr6v?~hVbT|eO{ytLdD zUdY(HG_mHRbI_orrrs4b)Fsl8BK61@=;&JViTh(dt5*1xg2*Hz?WYtAlk>)Kku28( z)1W;KHUD=PI(^YX)yy?O`dZs9Z@0WWT)n;CEsv(+#|hlpvcA9GM>prDP5!i?8`~+TgL|Sf|9$5p zSeRe0@*MF0lg0>C1qu^*Vr+qIGLbg_>e*8-Qhw&}PIx+_el9Xy z_-6Ba*tCj5!Al9z%T#n-n*`yOe|5J;fOHg<#G#Km1~6&BAG7c07~{lTruPB*IjEr~ zE@x}Y`WXXXaXFc&SH)JB0^8?;*io1xh0J#7+C(P@wT$*(HF^>~!BnwjD;gjX63<*x z&CGRH^p3e&$HoQlya=M){WR+>mJyn`c(qokT~lP!DkF6jcEu-YBJot=sa_i$8NK#T zQqn94wTTA|oVha>Q!_Snq4(J^k#+KR^~SfnfBb0KD-6&AfxT7^$Z{St6crU6vg9)n zUNpm=#H8~KY=5{<2Wa{UAOxHMIF}nZPrl_E;R}jU;m=K%B?iOEeH1Ehoh^5_BcihD z%gp=N)~4ygpb{Pj+vM~p?JA$lcRLCbcw%$pYiXw9Gc3(kjjgU+=56-uo@v|&tVIw= z%;;?sy}#)`=i@S=1K!p*{>V{=@%9w-EY%rVOb^JtH(Y&Hi{dUwYMkaf$j=S>FNQ#F zs2&&>M~F#4V*)q=;N_u{lbRq-g9a&J)hPl0KRP<15%c^4Xlmw;j%vVCpYw)%e)Y0o zMwj^ZZbA*0S)W<8*7;+1?=<=7)3_A?EXSVhZaTc^pU^istNucSvbY?2lNCne(0(%M zR#1+;ws+=vEpTB8UGCF7Z}5{4gH=e{Nq!_8?R_Yg+{hd}v7OhU)DED~=W=M>prTl~ zdG@|H6J1xp!F}@Qs8RTp>E2A`qq6S4KQ|62W~a#6^Zj{1A0s$FKW}JkJi5Dh%jUe$ z;QUz7KqB;&SP0ag4`E}!q!EKJ)ry0KD26Q8|E?kt;E8g2AfGzfE>I*68r6WlGk0b4 znH>237$bUnkGeGcv1WT}=H=4;{;PJ{#!qp{xXY4Lv6wqL4L6~^(a1DE)0-4>?l?3Z z!uOqxsO7bOF{Wo>s~@b;o!=+}afNiwl{G}AK%EnCtQ^(~ z6NAmB=zxm4DSVXtT1&?nqrbFS{hf!OHC@YoY7A$YS-0kFcP1`>5FurdjmY^A<5i`v z75@+0<(-*XnXCaDsYQ48mWy*D^i9{-zDi>9r<;!I^kE&3SeQ|Ke527~_nHErX+kG2&MS~+LE0-LpY$F(wa~8H{z3X{F3nvDo z=Wu=N>eN3ZEWbq!U+yi(2}{s{`Z4fdiLO0mteCqt+x?gI@T4c0;-h=!G7A39n}^ZP zsok=4x!%IV)q?fgngim*K_MQ;4=`_jgq@a*lnvsF<&^#}#t_pA6qIPddQ`BSj#nnA z`-u$~m<;X$g+T!I705jmj`Rgv)*u>R-&Yguz?ROh|1P5iw2OvjuQA0-`zq(p$AMhm zrq-R3LA>ewYy_d)KI_hA#2S?IMPvzd#c}C*XSd!H&}ueU@)Oj*&H$TZZPvCdUe?5z zrjGuMI9p2B9xL567Rt6pAa=N_zLtJB3aG^ZGbB$u-sr3hAhLpHw2t}^FrC2#UH=)K zVojkZOub#9Ix9bUf9&AOkbTn0Y<6(pQ)Hje##{|xz*$bS@Un8!pwi~lzIT& zY8Q|jcll3EI%6@I)rEzFH8R-mWON8ja6tzOMwf^U6Z~l&nUJ)qJ{&G)nmF|Udf)$HZ zP^_SHhgp`}KdpI5s_fL=AU%u+r+(~S2~zP7+E zrYc19uyq`IqVv;u2sfpFJUma+tz=+`fN$HlQ1V=W*uYDx7`6134ueNR9jRPUe?^1sQUrZZKD98Bnt!_J#zkH++paLluVR!~8apXZ^RZ)njU z15=-RX=gd5e}1|qlHpqo(yqBC<_hIRn9M8M)?q4T%9QMG=QLqu0F&6{>iKPgnj@tP zPIUZk_kYB%Ke(A+rOixCuK7!Qy2HkcicGA+KA!ybjG>MyG4{#OukGopOmVM|ZN`8Z z2PRPGxRX=7CLHiwc1~zlH8Q_Wd(J8QlKYz+X~DU@Y%g!`LsJnl4Ut^O3uL=~#yE|| zRuRYL2LV6WMdHeU{G$Q;1BrIYO6BtLP`e>?_YIa#KHP5SyH69zI~O@`sGW zavR}L3m5?AXlZ7hce#drA5gXVkJmUfbfUxh+9{_pO3-aoGBpvJ=O7!8)BY0qqOrOH(=}9 z;(y=#v93hEuMh~CJV`fU?|Zu=p2XGX$geB*UO$BDrO*1B^5LSAeaD>MQ?1_Kn84Lf zUt5H-D=1D`pI#}zRJPd4iJMK)_}lKtblOFi+fly|6H8CVNj_~gd-8Wi7l5O=3qSi` z0jXHPMaD5A!TnC;$mgQK!(Oe`^C@WFWrH2TW}Pl6lEzG63|x8pf!>Kk=*`eQb)2Yp(F z*rZqL4TkRT@w^QBqgR~!bLxJppB9E$(}R#{!FM+i*QVHlb#^Ent@=6ueqCQ>i;O#W~V=aIl zYVkf1Ath;S;jM8{M|AmAuRFT)J(oxlEmh6e8)MkTs=l05ZWit8C1Z0i zkF{D}5K|inDm>cw)C3CSRP&FZ4!Bnsrf=)dciZ({lnfLGN6eg2hZ{5n|HUMvqg`h%Yi{p@jKVHf1-Q{{)jMwr?nrjE7Ju&&qnKS zj=*X8iKhu@Blp4NumTn4ud(#?u1zyCpB_+LrBOk#F;w}+-7ZIGy~tricnOE}YQO0Q z9}yXreO7SR!KdzhK@FDtw3O!k*C!7mQF*QS<>z)C=KlPZ9HESTg{c#|0q)|saqNHA zVxnHZnh$(j#oD*i5R(5|#VcBqh@ha10K1nCw4#vx60Q^9MK&%-xe=4?EN^b|!zb_y z)Udf)u;?+57zRNXav0~W>8pYo(+`c!G{M0cs zRKsEquv@2F0=h2GmuwWv$cR;-6at!rUa{3>WnEP*FYp$AcQRjp`y!d03CvAIEhs34 zL0AC97pKD`@Yl4`2xQu%Qu6P9lIZ1n!ilq8(2P!rr9t;GEH27ebdOD&ea$v1m0AZV zkUCwf{h02?;<}i3{Mk;E`Loom(+;OiaWT@NMZ5xhF?GcH>C*xGRK|d#RrCr@1a6Lc zEyU7ZvR|QWl5rg8H2EY` z5J~;{n%{GOFZ#V;?OKXuAchJc0I?x+BJBSl-Ch5&v^lKiz4?M-GY;wRa?{4ktJUWx zUo8$Uz0HZ1;GSDY1T}tt>;4`6pUPLxc*_ieZq&6j=Ck3^HG4sthMtEd?c+YXELcz_ z!g{JXvNt+y36ri@nu=sGQXs_yhg63|PB;x1LMUfJ=q4zKnaDZcFuzk&2{gMPIq2?e zs*wkJ8|ED7X$K>8_atdLO+Qv0gZr`RxSjwd`;Gq--JUf^?1w`c)>fVnmpu99llxVrqU5^AB%=Che&V$b8CwNy>tgjpFYg?@tQ|g$*6Li+MmKum1&m5I&t9nZ|?TW#P zwN}Vux0Pvw*k)yta%N>ao_Ne+@#oo}l@)CR@6d+C0hJ4BXqaZD^=37fc1hKpg+vdC--cKk!?qdivt?nBpay}5&JEizNb z+p!7xIgQqR3(_EZewO@EvFtBiOTY_^R8nVLUSt-#vmzg|qaYD@6+RMQAQvcnHAr!e z(L#_lVtKS;gn3ZpL06g$?+54fi??NoU-mDNR9nMwl>;{YV9Z)MkT`hYHi_ur|M#1__b+Z;s563u9e9>fs9|$ zaL&(#>aA6;B1mF7Y{#VwN*ZUEmDLxhtr_rA}fzxhmeIB*` zNrbMi1Na?Je#OotE7=o{7wi(ze@WB$0QR(Cyy1^sM*NB^AXjw)`j89X!8%2k=n|?; z5su~^+?hydyh>gaahH<+#?g0H$C1=dT(?kF30V(x2JzcYHq75N$N_-FAXPxz&(b0@ z1MmF&dxm(_#{}laIUQ$HoidsYmgxT7)WXK2O|sl1b_U-OnlisDCZJZY;Xe5;V`rLc zhu86c2mvb$>&rcT7=_F#hu&s*oPbDYv#9v?)h{k+_bR>VddW9C(=EXo|gs3 zM}J9fD>hL#3}~)5F0JmEV?|K+Fkk4=C-@-{gc z)a}!6HqR6 zvQN%9Jmd!vhiVckgypV4>vO;{__5)@Ql~ezsya`(JPW=(mGFJNKLXkWPvM7!8@16S zyYIgewSD=|u$k-^SkPV|!T0zBK!*U31b`v5m6>lSS>5ykmUQQ5y>-f|(&gnnO5wrF z%-!Ss@2m5i2gXj89}1u_Bn`nv=2?Guw3_9qpTupUyx)KjX&l41uPVQ8y^nR;u&+Z8 zgPovScDxo`oe)ni{vY94mcF|MoW;p(p zim15O`|nP{=iNVd3cDS-?cKN^GGe~T@jv2`Xv?X{{-Vtf8oX?(WhTxmW?dxRHinh_ zB>Jj71&o=Zvc-3}!wb*Rn_0Df)VnE>>BI2= z!sswB@R*TJN9o{sj272t`3pbPh4^jZT>bB`8#90c9&iOf(`X%Kksj9>x}5%igi;2y zW082or=jaxc4albZ$rVmo7-Y`j88FJi$WA$diU#P?Av=GJY?S1Ak^7I0~HmO)}}Su zk5yI#4U}5mrYqW~%$U8{XC2(bzsi@zqzKHH)6B#xeExAPBJcf%KnJfe8rpkRRnK*ibH9&TRpNF^EEjDX=;!>B^r>}ER2SfhbB}UEc;YKWTZiW&LZLG z$qcjx?3L&w8;gYO+j8HEkac~|SrqcLW)`~j;W!%2)A54Yb0_DKP|h_`X{3T>fvGuFm)a?D`PS0um8$5TIZWompD)6X2YZ!LBO3qIp0CjL3zI)oKuT5JgKko zNdVl9(uZNr%SYoV|5rY93ombPA91S-le$vC%d_81d#qZyd*SkjG!hw?i_MNVl#T9t z@4DLV08t2DZG}>Cbl6dAIDcBU4^$W4nUIoJQ=X4}mb3~?V8KhnJ!>j#_XYC?rS8Ta zZ&sE}mVdxQ+0bd;3JX+JR{E1n(tc=vSZ|=#qB=idpE_ffi>`yhx{czFx(+v0?1aN5 zalGID{VkQQ+;ev)sIHVm+zun3nW2!uO66UUEYrNMmDyPP6y->5F4lSOFT`(WeRh$V zh*g1G zVys^FrQC|nY#e}5SM+CD{mY^JFr11OS5aJ4v7p0S4!LK714yz9PD7`I-ywghH-#ka zC@cq0Uu@U(^(|)Z;F7N_)Yz1%@<&P&wNN2dP&IH)AMl`Y3iSO>7pc3-`m{Z%u%lC3 zNb2?-wMOqmu~wN3aAPQY*C7-&2$jhC%W& zaF->*3)O;;CjK-6W)odOdroBj9(W9G z#>>ZQQDb7h8LRl|Y*Pcdc5FzB(qTu*;Yx`KEbdpSUeg8Y$H%he)KHY+?y+5Rlh7>r zqRkR8fxpN-$?}r`c}FC}r88AOZHbYLTqy2_usg3dofna^Y^=40$Tw#|g%lE-mG))tn#_-MI`+~`KW&W+x zDW;i*nODW!XJu=ipn{EbG6nJ655JrP^+H{)d73GcIrs#8U`O~M%9PZP3!A8@ z63v#J!!ge8qVJVt77k_VPpUZGf;>(DJ1cvf;H}XW-}4$GyQ;8%E- zBzyO$b#3N&C})?fUttnIyV^&NH|i#4W?uo4We*9hZw1>QU_HK?U*2zB+2@o>rm?@k%FZrXP9Y|BX%3enEbNS9OSbHqA(djeL&i{nLo2kERZ) zvNM7?B8+Oic^(J#M1tY#xbY-j*cHdn-3`SeFbKN)Iv>96K#k1+w7tG8^~ESd|E3>j zs@1#cX*DZnMx00S-xV11)r|ePchM^Woe%4qB$5esiUrAWPYr6tVdMixAwOeUZp*`U z6unHN1HvK2;zSAMTG;AT*7!H%oqugi4wT=GmZ$nQP~A7Ht?O5_AK*DCb`MzK7EhPsmeK3iZHI@h0;{E{P4NeyazuI2wG9CWt zWXoq$+kBLG^FhLxT%NK}NP-5LVTvdr9Uyu?O;TPhWX;o_aT)ULBaR>J%);Qr`O@?9 z*=1!&gN_&F(8V{%&YN#2-vUXXyOX}<%Hdykdfw`uh>0-8=5Z7ppN?w>FE(`M?b{yO zj$*($f*HP*F&XfE+Wo`zJ#}JBe;%tW&BB%p80{yQ*LH|C?kS2d`~1aw#hluEp==mQ z?gxd4FcrtHAV&qx<=|9{rOp1U>n#;({Pg-G)t&8y zIbh-BD~-I4I1X9=!9j43n-nn-)(2tUR-zXPgP5ipx~@Hx1e&zNDY%R3b+Ii|?GkbI z<$5bJQid_@1A=ds4|jhV^VpD}?ckZ%Raa_qX|zhkvB|y}PzuzVfDWD(N&g9+MHiI} z-LF<~XB~y9R_po|5Zvi&n%XYE)nr}Q{1!$SMYx{c;dpsAS#FJ>eD47oeeDyv^N-k| z>w1H!Lx45k+ZrLb>BBD#f2qD{y5ZSSgBeg?Cg$dczA>m{_G?0ATesXdh)x7xS8_p7 zx?Z+SP#6Z1_9x$PR!*8vEvk4KQ3#j|r4Ca!A6|t5;Sq7MM!2n>UXbQCOi-aFm*S&M0oA>}mFK9I$?{IqO_9Z>68EP-!|6Jz7=lpoP1aBK_f5KD;%)nEAAzUS|Xp%Gb@MF%im$dT$xL0*m^cTn=zGHmwseLT7^0&+ztOs zvTV-rv=)k=&G3Z0vIYE1L+VR#GEr*A9q!4%(N=?#gd%OHNLY??K#Q3GG0yF6Q0dpiXKr2mZ{c|4n}~dZLgdM+?)SX+?Djls zR`bMWPZh8^8F+uH1ZJWDPs}Iwo7bYL!VsgVukw>n(0HJlg81l`x^(5BHcdq^qE=fG zA9V!v$V;q^$Sdg?`SA8ZcA?Fs$fu&=BGqi}oZZhee^0`XHqA$XtmGh0AV467+~Ucy zJAO%O!1`y`IrtT}3yBFS9P;q%UoXw6bSaL(z@ZrW;h+QUplm8nU^0aDsV9Ek*|`g+ zE!kQY?M!2YqQ24h_$$`*K}prY{^i05t9$p~LgXJ~AMayz0fWYlhGNS4nVNv#*W-~V zqGn-4NOnOKOWX$2=$3eLJ^SPQL;Z#H|5es^M>V;0 zUsIGK9RWd_2-2&7NJsEeLQ$f$fK=%qO)zv26%Yi05V{B$iuB$gfClM3NN+~G6p0|c z{U+Y`d&~O%%3`f#o^#Hendi)$y*I;O*f|cCXacmvmzwnb*QEwAnf7;=>MK>4@0bnFhAKXxo3rWV@)DKZOd#|DVuY48!y zH5tu5*H5h+527-gxKjF8UH6BVzRsL<`>8S{Pp`;&OOltSDtsV%yDDs8Rc^&? zwf}*Bwyj>06v9{dKu7eYV9`Sz&e7q?Nt5H_<14;wxvA1=vZ|ZSU*Z-*9gt_2Br@Pn zC!N18oRfT=J?2Xi$TJ{xc0(-=NUtbf(Y$rF=LaV5*VFBm{pFW}M35p6`DWdeDaiTl zG@yLoYnuc;+{(Yav2ZMBb-N!GH+Q3E;Q|MDI`Q7%e$Ce91c6j;$!t}Lv*?Ug_LcN_ zyQeE7@Y@03?)_4addg^>J3b7nzI6@p40}=fwm-8gXJ`ioB}#q9Y_y*fIK5vPWk=|K zVs*{Es-M}Z$56()$UvU!Mv?Fl^Fv;aEqM{nZ->^cl`@veWnHtBdLTW$RkJ zaRk?A#NE5*=CeR?f3s>t8yI~l92h;q_&_hHag&MaT%}L#xbkm@oZFj;- z1n={-P`JvBM%~KK#X)dsZvQwh0pTT>G_<8gk-!N=4s1{Te+zq*((vC>c%ChM`rM4N}CfF}6`&;;N|T|Q#jcHs97icI|{R_ak3I$_a_0(`>ZY_9Qyqc`R2ae`Rk)r+sdOP1^!IJ}vDUOOdJ1;9Tbxhmlor_Wnf!y0K1@Bv%4cNP@1YM+Nc_^OqQ) zQBL_~wAb<8)ST-`M}rsNBB@6jcd%Ls+%M$k#f>7sq-c9PKCEYHL3C+;94MDoV<$;N&v3a2WI8$zFK~~`Y>uslfsamU`cMR z!_G)?zXiLVlOAJNaPcqL5E(qnoQmp88JREhE|-JQ5-RUtd9irv66ep8Q0Ro)m~M?8qdLN zJbYU81>yIA6&6}}A%*_NTi}{^k262ZnJ-eJJ_|FbDOKyii+PIgI9nD^l_Os6j5WB* zX+^?5GgCJS+qw5cwVx(c7hOvv-1u-=zg>>a!lQyP4k1L)W&?H&wX9aIdkYTpdYh9J z9wzKxw^=sMI$34U{vNnrJK#R!QaZ)l6a&ylx4kzhk{^jkPk0DNtkYmRusIa-Z|HL+ z!OFePz?JbyaXr2g>{;oS;wM`@bx)t%C$P@%4XK%4tMO+HXrrR7r(FS5msMOA!LfvO zExg~Z1MJwD`lF&Y*u0>CmB;i^CMgY2br#z*?g4Qta$~mM+ui1rG_j#C7WP-(OMeJ(8|km*D!(T^2_gwp9S;FE-n8^ zHsva`3Qr?VzhB%UGD{I5JJj*FYK|ox>j86vk=}xV>%am}C;=tf!O%g#IK>`2NWLE= z0-5wjyk#->G8E83FN@BECP01~^m9(7&+CiiA)c?S+iX(3WuyyMd0sd4;aU77ton4+ z?>l$e(EksXQb+iKjUAmjkj_l5k=pv}wp4VH(pcrc0C${x-5vQtp7=og6Jm|FSC?pr zcpK^-O=Ae~!Wbn@*?O1$HOZI|-y=auP2kQKWNm#Me-j*SBQ*wg=zm%WR9= zwXiPJiFB)flh1Pv0*$mm;!;_233qz}lunDQpefLIzU|!o2>Vl{+VS^&yaJaKEdSY-{$M z7GR=24{EC9sz;Udc#tPZ)L>7zm@xgmT7Vn9JmNRuU+bvR+XQQ>)p{3^ymIX^w+uUN z@ONuJ;VvXqs3c*mJOgO8p8ov$0avT*F4#iUXOmBm^!pnpyYP7!d%qPxq<#4}A`KJs z->sR{4B90+UC*pIl{fd;xWR1QE$i{1N)Q3!9}a{0mtSZKEN?E|-ErY^Q}&|y55Vlx zAoeD^w>n=*#)WRI`aoPN`GCDozZ6ReXF8Wfwc*d(w6XvwIXZ25xn)y&4KRKF&rd;?tp4PYr(6h?*AwX~TC;q1)j9|`%Y2{Ahz&euw= zv%HeeF!&H@q6v5&=QQQxeBkcxM3OZrPm%qob>MAOJKVS6*O2mRxO@pmMWRaWu{NBN z=ci6iEgAu>Lz}2QyLRDgIRaK(tCDV0TM#Z->iY;aPO(V3$grp|Y~{-N5#e&ljpQSX z@n&ZO+oAVUTV(mmc?&L<3nJaNmsEQ(-%m{oZRM}rPHB4Qo3^va5iq_H{ziDbJ?#bk z4O5$U>TcwjSXlPO`w%fwG7>gYw`-DJ+whBc6cplIN@s5>!0HKw$h(v#QMGN1ipdXr z+HaP?gWIa9vIi4g41S?H)5tWp--6wpn6}JzuNUAUhg%WnxSVu4LUM0E0;Cnvy zb#wo`^2xJnR4(I-Btqjp+lzMXlK^Ux0rtFHG?PP^z3h4O;D7?8h@Z89Iqpu|KmW?^!i!JCAn8<<=3)&1_s6o$bw`+w$Hc ztx7b#*ySXa%RgXw#eYWs!~2<#Rx1Ce!3Wq>c%tjX2~$(^iJxEQ#EDOL$Sr1k|c1^F$LFfDYPnD(z+N66G173@1mJXcgskM z@n5{Z-lVcsN9SFAnn61ddF7-I;Ls#d>agn{U~Qv`KM$0J#b0d-4Ee|?WlsC)^q1I0 zHaCCA{JPtwkgiuEpT_DVY_7xinLkF;8^ST?@>JwxU-*UK#J@N{-9PR)QHf$<n_w)%m-0_`dJxq<+PVq0(~H2Y!a)q-f*UtnBQzZthW)l|0z(+tV0K)hyQ3 z!A_&5l!o%wtwrkbe*8O86IOQ_xOGJ28!@3NAP(_i1l*dEK-Lre^9TMp6|4qD+)J-g z5qmz=8^Gw`sEd=ioRpjUcSht5%({TtmRsm?-7mT3tks{6nI+dS7$mDAxh+R# zRHY9ec0Sz6T+w!8BbL6(@RM*9MvvOT_UYB=*^qdwzF+UFn|@{Rugy@*d=}!dmiS+4 zwu#cREy5#thRSOX@tCd-QaS;;mF7J1vb{0@fCv5$V^{kKS?s&&zA|Lr=ng3P8ao29 zNcwbEdw$J~=#ipNKV)qCdp@NmM(ODC@r#A0j>ByqtZ2*Ju=n{Ov*D&qcZE)iu7y+C zDRs&JJ5f_(|HM2qiYYX!1#`?E=cTkkCRQA{{r#NUiQd`>4-G1C(Jr*2D}Ps zvPmva#36sLD$Eyx9ZU z1C=yPS1c5+UhO+(kFQ)A_jN(|EERXd z#tO`E7MsN$tM$YO8ymu%Zttt6C~7kRor_Rrn7y#~a-qIvRMc#cpDN#SGd*PCw>im| zu91kFwD#K^cN=|g@FxI*+GAV}y;A088n#?2Z+a3vBJR<3dw27s?gElN*Yqt>b>YU^ zK=Oe|z1Th|Kn#cedNx}Jn6y(*jy+E!cSN$Y7XVPYtv10MzVu_% zqZ02N#hdB5C{qfU7nmdErxc`$c0Z8J-D*dr_XnL1`EzAfXEn|Y4HrAx*PSdx=>0vW z-3c^ra&%~Y0Y!aBD8{v&lP#A=+zO{jMm12!YVH%>fU{iR9>yDu3Y;3WS4bc&3%>w zmklI~0={@Ci6kOimYe5R2@IO$q%qc?=2=_?|E>yJx z_v3DpOd)!`m3`!WRiI%CKhO@-Bq>QfUX7KKGg-v3UJXoH>arPV1hD2@ab2LMlL-)N zzH4hV4jxK_$d#pszdV;$mc&@ctAuZv={sJ3L63S1zHvE(H~QY$?axEU63c7W-L9=y zx4)gL+@4x>{~-XZC;~{q=obi?wzFdnsxuW0NP61H{vqDBJ^tg&817L_lMEOI3r2~;rf&+A{W`wo%YwYAKuhk9!i+9F@^(pqvRhF#=zXH1yGup29V0!zLDsYXJeGcoNciM z)ye>V)vw*2e*0_B@IA0d%o(j;Ub`if+&D8>0iTp7!l>SljLuqLToN72pa5c3V1V}s z8EHT{Tm;;;D2TSbcJap(t0A(7yHCXlesr$sgyE7VBE_X0$0*hFE*CBTeK#JH{MB#y z>MAeekNe#*az`?`&PMUMfG@!Vp{A?|syG_9&o80olbX1I#{J-Hn+p!_wdXk8izZa! zcNCn!b5)Xv`p7{iRGY3m*3FHHKpB1h8U*cI(oc_lLV^IN%-Z~JC=|UdW>AkB)qq+1 zBQhP9kLRu&*sd43O%`3xk_F#KYlqTOs=f(s8oO(iB*WIr2gRT+Bv@nuZJ$ye~L!X6LXn3*jcgrIIi#N}=h0a7|IxA}n~ z66hLUk5J##EJv)N5DGa6g;GR-a?B)WGE~*va-kTp-GiTI>f62yDSCJK$lPz=+vw@D zZ>Ky1d@W$yRG}IIrV%j1Tf;HvJV+5H%{cIC)F^E7q6)D}h1(1(E3pi50I`qQai3hy z)NG&crlxlYmnLBjg!)-b|E(lwN=DsZGpM$x*tf#0CTAbpAwzGPUuyu0D^dgVy%dkC zW=GwhIKYWsE|MfI`3L>gS0y?*0xKYv3NxXtZxsNM4y!&beWTErLRe_!K)f$u!l_On za{b$5bHx9{PJJ=*l50lywSxS$;RGCE^{_RzcP4-6$#;e9$>Br^^0O?ZyXw|FVddpp z?!?dJyd}U=Fpck2dfC8TSb+G5LU^NpAS!SDsWXt>e;CE?W-6Vdx|J;V>Asq1`%;0$ z_3BdTMc}{#h%%iUiCZ|p%2=buMbTESttP8?oBSahQqV+I8>TPh_SBLa38cT#$!xNn=P+tCRNTUtQ_05@p|*a_QA6 ziot!^AAQR(GtHkBw*MJpp!SL`>g*~|^`cnk6Z`Y%qKD*P=NzKMKPdpkRG@)+hYgwh z0CEa4$Nta~TV)}HQmG#gx+^rH&~?jBbUhmidDBzT@&+nATs5$Yl*eut@2x?K+#yAF zkRlKhKGz~EA>0SVbOGX0#kk5?RSVKy6aN6o*2yW_=c8-~zY~f9#XO`)15&h5Qsjpk z_wQ@}+XjVbw>PUi?|=NYy#VU+61vuas}9YUov^ZXD%HhZ!N6l3!=}SX>4{{-b75RW zFj8Pst;uKC`ogTEWyM(ubF2dj;l3U{iL|B0yN!Z!Q@z8dFc*SFfuP-uT>F~x#$ zq&}d-^WrmbjYi|NndH5WFyjqXX*QQrS~zC=D>1$%|46sjB;m^a#~!l>o^HMPpi7b{ ziuGShSRzGYxH(rKseoww6{2_tOFB4{>06salD?ZBi||*t`VteOEjyawx#=}>i?E0= zZp=WwUz(}_E2-gvqraP@***KSjaiHG_o{ZRW0%~Cn@HNtTq=>%Lq36FW}h9&1W7Zy zMkCSFYZ5wEs6!^(@S*LXcwy(#aw5|oo}}h))>K3At)3SYK^?rT6$Dq>;FtB&il#`3xqT5%Bs$NNKH(>l|aA>=>=n z)5sO(jOk?PX_#l}Zg_0WmS>CYgXYhtuyw4O$D%xHob;J#eZ$0)NUz?|ci1Lo`gpHi zNyX2lXN-9=2kK%WbLoi1#O-f zw}Q64ZV9^{yra4GK8bZwd0w;=*jbY>>mvjrJRh9)}wx z7>aAfMeQGgdbBzIh}}B3kr6hV*976ge@oyd5|jh~YagOaxCq3&Q3zv*tmr2@6??Yu zlPIXLwvg;_RS*)omecuBgTj?(?#Ih7KwMZJ^ZuiG4OY4eK4v4@AK^oH8N$hSJaOhMb_guW(t(Qu|Z!YS!OZk-o;}WQ8&HOz(J{tN_|5kxSq#ZTaOxGxEgXkpDX4JB~rNx=21YC&Gvk9VF zBw-TzVg*0kh(T2_Q3VOg*I$-)n_+`f*e!p#fExFUm7*YKA@(5Z4)>^5sJkVuy+yhw zggUDzk>h<3tRI$!CPfZccchY**mKR7(HrZRg;RPJ+o1Ph>*+98AouMXYuKsy-P)Y) zS}&s|jg;nPay9k`9uLX6x>#EdSs+E=&4*=Usl6cn7}iai_t$NhYw$qTf@$nn9GyAV z5K_bjWXao-G&2VoGC zzN<>2*IP4q?L9w!cVLhMiQ^3~NQTO3J>yP|fW52;0A2Jb^<2ae&B0yQR#c%YLoVN2 zmQ?zSdP2A&gaRcW1WuU{`r+a~@5}$ugZ>|}{=d(OPh7}4X**8uf&Vc;r1O`--7#); literal 0 HcmV?d00001 diff --git a/src/main.cpp b/src/main.cpp index 7602d1f..79c54fb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 16; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d4732e8..b456f2f 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -11,8 +11,8 @@ #include /*! Block size used for CUDA kernel launch. */ -#define blockSize 128 -#define sectionSize 128 +#define blockSize 1024 +#define sectionSize 1024 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) // usage: checkCUDAError("a descriptive name of this error") From e9cafcc80241a34b77fd0097beefc46e85920bb7 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Tue, 21 Sep 2021 11:16:12 -0400 Subject: [PATCH 20/27] update readme --- README.md | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index dec8797..23d5c79 100644 --- a/README.md +++ b/README.md @@ -6,24 +6,6 @@

---- -## Highlights - -XXXXX - - - -Tested on: - -``` - [SM 8.6 NVIDIA GeForce RTX 3080] - Max threads per block: 1024 - Shared memory per block: 49152 bytes - Max threads per SM: 1536 - Max blocks per SM: 16 - Max grid size: 2147483647, 65535, 65535 -``` - --- ## Features @@ -44,6 +26,12 @@ When the array size is under 20,000, CPU Scan performs better than other algorit Output when array size is 65536: ``` + [SM 8.6 NVIDIA GeForce RTX 3080] + Max threads per block: 1024 + Shared memory per block: 49152 bytes + Max threads per SM: 1536 + Max blocks per SM: 16 + Max grid size: 2147483647, 65535, 65535 **************** ** SCAN TESTS ** **************** From 4580221fae5e5e106331dd76e9bdcff90dcb0dcd Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Tue, 21 Sep 2021 14:23:26 -0400 Subject: [PATCH 21/27] patch #1 --- src/main.cpp | 12 ++++---- stream_compaction/common.h | 4 +-- stream_compaction/naive.cu | 57 ++++++++++++++++++++++++++++++++++---- 3 files changed, 59 insertions(+), 14 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 79c54fb..e511d7d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 16; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -101,7 +101,7 @@ int main(int argc, char* argv[]) { printCmpResult(NPOT, b, c); printf("\n"); - +#if 0 zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); @@ -115,12 +115,12 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - +#endif zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(SIZE, c, true); + printArray(SIZE, c, false); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -141,14 +141,14 @@ int main(int argc, char* argv[]) { printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(SIZE, c, true); + // printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(NPOT, c, true); + // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); diff --git a/stream_compaction/common.h b/stream_compaction/common.h index b456f2f..b784980 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -11,8 +11,8 @@ #include /*! Block size used for CUDA kernel launch. */ -#define blockSize 1024 -#define sectionSize 1024 +#define blockSize 256 +#define sectionSize 256 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) // usage: checkCUDAError("a descriptive name of this error") diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 8c038c4..25c6880 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -144,6 +144,8 @@ namespace StreamCompaction { } } + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ @@ -157,6 +159,7 @@ namespace StreamCompaction { int* d_OutputExclusiveData; int* d_SumArray; int* d_SumArrayOutput; + int* d_SumArrayAx; cudaMalloc((void**)&d_InputData, size); checkCUDAError("cudaMalloc d_InputData failed!"); @@ -173,6 +176,9 @@ namespace StreamCompaction { cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); + cudaMalloc((void**)&d_SumArrayAx, sumArraySize); + checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); + cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); @@ -181,6 +187,10 @@ namespace StreamCompaction { dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1); dim3 dimBlockSumArray(blockSize, 1, 1); + // for testing + int* sumArray = new int[sumArrayNumEle]; + int* sumArrayOutput = new int[sumArrayNumEle]; + timer().startGpuTimer(); // First step: compute the scan result for individual sections // then, store their block sum to sumArray @@ -188,13 +198,50 @@ namespace StreamCompaction { d_OutputData, d_SumArray, n); checkCUDAError("kernNaiveGPUScanFirstStep failed!"); - //(); + // cudaDeviceSynchronize(); + + cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + std::cout << '\n'; + for (int i = 0; i < n; i++) + { + std::cout << odata[i] << ' '; + if ((i + 1) % 8 == 0) { + std::cout << std::endl; + } + } + + std::cout << '\n'; + for (int i = 0; i < sumArrayNumEle; i++) + { + std::cout << sumArray[i] << ' '; + } + + std::cout << '\n'; // Second step: scan block sums - kernNaiveGPUScanSecondStep << > > ( - d_SumArray, d_SumArrayOutput, sumArrayNumEle); - checkCUDAError("kernNaiveGPUScanSecondStep failed!"); + kernNaiveGPUScanFirstStep << > > (d_SumArray, + d_SumArrayOutput, d_SumArrayAx, n); + + kernNaiveGPUScanThirdStep << > > ( + d_SumArrayAx, d_SumArrayOutput, n); + + cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize, + cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + printf("\n"); + + for (int i = 0; i < sumArrayNumEle; i++) + { + std::cout << sumArrayOutput[i] << ' '; + } + + printf("\n"); //cudaDeviceSynchronize(); // Third step: add scanned block sum i to all values of scanned block @@ -203,8 +250,6 @@ namespace StreamCompaction { d_SumArrayOutput, d_OutputData, n); checkCUDAError("kernNaiveGPUScanThirdStep failed!"); - // cudaDeviceSynchronize(); - // Last step: convertFromInclusiveToExclusive << > > ( d_OutputData, d_OutputExclusiveData, n); From d4b61dba239f2733c176c5bce660f60a905a9ca8 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Tue, 21 Sep 2021 16:08:39 -0400 Subject: [PATCH 22/27] Check 2 --- src/main.cpp | 4 +- stream_compaction/common.h | 6 +- stream_compaction/naive.cu | 118 +++++++++++++++++++------------------ 3 files changed, 67 insertions(+), 61 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index e511d7d..014825e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 20; // feel free to change the size of array +const int SIZE = 1 << 19; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -120,7 +120,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(SIZE, c, false); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan diff --git a/stream_compaction/common.h b/stream_compaction/common.h index b784980..8e2a01b 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -11,8 +11,10 @@ #include /*! Block size used for CUDA kernel launch. */ -#define blockSize 256 -#define sectionSize 256 +#define blockSize 512 +#define sectionSize 512 + +#define MAX_SUM_ARRAY_SIZE 1024 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) // usage: checkCUDAError("a descriptive name of this error") diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 25c6880..2cfa357 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -4,6 +4,9 @@ #include "naive.h" #include // testing +#include // for assert() + + namespace StreamCompaction { namespace Naive { @@ -14,8 +17,6 @@ namespace StreamCompaction { return timer; } - // write a kernel to convert from inclusive scan to exclusive scan - __global__ void convertFromInclusiveToExclusive(const int* inputArray, int* outputArray, int inputSize) { @@ -33,46 +34,6 @@ namespace StreamCompaction { } } - void unitTestConversion() - { - // for testing - int numObject = 8; - int size = numObject * sizeof(int); - int* toyExclusiveArray = new int[numObject]; - int* toyInclusiveArray = new int[numObject] {3, 4, 11, 11, 15, 16, 22, 25}; - - int* dev_toyExclusiveArray; - int* dev_toyInclusiveArray; - - cudaMalloc((void**)&dev_toyExclusiveArray, size); - checkCUDAError("cudaMalloc dev_toyExclusiveArray failed!"); - - cudaMalloc((void**)&dev_toyInclusiveArray, size); - checkCUDAError("cudaMalloc dev_toyInclusiveArray failed!"); - - cudaMemcpy(dev_toyInclusiveArray, toyInclusiveArray, size, - cudaMemcpyHostToDevice); - - dim3 dimGridArray((numObject + blockSize - 1) / blockSize, 1, 1); - dim3 dimBlockArray(blockSize, 1, 1); - convertFromInclusiveToExclusive <<>> ( - dev_toyInclusiveArray, dev_toyExclusiveArray, numObject); - - cudaMemcpy(toyExclusiveArray, dev_toyExclusiveArray, size, - cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - - printf("\n"); - - for (int i = 0; i < numObject; i++) - { - std::cout << toyExclusiveArray[i] << '\n'; - } - - printf("\n"); - - } - __device__ void computeScanToOutputArray(const int* inputArray, int* outputArray, int* XY, int inputSize) { @@ -130,7 +91,7 @@ namespace StreamCompaction { { // Each thread loads one value from the input array into shared // memory array XY - __shared__ int XY[sectionSize]; + __shared__ int XY[MAX_SUM_ARRAY_SIZE]; computeScanToOutputArray(inputArray, outputArray, XY, inputSize); } @@ -144,14 +105,13 @@ namespace StreamCompaction { } } - - /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { int size = n * sizeof(int); int sumArrayNumEle = (n + blockSize - 1) / blockSize; + assert(sumArrayNumEle <= 1024 && "Sum Array has more than 1024 elements!"); int sumArraySize = sumArrayNumEle * sizeof(int); int* d_InputData; @@ -184,8 +144,9 @@ namespace StreamCompaction { dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); dim3 dimBlockArray(blockSize, 1, 1); - dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1); - dim3 dimBlockSumArray(blockSize, 1, 1); + + dim3 dimGridSumArray(1, 1, 1); + dim3 dimBlockSumArray(sumArrayNumEle, 1, 1); // for testing int* sumArray = new int[sumArrayNumEle]; @@ -197,9 +158,8 @@ namespace StreamCompaction { kernNaiveGPUScanFirstStep << > > (d_InputData, d_OutputData, d_SumArray, n); checkCUDAError("kernNaiveGPUScanFirstStep failed!"); - - // cudaDeviceSynchronize(); - +#if 0 + cudaDeviceSynchronize(); cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); @@ -222,13 +182,12 @@ namespace StreamCompaction { } std::cout << '\n'; - +#endif // Second step: scan block sums - kernNaiveGPUScanFirstStep << > > (d_SumArray, - d_SumArrayOutput, d_SumArrayAx, n); - - kernNaiveGPUScanThirdStep << > > ( - d_SumArrayAx, d_SumArrayOutput, n); + kernNaiveGPUScanSecondStep << > > ( + d_SumArray, d_SumArrayOutput, sumArrayNumEle); + checkCUDAError("kernNaiveGPUScanSecondStep failed!"); +#if 0 cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize, cudaMemcpyDeviceToHost); @@ -242,15 +201,18 @@ namespace StreamCompaction { } printf("\n"); - //cudaDeviceSynchronize(); +#endif // Third step: add scanned block sum i to all values of scanned block // i + 1 kernNaiveGPUScanThirdStep << > > ( d_SumArrayOutput, d_OutputData, n); checkCUDAError("kernNaiveGPUScanThirdStep failed!"); + // cudaDeviceSynchronize(); + // Last step: + convertFromInclusiveToExclusive << > > ( d_OutputData, d_OutputExclusiveData, n); checkCUDAError("convertFromInclusiveToExclusive failed!"); @@ -270,3 +232,45 @@ namespace StreamCompaction { } } } + +#if 0 +void unitTestConversion() +{ + // for testing + int numObject = 8; + int size = numObject * sizeof(int); + int* toyExclusiveArray = new int[numObject]; + int* toyInclusiveArray = new int[numObject] {3, 4, 11, 11, 15, 16, 22, 25}; + + int* dev_toyExclusiveArray; + int* dev_toyInclusiveArray; + + cudaMalloc((void**)&dev_toyExclusiveArray, size); + checkCUDAError("cudaMalloc dev_toyExclusiveArray failed!"); + + cudaMalloc((void**)&dev_toyInclusiveArray, size); + checkCUDAError("cudaMalloc dev_toyInclusiveArray failed!"); + + cudaMemcpy(dev_toyInclusiveArray, toyInclusiveArray, size, + cudaMemcpyHostToDevice); + + dim3 dimGridArray((numObject + blockSize - 1) / blockSize, 1, 1); + dim3 dimBlockArray(blockSize, 1, 1); + convertFromInclusiveToExclusive << > > ( + dev_toyInclusiveArray, dev_toyExclusiveArray, numObject); + + cudaMemcpy(toyExclusiveArray, dev_toyExclusiveArray, size, + cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + printf("\n"); + + for (int i = 0; i < numObject; i++) + { + std::cout << toyExclusiveArray[i] << '\n'; + } + + printf("\n"); + +} +#endif \ No newline at end of file From 4fc85a73d091ae3535545b98e8bda0fdd174c42c Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Tue, 21 Sep 2021 22:41:04 -0400 Subject: [PATCH 23/27] Asking extension --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 23d5c79..0bf7c8e 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ --- +I would like to extend project 2 by one day, please. + ## Features - CPU Scan & Stream Compaction @@ -159,3 +161,11 @@ CUDA error (d:\dev\565\project2-stream-compaction\stream_compaction\naive.cu:84) > When a ```__syncthread()``` statement is placed in an if-statement, either all or none of the threads in a block execute the path that includes the __syncthreads(). PMPP p.59 +## Note + +- CPU sequential scan algorithms are linear algorithms and are extremely work-efficient. +- Expected speed: Thrust > GPU Efficient(Brent Kung) >= CPU > Naive GPU (koggle stone) + - Why is Naive GPU slower than CPU ? + - Naive GPU has control divergence in the first warp. Performance hit is worse for smaller block size. + - Naive GPU is not work-efficient. Naive GPU has NlogN - (N - 1), whereas CPU has only (N - 1) + From 86fe4eceaa64ec0b2031245fe6e8fdf64cfb1296 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Wed, 22 Sep 2021 01:35:44 -0400 Subject: [PATCH 24/27] New Version --- README.md | 2 + src/main.cpp | 10 +- stream_compaction/common.h | 2 + stream_compaction/cpu.cu | 23 +- stream_compaction/efficient.cu | 403 ++++++++++++++++----------------- stream_compaction/naive.cu | 257 +++++++++------------ 6 files changed, 331 insertions(+), 366 deletions(-) diff --git a/README.md b/README.md index 0bf7c8e..04682fc 100644 --- a/README.md +++ b/README.md @@ -168,4 +168,6 @@ CUDA error (d:\dev\565\project2-stream-compaction\stream_compaction\naive.cu:84) - Why is Naive GPU slower than CPU ? - Naive GPU has control divergence in the first warp. Performance hit is worse for smaller block size. - Naive GPU is not work-efficient. Naive GPU has NlogN - (N - 1), whereas CPU has only (N - 1) + - Why is GPU Efficient quicker? + - reduction step takes N - 1 operations, distribution phase takes N operations. Overall, it is a work-efficient algorithm. diff --git a/src/main.cpp b/src/main.cpp index 014825e..c937ead 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -101,21 +101,21 @@ int main(int argc, char* argv[]) { printCmpResult(NPOT, b, c); printf("\n"); -#if 0 + zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); -#endif + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); @@ -133,7 +133,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 8e2a01b..cb83569 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -10,11 +10,13 @@ #include #include +#if 0 /*! Block size used for CUDA kernel launch. */ #define blockSize 512 #define sectionSize 512 #define MAX_SUM_ARRAY_SIZE 1024 +#endif #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) // usage: checkCUDAError("a descriptive name of this error") diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 89d81f5..9880e84 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -14,7 +14,24 @@ namespace StreamCompaction { return timer; } - void scanWithoutTimer(int n, int* odata, const int* idata) { + /** + * A work-efficient CPU inclusive scan. + */ + void sequentialInclusiveScan(int n, int* odata, const int* idata) + { + int accumulator = idata[0]; + odata[0] = accumulator; + for (int i = 1; i < n; i++) + { + accumulator += idata[i]; + odata[i] = accumulator; + } + } + + /** + * A work-efficient CPU exclusive scan. + */ + void sequentialExclusiveScan(int n, int* odata, const int* idata) { odata[0] = 0; for (int j = 1; j < n; j++) { @@ -29,7 +46,7 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - scanWithoutTimer(n, odata, idata); + sequentialExclusiveScan(n, odata, idata); timer().endCpuTimer(); } @@ -90,7 +107,7 @@ namespace StreamCompaction { } // STEP 2: Run exclusive scan on tempArray - scanWithoutTimer(n, scanResult.get(), tempArray.get()); + sequentialExclusiveScan(n, scanResult.get(), tempArray.get()); // STEP 3: scatter for (int i = 0; i < n; i++) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index ea72d31..e3d8f15 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,9 @@ #include "common.h" #include "efficient.h" #include +#include + +#define SECTION_SIZE 1024 namespace StreamCompaction { namespace Efficient { @@ -30,265 +33,257 @@ namespace StreamCompaction { } } - __device__ void reductionStep(int *XY) + // lanuch this kernel with SECTION_SIZE / 2 threads in a block + __global__ void kernBrentKungScan(const int* X, int* Y, int* S, int inputSize) { + __shared__ int XY[SECTION_SIZE]; + // 2 * here responsible for handling multiple blocks + // now you only consider one block + int i = 2 * blockIdx.x * blockDim.x + threadIdx.x; + if (i < inputSize) + { + XY[threadIdx.x] = X[i]; + } + else { + XY[threadIdx.x] = 0; + } + if ((i + blockDim.x) < inputSize) + { + XY[threadIdx.x + blockDim.x] = X[i + blockDim.x]; + } + else { + XY[threadIdx.x + blockDim.x] = 0; + } + + // note here we have stride <= blockDim.x for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) { - // make sure that input is in place __syncthreads(); - int index = (threadIdx.x + 1) * stride * 2 - 1; - if (index < sectionSize) + int index = (threadIdx.x + 1) * 2 * stride - 1; + if (index < SECTION_SIZE) { XY[index] += XY[index - stride]; } } - } - __device__ void postScanStep(int* XY) - { - for (unsigned int stride = sectionSize / 4; stride > 0; stride /= 2) + for (int stride = SECTION_SIZE / 4; stride > 0; stride /= 2) { - // make sure that input is in place __syncthreads(); - int index = (threadIdx.x + 1) * stride * 2 - 1; - if ((index + stride) < sectionSize) + int index = (threadIdx.x + 1) * 2 * stride - 1; + if ((index + stride) < SECTION_SIZE) { XY[index + stride] += XY[index]; } } - } - __device__ void computeScanToOutputArray(const int* inputArray, int* outputArray, - int* XY, int inputSize) - { - int i = 2 * blockIdx.x * blockDim.x + threadIdx.x; - // each thread loads two input elements into the shared memory + __syncthreads(); if (i < inputSize) { - XY[threadIdx.x] = inputArray[i]; + Y[i] = XY[threadIdx.x]; } - if (i + blockDim.x < inputSize) - { - XY[threadIdx.x + blockDim.x] = inputArray[i + blockDim.x]; + else { + Y[i] = 0; } - reductionStep(XY); - postScanStep(XY); - // each thread write two elements into the output array - __syncthreads(); - if (i < inputSize) + if ((i + blockDim.x) < inputSize) { - outputArray[i] = XY[threadIdx.x]; + Y[i + blockDim.x] = XY[threadIdx.x + blockDim.x]; } - if (i + blockDim.x < inputSize) - { - outputArray[i + blockDim.x] = XY[threadIdx.x + blockDim.x]; + else { + Y[i + blockDim.x] = 0; } - } - - __global__ void kernWorkEfficientGPUScanFirstStep(const int* inputArray, - int* outputArray, int* SumArray, int inputSize) - { - __shared__ int XY[sectionSize]; - computeScanToOutputArray(inputArray, outputArray, XY, inputSize); // the last thread in the block should write the output value of - // the last XY element in the block to the blockIdx.x position of - // SumArray + // the last XY element in the block to the blockIdx.x position of + // SumArray - // make sure XY[sectionSize - 1] has the correct partial sum + // make sure XY[sectionSize - 1] has the correct partial sum __syncthreads(); if (threadIdx.x == blockDim.x - 1) { - SumArray[blockIdx.x] = XY[sectionSize - 1]; + S[blockIdx.x] = XY[SECTION_SIZE - 1]; } } - __global__ void kernWorkEfficientGPUScanSecondStep(const int* inputArray, - int* outputArray, int inputSize) + __global__ void kernBrentKungScan(const int* X, int* Y, int inputSize) { - __shared__ int XY[sectionSize]; - computeScanToOutputArray(inputArray, outputArray, XY, inputSize); - } + __shared__ int XY[SECTION_SIZE]; + // 2 * here responsible for handling multiple blocks + // now you only consider one block + int i = 2 * blockIdx.x * blockDim.x + threadIdx.x; + if (i < inputSize) + { + XY[threadIdx.x] = X[i]; + } + else { + XY[threadIdx.x] = 0; + } + if ((i + blockDim.x) < inputSize) + { + XY[threadIdx.x + blockDim.x] = X[i + blockDim.x]; + } + else { + XY[threadIdx.x + blockDim.x] = 0; + } + + // note here we have stride <= blockDim.x + for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) + { + __syncthreads(); + int index = (threadIdx.x + 1) * 2 * stride - 1; + if (index < SECTION_SIZE) + { + XY[index] += XY[index - stride]; + } + } + for (int stride = SECTION_SIZE / 4; stride > 0; stride /= 2) + { + __syncthreads(); + int index = (threadIdx.x + 1) * 2 * stride - 1; + if ((index + stride) < SECTION_SIZE) + { + XY[index + stride] += XY[index]; + } + } - __global__ void kernWorkEfficientGPUScanThirdStep(const int* inputArray, - int* outputArray, int inputSize) + __syncthreads(); + if (i < inputSize) + { + Y[i] = XY[threadIdx.x]; + } + else { + Y[i] = 0; + } + if ((i + blockDim.x) < inputSize) + { + Y[i + blockDim.x] = XY[threadIdx.x + blockDim.x]; + } + else { + Y[i + blockDim.x] = 0; + } + } + + __global__ void kernBrentKungScanAddUpSumArray(const int* S, + int* Y, int inputSize) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < inputSize && blockIdx.x > 0) { - outputArray[i] += inputArray[blockIdx.x - 1]; + Y[i] += S[blockIdx.x - 1]; } } - - void scanWithoutTimer(int n, int* odata, const int* idata) { - int size = n * sizeof(int); - int sumArrayNumEle = (n + blockSize - 1) / blockSize; - int sumArraySize = sumArrayNumEle * sizeof(int); - - int* d_InputData; - int* d_OutputData; - int* d_OutputExclusiveData; - int* d_SumArray; - int* d_SumArrayOutput; - - cudaMalloc((void**)&d_InputData, size); - checkCUDAError("cudaMalloc d_InputData failed!"); - - cudaMalloc((void**)&d_OutputData, size); - checkCUDAError("cudaMalloc d_OutputData failed!"); - - cudaMalloc((void**)&d_OutputExclusiveData, size); - checkCUDAError("cudaMalloc d_OutputExclusiveData failed!"); - - cudaMalloc((void**)&d_SumArray, sumArraySize); - checkCUDAError("cudaMalloc d_SumArray failed!"); - - cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); - checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); - - cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); - - // Only need to launch a kernel with (blockSize / 2) in a block - // b/c each thread loads/stores two elements - dim3 dimGridArrayEfficient((n + (blockSize / 2) - 1) / (blockSize / 2), 1, 1); - dim3 dimBlockArrayEfficient((blockSize / 2), 1, 1); - - dim3 dimGridSumArray((sumArrayNumEle + (blockSize / 2) - 1) / (blockSize / 2), 1, 1); - dim3 dimBlockSumArray((blockSize / 2), 1, 1); - - dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); - dim3 dimBlockArray(blockSize, 1, 1); - - // timer().startGpuTimer(); - - // First step: compute the scan result for individual sections - // then, store their block sum to sumArray - kernWorkEfficientGPUScanFirstStep << > > (d_InputData, d_OutputData, - d_SumArray, n); - checkCUDAError("kernNaiveGPUScanFirstStep failed!"); - - // cudaDeviceSynchronize(); - - // Second step: scan block sums - kernWorkEfficientGPUScanSecondStep << > > ( - d_SumArray, d_SumArrayOutput, sumArrayNumEle); - checkCUDAError("kernNaiveGPUScanSecondStep failed!"); - - // cudaDeviceSynchronize(); - - // Third step: add scanned block sum i to all values of scanned block - // i + 1 - kernWorkEfficientGPUScanThirdStep << > > ( - d_SumArrayOutput, d_OutputData, n); - checkCUDAError("kernNaiveGPUScanThirdStep failed!"); - - // cudaDeviceSynchronize(); - - // Last step: - convertFromInclusiveToExclusive << > > ( - d_OutputData, d_OutputExclusiveData, n); - checkCUDAError("convertFromInclusiveToExclusive failed!"); - // timer().endGpuTimer(); - - cudaMemcpy(odata, d_OutputExclusiveData, size, cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - - // cleanup - cudaFree(d_InputData); - cudaFree(d_OutputData); - cudaFree(d_OutputExclusiveData); - cudaFree(d_SumArray); - cudaFree(d_SumArrayOutput); - checkCUDAError("cudaFree failed!"); - } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - int size = n * sizeof(int); - int sumArrayNumEle = (n + blockSize - 1) / blockSize; - int sumArraySize = sumArrayNumEle * sizeof(int); - - int* d_InputData; - int* d_OutputData; - int* d_OutputExclusiveData; - int* d_SumArray; - int* d_SumArrayOutput; - - cudaMalloc((void**)&d_InputData, size); - checkCUDAError("cudaMalloc d_InputData failed!"); - - cudaMalloc((void**)&d_OutputData, size); - checkCUDAError("cudaMalloc d_OutputData failed!"); - - cudaMalloc((void**)&d_OutputExclusiveData, size); - checkCUDAError("cudaMalloc d_OutputExclusiveData failed!"); - - cudaMalloc((void**)&d_SumArray, sumArraySize); - checkCUDAError("cudaMalloc d_SumArray failed!"); - - cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); - checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); - - cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); - - // Only need to launch a kernel with (blockSize / 2) in a block - // b/c each thread loads/stores two elements - dim3 dimGridArrayEfficient((n + (blockSize / 2) - 1) / (blockSize / 2), 1, 1); - dim3 dimBlockArrayEfficient((blockSize / 2), 1, 1); - - dim3 dimGridSumArray((sumArrayNumEle + (blockSize / 2) - 1) / (blockSize / 2), 1, 1); - dim3 dimBlockSumArray((blockSize / 2), 1, 1); - - dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); - dim3 dimBlockArray(blockSize, 1, 1); + // n could be larger than SECTION_SIZE + int idataSizeBytes = n * sizeof(int); + int sumArraySizeBytes = (n / SECTION_SIZE) * sizeof(int); + + // MaxThreadsPerBlock: 1024. However, SECTION_SIZE / 2 is needed + // for kernBrentKungScan + assert(SECTION_SIZE <= 1024); + assert(n <= 524288); + + dim3 dimGridBrent((n + (SECTION_SIZE / 2) - 1) / (SECTION_SIZE / 2), 1, 1); + dim3 dimBlockBrent(SECTION_SIZE / 2, 1, 1); + + dim3 dimGridBrentSumArray(1, 1, 1); + dim3 dimBlockBrentSumArray(SECTION_SIZE / 2, 1, 1); + + dim3 dimGridArray((n + SECTION_SIZE - 1) / SECTION_SIZE, 1, 1); + dim3 dimBlockArray(SECTION_SIZE, 1, 1); + + int* d_X; + int* d_Y; + int* d_S; + int* d_SOut; + int* d_YExclusive; + cudaMalloc((void**)&d_X, idataSizeBytes); + checkCUDAError("cudaMalloc d_X failed!"); + cudaMalloc((void**)&d_Y, idataSizeBytes); + checkCUDAError("cudaMalloc d_Y failed!"); + cudaMalloc((void**)&d_YExclusive, idataSizeBytes); + checkCUDAError("cudaMalloc d_YExclusive failed!"); + cudaMalloc((void**)&d_S, sumArraySizeBytes); + checkCUDAError("cudaMalloc d_S failed!"); + cudaMalloc((void**)&d_SOut, sumArraySizeBytes); + checkCUDAError("cudaMalloc d_SOut failed!"); + + cudaMemcpy(d_X, idata, idataSizeBytes, cudaMemcpyHostToDevice); timer().startGpuTimer(); + kernBrentKungScan << > > (d_X, d_Y, d_S, n); + kernBrentKungScan << > > (d_S, d_SOut, n); + kernBrentKungScanAddUpSumArray << > > (d_SOut, d_Y, n); + convertFromInclusiveToExclusive << > > (d_Y, d_YExclusive, n); + timer().endGpuTimer(); - // First step: compute the scan result for individual sections - // then, store their block sum to sumArray - kernWorkEfficientGPUScanFirstStep <<> > (d_InputData, d_OutputData, - d_SumArray, n); - checkCUDAError("kernNaiveGPUScanFirstStep failed!"); - - // cudaDeviceSynchronize(); - - // Second step: scan block sums - kernWorkEfficientGPUScanSecondStep << >> ( - d_SumArray, d_SumArrayOutput, sumArrayNumEle); - checkCUDAError("kernNaiveGPUScanSecondStep failed!"); - - // cudaDeviceSynchronize(); - - // Third step: add scanned block sum i to all values of scanned block - // i + 1 - kernWorkEfficientGPUScanThirdStep << >> ( - d_SumArrayOutput, d_OutputData, n); - checkCUDAError("kernNaiveGPUScanThirdStep failed!"); - - // cudaDeviceSynchronize(); + cudaMemcpy(odata, d_YExclusive, idataSizeBytes, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); - // Last step: - convertFromInclusiveToExclusive <<>> ( - d_OutputData, d_OutputExclusiveData, n); - checkCUDAError("convertFromInclusiveToExclusive failed!"); - timer().endGpuTimer(); + cudaFree(d_X); + cudaFree(d_Y); + cudaFree(d_S); + cudaFree(d_SOut); + cudaFree(d_YExclusive); + checkCUDAError("cudaFree failed!"); + } - cudaMemcpy(odata, d_OutputExclusiveData, size, cudaMemcpyDeviceToHost); + void scanWithoutTimer(int n, int* odata, const int* idata) { + // n could be larger than SECTION_SIZE + int idataSizeBytes = n * sizeof(int); + int sumArraySizeBytes = (n / SECTION_SIZE) * sizeof(int); + + // MaxThreadsPerBlock: 1024. However, SECTION_SIZE / 2 is needed + // for kernBrentKungScan + assert(SECTION_SIZE <= 1024); + assert(n <= 524288); + + dim3 dimGridBrent((n + (SECTION_SIZE / 2) - 1) / (SECTION_SIZE / 2), 1, 1); + dim3 dimBlockBrent(SECTION_SIZE / 2, 1, 1); + + dim3 dimGridBrentSumArray(1, 1, 1); + dim3 dimBlockBrentSumArray(SECTION_SIZE / 2, 1, 1); + + dim3 dimGridArray((n + SECTION_SIZE - 1) / SECTION_SIZE, 1, 1); + dim3 dimBlockArray(SECTION_SIZE, 1, 1); + + int* d_X; + int* d_Y; + int* d_S; + int* d_SOut; + int* d_YExclusive; + cudaMalloc((void**)&d_X, idataSizeBytes); + checkCUDAError("cudaMalloc d_X failed!"); + cudaMalloc((void**)&d_Y, idataSizeBytes); + checkCUDAError("cudaMalloc d_Y failed!"); + cudaMalloc((void**)&d_YExclusive, idataSizeBytes); + checkCUDAError("cudaMalloc d_YExclusive failed!"); + cudaMalloc((void**)&d_S, sumArraySizeBytes); + checkCUDAError("cudaMalloc d_S failed!"); + cudaMalloc((void**)&d_SOut, sumArraySizeBytes); + checkCUDAError("cudaMalloc d_SOut failed!"); + + cudaMemcpy(d_X, idata, idataSizeBytes, cudaMemcpyHostToDevice); + + kernBrentKungScan << > > (d_X, d_Y, d_S, n); + kernBrentKungScan << > > (d_S, d_SOut, n); + kernBrentKungScanAddUpSumArray << > > (d_SOut, d_Y, n); + convertFromInclusiveToExclusive << > > (d_Y, d_YExclusive, n); + + cudaMemcpy(odata, d_YExclusive, idataSizeBytes, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); - // cleanup - cudaFree(d_InputData); - cudaFree(d_OutputData); - cudaFree(d_OutputExclusiveData); - cudaFree(d_SumArray); - cudaFree(d_SumArrayOutput); + cudaFree(d_X); + cudaFree(d_Y); + cudaFree(d_S); + cudaFree(d_SOut); + cudaFree(d_YExclusive); checkCUDAError("cudaFree failed!"); } + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 2cfa357..14eac12 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,7 +6,7 @@ #include // testing #include // for assert() - +#define SECTION_SIZE 1024 namespace StreamCompaction { namespace Naive { @@ -16,7 +16,6 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - __global__ void convertFromInclusiveToExclusive(const int* inputArray, int* outputArray, int inputSize) { @@ -26,7 +25,7 @@ namespace StreamCompaction { // element and out-of-bound elements with 0. if (i < inputSize && i != 0) { - + outputArray[i] = inputArray[i - 1]; } else { @@ -34,200 +33,150 @@ namespace StreamCompaction { } } - __device__ void computeScanToOutputArray(const int* inputArray, int* outputArray, - int* XY, int inputSize) + __global__ void kernKoggeStoneScanAddUpSumArray(const int* S, + int* Y, int inputSize) + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < inputSize && blockIdx.x > 0) + { + Y[i] += S[blockIdx.x - 1]; + } + } + + __global__ void kernKoggeStoneScan(int* X, int* Y, int* S, int inputSize) { + __shared__ int XY[SECTION_SIZE]; int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < inputSize) { - XY[threadIdx.x] = inputArray[i]; + XY[threadIdx.x] = X[i]; } else { XY[threadIdx.x] = 0; } - // perform naive scan + // performs iterative scan on XY + // note that it is stride < blockDim.x, not stride <= blockDim.x: + // if you have 16 elements, stride could only be 1,2,4,8 for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) { // make sure that input is in place __syncthreads(); - int previousValue = 0; - int previousIndex = threadIdx.x - stride; - if (previousIndex >= 0) + bool written = false; + int temp = 0; + if (threadIdx.x >= stride) { - previousValue = XY[previousIndex]; + temp = XY[threadIdx.x] + XY[threadIdx.x - stride]; + written = true; } - int temp = XY[threadIdx.x] + previousValue; // make sure previous output has been consumed __syncthreads(); - XY[threadIdx.x] = temp; + if (written) + { + XY[threadIdx.x] = temp; + } } - - // each thread writes its result into the output array - outputArray[i] = XY[threadIdx.x]; - } - - __global__ void kernNaiveGPUScanFirstStep(const int* inputArray, - int* outputArray, int* SumArray, int inputSize) - { - // Each thread loads one value from the input array into shared - // memory array XY - __shared__ int XY[sectionSize]; - computeScanToOutputArray(inputArray, outputArray, XY, inputSize); + Y[i] = XY[threadIdx.x]; // the last thread in the block should write the output value of // the last XY element in the block to the blockIdx.x position of // SumArray // make sure XY[sectionSize - 1] has the correct partial sum - __syncthreads(); + __syncthreads(); if (threadIdx.x == blockDim.x - 1) { - SumArray[blockIdx.x] = XY[sectionSize - 1]; + S[blockIdx.x] = XY[SECTION_SIZE - 1]; } } - - __global__ void kernNaiveGPUScanSecondStep(const int* inputArray, - int* outputArray, int inputSize) - { - // Each thread loads one value from the input array into shared - // memory array XY - __shared__ int XY[MAX_SUM_ARRAY_SIZE]; - computeScanToOutputArray(inputArray, outputArray, XY, inputSize); - } - - __global__ void kernNaiveGPUScanThirdStep(const int* inputArray, - int* outputArray, int inputSize) + __global__ void kernKoggeStoneScan(int* X, int* Y, int inputSize) { + __shared__ int XY[SECTION_SIZE]; int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < inputSize && blockIdx.x > 0) + if (i < inputSize) + { + XY[threadIdx.x] = X[i]; + } + else { + XY[threadIdx.x] = 0; + } + // performs iterative scan on XY + // note that it is stride < blockDim.x, not stride <= blockDim.x: + // if you have 16 elements, stride could only be 1,2,4,8 + for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) { - outputArray[i] += inputArray[blockIdx.x - 1]; + // make sure that input is in place + __syncthreads(); + bool written = false; + int temp = 0; + if (threadIdx.x >= stride) + { + temp = XY[threadIdx.x] + XY[threadIdx.x - stride]; + written = true; + } + // make sure previous output has been consumed + __syncthreads(); + if (written) + { + XY[threadIdx.x] = temp; + } } + Y[i] = XY[threadIdx.x]; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - int size = n * sizeof(int); - int sumArrayNumEle = (n + blockSize - 1) / blockSize; - assert(sumArrayNumEle <= 1024 && "Sum Array has more than 1024 elements!"); - int sumArraySize = sumArrayNumEle * sizeof(int); - - int* d_InputData; - int* d_OutputData; - int* d_OutputExclusiveData; - int* d_SumArray; - int* d_SumArrayOutput; - int* d_SumArrayAx; - - cudaMalloc((void**)&d_InputData, size); - checkCUDAError("cudaMalloc d_InputData failed!"); - - cudaMalloc((void**)&d_OutputData, size); - checkCUDAError("cudaMalloc d_OutputData failed!"); - - cudaMalloc((void**)&d_OutputExclusiveData, size); - checkCUDAError("cudaMalloc d_OutputExclusiveData failed!"); - - cudaMalloc((void**)&d_SumArray, sumArraySize); - checkCUDAError("cudaMalloc d_SumArray failed!"); - - cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); - checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); - - cudaMalloc((void**)&d_SumArrayAx, sumArraySize); - checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); - - cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); - - dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); - dim3 dimBlockArray(blockSize, 1, 1); - - - dim3 dimGridSumArray(1, 1, 1); - dim3 dimBlockSumArray(sumArrayNumEle, 1, 1); - - // for testing - int* sumArray = new int[sumArrayNumEle]; - int* sumArrayOutput = new int[sumArrayNumEle]; + // n could be larger than SECTION_SIZE + int idataSizeBytes = n * sizeof(int); + + int sumArraySizeBytes = (n / SECTION_SIZE) * sizeof(int); + + // MaxThreadsPerBlock: 1024 + assert(SECTION_SIZE <= 1024); + assert(n <= 1048576); // 2^20 + + dim3 dimGridKogge((n + SECTION_SIZE - 1) / SECTION_SIZE, 1, 1); + dim3 dimBlockKogge(SECTION_SIZE, 1, 1); + + dim3 dimGridKoggeSumArray(1, 1, 1); + dim3 dimBlockKoggeSumArray(SECTION_SIZE, 1, 1); + + int* d_X; + int* d_Y; + int* d_S; + int* d_SOut; + int* d_YExclusive; + cudaMalloc((void**)&d_X, idataSizeBytes); + checkCUDAError("cudaMalloc d_X failed!"); + cudaMalloc((void**)&d_Y, idataSizeBytes); + checkCUDAError("cudaMalloc d_Y failed!"); + cudaMalloc((void**)&d_YExclusive, idataSizeBytes); + checkCUDAError("cudaMalloc d_YExclusive failed!"); + cudaMalloc((void**)&d_S, sumArraySizeBytes); + checkCUDAError("cudaMalloc d_S failed!"); + cudaMalloc((void**)&d_SOut, sumArraySizeBytes); + checkCUDAError("cudaMalloc d_SOut failed!"); + + cudaMemcpy(d_X, idata, idataSizeBytes, cudaMemcpyHostToDevice); timer().startGpuTimer(); - // First step: compute the scan result for individual sections - // then, store their block sum to sumArray - kernNaiveGPUScanFirstStep << > > (d_InputData, - d_OutputData, d_SumArray, n); - checkCUDAError("kernNaiveGPUScanFirstStep failed!"); -#if 0 - cudaDeviceSynchronize(); - cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - - cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - - std::cout << '\n'; - for (int i = 0; i < n; i++) - { - std::cout << odata[i] << ' '; - if ((i + 1) % 8 == 0) { - std::cout << std::endl; - } - } - - std::cout << '\n'; - for (int i = 0; i < sumArrayNumEle; i++) - { - std::cout << sumArray[i] << ' '; - } - - std::cout << '\n'; -#endif - // Second step: scan block sums - kernNaiveGPUScanSecondStep << > > ( - d_SumArray, d_SumArrayOutput, sumArrayNumEle); - checkCUDAError("kernNaiveGPUScanSecondStep failed!"); -#if 0 - - cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize, - cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - - printf("\n"); - - for (int i = 0; i < sumArrayNumEle; i++) - { - std::cout << sumArrayOutput[i] << ' '; - } - - printf("\n"); - -#endif - // Third step: add scanned block sum i to all values of scanned block - // i + 1 - kernNaiveGPUScanThirdStep << > > ( - d_SumArrayOutput, d_OutputData, n); - checkCUDAError("kernNaiveGPUScanThirdStep failed!"); - - // cudaDeviceSynchronize(); - - // Last step: - - convertFromInclusiveToExclusive << > > ( - d_OutputData, d_OutputExclusiveData, n); - checkCUDAError("convertFromInclusiveToExclusive failed!"); - + kernKoggeStoneScan <<>> (d_X, d_Y, d_S, n); + kernKoggeStoneScan <<>> (d_S, d_SOut, n); + kernKoggeStoneScanAddUpSumArray <<>> ( + d_SOut, d_Y, n); + convertFromInclusiveToExclusive << > > ( + d_Y, d_YExclusive, n); timer().endGpuTimer(); - cudaMemcpy(odata, d_OutputExclusiveData, size, cudaMemcpyDeviceToHost); + cudaMemcpy(odata, d_YExclusive, idataSizeBytes, cudaMemcpyDeviceToHost); checkCUDAError("memCpy back failed!"); - // cleanup - cudaFree(d_InputData); - cudaFree(d_OutputData); - cudaFree(d_OutputExclusiveData); - cudaFree(d_SumArray); - cudaFree(d_SumArrayOutput); + cudaFree(d_X); + cudaFree(d_Y); + cudaFree(d_S); + cudaFree(d_SOut); + cudaFree(d_YExclusive); checkCUDAError("cudaFree failed!"); } } From cd7a3575d5db9036bc2c1c11a56fe72be6078996 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Wed, 22 Sep 2021 14:12:35 -0400 Subject: [PATCH 25/27] Update README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 04682fc..2807199 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,6 @@ --- -I would like to extend project 2 by one day, please. - ## Features - CPU Scan & Stream Compaction @@ -19,6 +17,8 @@ I would like to extend project 2 by one day, please. For all GPU Scan algorithms, I choose to implement inclusive Scan first, and then convert the result of inclusive Scan to exclusive Scan. This can be done in parallel with minimal code. +In this version, the partial sum array has a maximum size of 1024. Thus, the number of elements the Scan algorithm can handle is limited. + ## Performance Analysis ![scan](images/scan.png) From 8c997fbb7c917a0924b3ed0763d23ab9e5b4d4b1 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Wed, 22 Sep 2021 20:10:20 -0400 Subject: [PATCH 26/27] Naive Recursive Scan --- src/main.cpp | 6 +-- stream_compaction/naive.cu | 97 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 8 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index c937ead..8c7d351 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 19; // feel free to change the size of array +const int SIZE = 1 << 25; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -101,7 +101,7 @@ int main(int argc, char* argv[]) { printCmpResult(NPOT, b, c); printf("\n"); - +#if 0 zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); @@ -115,7 +115,7 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - +#endif zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 14eac12..e635cef 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -123,6 +123,78 @@ namespace StreamCompaction { Y[i] = XY[threadIdx.x]; } + void scanRecursiveHelper(int n, int* odata, const int* idata) + { + int blockSize = (n + SECTION_SIZE - 1) / SECTION_SIZE; + int idataSizeBytes = n * sizeof(int); + + int sumArraySizeBytes = n <= 1024 ? n * sizeof(int) + : (n / SECTION_SIZE) * sizeof(int); + + dim3 dimGridKogge(blockSize, 1, 1); + dim3 dimBlockKogge(SECTION_SIZE, 1, 1); + + if (blockSize == 1) + { + int* d_X; + int* d_Y; + cudaMalloc((void**)&d_X, idataSizeBytes); + checkCUDAError("cudaMalloc d_X failed!"); + cudaMalloc((void**)&d_Y, idataSizeBytes); + checkCUDAError("cudaMalloc d_Y failed!"); + + cudaMemcpy(d_X, idata, idataSizeBytes, cudaMemcpyHostToDevice); + + kernKoggeStoneScan << > > ( + d_X, d_Y, n); + + cudaMemcpy(odata, d_Y, idataSizeBytes, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); +#if 0 + std::cout << '\n'; + for (int i = 0; i < n; i++) + { + std::cout << odata[i] << '\n'; + } +#endif + cudaFree(d_X); + cudaFree(d_Y); + checkCUDAError("cudaFree failed!"); + } + else { + int* d_X; + int* d_Y; + cudaMalloc((void**)&d_X, idataSizeBytes); + checkCUDAError("cudaMalloc d_X failed!"); + cudaMalloc((void**)&d_Y, idataSizeBytes); + checkCUDAError("cudaMalloc d_Y failed!"); + int* d_S; + int* d_SOut; + cudaMalloc((void**)&d_S, sumArraySizeBytes); + checkCUDAError("cudaMalloc d_S failed!"); + cudaMalloc((void**)&d_SOut, sumArraySizeBytes); + checkCUDAError("cudaMalloc d_SOut failed!"); + + cudaMemcpy(d_X, idata, idataSizeBytes, cudaMemcpyHostToDevice); + checkCUDAError("memCpy back failed!"); + + kernKoggeStoneScan << > > (d_X, d_Y, d_S, n); + + scanRecursiveHelper(n / SECTION_SIZE, d_SOut, d_S); + kernKoggeStoneScanAddUpSumArray << > > ( + d_SOut, d_Y, n); + + cudaMemcpy(odata, d_Y, idataSizeBytes, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + cudaFree(d_X); + cudaFree(d_Y); + cudaFree(d_S); + cudaFree(d_SOut); + checkCUDAError("cudaFree failed!"); + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ @@ -133,14 +205,16 @@ namespace StreamCompaction { int sumArraySizeBytes = (n / SECTION_SIZE) * sizeof(int); // MaxThreadsPerBlock: 1024 - assert(SECTION_SIZE <= 1024); - assert(n <= 1048576); // 2^20 + // assert(SECTION_SIZE <= 1024); + // assert(n <= 1048576); // 2^20 dim3 dimGridKogge((n + SECTION_SIZE - 1) / SECTION_SIZE, 1, 1); dim3 dimBlockKogge(SECTION_SIZE, 1, 1); - dim3 dimGridKoggeSumArray(1, 1, 1); - dim3 dimBlockKoggeSumArray(SECTION_SIZE, 1, 1); + // dim3 dimGridKoggeSumArray(1, 1, 1); + // dim3 dimBlockKoggeSumArray(SECTION_SIZE, 1, 1); + + int* sumArrayOutput = new int[n / SECTION_SIZE]; int* d_X; int* d_Y; @@ -162,7 +236,20 @@ namespace StreamCompaction { timer().startGpuTimer(); kernKoggeStoneScan <<>> (d_X, d_Y, d_S, n); - kernKoggeStoneScan <<>> (d_S, d_SOut, n); + scanRecursiveHelper(n / SECTION_SIZE, d_SOut, d_S); +#if 0 + cudaMemcpy(sumArrayOutput, d_SOut, sumArraySizeBytes, + cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + printf("\n"); + + for (int i = 0; i < n / SECTION_SIZE; i++) + { + std::cout << sumArrayOutput[i] << '\n'; + } +#endif + kernKoggeStoneScanAddUpSumArray <<>> ( d_SOut, d_Y, n); convertFromInclusiveToExclusive << > > ( From 0e07c174477e79dd66ec1f1a238522b3fda035a1 Mon Sep 17 00:00:00 2001 From: Zixin Zhang Date: Wed, 22 Sep 2021 20:14:27 -0400 Subject: [PATCH 27/27] Update README --- README.md | 4 +--- stream_compaction/naive.cu | 19 ------------------- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/README.md b/README.md index 2807199..ea2d7b4 100644 --- a/README.md +++ b/README.md @@ -11,14 +11,12 @@ ## Features - CPU Scan & Stream Compaction -- Naive GPU Scan Algorithm Using Shared Memory +- Recusive Naive GPU Scan Algorithm Using Shared Memory - Work-Efficient GPU Scan Using Shared Memory & Stream Compaction - Thrust's Scan Algorithm For all GPU Scan algorithms, I choose to implement inclusive Scan first, and then convert the result of inclusive Scan to exclusive Scan. This can be done in parallel with minimal code. -In this version, the partial sum array has a maximum size of 1024. Thus, the number of elements the Scan algorithm can handle is limited. - ## Performance Analysis ![scan](images/scan.png) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index e635cef..88b4501 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -204,16 +204,9 @@ namespace StreamCompaction { int sumArraySizeBytes = (n / SECTION_SIZE) * sizeof(int); - // MaxThreadsPerBlock: 1024 - // assert(SECTION_SIZE <= 1024); - // assert(n <= 1048576); // 2^20 - dim3 dimGridKogge((n + SECTION_SIZE - 1) / SECTION_SIZE, 1, 1); dim3 dimBlockKogge(SECTION_SIZE, 1, 1); - // dim3 dimGridKoggeSumArray(1, 1, 1); - // dim3 dimBlockKoggeSumArray(SECTION_SIZE, 1, 1); - int* sumArrayOutput = new int[n / SECTION_SIZE]; int* d_X; @@ -237,18 +230,6 @@ namespace StreamCompaction { timer().startGpuTimer(); kernKoggeStoneScan <<>> (d_X, d_Y, d_S, n); scanRecursiveHelper(n / SECTION_SIZE, d_SOut, d_S); -#if 0 - cudaMemcpy(sumArrayOutput, d_SOut, sumArraySizeBytes, - cudaMemcpyDeviceToHost); - checkCUDAError("memCpy back failed!"); - - printf("\n"); - - for (int i = 0; i < n / SECTION_SIZE; i++) - { - std::cout << sumArrayOutput[i] << '\n'; - } -#endif kernKoggeStoneScanAddUpSumArray <<>> ( d_SOut, d_Y, n);