diff --git a/README.md b/README.md index 0e38ddb..d7351a0 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,111 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Paul (San) Jewell + * [LinkedIn](https://www.linkedin.com/in/paul-jewell-2aba7379), [work website]( + https://www.biociphers.org/paul-jewell-lab-member), [personal website](https://gitlab.com/inklabapp), [twitter](https://twitter.com/inklabapp), etc. +* Tested on: (TODO) Linux pop-os 5.11.0-7614-generic, i7-9750H CPU @ 2.60GHz 32GB, GeForce GTX 1650 Mobile / Max-Q 4GB -### (TODO: Your README) +### GPU scan and stream compaction demonstration -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This repository contains some comparisons of basic algorithms compared between a CPU serial implementation, my GPU CUDA implementation, +and similar functions provided by the 'thrust' library. +I demonstrate usage of the naive, buffered, work-efficient, and chunked work-efficient scan algorithms as described in +[this link](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda). +The implementation is also broken down into a number of sub-functions for easier comprehension by the curious reader. + +### Performance comparison overview + +After manually testing a number of block sizes, I've found that a size of 64 works best for the efficient scan +implementation. (same as for boids)and 512 (the largest I can do) works best for the naive implementation. (this +pretty much makes sense as it is a largely processor-count-limited algorithm). Thrust does not allow specifying a +manual block size that I could find so I didn't do any testing there. + +![boidstats](img/chart1.png) + +For a first iteration, I was unable to run larger sizes without causing OOM on my local machine + (desktop completely crashed), so I tested up to 1mil only. Clearly something seems to have goofed in either + a single blocking line in my efficient implementation, or the measurement code location. I don't believe a simple + inefficiency would lead to the huge discrepancy between naive and efficient implementations here. The no-chunking + implementation should do the same amount of operations for the up and down sweep as the chunking implementation (ideal), + however, it will get the incorrect result. There is something very inefficient with the chunking that I've not yet been + able to work out. + +``` + **************** + ** SCAN TESTS ** + **************** + [ 29 18 29 17 11 3 19 31 3 49 25 31 31 ... 14 0 ] + ==== cpu scan, power-of-two ==== + elapsed time: 0.000686ms (std::chrono Measured) + [ 29 47 76 93 104 107 126 157 160 209 234 265 296 ... 23987 23987 ] + ==== cpu scan, non-power-of-two ==== + elapsed time: 0.000377ms (std::chrono Measured) + [ 29 47 76 93 104 107 126 157 160 209 234 265 296 ... 23950 23960 ] + passed + ==== naive scan, power-of-two ==== + elapsed time: 0.060032ms (CUDA Measured) + [ 29 47 76 93 104 107 126 157 160 209 234 265 296 ... 23987 23987 ] + ==== naive scan, non-power-of-two ==== + elapsed time: 0.055296ms (CUDA Measured) + [ 29 47 76 93 104 107 126 157 160 209 234 265 296 ... 23950 23960 ] + passed + ==== work-efficient scan no chunk, power-of-two ==== + elapsed time: 0.087648ms (CUDA Measured) + [ 29 47 76 93 104 107 126 157 160 209 234 265 296 ... 23987 23987 ] + passed + ==== work-efficient scan no chunk, non-power-of-two ==== + elapsed time: 0.041152ms (CUDA Measured) + [ 29 47 76 93 104 107 126 157 160 209 234 265 296 ... 23950 23960 ] + passed + ==== work-efficient scan, power-of-two ==== + elapsed time: 0.04512ms (CUDA Measured) + [ 29 47 76 93 104 107 126 157 160 209 234 265 296 ... 23987 23987 ] + passed + ==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.058752ms (CUDA Measured) + [ 29 47 76 93 104 107 126 157 160 209 234 265 296 ... 23950 23960 ] + passed + ==== thrust scan, power-of-two ==== + elapsed time: 0.022528ms (CUDA Measured) + [ 29 47 76 93 104 107 126 157 160 209 234 265 296 ... 23987 23987 ] + passed + ==== thrust scan, non-power-of-two ==== + elapsed time: 0.021024ms (CUDA Measured) + [ 29 47 76 93 104 107 126 157 160 209 234 265 296 ... 23950 23960 ] + passed + + ***************************** + ** STREAM COMPACTION TESTS ** + ***************************** + [ 1 2 1 3 1 1 3 1 3 3 3 3 3 ... 0 0 ] + [ 1 2 1 3 1 1 3 1 3 3 3 3 3 ... 2 2 ] + ==== cpu compact without scan, power-of-two ==== + elapsed time: 0.002584ms (std::chrono Measured) + [ 1 2 1 3 1 1 3 1 3 3 3 3 3 ... 2 3 ] + passed + ==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.001953ms (std::chrono Measured) + [ 1 2 1 3 1 1 3 1 3 3 3 3 3 ... 2 2 ] + passed + ==== cpu compact with scan ==== + elapsed time: 0.004194ms (std::chrono Measured) + [ 1 2 1 3 1 1 3 1 3 3 3 3 3 ... 2 3 ] + passed + ==== cpu compact with scan, non-power-of-two ==== + elapsed time: 0.003411ms (std::chrono Measured) + [ 1 2 1 3 1 1 3 1 3 3 3 3 3 ... 2 2 ] + passed + ==== work-efficient compact, power-of-two ==== + elapsed time: 0.065216ms (CUDA Measured) + [ 1 2 1 3 1 1 3 1 3 3 3 3 3 ... 2 3 ] + passed + ==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.063552ms (CUDA Measured) + [ 1 2 1 3 1 1 3 1 3 3 3 3 3 ... 2 2 ] + passed + sh: 1: pause: not found + + Process finished with exit code 0 +``` \ No newline at end of file diff --git a/img/chart1.png b/img/chart1.png new file mode 100644 index 0000000..cb51c9c Binary files /dev/null and b/img/chart1.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..a3de94a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,9 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +//const int SIZE = 1 << 8; // feel free to change the size of array +//const int SIZE = 10000; // feel free to change the size of array +const int SIZE = 1000; const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -29,6 +31,7 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; + printArray(SIZE, a, true); // initialize b using StreamCompaction::CPU::scan you implement @@ -51,48 +54,102 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + printArray(SIZE, c, true); - zeroArray(SIZE, c); + zeroArray(NPOT, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + + // For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + +// onesArray(SIZE, a); +// zeroArray(SIZE, c); +// printDesc("1s array for finding bugs"); +// StreamCompaction::Naive::scan(NPOT, c, a); +// printArray(NPOT, c); + +// onesArray(NPOT, a); +// zeroArray(NPOT, c); +// printDesc("1s array for finding bugs"); +// StreamCompaction::Naive::scan(NPOT, c, a); +// printArray(NPOT, c); +// +// onesArray(SIZE, a); +// zeroArray(SIZE, c); +// printDesc("1s array for finding bugs"); +// StreamCompaction::Efficient::scan(SIZE, c, a); +// printArray(SIZE, c); + + +// onesArray(3, a); +// a[0] = 4; +// a[1] = 4; +// a[2] = 1; +// zeroArray(3, c); +// printDesc("1s array for finding bugs"); +// StreamCompaction::Efficient::scanplain(3, c, a); +// printArray(3, c); +// +// onesArray(SIZE, a); +// zeroArray(SIZE, c); +// printDesc("1s array for finding bugs"); +// StreamCompaction::Efficient::scan(SIZE, c, a); +// printArray(SIZE, c); +// +// onesArray(SIZE, a); +// zeroArray(SIZE, c); +// printDesc("1s array for finding bugs"); +// StreamCompaction::Efficient::scan(SIZE, c, a); +// printArray(SIZE, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan no chunk, power-of-two"); + StreamCompaction::Efficient::scanplain(SIZE, c, a); + exclusive2inclusive(SIZE, c, a[SIZE-1]); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan no chunk, non-power-of-two"); + StreamCompaction::Efficient::scanplain(NPOT, c, a); + exclusive2inclusive(NPOT, c, a[NPOT-1]); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); + exclusive2inclusive(SIZE, c, a[SIZE-1]); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); + exclusive2inclusive(NPOT, c, a[NPOT-1]); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -104,7 +161,16 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; + //a[NPOT - 1] = 0; +// +// a[0] = 0; +// a[1] = 2; +// a[2] = 1; +// a[3] = 2; +// a[4] = 1; + printArray(SIZE, a, true); + printArray(NPOT, a, true); int count, expectedCount, expectedNPOT; @@ -133,18 +199,25 @@ int main(int argc, char* argv[]) { printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + zeroArray(SIZE, c); + printDesc("cpu compact with scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); + zeroArray(NPOT, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..c2ad78d 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -30,6 +30,13 @@ inline int ilog2ceil(int x) { return x == 1 ? 0 : ilog2(x - 1) + 1; } +inline void exclusive2inclusive(int n, int* scanned, int lastvalue){ + for(int i=0; i #include "cpu.h" - +#include "testing_helpers.hpp" #include "common.h" namespace StreamCompaction { @@ -20,6 +20,25 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + /* + * out[0] = in[0]; // assuming n > 0 + for (int k = 1; k < n; ++k) + out[k] = out[k – 1] + in[k]; + */ + if(n < 1) return; + + // inclusive + odata[0] = idata[0]; + for(int i=1; i0; --i){ +// odata[i] = odata[i-1]; +// } +// odata[0] = 0; + timer().endCpuTimer(); } @@ -30,9 +49,17 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int numNonZero = 0; + for(int i=0; i #include "common.h" #include "efficient.h" +#include + +#define checkCUDAErrorWithLine(msg) checkCUDAErrorFn(msg, __FILE__, __LINE__) +#define blockSize 64 namespace StreamCompaction { namespace Efficient { @@ -12,15 +16,267 @@ namespace StreamCompaction { return timer; } + __global__ void kernEfficientScanUp(int N, int d, int* odata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + int power_plus = (int) powf(2.0f, (float) d+1); + if (index % power_plus != 0) return; + + int power_ident = power_plus / 2; + //int power_ident = (int) powf(2.0f, (float) d); + + odata[index + power_plus - 1] += odata[index + power_ident - 1]; + + + } + + __global__ void kernEfficientScanReset(int N, int* odata) { + // should not be needed in the end + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index != 0) return; + odata[N-1] = 0; + } + + + + + __global__ void kernEfficientMoveLastValue(int N, int block_idx, const int* odata, int* auxArray) { + // should not be needed in the end + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index != 0) return; + auxArray[block_idx] = odata[N-1]; + + + } + + __global__ void kernEfficientScanDown(int N, int d, int* odata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + int power_plus = (int) powf(2.0f, (float) d+1); + if (index % power_plus != 0) return; + + int power_ident = power_plus / 2; + //int power_ident = (int) powf(2.0f, (float) d); + + int t = odata[index + power_ident - 1]; + + odata[index + power_ident - 1] = odata[index + power_plus - 1]; + odata[index + power_plus - 1] += t; + + + } + + __global__ void kernAddToAll(int N, int block_idx, const int* auxArr, int* odata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + odata[index] += auxArr[block_idx]; + } + + void _scan(int N, int* dev_odata) { + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + + + for(int d=0;d<=ilog2ceil(N)-1;d++){ + kernEfficientScanUp<<>>(N, d, dev_odata); + + } + + kernEfficientScanReset<<>>(N, dev_odata); + + for(int d=ilog2ceil(N)-1;d>=0;d--){ + kernEfficientScanDown<<>>(N, d, dev_odata); + + + } + + + } + + void _scanBlock(int N, int* dev_odata, int block_idx, int* aux_arr) { + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + int offset = N*block_idx; + + for(int d=0;d<=ilog2ceil(N)-1;d++){ + kernEfficientScanUp<<>>(N, d, dev_odata + offset); + + } + + cudaMemcpy(aux_arr + block_idx, dev_odata + offset + N - 1, sizeof(int), cudaMemcpyDeviceToDevice); + //cudaMemset(dev_odata + offset + N -1 , 0, sizeof(int)); + kernEfficientScanReset<<>>(N, dev_odata + offset); + + for(int d=ilog2ceil(N)-1;d>=0;d--){ + kernEfficientScanDown<<>>(N, d, dev_odata + offset); + + + } + + + } + + + int getNextPower(int _N){ + int N = 1; + while(N < _N){ + N *= 2; + } + return N; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + + + void _chunkedScanAutomatic(int N, int *odata){ + + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + // divide array into blocks + const int num_per_block = 1024; + int num_blocks = ceil(N / (double) num_per_block); + int num_scan_blocks = getNextPower(num_blocks); + + int* dev_auxarr; + cudaMalloc((void**)&dev_auxarr, num_scan_blocks * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_auxarr failed!"); + + + for(int block_idx=0; block_idx>>(num_per_block, block_idx, dev_auxarr, odata + offset); + } + + cudaFree(dev_auxarr); + } + void _chunkedScan(int N, int *odata, int *auxarr, int num_per_block, int num_blocks, int num_scan_blocks){ + + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + + + + + + for(int block_idx=0; block_idx>>(num_per_block, block_idx, auxarr, odata + offset); + } + + + + } + + + void scan(int _N, int *odata, const int *idata) { + + + int N = getNextPower(_N); + + + //dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + // divide array into blocks + const int num_per_block = 1024; + int num_blocks = ceil(N / (double) num_per_block); + int num_scan_blocks = getNextPower(num_blocks); + + int* dev_auxarr; + cudaMalloc((void**)&dev_auxarr, num_scan_blocks * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_auxarr failed!"); + + // copy data to gpu buffer + int* dev_odata; + cudaMalloc((void**)&dev_odata, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata failed!"); + + cudaMemset(dev_odata, 0, sizeof(int) * N); + cudaMemcpy(dev_odata, idata, sizeof(int) * _N, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + _chunkedScan(N, dev_odata, dev_auxarr, num_per_block, num_blocks, num_scan_blocks); + + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, sizeof(int) * _N, cudaMemcpyDeviceToHost); + + cudaDeviceSynchronize(); + cudaFree(dev_odata); + cudaFree(dev_auxarr); + + } + + void scanplain(int _N, int *odata, const int *idata) { + + + int N = getNextPower(_N); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + + // copy data to gpu buffer + int* dev_odata; + //int* dev_idata; + cudaMalloc((void**)&dev_odata, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata failed!"); + + cudaMemset(dev_odata, 0, sizeof(int) * N); + cudaMemcpy(dev_odata, idata, sizeof(int) * _N, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + _scan(N, dev_odata); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, sizeof(int) * _N, cudaMemcpyDeviceToHost); + + cudaDeviceSynchronize(); + cudaFree(dev_odata); + } + + + __global__ void kernBooleanAssigner(int N, int* odata, const int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + odata[index] = idata[index] == 0 ? 0 : 1; + + } + + __global__ void kernScatter(int N, int* odata, const int* idata, const int* boolarr, const int* scanres ) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + if(!boolarr[index]) return; + + odata[scanres[index]] = idata[index]; + + + } + + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -30,11 +286,66 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + int compact(int _N, int *odata, const int *idata) { + + int N = getNextPower(_N); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + // copy data to gpu buffer + int* dev_odata; + int* dev_idata; + int* dev_1bool; + int* dev_2scanres; + int numElements = -1; + + + cudaMalloc((void**)&dev_odata, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_idata, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_1bool, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_1bool failed!"); + cudaMalloc((void**)&dev_2scanres, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_2scanres failed!"); + + cudaMemset(dev_odata, 0, sizeof(int) * N); + cudaMemset(dev_idata, 0, sizeof(int) * N); + cudaMemset(dev_1bool, 0, sizeof(int) * N); + cudaMemset(dev_2scanres, 0, sizeof(int) * N); + + cudaMemcpy(dev_idata, idata, sizeof(int) * _N, cudaMemcpyHostToDevice); + //cudaMemcpy(dev_odata, odata, sizeof(int) * N, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + kernBooleanAssigner<<>>(N, dev_1bool, dev_idata); + cudaMemcpy(dev_2scanres, dev_1bool, sizeof(int) * _N, cudaMemcpyDeviceToDevice); + + + _chunkedScanAutomatic(_N, dev_2scanres); + + + kernScatter<<>>(N, dev_odata, dev_idata, dev_1bool, dev_2scanres); + cudaMemcpy(&numElements, dev_2scanres + (_N-1), sizeof(int), cudaMemcpyDeviceToHost); + + if(_N % 2 != 0){ + numElements++; + } + + cudaDeviceSynchronize(); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, sizeof(int) * _N, cudaMemcpyDeviceToHost); + + cudaDeviceSynchronize(); + cudaFree(dev_odata); + cudaFree(dev_idata); + cudaFree(dev_1bool); + cudaFree(dev_2scanres); + + + return numElements; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..2499d5e 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -7,6 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + void scanplain(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..22f690d 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,6 +2,11 @@ #include #include "common.h" #include "naive.h" +#include + +#define checkCUDAErrorWithLine(msg) checkCUDAErrorFn(msg, __FILE__, __LINE__) +#define blockSize 512 + namespace StreamCompaction { namespace Naive { @@ -12,14 +17,81 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernNaiveScan(int N, int d, int* odata, const int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + +// int power = 1; +// if(d>1){ +// for(int i=0; i= power){ + + odata[index] = idata[index-power] + idata[index]; + + + }else{ + odata[index] = idata[index]; + } + + + } + + int getNextPower(int _N){ + int N = 1; + while(N < _N){ + N *= 2; + } + return N; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + void scan(int _N, int *odata, const int *idata) { + + int N = getNextPower(_N); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + // copy data to gpu buffer + int* dev_odata; + int* dev_idata; + cudaMalloc((void**)&dev_odata, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_idata, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * _N, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, idata, sizeof(int) * _N, cudaMemcpyHostToDevice); + + + timer().startGpuTimer(); + + for(int d=1;d<=ilog2ceil(N);d++){ + int* tmp = dev_idata; + dev_idata = dev_odata; + dev_odata = tmp; + + kernNaiveScan<<>>(N, d, dev_odata, dev_idata); + + cudaDeviceSynchronize(); + + } + + cudaDeviceSynchronize(); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, sizeof(int) * _N, cudaMemcpyDeviceToHost); + + cudaDeviceSynchronize(); + cudaFree(dev_odata); + cudaFree(dev_idata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..6e2685f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -6,6 +6,9 @@ #include "common.h" #include "thrust.h" +#define checkCUDAErrorWithLine(msg) checkCUDAErrorFn(msg, __FILE__, __LINE__) +#define blockSize 128 + namespace StreamCompaction { namespace Thrust { using StreamCompaction::Common::PerformanceTimer; @@ -17,12 +20,41 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + void scan(int N, int *odata, const int *idata) { + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + // copy data to gpu buffer + int* dev_odata; + int* dev_idata; + cudaMalloc((void**)&dev_odata, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_idata, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_odata, odata, sizeof(int) * N, cudaMemcpyHostToDevice); + cudaMemcpy(dev_idata, idata, sizeof(int) * N, cudaMemcpyHostToDevice); + + + timer().startGpuTimer(); + + thrust::device_ptr dev_thrust_odata(dev_odata); + thrust::device_ptr dev_thrust_idata(dev_idata); + + // TODO use `thrust::exclusive_scan` + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::inclusive_scan(dev_thrust_idata, dev_thrust_idata+N, dev_thrust_odata); + + //cudaDeviceSynchronize(); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, sizeof(int) * N, cudaMemcpyDeviceToHost); + + cudaDeviceSynchronize(); + cudaFree(dev_odata); + cudaFree(dev_idata); + } } }