diff --git a/extras/ros-jazzy-gz-harmonic-install.sh b/extras/ros-jazzy-gz-harmonic-install.sh index 24d4309c..da522c1c 100644 --- a/extras/ros-jazzy-gz-harmonic-install.sh +++ b/extras/ros-jazzy-gz-harmonic-install.sh @@ -88,6 +88,7 @@ sudo apt update && apt install -y \ ros-$DIST-ros2-controllers \ ros-$DIST-teleop-tools \ ros-$DIST-urdfdom-py \ + ros-$DIST-marine-acoustic-msgs \ ros-dev-tools echo diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt index 40619b2d..3f0b87d2 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt @@ -1,24 +1,28 @@ cmake_minimum_required(VERSION 3.11.0 FATAL_ERROR) # Suppress developer warnings for the entire workspace. + set(CMAKE_SUPPRESS_DEVELOPER_WARNINGS TRUE CACHE BOOL "Suppress developer warnings" FORCE) # Set CMP0144 to NEW to ensure find_package # uses upper-case _ROOT variables. + cmake_policy(SET CMP0144 NEW) project(multibeam_sonar) +set(CUDA_ARCHITECTURE "60" CACHE STRING "Target CUDA SM version") + find_package(ament_cmake REQUIRED) find_package(CUDAToolkit QUIET) if(CUDAToolkit_FOUND) + enable_language(CUDA) find_package(CUDA REQUIRED) message(STATUS "CUDA found, enabling CUDA support.") include_directories(${CUDA_INCLUDE_DIRS}) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_60") - + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_${CUDA_ARCHITECTURE}") find_package(gz-cmake3 REQUIRED) find_package(gz-sim8 REQUIRED) find_package(gz-sensors8 REQUIRED) @@ -28,17 +32,10 @@ if(CUDAToolkit_FOUND) find_package(sensor_msgs REQUIRED) find_package(geometry_msgs REQUIRED) find_package(rosidl_default_generators REQUIRED) - find_package(PCL REQUIRED) - find_package(pcl_conversions REQUIRED) find_package(OpenCV REQUIRED) find_package(marine_acoustic_msgs REQUIRED) find_package(cv_bridge REQUIRED) - include_directories(${PCL_INCLUDE_DIRS}) - link_directories(${PCL_LIBRARY_DIRS}) - add_definitions(${PCL_DEFINITIONS}) - - # Set version variables set(GZ_MSGS_VER ${gz-msgs10_VERSION_MAJOR}) set(GZ_RENDERING_VER ${gz-rendering8_VERSION_MAJOR}) set(GZ_SENSORS_VER ${gz-sensors8_VERSION_MAJOR}) @@ -64,7 +61,9 @@ if(CUDAToolkit_FOUND) ) set_target_properties(${PROJECT_NAME} - PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + ) target_include_directories(${PROJECT_NAME} PUBLIC @@ -76,11 +75,17 @@ if(CUDAToolkit_FOUND) ${gz-sim${GZ_SIM_VER}_INCLUDE_DIRS} ) + find_library(CUBLAS_LIB cublas + HINTS ${CUDAToolkit_LIBRARY_DIR} /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu /usr/lib) + if(NOT CUBLAS_LIB) + message(FATAL_ERROR "cuBLAS not found") + endif() + target_link_libraries(${PROJECT_NAME} - ${PCL_LIBRARIES} - ${CUDA_LIBRARIES} - ${CUDA_CUFFT_LIBRARIES} - ${CUBLAS_LIB}) + ${CUDA_LIBRARIES} + ${CUDA_CUFFT_LIBRARIES} + ${CUBLAS_LIB} + ) install(TARGETS ${PROJECT_NAME} DESTINATION lib/${PROJECT_NAME} @@ -95,22 +100,18 @@ if(CUDAToolkit_FOUND) rclcpp sensor_msgs rosidl_default_generators - PCL - pcl_conversions OpenCV marine_acoustic_msgs geometry_msgs cv_bridge ) - - # Environment hooks ament_environment_hooks( "${CMAKE_CURRENT_SOURCE_DIR}/hooks/${PROJECT_NAME}.dsv.in" ) else() - message(STATUS "CUDA Toolkit not found: Skipping CUDA-specific targets") + message(STATUS "CUDA Toolkit not found or disabled: Skipping CUDA-specific targets") endif() ament_package() diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc index 3ef17146..310e9568 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc @@ -48,11 +48,7 @@ #include "MultibeamSonarSensor.hh" #include "sonar_calculation_cuda.cuh" -#include -#include -#include -#include -#include +#include #include #include #include @@ -372,6 +368,9 @@ bool MultibeamSonarSensor::Implementation::InitializeBeamArrangement(MultibeamSo this->debugFlag = sensorElement->Get("debugFlag", false).first; gzmsg << "Debug: " << this->debugFlag << std::endl; + this->blazingFlag = sensorElement->Get("blazingSonarImage", false).first; + gzmsg << "BlazingSonarImage: " << this->blazingFlag << std::endl; + this->pointCloudTopicName = sensorElement->Get("pointCloudTopicName", "point_cloud").first; gzmsg << "pointCloudTopicName: " << this->pointCloudTopicName << std::endl; @@ -648,16 +647,6 @@ bool MultibeamSonarSensor::Implementation::InitializeBeamArrangement(MultibeamSo // -- Pre calculations for sonar -- // - // Random number generator - gzmsg << "Initializing random number generator..." << std::endl; - this->randImage = cv::Mat(this->pointMsg.height(), this->pointMsg.width(), CV_32FC2); - uint64 randN = static_cast(std::rand()); - gzmsg << "Random seed: " << randN << std::endl; - cv::theRNG().state = randN; - cv::RNG rng = cv::theRNG(); - rng.fill(this->randImage, cv::RNG::NORMAL, 0.0f, 1.0f); - gzmsg << "Random image generated with normal distribution." << std::endl; - // Hamming window gzmsg << "Computing Hamming window for " << this->nFreq << " frequencies." << std::endl; this->window = new float[this->nFreq]; @@ -1069,7 +1058,6 @@ void MultibeamSonarSensor::Implementation::ComputeSonarImage() CArray2D P_Beams = NpsGazeboSonar::sonar_calculation_wrapper( this->pointCloudImage, // cv::Mat& depth_image (the point cloud image) normal_image, // cv::Mat& normal_image - this->randImage, // cv::Mat& rand_image hPixelSize, // hPixelSize vPixelSize, // vPixelSize hFOV, // hFOV @@ -1093,7 +1081,9 @@ void MultibeamSonarSensor::Implementation::ComputeSonarImage() this->window, // _window this->beamCorrector, // _beamCorrector this->beamCorrectorSum, // _beamCorrectorSum - this->debugFlag); + this->debugFlag, // debugFlag + this->blazingFlag // _blazingFlag + ); // For calc time measure auto stop = std::chrono::high_resolution_clock::now(); diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.hh b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.hh index 7ccfec3d..b7790e3a 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.hh +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.hh @@ -160,6 +160,9 @@ private: int ray_nElevationRays; float * rangeVector; + // Sonar image parameters + bool blazingFlag; + // Debug flags and reflectivity bool debugFlag; bool constMu; diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index 493ff1a5..f01da386 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -5,7 +5,7 @@ * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -35,10 +35,43 @@ #include #include +#include +#include +#include #include +#include + +// FOR DEBUG -- DEV VERSION +#include + +std::ofstream debugLog("debug_timings.txt"); #define BLOCK_SIZE 32 +// Existing SAFE_CALL (for cudaError_t) +#define SAFE_CALL(call, msg) _safe_cuda_call((call), msg, __FILE__, __LINE__) + +// New SAFE_CUBLAS_CALL (for cublasStatus_t) +#define SAFE_CUBLAS_CALL(call, msg) \ + { \ + cublasStatus_t err = (call); \ + if (err != CUBLAS_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "CUBLAS Error: %s in %s at line %d: %d\n", msg, __FILE__, __LINE__, err); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define SAFE_CUFFT_CALL(call, msg) \ + { \ + cufftResult err = (call); \ + if (err != CUFFT_SUCCESS) \ + { \ + fprintf(stderr, "CUFFT Error: %s in %s at line %d: %d\n", msg, __FILE__, __LINE__, err); \ + exit(EXIT_FAILURE); \ + } \ + } + static inline void _safe_cuda_call( cudaError err, const char * msg, const char * file_name, const int line_number) { @@ -52,7 +85,29 @@ static inline void _safe_cuda_call( } } -#define SAFE_CALL(call, msg) _safe_cuda_call((call), (msg), __FILE__, __LINE__) +// Persistent GPU memory pointers (device) +static float * d_depth_image = nullptr; +static float * d_normal_image = nullptr; +static float * d_reflectivity_image = nullptr; +static float * d_ray_elevationAngles = nullptr; +static thrust::complex * d_P_Beams = nullptr; +static thrust::complex * P_Beams = nullptr; +static bool memory_initialized = false; +float * ray_elevationAngles = nullptr; + +float *P_Beams_Cor_real_h = nullptr, *P_Beams_Cor_imag_h = nullptr; +float *d_P_Beams_Cor_real = nullptr, *d_P_Beams_Cor_imag = nullptr; +float *d_P_Beams_Cor_F_real = nullptr, *d_P_Beams_Cor_F_imag = nullptr; +float *beamCorrector_lin_h = nullptr, *d_beamCorrector_lin = nullptr; + +int P_Beams_Cor_N; +int P_Beams_Cor_Bytes; +int beamCorrector_lin_N; +int beamCorrector_lin_Bytes; + +// FFT +cufftComplex * deviceInputData; +cufftComplex * deviceOutputData; /////////////////////////////////////////////////////////////////////////// // Incident Angle Calculation Function @@ -89,68 +144,53 @@ __device__ __host__ float unnormalized_sinc(float t) } } -/////////////////////////////////////////////////////////////////////////// -template -__global__ void column_sums_reduce( - const T * __restrict__ in, T * __restrict__ out, size_t width, size_t height) +__global__ void reduce_beams_kernel( + const thrust::complex * __restrict__ d_P_Beams, float * d_P_Beams_Cor_real, + float * d_P_Beams_Cor_imag, int nBeams, int nFreq, int nRaysSkipped) { - __shared__ T sdata[BLOCK_SIZE][BLOCK_SIZE + 1]; - size_t idx = threadIdx.x + blockDim.x * blockIdx.x; - size_t width_stride = gridDim.x * blockDim.x; - size_t full_width = (width & (~((unsigned long long)(BLOCK_SIZE - 1)))) + - ((width & (BLOCK_SIZE - 1)) ? BLOCK_SIZE : 0); // round up to next block - for (size_t w = idx; w < full_width; w += width_stride) - { // grid-stride loop across matrix width - sdata[threadIdx.y][threadIdx.x] = 0; - size_t in_ptr = w + threadIdx.y * width; - for (size_t h = threadIdx.y; h < height; h += BLOCK_SIZE) - { // block-stride loop across matrix height - sdata[threadIdx.y][threadIdx.x] += (w < width) ? in[in_ptr] : 0; - in_ptr += width * BLOCK_SIZE; - } - __syncthreads(); - T my_val = sdata[threadIdx.x][threadIdx.y]; - for (int i = warpSize >> 1; i > 0; i >>= 1) // warp-wise parallel sum reduction - { - my_val += __shfl_xor_sync(0xFFFFFFFFU, my_val, i); - } - __syncthreads(); - if (threadIdx.x == 0) - { - sdata[0][threadIdx.y] = my_val; - } - __syncthreads(); - if ((threadIdx.y == 0) && ((w) < width)) - { - out[w] = sdata[0][threadIdx.x]; - } + const int beam = blockIdx.y; + const int f = blockIdx.x; + + if (beam >= nBeams || f >= nFreq) + { + return; } -} -__global__ void gpu_matrix_mult(float * a, float * b, float * c, int m, int n, int k) -{ - int row = blockIdx.y * blockDim.y + threadIdx.y; - int col = blockIdx.x * blockDim.x + threadIdx.x; - float sum = 0; - if (col < k && row < m) + __shared__ float shared_real[BLOCK_SIZE]; + __shared__ float shared_imag[BLOCK_SIZE]; + + float sum_real = 0.0f; + float sum_imag = 0.0f; + + for (int ray = threadIdx.x; ray < nRaysSkipped; ray += blockDim.x) { - for (int i = 0; i < n; i++) - { - sum += a[row * n + i] * b[i * k + col]; - } - c[row * k + col] = sum; + int idx = beam * nFreq * nRaysSkipped + ray * nFreq + f; + + thrust::complex val = d_P_Beams[idx]; + sum_real += val.real(); + sum_imag += val.imag(); } -} -__global__ void gpu_diag_matrix_mult(float * Val, int * RowPtr, float * diagVals, int total_rows) -{ - const int row = threadIdx.x + blockIdx.x * blockDim.x; - if (row < total_rows) + shared_real[threadIdx.x] = sum_real; + shared_imag[threadIdx.x] = sum_imag; + + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { - for (int i = RowPtr[row]; i < RowPtr[row + 1]; i++) + if (threadIdx.x < s) { - Val[i] = diagVals[row] * Val[i]; + shared_real[threadIdx.x] += shared_real[threadIdx.x + s]; + shared_imag[threadIdx.x] += shared_imag[threadIdx.x + s]; } + __syncthreads(); + } + + if (threadIdx.x == 0) + { + int output_idx = f * nBeams + beam; + d_P_Beams_Cor_real[output_idx] = shared_real[0]; + d_P_Beams_Cor_imag[output_idx] = shared_imag[0]; } } @@ -158,7 +198,7 @@ __global__ void gpu_diag_matrix_mult(float * Val, int * RowPtr, float * diagVals // Sonar Claculation Function __global__ void sonar_calculation( thrust::complex * P_Beams, float * depth_image, float * normal_image, int width, - int height, int depth_image_step, int normal_image_step, float * rand_image, int rand_image_step, + int height, int depth_image_step, int normal_image_step, unsigned long long seed, float * reflectivity_image, int reflectivity_image_step, float hPixelSize, float vPixelSize, float hFOV, float vFOV, float beam_azimuthAngleWidth, float beam_elevationAngleWidth, float ray_azimuthAngleWidth, float * ray_elevationAngles, float ray_elevationAngleWidth, @@ -176,7 +216,6 @@ __global__ void sonar_calculation( // Location of the image pixel const int depth_index = ray * depth_image_step / sizeof(float) + beam; const int normal_index = ray * normal_image_step / sizeof(float) + (3 * beam); - const int rand_index = ray * rand_image_step / sizeof(float) + (2 * beam); const int reflectivity_index = ray * reflectivity_image_step / sizeof(float) + beam; // Input parameters for ray processing @@ -189,7 +228,7 @@ __global__ void sonar_calculation( float azimuthBeamPattern = 1.0; float elevationBeamPattern = 1.0; // float elevationBeamPattern = abs(unnormalized_sinc(M_PI * 0.884 - // / (beam_elevationAngleWidth) * + // / (beam_elevationAngleWidth) * // sin(ray_elevationAngles[ray]))); // printf("angles %f", ray_elevationAngles[ray]); @@ -199,9 +238,13 @@ __global__ void sonar_calculation( acos(normal[2]); // compute_incidence(ray_azimuthAngle, ray_elevationAngle, normal); // ----- Point scattering model ------ // - // Gaussian noise generated using opencv RNG - float xi_z = rand_image[rand_index]; - float xi_y = rand_image[rand_index + 1]; + curandStatePhilox4_32_10_t state; + curand_init(seed, beam * height + ray, 0, &state); // seed, unique id, offset, state + + float4 xi = curand_normal4(&state); // standard 4 normal random values + + float xi_z = xi.x; + float xi_y = xi.y; // Calculate amplitude thrust::complex randomAmps = thrust::complex(xi_z / sqrt(2.0), xi_y / sqrt(2.0)); @@ -229,19 +272,25 @@ __global__ void sonar_calculation( float freq; if (nFreq % 2 == 0) { - freq = delta_f * (-nFreq / 2.0 + f * 1.0f + 1.0); + freq = __fdividef(delta_f * (-nFreq + 2.0f * (f + 1.0f)), 2.0f); } else { - freq = delta_f * (-(nFreq - 1) / 2.0 + f * 1.0f + 1.0); + freq = __fdividef(delta_f * (-(nFreq - 1) + 2.0f * (f + 1.0f)), 2.0f); } - float kw = 2.0 * M_PI * freq / soundSpeed; // wave vector - // Transmit spectrum, frequency domain - thrust::complex kernel = - exp(thrust::complex(0.0f, 2.0f * distance * kw)) * amplitude; - P_Beams[beam * nFreq * (int)(nRays / raySkips) + (int)(ray / raySkips) * nFreq + f] = - thrust::complex(kernel.real(), kernel.imag()); + float kw = __fdividef(2.0f * M_PI * freq, soundSpeed); // wave vector + + float phase = 2.0f * distance * kw; + float s, c; + __sincosf(phase, &s, &c); // intrinsic sine and cosine + + thrust::complex kernel(c, s); + kernel *= amplitude; + + int ray_index = ray / raySkips; // Map ray to reduced ray index + + P_Beams[beam * nFreq * (nRays / raySkips) + ray_index * nFreq + f] = kernel; } } } @@ -263,17 +312,40 @@ void check_cuda_init_wrapper(void) } } +void free_cuda_memory() +{ + SAFE_CALL(cudaFree(d_depth_image), "cudaFree failed for d_depth_image"); + SAFE_CALL(cudaFree(d_normal_image), "cudaFree failed for d_normal_image"); + SAFE_CALL(cudaFree(d_reflectivity_image), "cudaFree failed for d_reflectivity_image"); + SAFE_CALL(cudaFree(d_ray_elevationAngles), "cudaFree failed for d_ray_elevationAngles"); + SAFE_CALL(cudaFree(d_P_Beams), "cudaFree failed for d_P_Beams"); + SAFE_CALL(cudaFree(deviceOutputData), "cudaFree failed for OutputData"); + SAFE_CALL(cudaFree(deviceInputData), "cudaFree failed for InputData"); + SAFE_CALL(cudaFreeHost(P_Beams), "cudaFreeHost failed for P_Beams"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_imag), "cudaFree failed for d_P_Beams_Cor_imag"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_real), "cudaFree failed for d_P_Beams_Cor_real"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_F_imag), "cudaFree failed for d_P_Beams_Cor_F_imag"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_F_real), "cudaFree failed for d_P_Beams_Cor_F_real"); + SAFE_CALL(cudaFree(d_beamCorrector_lin), "cudaFree failed for d_beamCorrector_lin"); + SAFE_CALL(cudaFreeHost(P_Beams_Cor_real_h), "cudaFreeHost failed for P_Beams_Cor_real_h"); + SAFE_CALL(cudaFreeHost(P_Beams_Cor_imag_h), "cudaFreeHost failed for P_Beams_Cor_imag_h"); + SAFE_CALL(cudaFreeHost(beamCorrector_lin_h), "cudaFreeHost failed for beamCorrector_lin_h"); + memory_initialized = false; +} + // Sonar Claculation Function Wrapper CArray2D sonar_calculation_wrapper( - const cv::Mat & depth_image, const cv::Mat & normal_image, const cv::Mat & rand_image, - double _hPixelSize, double _vPixelSize, double _hFOV, double _vFOV, - double _beam_azimuthAngleWidth, double _beam_elevationAngleWidth, double _ray_azimuthAngleWidth, - float * _ray_elevationAngles, double _ray_elevationAngleWidth, double _soundSpeed, - double _maxDistance, double _sourceLevel, int _nBeams, int _nRays, int _raySkips, - double _sonarFreq, double _bandwidth, int _nFreq, const cv::Mat & reflectivity_image, - double _attenuation, float * window, float ** beamCorrector, float beamCorrectorSum, - bool debugFlag) + const cv::Mat & depth_image, const cv::Mat & normal_image, double _hPixelSize, double _vPixelSize, + double _hFOV, double _vFOV, double _beam_azimuthAngleWidth, double _beam_elevationAngleWidth, + double _ray_azimuthAngleWidth, float * _ray_elevationAngles, double _ray_elevationAngleWidth, + double _soundSpeed, double _maxDistance, double _sourceLevel, int _nBeams, int _nRays, + int _raySkips, double _sonarFreq, double _bandwidth, int _nFreq, + const cv::Mat & reflectivity_image, double _attenuation, float * window, float ** beamCorrector, + float beamCorrectorSum, bool debugFlag, bool blazingFlag) { + // Start the timer for the entire function + auto total_start_time = std::chrono::high_resolution_clock::now(); + auto start = std::chrono::high_resolution_clock::now(); auto stop = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(stop - start); @@ -318,36 +390,85 @@ CArray2D sonar_calculation_wrapper( // Calculate total number of bytes of input and output image const int depth_image_Bytes = depth_image.step * depth_image.rows; const int normal_image_Bytes = normal_image.step * normal_image.rows; - const int rand_image_Bytes = rand_image.step * rand_image.rows; const int reflectivity_image_Bytes = reflectivity_image.step * reflectivity_image.rows; const int ray_elevationAngles_Bytes = sizeof(float) * nRays; + const int P_Beams_N = nBeams * (int)(nRays / raySkips) * (nFreq + 1); + + P_Beams_Cor_N = nBeams * nFreq; + P_Beams_Cor_Bytes = sizeof(float) * P_Beams_Cor_N; + beamCorrector_lin_N = nBeams * nBeams; + beamCorrector_lin_Bytes = sizeof(float) * beamCorrector_lin_N; + + // FFT Parameters + const int DATASIZE = nFreq; + const int BATCH = nBeams; + + if (!memory_initialized) + { + SAFE_CALL( + cudaMalloc((void **)&d_depth_image, depth_image.step * depth_image.rows), "depth malloc"); + SAFE_CALL( + cudaMallocHost((void **)&ray_elevationAngles, sizeof(float) * nRays), + "MallocHost ray_elevationAngles"); + SAFE_CALL( + cudaMalloc((void **)&d_normal_image, normal_image.step * normal_image.rows), "normal malloc"); + SAFE_CALL( + cudaMalloc((void **)&d_reflectivity_image, reflectivity_image.step * reflectivity_image.rows), + "reflectivity malloc"); + SAFE_CALL( + cudaMalloc((void **)&d_ray_elevationAngles, sizeof(float) * nRays), "ray elevation malloc"); + SAFE_CALL( + cudaMallocHost((void **)&P_Beams, sizeof(thrust::complex) * P_Beams_N), + "P_Beams malloc host"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams, sizeof(thrust::complex) * P_Beams_N), + "P_Beams malloc device"); + SAFE_CALL( + cudaMallocHost((void **)&P_Beams_Cor_real_h, P_Beams_Cor_Bytes), + "CUDA MallocHost Failed for P_Beams_Cor_real_h"); + SAFE_CALL( + cudaMallocHost((void **)&P_Beams_Cor_imag_h, P_Beams_Cor_Bytes), + "CUDA MallocHost Failed for P_Beams_Cor_imag_h"); + SAFE_CALL( + cudaMallocHost((void **)&beamCorrector_lin_h, beamCorrector_lin_Bytes), + "CUDA MallocHost Failed for beamCorrector_lin_h"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_real, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_real"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_imag, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_imag"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_F_real"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_F_imag"); + SAFE_CALL( + cudaMalloc((void **)&d_beamCorrector_lin, beamCorrector_lin_Bytes), + "CUDA Malloc Failed for d_beamCorrector_lin"); + SAFE_CALL( + cudaMalloc((void **)&deviceInputData, DATASIZE * BATCH * sizeof(cufftComplex)), + "FFT CUDA Malloc Failed"); + SAFE_CALL( + cudaMalloc((void **)&deviceOutputData, DATASIZE * BATCH * sizeof(cufftComplex)), + "FFT CUDA Malloc Failed"); + + memory_initialized = true; + } - // Allocate device memory - float *d_depth_image, *d_normal_image, *d_rand_image, *d_reflectivity_image, *ray_elevationAngles, - *d_ray_elevationAngles; - SAFE_CALL(cudaMalloc((void **)&d_depth_image, depth_image_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_normal_image, normal_image_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_rand_image, rand_image_Bytes), "CUDA Malloc Failed"); - SAFE_CALL( - cudaMalloc((void **)&d_reflectivity_image, reflectivity_image_Bytes), "CUDA Malloc Failed"); - cudaMallocHost((void **)&ray_elevationAngles, ray_elevationAngles_Bytes); - SAFE_CALL( - cudaMalloc((void **)&d_ray_elevationAngles, ray_elevationAngles_Bytes), "CUDA Malloc Failed"); for (size_t ray = 0; ray < nRays; ray++) { ray_elevationAngles[ray] = _ray_elevationAngles[ray]; } - // Copy data from OpenCV input image to device memory + // Perform your cudaMemcpy calls SAFE_CALL( cudaMemcpy(d_depth_image, depth_image.ptr(), depth_image_Bytes, cudaMemcpyHostToDevice), "CUDA Memcpy Failed"); SAFE_CALL( cudaMemcpy(d_normal_image, normal_image.ptr(), normal_image_Bytes, cudaMemcpyHostToDevice), "CUDA Memcpy Failed"); - SAFE_CALL( - cudaMemcpy(d_rand_image, rand_image.ptr(), rand_image_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); SAFE_CALL( cudaMemcpy( d_reflectivity_image, reflectivity_image.ptr(), reflectivity_image_Bytes, @@ -366,244 +487,157 @@ CArray2D sonar_calculation_wrapper( const dim3 grid( (depth_image.cols + block.x - 1) / block.x, (depth_image.rows + block.y - 1) / block.y); - // Beam data array - thrust::complex * P_Beams; - thrust::complex * d_P_Beams; - const int P_Beams_N = nBeams * (int)(nRays / raySkips) * (nFreq + 1); - const int P_Beams_Bytes = sizeof(thrust::complex) * P_Beams_N; - SAFE_CALL(cudaMallocHost((void **)&P_Beams, P_Beams_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Beams, P_Beams_Bytes), "CUDA Malloc Failed"); + // Random seed for blazing sonar image + unsigned long long seed; + if (blazingFlag) + { + seed = static_cast(time(NULL)); + } + else + { + seed = 1234; + } // Launch the beamor conversion kernel sonar_calculation<<>>( d_P_Beams, d_depth_image, d_normal_image, normal_image.cols, normal_image.rows, - depth_image.step, normal_image.step, d_rand_image, rand_image.step, d_reflectivity_image, - reflectivity_image.step, hPixelSize, vPixelSize, hFOV, vFOV, beam_azimuthAngleWidth, - beam_elevationAngleWidth, ray_azimuthAngleWidth, d_ray_elevationAngles, ray_elevationAngleWidth, - soundSpeed, sourceTerm, nBeams, nRays, raySkips, sonarFreq, delta_f, nFreq, bandwidth, - max_distance, attenuation, area_scaler); + depth_image.step, normal_image.step, seed, d_reflectivity_image, reflectivity_image.step, + hPixelSize, vPixelSize, hFOV, vFOV, beam_azimuthAngleWidth, beam_elevationAngleWidth, + ray_azimuthAngleWidth, d_ray_elevationAngles, ray_elevationAngleWidth, soundSpeed, sourceTerm, + nBeams, nRays, raySkips, sonarFreq, delta_f, nFreq, bandwidth, max_distance, attenuation, + area_scaler); // Synchronize to check for any kernel launch errors SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); - // Copy back data from destination device meory to OpenCV output image - SAFE_CALL( - cudaMemcpy(P_Beams, d_P_Beams, P_Beams_Bytes, cudaMemcpyDeviceToHost), "CUDA Memcpy Failed"); - - // Free GPU memory - cudaFree(d_depth_image); - cudaFree(d_normal_image); - cudaFree(d_rand_image); - cudaFree(d_reflectivity_image); - cudaFree(d_P_Beams); - cudaFree(d_ray_elevationAngles); - cudaFreeHost(ray_elevationAngles); - - // For calc time measure if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); - printf( - "GPU Sonar Computation Time %lld/100 [s]\n", - static_cast(duration.count() / 10000)); + + long long dcount = duration.count(); + float ms = static_cast(dcount) / 1000.0f; + + printf("GPU Sonar Computation Time %lld/100 [s]\n", dcount / 10000); + printf("GPU Sonar Computation Time: %.3f ms\n", ms); + + debugLog << "GPU Sonar Computation Time " << dcount / 10000 << "/100 [s]\n"; + debugLog << "GPU Sonar Computation Time: " << ms << " ms\n"; + start = std::chrono::high_resolution_clock::now(); } // ########################################################// // ######### Summation, Culling and windowing #########// // ########################################################// - // Preallocate an array for return - CArray2D P_Beams_F(CArray(nFreq), nBeams); - // GPU grids and rows - unsigned int grid_rows, grid_cols; - dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); - - // GPU Ray summation using column sum - float *P_Ray_real, *P_Ray_imag; - float *d_P_Ray_real, *d_P_Ray_imag; - const int P_Ray_N = (int)(nRays / raySkips) * (nFreq); - const int P_Ray_Bytes = sizeof(float) * P_Ray_N; - float *P_Ray_F_real, *P_Ray_F_imag; - float *d_P_Ray_F_real, *d_P_Ray_F_imag; - const int P_Ray_F_N = (nFreq) * 1; - const int P_Ray_F_Bytes = sizeof(float) * P_Ray_F_N; - cudaMallocHost((void **)&P_Ray_real, P_Ray_Bytes); - cudaMallocHost((void **)&P_Ray_imag, P_Ray_Bytes); - cudaMallocHost((void **)&P_Ray_F_real, P_Ray_F_Bytes); - cudaMallocHost((void **)&P_Ray_F_imag, P_Ray_F_Bytes); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_real, P_Ray_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_imag, P_Ray_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_real, P_Ray_F_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_imag, P_Ray_F_Bytes), "CUDA Malloc Failed"); - - dim3 dimGrid_Ray((nFreq + BLOCK_SIZE - 1) / BLOCK_SIZE); - for (size_t beam = 0; beam < nBeams; beam++) - { - for (size_t ray = 0; ray < (int)(nRays / raySkips); ray++) - { - for (size_t f = 0; f < nFreq; f++) - { - P_Ray_real[ray * nFreq + f] = - P_Beams[beam * nFreq * (int)(nRays / raySkips) + ray * nFreq + f].real(); - P_Ray_imag[ray * nFreq + f] = - P_Beams[beam * nFreq * (int)(nRays / raySkips) + ray * nFreq + f].imag(); - } - } - - SAFE_CALL( - cudaMemcpy(d_P_Ray_real, P_Ray_real, P_Ray_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); - SAFE_CALL( - cudaMemcpy(d_P_Ray_imag, P_Ray_imag, P_Ray_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); - - column_sums_reduce<<>>( - d_P_Ray_real, d_P_Ray_F_real, nFreq, (int)(nRays / raySkips)); - column_sums_reduce<<>>( - d_P_Ray_imag, d_P_Ray_F_imag, nFreq, (int)(nRays / raySkips)); - - SAFE_CALL( - cudaMemcpy(P_Ray_F_real, d_P_Ray_F_real, P_Ray_F_Bytes, cudaMemcpyDeviceToHost), - "CUDA Memcpy Failed"); - SAFE_CALL( - cudaMemcpy(P_Ray_F_imag, d_P_Ray_F_imag, P_Ray_F_Bytes, cudaMemcpyDeviceToHost), - "CUDA Memcpy Failed"); - SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); + dim3 blockDim(BLOCK_SIZE); + dim3 gridDim(nFreq, nBeams); // one thread block per beam+freq pair - for (size_t f = 0; f < nFreq; f++) - { - P_Beams_F[beam][f] = Complex(P_Ray_F_real[f], P_Ray_F_imag[f]); - } - } - - // free memory - cudaFreeHost(P_Beams); - cudaFreeHost(P_Ray_real); - cudaFreeHost(P_Ray_imag); - cudaFreeHost(P_Ray_F_real); - cudaFreeHost(P_Ray_F_imag); - cudaFree(d_P_Ray_real); - cudaFree(d_P_Ray_imag); - cudaFree(d_P_Ray_F_real); - cudaFree(d_P_Ray_F_imag); + reduce_beams_kernel<<>>( + d_P_Beams, d_P_Beams_Cor_real, d_P_Beams_Cor_imag, nBeams, nFreq, nRays / raySkips); + SAFE_CALL(cudaGetLastError(), "reduce_beams_kernel launch failed"); + SAFE_CALL(cudaDeviceSynchronize(), "reduce_beams_kernel execution failed"); if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); - printf( - "Sonar Ray Summation %lld/100 [s]\n", static_cast(duration.count() / 10000)); + + long long dcount = duration.count(); + + printf("Sonar Ray Summation %lld/100 [s]\n", dcount / 10000); + printf("Sonar Ray Summation Time: %.3f ms\n", static_cast(dcount) / 1000.0f); + + debugLog << "Sonar Ray Summation " << dcount / 10000 << "/100 [s]\n"; + debugLog << "Sonar Ray Summation Time: " << static_cast(dcount) / 1000.0f << " ms\n"; + start = std::chrono::high_resolution_clock::now(); } - // -------------- Beam culling correction -----------------// + // --- Beam culling correction --- // beamCorrector and beamCorrectorSum is precalculated at parent cpp - float *P_Beams_Cor_real, *P_Beams_Cor_imag; - // float *P_Beams_Cor_F_real, *P_Beams_Cor_F_imag; - float *P_Beams_Cor_real_tmp, *P_Beams_Cor_imag_tmp; - float *d_P_Beams_Cor_real, *d_P_Beams_Cor_imag; - float *d_P_Beams_Cor_F_real, *d_P_Beams_Cor_F_imag; - const int P_Beams_Cor_N = nBeams * nFreq; - const int P_Beams_Cor_Bytes = sizeof(float) * P_Beams_Cor_N; - cudaMallocHost((void **)&P_Beams_Cor_real, P_Beams_Cor_Bytes); - cudaMallocHost((void **)&P_Beams_Cor_imag, P_Beams_Cor_Bytes); - cudaMallocHost((void **)&P_Beams_Cor_real_tmp, P_Beams_Cor_Bytes); - cudaMallocHost((void **)&P_Beams_Cor_imag_tmp, P_Beams_Cor_Bytes); - // cudaMallocHost((void **)&P_Beams_Cor_F_real, P_Beams_Cor_Bytes); - // cudaMallocHost((void **)&P_Beams_Cor_F_imag, P_Beams_Cor_Bytes); - SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_real, P_Beams_Cor_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_imag, P_Beams_Cor_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes), "CUDA Malloc Failed"); - - float *beamCorrector_lin, *d_beamCorrector_lin; - const int beamCorrector_lin_N = nBeams * nBeams; - const int beamCorrector_lin_Bytes = sizeof(float) * beamCorrector_lin_N; - cudaMallocHost((void **)&beamCorrector_lin, beamCorrector_lin_Bytes); - SAFE_CALL( - cudaMalloc((void **)&d_beamCorrector_lin, beamCorrector_lin_Bytes), "CUDA Malloc Failed"); + CArray2D P_Beams_F(CArray(nFreq), nBeams); + cublasHandle_t cublas_handle; + + cublasStatus_t stat = cublasCreate(&cublas_handle); + if (stat != CUBLAS_STATUS_SUCCESS) + { + std::cerr << "cuBLAS create failed with error: " << stat << std::endl; + return P_Beams_F; + } - // (nfreq x nBeams) * (nBeams x nBeams) = (nfreq x nBeams) for (size_t beam = 0; beam < nBeams; beam++) { - for (size_t f = 0; f < nFreq; f++) - { - P_Beams_Cor_real[f * nBeams + beam] = P_Beams_F[beam][f].real() * 1.0f; - P_Beams_Cor_imag[f * nBeams + beam] = P_Beams_F[beam][f].imag() * 1.0f; - } for (size_t beam_other = 0; beam_other < nBeams; beam_other++) { - beamCorrector_lin[beam_other * nBeams + beam] = beamCorrector[beam][beam_other]; + beamCorrector_lin_h[beam_other * nBeams + beam] = beamCorrector[beam][beam_other]; } } - SAFE_CALL( - cudaMemcpy(d_P_Beams_Cor_real, P_Beams_Cor_real, P_Beams_Cor_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); - SAFE_CALL( - cudaMemcpy(d_P_Beams_Cor_imag, P_Beams_Cor_imag, P_Beams_Cor_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); SAFE_CALL( cudaMemcpy( - d_beamCorrector_lin, beamCorrector_lin, beamCorrector_lin_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); + d_beamCorrector_lin, beamCorrector_lin_h, beamCorrector_lin_Bytes, cudaMemcpyHostToDevice), + "CUDA Memcpy Failed (d_beamCorrector_lin)"); + + // Define matrix dimensions and GEMM parameters + // M_gemm = rows of A (P_Beams_Cor_real/imag) = nFreq + // N_gemm = cols of B (beamCorrector_lin) and cols of C (result) = nBeams + // K_gemm = cols of A and rows of B = nBeams + const int M_gemm = nFreq; + const int N_gemm = nBeams; + const int K_gemm = nBeams; + + // Alpha and Beta for sgemm (float type) + const float alpha = 1.0f; + const float beta = 0.0f; // Setting beta to 0.0f to overwrite C + + // Real part + SAFE_CUBLAS_CALL( + cublasSgemm( + cublas_handle, + CUBLAS_OP_N, // op(A) + CUBLAS_OP_N, // op(B) + N_gemm, M_gemm, K_gemm, &alpha, + d_beamCorrector_lin, // Pointer to B_original (now A in cublasSgemm) + N_gemm, // lda: leading dimension of B_original + d_P_Beams_Cor_real, // Pointer to A_original (now B in cublasSgemm) + K_gemm, // ldb: leading dimension of A_original + &beta, + d_P_Beams_Cor_F_real, // Pointer to C_original + N_gemm), // ldc: leading dimension of C_original + "cublasSgemm Failed for real part"); + + // Imag part + SAFE_CUBLAS_CALL( + cublasSgemm( + cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, N_gemm, M_gemm, K_gemm, &alpha, d_beamCorrector_lin, + N_gemm, d_P_Beams_Cor_imag, K_gemm, &beta, d_P_Beams_Cor_F_imag, N_gemm), + "cublasSgemm Failed for imag part"); + + SAFE_CUBLAS_CALL(cublasDestroy_v2(cublas_handle), "CUBLAS_STATUS_ALLOC_FAILED"); - grid_rows = (nFreq + BLOCK_SIZE - 1) / BLOCK_SIZE; - grid_cols = (nBeams + BLOCK_SIZE - 1) / BLOCK_SIZE; - dim3 dimGrid_Beam(grid_cols, grid_rows); - - gpu_matrix_mult<<>>( - d_P_Beams_Cor_real, d_beamCorrector_lin, d_P_Beams_Cor_F_real, nFreq, nBeams, nBeams); - SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); - - gpu_matrix_mult<<>>( - d_P_Beams_Cor_imag, d_beamCorrector_lin, d_P_Beams_Cor_F_imag, nFreq, nBeams, nBeams); - SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); - - // Copy back data from destination device meory SAFE_CALL( - cudaMemcpy( - P_Beams_Cor_real_tmp, d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes, cudaMemcpyDeviceToHost), - "CUDA Memcpy Failed"); + cudaMemcpy(P_Beams_Cor_real_h, d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes, cudaMemcpyDeviceToHost), + "CUDA Memcpy Failed for P_Beams_Cor_real_h"); SAFE_CALL( - cudaMemcpy( - P_Beams_Cor_imag_tmp, d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes, cudaMemcpyDeviceToHost), - "CUDA Memcpy Failed"); - SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); + cudaMemcpy(P_Beams_Cor_imag_h, d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes, cudaMemcpyDeviceToHost), + "CUDA Memcpy Failed for P_Beams_Cor_imag_h"); - // Return - for (size_t beam = 0; beam < nBeams; beam++) - { - for (size_t f = 0; f < nFreq; f++) - { - P_Beams_F[beam][f] = Complex( - P_Beams_Cor_real_tmp[f * nBeams + beam] / beamCorrectorSum, - P_Beams_Cor_imag_tmp[f * nBeams + beam] / beamCorrectorSum); - } - } - - // Free memory - cudaFree(d_P_Beams_Cor_imag); - cudaFree(d_P_Beams_Cor_real); - cudaFree(d_P_Beams_Cor_F_imag); - cudaFree(d_P_Beams_Cor_F_real); - cudaFree(d_beamCorrector_lin); - cudaFreeHost(P_Beams_Cor_real); - cudaFreeHost(P_Beams_Cor_imag); - cudaFreeHost(P_Beams_Cor_real_tmp); - cudaFreeHost(P_Beams_Cor_imag_tmp); - cudaFreeHost(beamCorrector_lin); - - // For calc time measure if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); - printf( - "GPU Window & Correction %lld/100 [s]\n", - static_cast(duration.count() / 10000)); + + long long dcount = duration.count(); + + printf("GPU Window & Correction %lld/100 [s]\n", dcount / 10000); + printf("GPU Window & Correction: %.3f ms\n", static_cast(dcount) / 1000.0f); + + // Write to file + debugLog << "GPU Window & Correction " << dcount / 10000 << "/100 [s]\n"; + debugLog << "GPU Window & Correction: " << static_cast(dcount) / 1000.0f << " ms\n"; + start = std::chrono::high_resolution_clock::now(); } @@ -611,31 +645,25 @@ CArray2D sonar_calculation_wrapper( // ################### FFT #####################// // #################################################// SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); - const int DATASIZE = nFreq; - const int BATCH = nBeams; + // --- Host side input data allocation and initialization cufftComplex * hostInputData = (cufftComplex *)malloc(DATASIZE * BATCH * sizeof(cufftComplex)); - for (int beam = 0; beam < BATCH; beam++) + start = std::chrono::high_resolution_clock::now(); + +#pragma omp parallel for collapse(2) + for (int beam = 0; beam < BATCH; ++beam) { - for (int f = 0; f < DATASIZE; f++) + for (int f = 0; f < DATASIZE; ++f) { - if (f < nFreq) - { - hostInputData[beam * DATASIZE + f] = - make_cuComplex(P_Beams_F[beam][f].real() * 1.0f, P_Beams_F[beam][f].imag() * 1.0f); - } - else - { - hostInputData[beam * DATASIZE + f] = (make_cuComplex(0.f, 0.f)); // zero padding - } + int idx = beam * DATASIZE + f; + const std::complex & val = P_Beams_F[beam][f]; + hostInputData[idx] = make_cuComplex( + P_Beams_Cor_real_h[f * nBeams + beam] / beamCorrectorSum, + P_Beams_Cor_imag_h[f * nBeams + beam] / beamCorrectorSum); } } // --- Device side input data allocation and initialization - cufftComplex * deviceInputData; - SAFE_CALL( - cudaMalloc((void **)&deviceInputData, DATASIZE * BATCH * sizeof(cufftComplex)), - "FFT CUDA Malloc Failed"); SAFE_CALL( cudaMemcpy( deviceInputData, hostInputData, DATASIZE * BATCH * sizeof(cufftComplex), @@ -645,10 +673,6 @@ CArray2D sonar_calculation_wrapper( // --- Host side output data allocation cufftComplex * hostOutputData = (cufftComplex *)malloc(DATASIZE * BATCH * sizeof(cufftComplex)); - // --- Device side output data allocation - cufftComplex * deviceOutputData; - cudaMalloc((void **)&deviceOutputData, DATASIZE * BATCH * sizeof(cufftComplex)); - // --- Batched 1D FFTs cufftHandle handle; int rank = 1; // --- 1D FFTs @@ -672,12 +696,6 @@ CArray2D sonar_calculation_wrapper( cudaMemcpyDeviceToHost), "FFT CUDA Memcopy Failed"); - cufftDestroy(handle); - cudaFree(deviceOutputData); - cudaFree(deviceInputData); - free(hostInputData); - free(hostOutputData); - for (int beam = 0; beam < BATCH; beam++) { for (int f = 0; f < nFreq; f++) @@ -688,15 +706,41 @@ CArray2D sonar_calculation_wrapper( } } - // For calc time measure + cufftDestroy(handle); + free(hostInputData); + free(hostOutputData); + if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); - printf( - "GPU FFT Calc Time %lld/100 [s]\n", static_cast(duration.count() / 10000)); + + long long dcount = duration.count(); + float ms = static_cast(dcount) / 1000.0f; + + printf("GPU FFT Calc Time: %.3f ms\n", ms); + printf("GPU FFT Calc Time %lld/100 [s]\n", dcount / 10000); + + // Write to file + debugLog << "GPU FFT Calc Time: " << ms << " ms\n"; + debugLog << "GPU FFT Calc Time " << dcount / 10000 << "/100 [s]\n"; + + start = std::chrono::high_resolution_clock::now(); + } + + auto total_stop_time = std::chrono::high_resolution_clock::now(); + auto total_duration = + std::chrono::duration_cast(total_stop_time - total_start_time); + + if (debugFlag) + { + float ms = static_cast(total_duration.count()) / 1000.0f; + + printf("Total Sonar Calculation Wrapper Time: %.3f ms\n", ms); + + debugLog << "Total Sonar Calculation Wrapper Time: " << ms << " ms\n"; } return P_Beams_F; } -} // namespace NpsGazeboSonar +} // namespace NpsGazeboSonar \ No newline at end of file diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh index 328a6b5b..651c05da 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh @@ -40,14 +40,16 @@ typedef std::valarray CArray2D; /// \brief CUDA Device Check Function Wrapper void check_cuda_init_wrapper(void); +/// \brief CUDA Free Memory Function +void free_cuda_memory(); + /// \brief Sonar Claculation Function Wrapper CArray2D sonar_calculation_wrapper( - const cv::Mat & depth_image, const cv::Mat & normal_image, const cv::Mat & rand_image, - double _hPixelSize, double _vPixelSize, double _hFOV, double _vFOV, - double _beam_azimuthAngleWidth, double _beam_elevationAngleWidth, double _ray_azimuthAngleWidth, - float * _ray_elevationAngles, double _ray_elevationAngleWidth, double _soundSpeed, - double _maxDistance, double _sourceLevel, int _nBeams, int _nRays, int _raySkips, - double _sonarFreq, double _bandwidth, int _nFreq, const cv::Mat & reflectivity_image, - double _attenuation, float * _window, float ** _beamCorrector, float _beamCorrectorSum, - bool _debugFlag); + const cv::Mat & depth_image, const cv::Mat & normal_image, double _hPixelSize, double _vPixelSize, + double _hFOV, double _vFOV, double _beam_azimuthAngleWidth, double _beam_elevationAngleWidth, + double _ray_azimuthAngleWidth, float * _ray_elevationAngles, double _ray_elevationAngleWidth, + double _soundSpeed, double _maxDistance, double _sourceLevel, int _nBeams, int _nRays, + int _raySkips, double _sonarFreq, double _bandwidth, int _nFreq, + const cv::Mat & reflectivity_image, double _attenuation, float * _window, float ** _beamCorrector, + float _beamCorrectorSum, bool _debugFlag, bool _blazingFlag); } // namespace NpsGazeboSonar \ No newline at end of file diff --git a/models/dave_robot_models/description/bluerov2_heavy/model.sdf b/models/dave_robot_models/description/bluerov2_heavy/model.sdf index d5084caa..ab3f53fe 100644 --- a/models/dave_robot_models/description/bluerov2_heavy/model.sdf +++ b/models/dave_robot_models/description/bluerov2_heavy/model.sdf @@ -165,8 +165,9 @@ 10 10 0.02 - true - true + true + false + false 5 point_cloud diff --git a/models/dave_sensor_models/description/blueview_p900/model.sdf b/models/dave_sensor_models/description/blueview_p900/model.sdf index 3745c2e3..dff61f43 100644 --- a/models/dave_sensor_models/description/blueview_p900/model.sdf +++ b/models/dave_sensor_models/description/blueview_p900/model.sdf @@ -90,8 +90,9 @@ 10 10 0.02 - true - true + true + false + false 5 point_cloud