From 50cded8c18714d4cde062d5a0425494d25689cc8 Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Thu, 24 Jul 2025 15:19:05 -0300 Subject: [PATCH 01/16] add sgemm acceleration --- .../multibeam_sonar/CMakeLists.txt | 9 +- .../multibeam_sonar/sonar_calculation_cuda.cu | 325 +++++++++++++----- 2 files changed, 246 insertions(+), 88 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt index 4b0635ba..664a63ba 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt @@ -73,7 +73,14 @@ if(CUDAToolkit_FOUND) ${gz-transport${GZ_TRANSPORT_VER}_INCLUDE_DIRS} ${gz-sim${GZ_SIM_VER}_INCLUDE_DIRS} ) - target_link_libraries(${PROJECT_NAME} ${PCL_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}) + + find_library(CUBLAS_LIB cublas + HINTS ${CUDAToolkit_LIBRARY_DIR} /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu /usr/lib) + if(NOT CUBLAS_LIB) + message(FATAL_ERROR "cuBLAS not found") + endif() + + target_link_libraries(${PROJECT_NAME} ${PCL_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUBLAS_LIB}) install(TARGETS ${PROJECT_NAME} DESTINATION lib/${PROJECT_NAME} diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index 493ff1a5..b955581d 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -5,7 +5,7 @@ * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -35,10 +35,35 @@ #include #include +#include #include #define BLOCK_SIZE 32 +// Existing SAFE_CALL (for cudaError_t) +#define SAFE_CALL(call, msg) _safe_cuda_call((call), msg, __FILE__, __LINE__) + +// New SAFE_CUBLAS_CALL (for cublasStatus_t) +#define SAFE_CUBLAS_CALL(call, msg) \ + { \ + cublasStatus_t err = (call); \ + if (err != CUBLAS_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "CUBLAS Error: %s in %s at line %d: %d\n", msg, __FILE__, __LINE__, err); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define SAFE_CUFFT_CALL(call, msg) \ + { \ + cufftResult err = (call); \ + if (err != CUFFT_SUCCESS) \ + { \ + fprintf(stderr, "CUFFT Error: %s in %s at line %d: %d\n", msg, __FILE__, __LINE__, err); \ + exit(EXIT_FAILURE); \ + } \ + } + static inline void _safe_cuda_call( cudaError err, const char * msg, const char * file_name, const int line_number) { @@ -52,7 +77,8 @@ static inline void _safe_cuda_call( } } -#define SAFE_CALL(call, msg) _safe_cuda_call((call), (msg), __FILE__, __LINE__) +// REMOVED: Duplicate SAFE_CALL definition here. Keep only one. +// #define SAFE_CALL(call, msg) _safe_cuda_call((call), (msg), __FILE__, __LINE__) /////////////////////////////////////////////////////////////////////////// // Incident Angle Calculation Function @@ -127,7 +153,25 @@ __global__ void column_sums_reduce( } } -__global__ void gpu_matrix_mult(float * a, float * b, float * c, int m, int n, int k) +__global__ void gpu_matrix_mult(const __half * a, const __half * b, __half * c, int m, int n, int k) +{ + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + + if (col < k && row < m) + { + __half sum = __float2half(0.0f); + for (int i = 0; i < n; ++i) + { + __half valA = a[row * n + i]; + __half valB = b[i * k + col]; + sum = __hadd(sum, __hmul(valA, valB)); + } + c[row * k + col] = sum; + } +} + +__global__ void gpu_matrix_mult_float(float * a, float * b, float * c, int m, int n, int k) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; @@ -189,7 +233,7 @@ __global__ void sonar_calculation( float azimuthBeamPattern = 1.0; float elevationBeamPattern = 1.0; // float elevationBeamPattern = abs(unnormalized_sinc(M_PI * 0.884 - // / (beam_elevationAngleWidth) * + // / (beam_elevationAngleWidth) * // sin(ray_elevationAngles[ray]))); // printf("angles %f", ray_elevationAngles[ray]); @@ -274,6 +318,9 @@ CArray2D sonar_calculation_wrapper( double _attenuation, float * window, float ** beamCorrector, float beamCorrectorSum, bool debugFlag) { + // Start the timer for the entire function + auto total_start_time = std::chrono::high_resolution_clock::now(); + auto start = std::chrono::high_resolution_clock::now(); auto stop = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(stop - start); @@ -330,7 +377,9 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL(cudaMalloc((void **)&d_rand_image, rand_image_Bytes), "CUDA Malloc Failed"); SAFE_CALL( cudaMalloc((void **)&d_reflectivity_image, reflectivity_image_Bytes), "CUDA Malloc Failed"); - cudaMallocHost((void **)&ray_elevationAngles, ray_elevationAngles_Bytes); + SAFE_CALL( + cudaMallocHost((void **)&ray_elevationAngles, ray_elevationAngles_Bytes), + "CUDA MallocHost Failed for ray_elevationAngles"); // Added SAFE_CALL SAFE_CALL( cudaMalloc((void **)&d_ray_elevationAngles, ray_elevationAngles_Bytes), "CUDA Malloc Failed"); for (size_t ray = 0; ray < nRays; ray++) @@ -416,7 +465,7 @@ CArray2D sonar_calculation_wrapper( // Preallocate an array for return CArray2D P_Beams_F(CArray(nFreq), nBeams); // GPU grids and rows - unsigned int grid_rows, grid_cols; + // unsigned int grid_rows, grid_cols; dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // GPU Ray summation using column sum @@ -428,10 +477,16 @@ CArray2D sonar_calculation_wrapper( float *d_P_Ray_F_real, *d_P_Ray_F_imag; const int P_Ray_F_N = (nFreq) * 1; const int P_Ray_F_Bytes = sizeof(float) * P_Ray_F_N; - cudaMallocHost((void **)&P_Ray_real, P_Ray_Bytes); - cudaMallocHost((void **)&P_Ray_imag, P_Ray_Bytes); - cudaMallocHost((void **)&P_Ray_F_real, P_Ray_F_Bytes); - cudaMallocHost((void **)&P_Ray_F_imag, P_Ray_F_Bytes); + SAFE_CALL( + cudaMallocHost((void **)&P_Ray_real, P_Ray_Bytes), "CUDA MallocHost Failed for P_Ray_real"); + SAFE_CALL( + cudaMallocHost((void **)&P_Ray_imag, P_Ray_Bytes), "CUDA MallocHost Failed for P_Ray_imag"); + SAFE_CALL( + cudaMallocHost((void **)&P_Ray_F_real, P_Ray_F_Bytes), + "CUDA MallocHost Failed for P_Ray_F_real"); + SAFE_CALL( + cudaMallocHost((void **)&P_Ray_F_imag, P_Ray_F_Bytes), + "CUDA MallocHost Failed for P_Ray_F_imag"); SAFE_CALL(cudaMalloc((void **)&d_P_Ray_real, P_Ray_Bytes), "CUDA Malloc Failed"); SAFE_CALL(cudaMalloc((void **)&d_P_Ray_imag, P_Ray_Bytes), "CUDA Malloc Failed"); SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_real, P_Ray_F_Bytes), "CUDA Malloc Failed"); @@ -461,8 +516,10 @@ CArray2D sonar_calculation_wrapper( column_sums_reduce<<>>( d_P_Ray_real, d_P_Ray_F_real, nFreq, (int)(nRays / raySkips)); + SAFE_CALL(cudaDeviceSynchronize(), "column_sums_reduce real kernel launch failed"); column_sums_reduce<<>>( d_P_Ray_imag, d_P_Ray_F_imag, nFreq, (int)(nRays / raySkips)); + SAFE_CALL(cudaDeviceSynchronize(), "column_sums_reduce imag kernel launch failed"); SAFE_CALL( cudaMemcpy(P_Ray_F_real, d_P_Ray_F_real, P_Ray_F_Bytes, cudaMemcpyDeviceToHost), @@ -470,7 +527,6 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL( cudaMemcpy(P_Ray_F_imag, d_P_Ray_F_imag, P_Ray_F_Bytes, cudaMemcpyDeviceToHost), "CUDA Memcpy Failed"); - SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); for (size_t f = 0; f < nFreq; f++) { @@ -498,103 +554,165 @@ CArray2D sonar_calculation_wrapper( start = std::chrono::high_resolution_clock::now(); } - // -------------- Beam culling correction -----------------// + // --- Beam culling correction --- // beamCorrector and beamCorrectorSum is precalculated at parent cpp - float *P_Beams_Cor_real, *P_Beams_Cor_imag; - // float *P_Beams_Cor_F_real, *P_Beams_Cor_F_imag; - float *P_Beams_Cor_real_tmp, *P_Beams_Cor_imag_tmp; - float *d_P_Beams_Cor_real, *d_P_Beams_Cor_imag; - float *d_P_Beams_Cor_F_real, *d_P_Beams_Cor_F_imag; + cudaEvent_t start_event, stop_event; + float elapsed_ms = 0.0f; + cudaEventCreate(&start_event); + cudaEventCreate(&stop_event); + + cublasHandle_t cublas_handle; + // Use SAFE_CUBLAS_CALL for cublas functions + SAFE_CUBLAS_CALL(cublasCreate_v2(&cublas_handle), "CUBLAS_STATUS_NOT_INITIALIZED"); + + float *P_Beams_Cor_real_h = nullptr, *P_Beams_Cor_imag_h = nullptr; + float *d_P_Beams_Cor_real = nullptr, *d_P_Beams_Cor_imag = nullptr; + float *d_P_Beams_Cor_F_real = nullptr, *d_P_Beams_Cor_F_imag = nullptr; + float *beamCorrector_lin_h = nullptr, *d_beamCorrector_lin = nullptr; + const int P_Beams_Cor_N = nBeams * nFreq; const int P_Beams_Cor_Bytes = sizeof(float) * P_Beams_Cor_N; - cudaMallocHost((void **)&P_Beams_Cor_real, P_Beams_Cor_Bytes); - cudaMallocHost((void **)&P_Beams_Cor_imag, P_Beams_Cor_Bytes); - cudaMallocHost((void **)&P_Beams_Cor_real_tmp, P_Beams_Cor_Bytes); - cudaMallocHost((void **)&P_Beams_Cor_imag_tmp, P_Beams_Cor_Bytes); - // cudaMallocHost((void **)&P_Beams_Cor_F_real, P_Beams_Cor_Bytes); - // cudaMallocHost((void **)&P_Beams_Cor_F_imag, P_Beams_Cor_Bytes); - SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_real, P_Beams_Cor_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_imag, P_Beams_Cor_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes), "CUDA Malloc Failed"); - - float *beamCorrector_lin, *d_beamCorrector_lin; const int beamCorrector_lin_N = nBeams * nBeams; const int beamCorrector_lin_Bytes = sizeof(float) * beamCorrector_lin_N; - cudaMallocHost((void **)&beamCorrector_lin, beamCorrector_lin_Bytes); + + SAFE_CALL( + cudaMallocHost((void **)&P_Beams_Cor_real_h, P_Beams_Cor_Bytes), + "CUDA MallocHost Failed for P_Beams_Cor_real_h"); + SAFE_CALL( + cudaMallocHost((void **)&P_Beams_Cor_imag_h, P_Beams_Cor_Bytes), + "CUDA MallocHost Failed for P_Beams_Cor_imag_h"); + SAFE_CALL( + cudaMallocHost((void **)&beamCorrector_lin_h, beamCorrector_lin_Bytes), + "CUDA MallocHost Failed for beamCorrector_lin_h"); + + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_real, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_real"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_imag, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_imag"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_F_real"); SAFE_CALL( - cudaMalloc((void **)&d_beamCorrector_lin, beamCorrector_lin_Bytes), "CUDA Malloc Failed"); + cudaMalloc((void **)&d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_F_imag"); + SAFE_CALL( + cudaMalloc((void **)&d_beamCorrector_lin, beamCorrector_lin_Bytes), + "CUDA Malloc Failed for d_beamCorrector_lin"); - // (nfreq x nBeams) * (nBeams x nBeams) = (nfreq x nBeams) for (size_t beam = 0; beam < nBeams; beam++) { for (size_t f = 0; f < nFreq; f++) { - P_Beams_Cor_real[f * nBeams + beam] = P_Beams_F[beam][f].real() * 1.0f; - P_Beams_Cor_imag[f * nBeams + beam] = P_Beams_F[beam][f].imag() * 1.0f; + P_Beams_Cor_real_h[f * nBeams + beam] = P_Beams_F[beam][f].real(); + P_Beams_Cor_imag_h[f * nBeams + beam] = P_Beams_F[beam][f].imag(); } for (size_t beam_other = 0; beam_other < nBeams; beam_other++) { - beamCorrector_lin[beam_other * nBeams + beam] = beamCorrector[beam][beam_other]; + beamCorrector_lin_h[beam_other * nBeams + beam] = beamCorrector[beam][beam_other]; } } + // Copy data to device SAFE_CALL( - cudaMemcpy(d_P_Beams_Cor_real, P_Beams_Cor_real, P_Beams_Cor_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); + cudaMemcpy(d_P_Beams_Cor_real, P_Beams_Cor_real_h, P_Beams_Cor_Bytes, cudaMemcpyHostToDevice), + "CUDA Memcpy Failed (d_P_Beams_Cor_real)"); SAFE_CALL( - cudaMemcpy(d_P_Beams_Cor_imag, P_Beams_Cor_imag, P_Beams_Cor_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); + cudaMemcpy(d_P_Beams_Cor_imag, P_Beams_Cor_imag_h, P_Beams_Cor_Bytes, cudaMemcpyHostToDevice), + "CUDA Memcpy Failed (d_P_Beams_Cor_imag)"); SAFE_CALL( cudaMemcpy( - d_beamCorrector_lin, beamCorrector_lin, beamCorrector_lin_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); - - grid_rows = (nFreq + BLOCK_SIZE - 1) / BLOCK_SIZE; - grid_cols = (nBeams + BLOCK_SIZE - 1) / BLOCK_SIZE; - dim3 dimGrid_Beam(grid_cols, grid_rows); - - gpu_matrix_mult<<>>( - d_P_Beams_Cor_real, d_beamCorrector_lin, d_P_Beams_Cor_F_real, nFreq, nBeams, nBeams); - SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); - - gpu_matrix_mult<<>>( - d_P_Beams_Cor_imag, d_beamCorrector_lin, d_P_Beams_Cor_F_imag, nFreq, nBeams, nBeams); - SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); + d_beamCorrector_lin, beamCorrector_lin_h, beamCorrector_lin_Bytes, cudaMemcpyHostToDevice), + "CUDA Memcpy Failed (d_beamCorrector_lin)"); + + // Define matrix dimensions and GEMM parameters + // M_gemm = rows of A (P_Beams_Cor_real/imag) = nFreq + // N_gemm = cols of B (beamCorrector_lin) and cols of C (result) = nBeams + // K_gemm = cols of A and rows of B = nBeams + const int M_gemm = nFreq; + const int N_gemm = nBeams; + const int K_gemm = nBeams; + + // Alpha and Beta for sgemm (float type) + const float alpha = 1.0f; + const float beta = 0.0f; // Setting beta to 0.0f to overwrite C + + // Real part + cudaEventRecord(start_event); + SAFE_CUBLAS_CALL( + cublasSgemm( + cublas_handle, + CUBLAS_OP_N, // op(A) + CUBLAS_OP_N, // op(B) + N_gemm, M_gemm, K_gemm, &alpha, + d_beamCorrector_lin, // Pointer to B_original (now A in cublasSgemm) + N_gemm, // lda: leading dimension of B_original + d_P_Beams_Cor_real, // Pointer to A_original (now B in cublasSgemm) + K_gemm, // ldb: leading dimension of A_original + &beta, + d_P_Beams_Cor_F_real, // Pointer to C_original + N_gemm), // ldc: leading dimension of C_original + "cublasSgemm Failed for real part"); + + // Imag part + SAFE_CUBLAS_CALL( + cublasSgemm( + cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, N_gemm, M_gemm, K_gemm, &alpha, d_beamCorrector_lin, + N_gemm, d_P_Beams_Cor_imag, K_gemm, &beta, d_P_Beams_Cor_F_imag, N_gemm), + "cublasSgemm Failed for imag part"); + + cudaEventRecord(stop_event); + cudaEventSynchronize(stop_event); + cudaEventElapsedTime(&elapsed_ms, start_event, stop_event); + printf("cuBLAS cublasSgemm time: %.3f ms\n", elapsed_ms); + + // grid_rows = (nFreq + BLOCK_SIZE - 1) / BLOCK_SIZE; + // grid_cols = (nBeams + BLOCK_SIZE - 1) / BLOCK_SIZE; + // dim3 dimGrid_Beam(grid_cols, grid_rows); + // cudaEventRecord(start_event); + + // gpu_matrix_mult_float<<>>( + // d_P_Beams_Cor_real, d_beamCorrector_lin, d_P_Beams_Cor_F_real, nFreq, nBeams, nBeams); + // SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); + + // gpu_matrix_mult_float<<>>( + // d_P_Beams_Cor_imag, d_beamCorrector_lin, d_P_Beams_Cor_F_imag, nFreq, nBeams, nBeams); + // SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); + + // cudaEventRecord(stop_event); + // cudaEventSynchronize(stop_event); + // cudaEventElapsedTime(&elapsed_ms, start_event, stop_event); + // printf("Custom kernel time: %.3f ms\n", elapsed_ms); + cudaEventDestroy(start_event); + cudaEventDestroy(stop_event); + SAFE_CUBLAS_CALL(cublasDestroy_v2(cublas_handle), "CUBLAS_STATUS_ALLOC_FAILED"); - // Copy back data from destination device meory SAFE_CALL( - cudaMemcpy( - P_Beams_Cor_real_tmp, d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes, cudaMemcpyDeviceToHost), - "CUDA Memcpy Failed"); + cudaMemcpy(P_Beams_Cor_real_h, d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes, cudaMemcpyDeviceToHost), + "CUDA Memcpy Failed for P_Beams_Cor_real_h"); SAFE_CALL( - cudaMemcpy( - P_Beams_Cor_imag_tmp, d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes, cudaMemcpyDeviceToHost), - "CUDA Memcpy Failed"); - SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); + cudaMemcpy(P_Beams_Cor_imag_h, d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes, cudaMemcpyDeviceToHost), + "CUDA Memcpy Failed for P_Beams_Cor_imag_h"); - // Return for (size_t beam = 0; beam < nBeams; beam++) { for (size_t f = 0; f < nFreq; f++) { P_Beams_F[beam][f] = Complex( - P_Beams_Cor_real_tmp[f * nBeams + beam] / beamCorrectorSum, - P_Beams_Cor_imag_tmp[f * nBeams + beam] / beamCorrectorSum); + P_Beams_Cor_real_h[f * nBeams + beam] / beamCorrectorSum, + P_Beams_Cor_imag_h[f * nBeams + beam] / beamCorrectorSum); } } - // Free memory - cudaFree(d_P_Beams_Cor_imag); - cudaFree(d_P_Beams_Cor_real); - cudaFree(d_P_Beams_Cor_F_imag); - cudaFree(d_P_Beams_Cor_F_real); - cudaFree(d_beamCorrector_lin); - cudaFreeHost(P_Beams_Cor_real); - cudaFreeHost(P_Beams_Cor_imag); - cudaFreeHost(P_Beams_Cor_real_tmp); - cudaFreeHost(P_Beams_Cor_imag_tmp); - cudaFreeHost(beamCorrector_lin); + SAFE_CALL(cudaFree(d_P_Beams_Cor_imag), "cudaFree failed for d_P_Beams_Cor_imag"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_real), "cudaFree failed for d_P_Beams_Cor_real"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_F_imag), "cudaFree failed for d_P_Beams_Cor_F_imag"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_F_real), "cudaFree failed for d_P_Beams_Cor_F_real"); + SAFE_CALL(cudaFree(d_beamCorrector_lin), "cudaFree failed for d_beamCorrector_lin"); + SAFE_CALL(cudaFreeHost(P_Beams_Cor_real_h), "cudaFreeHost failed for P_Beams_Cor_real_h"); + SAFE_CALL(cudaFreeHost(P_Beams_Cor_imag_h), "cudaFreeHost failed for P_Beams_Cor_imag_h"); + SAFE_CALL(cudaFreeHost(beamCorrector_lin_h), "cudaFreeHost failed for beamCorrector_lin_h"); // For calc time measure if (debugFlag) @@ -613,24 +731,46 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); const int DATASIZE = nFreq; const int BATCH = nBeams; + // --- Host side input data allocation and initialization cufftComplex * hostInputData = (cufftComplex *)malloc(DATASIZE * BATCH * sizeof(cufftComplex)); - for (int beam = 0; beam < BATCH; beam++) + start = std::chrono::high_resolution_clock::now(); + + // for (int beam = 0; beam < BATCH; beam++) + // { + // for (int f = 0; f < DATASIZE; f++) + // { + // if (f < nFreq) + // { + // hostInputData[beam * DATASIZE + f] = + // make_cuComplex(P_Beams_F[beam][f].real() * 1.0f, P_Beams_F[beam][f].imag() * 1.0f); + // } + // else + // { + // hostInputData[beam * DATASIZE + f] = (make_cuComplex(0.f, 0.f)); // zero padding + // } + // } + // } + +#pragma omp parallel for collapse(2) + for (int beam = 0; beam < BATCH; ++beam) { - for (int f = 0; f < DATASIZE; f++) + for (int f = 0; f < DATASIZE; ++f) { - if (f < nFreq) - { - hostInputData[beam * DATASIZE + f] = - make_cuComplex(P_Beams_F[beam][f].real() * 1.0f, P_Beams_F[beam][f].imag() * 1.0f); - } - else - { - hostInputData[beam * DATASIZE + f] = (make_cuComplex(0.f, 0.f)); // zero padding - } + int idx = beam * DATASIZE + f; + const std::complex & val = P_Beams_F[beam][f]; + hostInputData[idx] = make_cuComplex(val.real(), val.imag()); } } + if (debugFlag) + { + stop = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = stop - start; + printf("GPU LOOP Computation Time: %.6f seconds\n", duration.count()); + start = std::chrono::high_resolution_clock::now(); + } + // --- Device side input data allocation and initialization cufftComplex * deviceInputData; SAFE_CALL( @@ -697,6 +837,17 @@ CArray2D sonar_calculation_wrapper( "GPU FFT Calc Time %lld/100 [s]\n", static_cast(duration.count() / 10000)); } + auto total_stop_time = std::chrono::high_resolution_clock::now(); + auto total_duration = + std::chrono::duration_cast(total_stop_time - total_start_time); + + if (debugFlag) + { + printf( + "Total Sonar Calculation Wrapper Time: %.3f ms\n", + static_cast(total_duration.count()) / 1000.0f); + } + return P_Beams_F; } -} // namespace NpsGazeboSonar +} // namespace NpsGazeboSonar \ No newline at end of file From 5aea3e1b1a7571708cd61232498ea6d89dc3c152 Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Wed, 6 Aug 2025 14:58:13 -0300 Subject: [PATCH 02/16] add new summation kernel and reuse buffers --- .../multibeam_sonar/sonar_calculation_cuda.cu | 266 +++++++++--------- 1 file changed, 136 insertions(+), 130 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index b955581d..18fbe3f0 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -77,8 +77,18 @@ static inline void _safe_cuda_call( } } -// REMOVED: Duplicate SAFE_CALL definition here. Keep only one. -// #define SAFE_CALL(call, msg) _safe_cuda_call((call), (msg), __FILE__, __LINE__) +// Persistent GPU memory pointers (device) +static float * d_depth_image = nullptr; +static float * d_normal_image = nullptr; +static float * d_rand_image = nullptr; +static float * d_reflectivity_image = nullptr; +static float * d_ray_elevationAngles = nullptr; +static float * d_P_Beams_F_real = nullptr; +static float * d_P_Beams_F_imag = nullptr; +static thrust::complex * d_P_Beams = nullptr; +static thrust::complex * P_Beams = nullptr; +static bool memory_initialized = false; +float * ray_elevationAngles = nullptr; /////////////////////////////////////////////////////////////////////////// // Incident Angle Calculation Function @@ -115,44 +125,6 @@ __device__ __host__ float unnormalized_sinc(float t) } } -/////////////////////////////////////////////////////////////////////////// -template -__global__ void column_sums_reduce( - const T * __restrict__ in, T * __restrict__ out, size_t width, size_t height) -{ - __shared__ T sdata[BLOCK_SIZE][BLOCK_SIZE + 1]; - size_t idx = threadIdx.x + blockDim.x * blockIdx.x; - size_t width_stride = gridDim.x * blockDim.x; - size_t full_width = (width & (~((unsigned long long)(BLOCK_SIZE - 1)))) + - ((width & (BLOCK_SIZE - 1)) ? BLOCK_SIZE : 0); // round up to next block - for (size_t w = idx; w < full_width; w += width_stride) - { // grid-stride loop across matrix width - sdata[threadIdx.y][threadIdx.x] = 0; - size_t in_ptr = w + threadIdx.y * width; - for (size_t h = threadIdx.y; h < height; h += BLOCK_SIZE) - { // block-stride loop across matrix height - sdata[threadIdx.y][threadIdx.x] += (w < width) ? in[in_ptr] : 0; - in_ptr += width * BLOCK_SIZE; - } - __syncthreads(); - T my_val = sdata[threadIdx.x][threadIdx.y]; - for (int i = warpSize >> 1; i > 0; i >>= 1) // warp-wise parallel sum reduction - { - my_val += __shfl_xor_sync(0xFFFFFFFFU, my_val, i); - } - __syncthreads(); - if (threadIdx.x == 0) - { - sdata[0][threadIdx.y] = my_val; - } - __syncthreads(); - if ((threadIdx.y == 0) && ((w) < width)) - { - out[w] = sdata[0][threadIdx.x]; - } - } -} - __global__ void gpu_matrix_mult(const __half * a, const __half * b, __half * c, int m, int n, int k) { int row = blockIdx.y * blockDim.y + threadIdx.y; @@ -198,6 +170,56 @@ __global__ void gpu_diag_matrix_mult(float * Val, int * RowPtr, float * diagVals } } +__global__ void reduce_beams_kernel( + const thrust::complex * __restrict__ d_P_Beams, float * d_P_Beams_F_real, + float * d_P_Beams_F_imag, int nBeams, int nFreq, int nRaysSkipped) +{ + const int beam = blockIdx.y; + const int f = blockIdx.x; + + if (beam >= nBeams || f >= nFreq) + { + return; + } + + __shared__ float shared_real[BLOCK_SIZE]; + __shared__ float shared_imag[BLOCK_SIZE]; + + float sum_real = 0.0f; + float sum_imag = 0.0f; + + for (int ray = threadIdx.x; ray < nRaysSkipped; ray += blockDim.x) + { + int idx = beam * nFreq * nRaysSkipped + ray * nFreq + f; + + thrust::complex val = d_P_Beams[idx]; + sum_real += val.real(); + sum_imag += val.imag(); + } + + shared_real[threadIdx.x] = sum_real; + shared_imag[threadIdx.x] = sum_imag; + + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) + { + if (threadIdx.x < s) + { + shared_real[threadIdx.x] += shared_real[threadIdx.x + s]; + shared_imag[threadIdx.x] += shared_imag[threadIdx.x + s]; + } + __syncthreads(); + } + + if (threadIdx.x == 0) + { + int output_idx = beam * nFreq + f; + d_P_Beams_F_real[output_idx] = shared_real[0]; + d_P_Beams_F_imag[output_idx] = shared_imag[0]; + } +} + /////////////////////////////////////////////////////////////////////////// // Sonar Claculation Function __global__ void sonar_calculation( @@ -307,6 +329,20 @@ void check_cuda_init_wrapper(void) } } +void free_cuda_memory() +{ + cudaFree(d_depth_image); + cudaFree(d_normal_image); + cudaFree(d_rand_image); + cudaFree(d_reflectivity_image); + cudaFree(d_ray_elevationAngles); + cudaFree(d_P_Beams); + cudaFreeHost(P_Beams); + cudaFree(d_P_Beams_F_real); + cudaFree(d_P_Beams_F_imag); + memory_initialized = false; +} + // Sonar Claculation Function Wrapper CArray2D sonar_calculation_wrapper( const cv::Mat & depth_image, const cv::Mat & normal_image, const cv::Mat & rand_image, @@ -368,20 +404,36 @@ CArray2D sonar_calculation_wrapper( const int rand_image_Bytes = rand_image.step * rand_image.rows; const int reflectivity_image_Bytes = reflectivity_image.step * reflectivity_image.rows; const int ray_elevationAngles_Bytes = sizeof(float) * nRays; + const int P_Beams_N = nBeams * (int)(nRays / raySkips) * (nFreq + 1); + const int P_Beams_Bytes = sizeof(thrust::complex) * P_Beams_N; + + if (!memory_initialized) + { + SAFE_CALL( + cudaMalloc((void **)&d_depth_image, depth_image.step * depth_image.rows), "depth malloc"); + SAFE_CALL( + cudaMallocHost((void **)&ray_elevationAngles, sizeof(float) * nRays), + "MallocHost ray_elevationAngles"); + SAFE_CALL( + cudaMalloc((void **)&d_normal_image, normal_image.step * normal_image.rows), "normal malloc"); + SAFE_CALL(cudaMalloc((void **)&d_rand_image, rand_image.step * rand_image.rows), "rand malloc"); + SAFE_CALL( + cudaMalloc((void **)&d_reflectivity_image, reflectivity_image.step * reflectivity_image.rows), + "reflectivity malloc"); + SAFE_CALL( + cudaMalloc((void **)&d_ray_elevationAngles, sizeof(float) * nRays), "ray elevation malloc"); + SAFE_CALL( + cudaMallocHost((void **)&P_Beams, sizeof(thrust::complex) * P_Beams_N), + "P_Beams malloc host"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams, sizeof(thrust::complex) * P_Beams_N), + "P_Beams malloc device"); + SAFE_CALL(cudaMalloc(&d_P_Beams_F_real, sizeof(float) * nBeams * nFreq), "beam real malloc"); + SAFE_CALL(cudaMalloc(&d_P_Beams_F_imag, sizeof(float) * nBeams * nFreq), "beam imag malloc"); + + memory_initialized = true; + } - // Allocate device memory - float *d_depth_image, *d_normal_image, *d_rand_image, *d_reflectivity_image, *ray_elevationAngles, - *d_ray_elevationAngles; - SAFE_CALL(cudaMalloc((void **)&d_depth_image, depth_image_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_normal_image, normal_image_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_rand_image, rand_image_Bytes), "CUDA Malloc Failed"); - SAFE_CALL( - cudaMalloc((void **)&d_reflectivity_image, reflectivity_image_Bytes), "CUDA Malloc Failed"); - SAFE_CALL( - cudaMallocHost((void **)&ray_elevationAngles, ray_elevationAngles_Bytes), - "CUDA MallocHost Failed for ray_elevationAngles"); // Added SAFE_CALL - SAFE_CALL( - cudaMalloc((void **)&d_ray_elevationAngles, ray_elevationAngles_Bytes), "CUDA Malloc Failed"); for (size_t ray = 0; ray < nRays; ray++) { ray_elevationAngles[ray] = _ray_elevationAngles[ray]; @@ -415,14 +467,6 @@ CArray2D sonar_calculation_wrapper( const dim3 grid( (depth_image.cols + block.x - 1) / block.x, (depth_image.rows + block.y - 1) / block.y); - // Beam data array - thrust::complex * P_Beams; - thrust::complex * d_P_Beams; - const int P_Beams_N = nBeams * (int)(nRays / raySkips) * (nFreq + 1); - const int P_Beams_Bytes = sizeof(thrust::complex) * P_Beams_N; - SAFE_CALL(cudaMallocHost((void **)&P_Beams, P_Beams_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Beams, P_Beams_Bytes), "CUDA Malloc Failed"); - // Launch the beamor conversion kernel sonar_calculation<<>>( d_P_Beams, d_depth_image, d_normal_image, normal_image.cols, normal_image.rows, @@ -439,15 +483,6 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL( cudaMemcpy(P_Beams, d_P_Beams, P_Beams_Bytes, cudaMemcpyDeviceToHost), "CUDA Memcpy Failed"); - // Free GPU memory - cudaFree(d_depth_image); - cudaFree(d_normal_image); - cudaFree(d_rand_image); - cudaFree(d_reflectivity_image); - cudaFree(d_P_Beams); - cudaFree(d_ray_elevationAngles); - cudaFreeHost(ray_elevationAngles); - // For calc time measure if (debugFlag) { @@ -462,11 +497,6 @@ CArray2D sonar_calculation_wrapper( // ########################################################// // ######### Summation, Culling and windowing #########// // ########################################################// - // Preallocate an array for return - CArray2D P_Beams_F(CArray(nFreq), nBeams); - // GPU grids and rows - // unsigned int grid_rows, grid_cols; - dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // GPU Ray summation using column sum float *P_Ray_real, *P_Ray_imag; @@ -492,50 +522,35 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_real, P_Ray_F_Bytes), "CUDA Malloc Failed"); SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_imag, P_Ray_F_Bytes), "CUDA Malloc Failed"); - dim3 dimGrid_Ray((nFreq + BLOCK_SIZE - 1) / BLOCK_SIZE); + dim3 blockDim(BLOCK_SIZE); + dim3 gridDim(nFreq, nBeams); // one thread block per beam+freq pair - for (size_t beam = 0; beam < nBeams; beam++) - { - for (size_t ray = 0; ray < (int)(nRays / raySkips); ray++) - { - for (size_t f = 0; f < nFreq; f++) - { - P_Ray_real[ray * nFreq + f] = - P_Beams[beam * nFreq * (int)(nRays / raySkips) + ray * nFreq + f].real(); - P_Ray_imag[ray * nFreq + f] = - P_Beams[beam * nFreq * (int)(nRays / raySkips) + ray * nFreq + f].imag(); - } - } + reduce_beams_kernel<<>>( + d_P_Beams, d_P_Beams_F_real, d_P_Beams_F_imag, nBeams, nFreq, nRays / raySkips); + SAFE_CALL(cudaGetLastError(), "reduce_beams_kernel launch failed"); + SAFE_CALL(cudaDeviceSynchronize(), "reduce_beams_kernel execution failed"); - SAFE_CALL( - cudaMemcpy(d_P_Ray_real, P_Ray_real, P_Ray_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); - SAFE_CALL( - cudaMemcpy(d_P_Ray_imag, P_Ray_imag, P_Ray_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); - - column_sums_reduce<<>>( - d_P_Ray_real, d_P_Ray_F_real, nFreq, (int)(nRays / raySkips)); - SAFE_CALL(cudaDeviceSynchronize(), "column_sums_reduce real kernel launch failed"); - column_sums_reduce<<>>( - d_P_Ray_imag, d_P_Ray_F_imag, nFreq, (int)(nRays / raySkips)); - SAFE_CALL(cudaDeviceSynchronize(), "column_sums_reduce imag kernel launch failed"); + std::vector P_Beams_F_real(nBeams * nFreq); + std::vector P_Beams_F_imag(nBeams * nFreq); - SAFE_CALL( - cudaMemcpy(P_Ray_F_real, d_P_Ray_F_real, P_Ray_F_Bytes, cudaMemcpyDeviceToHost), - "CUDA Memcpy Failed"); - SAFE_CALL( - cudaMemcpy(P_Ray_F_imag, d_P_Ray_F_imag, P_Ray_F_Bytes, cudaMemcpyDeviceToHost), - "CUDA Memcpy Failed"); + cudaMemcpy( + P_Beams_F_real.data(), d_P_Beams_F_real, sizeof(float) * nBeams * nFreq, + cudaMemcpyDeviceToHost); + cudaMemcpy( + P_Beams_F_imag.data(), d_P_Beams_F_imag, sizeof(float) * nBeams * nFreq, + cudaMemcpyDeviceToHost); - for (size_t f = 0; f < nFreq; f++) + CArray2D P_Beams_F(CArray(nFreq), nBeams); + for (int beam = 0; beam < nBeams; beam++) + { + for (int f = 0; f < nFreq; f++) { - P_Beams_F[beam][f] = Complex(P_Ray_F_real[f], P_Ray_F_imag[f]); + P_Beams_F[beam][f] = + Complex(P_Beams_F_real[beam * nFreq + f], P_Beams_F_imag[beam * nFreq + f]); } } // free memory - cudaFreeHost(P_Beams); cudaFreeHost(P_Ray_real); cudaFreeHost(P_Ray_imag); cudaFreeHost(P_Ray_F_real); @@ -551,6 +566,7 @@ CArray2D sonar_calculation_wrapper( duration = std::chrono::duration_cast(stop - start); printf( "Sonar Ray Summation %lld/100 [s]\n", static_cast(duration.count() / 10000)); + printf("Sonar Ray Summation Time: %.3f ms\n", static_cast(duration.count()) / 1000.0f); start = std::chrono::high_resolution_clock::now(); } @@ -560,10 +576,16 @@ CArray2D sonar_calculation_wrapper( float elapsed_ms = 0.0f; cudaEventCreate(&start_event); cudaEventCreate(&stop_event); - cublasHandle_t cublas_handle; + + cublasStatus_t stat = cublasCreate(&cublas_handle); + if (stat != CUBLAS_STATUS_SUCCESS) + { + std::cerr << "cuBLAS create failed with error: " << stat << std::endl; + return P_Beams_F; + } + // Use SAFE_CUBLAS_CALL for cublas functions - SAFE_CUBLAS_CALL(cublasCreate_v2(&cublas_handle), "CUBLAS_STATUS_NOT_INITIALIZED"); float *P_Beams_Cor_real_h = nullptr, *P_Beams_Cor_imag_h = nullptr; float *d_P_Beams_Cor_real = nullptr, *d_P_Beams_Cor_imag = nullptr; @@ -666,24 +688,6 @@ CArray2D sonar_calculation_wrapper( cudaEventSynchronize(stop_event); cudaEventElapsedTime(&elapsed_ms, start_event, stop_event); printf("cuBLAS cublasSgemm time: %.3f ms\n", elapsed_ms); - - // grid_rows = (nFreq + BLOCK_SIZE - 1) / BLOCK_SIZE; - // grid_cols = (nBeams + BLOCK_SIZE - 1) / BLOCK_SIZE; - // dim3 dimGrid_Beam(grid_cols, grid_rows); - // cudaEventRecord(start_event); - - // gpu_matrix_mult_float<<>>( - // d_P_Beams_Cor_real, d_beamCorrector_lin, d_P_Beams_Cor_F_real, nFreq, nBeams, nBeams); - // SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); - - // gpu_matrix_mult_float<<>>( - // d_P_Beams_Cor_imag, d_beamCorrector_lin, d_P_Beams_Cor_F_imag, nFreq, nBeams, nBeams); - // SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); - - // cudaEventRecord(stop_event); - // cudaEventSynchronize(stop_event); - // cudaEventElapsedTime(&elapsed_ms, start_event, stop_event); - // printf("Custom kernel time: %.3f ms\n", elapsed_ms); cudaEventDestroy(start_event); cudaEventDestroy(stop_event); SAFE_CUBLAS_CALL(cublasDestroy_v2(cublas_handle), "CUBLAS_STATUS_ALLOC_FAILED"); @@ -722,6 +726,7 @@ CArray2D sonar_calculation_wrapper( printf( "GPU Window & Correction %lld/100 [s]\n", static_cast(duration.count() / 10000)); + printf("GPU Window & Correction: %.3f ms\n", static_cast(duration.count()) / 1000.0f); start = std::chrono::high_resolution_clock::now(); } @@ -833,6 +838,7 @@ CArray2D sonar_calculation_wrapper( { stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); + printf("GPU FFT Calc Time Time: %.3f ms\n", static_cast(duration.count()) / 1000.0f); printf( "GPU FFT Calc Time %lld/100 [s]\n", static_cast(duration.count() / 10000)); } From e92156f53951df5a3fcab0a14d2ad641c9f1ec0a Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Tue, 12 Aug 2025 12:45:49 -0300 Subject: [PATCH 03/16] add free_memory to header --- .../multibeam_sonar/CMakeLists.txt | 27 +++++++++++-------- .../sonar_calculation_cuda.cuh | 3 +++ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt index 664a63ba..1e450072 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt @@ -2,21 +2,22 @@ cmake_minimum_required(VERSION 3.11.0 FATAL_ERROR) # Suppress developer warnings for the entire workspace. set(CMAKE_SUPPRESS_DEVELOPER_WARNINGS TRUE CACHE BOOL "Suppress developer warnings" FORCE) -# Set CMP0144 to NEW to ensure find_package uses upper-case _ROOT variables. cmake_policy(SET CMP0144 NEW) project(multibeam_sonar) +set(CUDA_ARCHITECTURE "60" CACHE STRING "Target CUDA SM version") + find_package(ament_cmake REQUIRED) find_package(CUDAToolkit QUIET) -if(CUDAToolkit_FOUND) +if(BUILD_WITH_CUDA AND CUDAToolkit_FOUND) + enable_language(CUDA) find_package(CUDA REQUIRED) message(STATUS "CUDA found, enabling CUDA support.") include_directories(${CUDA_INCLUDE_DIRS}) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_60") - + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_${CUDA_ARCHITECTURE}") find_package(gz-cmake3 REQUIRED) find_package(gz-sim8 REQUIRED) find_package(gz-sensors8 REQUIRED) @@ -36,7 +37,6 @@ if(CUDAToolkit_FOUND) link_directories(${PCL_LIBRARY_DIRS}) add_definitions(${PCL_DEFINITIONS}) - # Set version variables set(GZ_MSGS_VER ${gz-msgs10_VERSION_MAJOR}) set(GZ_RENDERING_VER ${gz-rendering8_VERSION_MAJOR}) set(GZ_SENSORS_VER ${gz-sensors8_VERSION_MAJOR}) @@ -62,7 +62,9 @@ if(CUDAToolkit_FOUND) ) set_target_properties(${PROJECT_NAME} - PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + ) target_include_directories(${PROJECT_NAME} PUBLIC @@ -75,12 +77,17 @@ if(CUDAToolkit_FOUND) ) find_library(CUBLAS_LIB cublas - HINTS ${CUDAToolkit_LIBRARY_DIR} /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu /usr/lib) + HINTS ${CUDAToolkit_LIBRARY_DIR} /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu /usr/lib) if(NOT CUBLAS_LIB) message(FATAL_ERROR "cuBLAS not found") endif() - target_link_libraries(${PROJECT_NAME} ${PCL_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUBLAS_LIB}) + target_link_libraries(${PROJECT_NAME} + ${PCL_LIBRARIES} + ${CUDA_LIBRARIES} + ${CUDA_CUFFT_LIBRARIES} + ${CUBLAS_LIB} + ) install(TARGETS ${PROJECT_NAME} DESTINATION lib/${PROJECT_NAME} @@ -103,14 +110,12 @@ if(CUDAToolkit_FOUND) cv_bridge ) - - # Environment hooks ament_environment_hooks( "${CMAKE_CURRENT_SOURCE_DIR}/hooks/${PROJECT_NAME}.dsv.in" ) else() - message(STATUS "CUDA Toolkit not found: Skipping CUDA-specific targets") + message(STATUS "CUDA Toolkit not found or disabled: Skipping CUDA-specific targets") endif() ament_package() diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh index 328a6b5b..ef78b07a 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh @@ -40,6 +40,9 @@ typedef std::valarray CArray2D; /// \brief CUDA Device Check Function Wrapper void check_cuda_init_wrapper(void); +/// \brief CUDA Free Memory Function +void free_cuda_memory(); + /// \brief Sonar Claculation Function Wrapper CArray2D sonar_calculation_wrapper( const cv::Mat & depth_image, const cv::Mat & normal_image, const cv::Mat & rand_image, From edd14e0e645ac260bc4134130e60c1a1bbf356bf Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Tue, 12 Aug 2025 16:30:10 -0300 Subject: [PATCH 04/16] change complex exp calculation --- .../multibeam_sonar/sonar_calculation_cuda.cu | 229 ++++++++++-------- 1 file changed, 129 insertions(+), 100 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index 18fbe3f0..662a054c 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -90,6 +90,25 @@ static thrust::complex * P_Beams = nullptr; static bool memory_initialized = false; float * ray_elevationAngles = nullptr; +float *P_Ray_real, *P_Ray_imag; +float *d_P_Ray_real, *d_P_Ray_imag; +int P_Ray_N; +int P_Ray_Bytes; +float *P_Ray_F_real, *P_Ray_F_imag; +float *d_P_Ray_F_real, *d_P_Ray_F_imag; +int P_Ray_F_N; +int P_Ray_F_Bytes; + +float *P_Beams_Cor_real_h = nullptr, *P_Beams_Cor_imag_h = nullptr; +float *d_P_Beams_Cor_real = nullptr, *d_P_Beams_Cor_imag = nullptr; +float *d_P_Beams_Cor_F_real = nullptr, *d_P_Beams_Cor_F_imag = nullptr; +float *beamCorrector_lin_h = nullptr, *d_beamCorrector_lin = nullptr; + +int P_Beams_Cor_N; +int P_Beams_Cor_Bytes; +int beamCorrector_lin_N; +int beamCorrector_lin_Bytes; + /////////////////////////////////////////////////////////////////////////// // Incident Angle Calculation Function // incidence angle is target's normal angle accounting for the ray's azimuth @@ -295,19 +314,24 @@ __global__ void sonar_calculation( float freq; if (nFreq % 2 == 0) { - freq = delta_f * (-nFreq / 2.0 + f * 1.0f + 1.0); + freq = delta_f * (-nFreq / 2.0f + f + 1.0f); } else { - freq = delta_f * (-(nFreq - 1) / 2.0 + f * 1.0f + 1.0); + freq = delta_f * (-(nFreq - 1) / 2.0f + f + 1.0f); } - float kw = 2.0 * M_PI * freq / soundSpeed; // wave vector + float kw = 2.0f * M_PI * freq / soundSpeed; // wave vector + + float phase = 2.0f * distance * kw; + float s, c; + __sincosf(phase, &s, &c); + + thrust::complex kernel(c, s); + kernel *= amplitude; + + int ray_index = ray / raySkips; // Map ray to reduced ray index - // Transmit spectrum, frequency domain - thrust::complex kernel = - exp(thrust::complex(0.0f, 2.0f * distance * kw)) * amplitude; - P_Beams[beam * nFreq * (int)(nRays / raySkips) + (int)(ray / raySkips) * nFreq + f] = - thrust::complex(kernel.real(), kernel.imag()); + P_Beams[beam * nFreq * (nRays / raySkips) + ray_index * nFreq + f] = kernel; } } } @@ -331,15 +355,32 @@ void check_cuda_init_wrapper(void) void free_cuda_memory() { - cudaFree(d_depth_image); - cudaFree(d_normal_image); - cudaFree(d_rand_image); - cudaFree(d_reflectivity_image); - cudaFree(d_ray_elevationAngles); - cudaFree(d_P_Beams); - cudaFreeHost(P_Beams); - cudaFree(d_P_Beams_F_real); - cudaFree(d_P_Beams_F_imag); + SAFE_CALL(cudaFree(d_depth_image), "cudaFree failed for d_depth_image"); + SAFE_CALL(cudaFree(d_normal_image), "cudaFree failed for d_normal_image"); + SAFE_CALL(cudaFree(d_rand_image), "cudaFree failed for d_rand_image"); + SAFE_CALL(cudaFree(d_reflectivity_image), "cudaFree failed for d_reflectivity_image"); + SAFE_CALL(cudaFree(d_ray_elevationAngles), "cudaFree failed for d_ray_elevationAngles"); + SAFE_CALL(cudaFree(d_P_Beams), "cudaFree failed for d_P_Beams"); + SAFE_CALL(cudaFreeHost(P_Beams), "cudaFreeHost failed for P_Beams"); + SAFE_CALL(cudaFree(d_P_Beams_F_real), "cudaFree failed for d_P_Beams_F_real"); + SAFE_CALL(cudaFree(d_P_Beams_F_imag), "cudaFree failed for d_P_Beams_F_imag"); + SAFE_CALL(cudaFreeHost(P_Ray_real), "cudaFreeHost failed for P_Ray_real"); + SAFE_CALL(cudaFreeHost(P_Ray_imag), "cudaFreeHost failed for P_Ray_imag"); + SAFE_CALL(cudaFreeHost(P_Ray_F_real), "cudaFreeHost failed for P_Ray_F_real"); + SAFE_CALL(cudaFreeHost(P_Ray_F_imag), "cudaFreeHost failed for P_Ray_F_imag"); + SAFE_CALL(cudaFree(d_P_Ray_real), "cudaFree failed for d_P_Ray_real"); + SAFE_CALL(cudaFree(d_P_Ray_imag), "cudaFree failed for d_P_Ray_imag"); + SAFE_CALL(cudaFree(d_P_Ray_F_real), "cudaFree failed for d_P_Ray_F_real"); + SAFE_CALL(cudaFree(d_P_Ray_F_imag), "cudaFree failed for d_P_Ray_F_imag"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_imag), "cudaFree failed for d_P_Beams_Cor_imag"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_real), "cudaFree failed for d_P_Beams_Cor_real"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_F_imag), "cudaFree failed for d_P_Beams_Cor_F_imag"); + SAFE_CALL(cudaFree(d_P_Beams_Cor_F_real), "cudaFree failed for d_P_Beams_Cor_F_real"); + SAFE_CALL(cudaFree(d_beamCorrector_lin), "cudaFree failed for d_beamCorrector_lin"); + SAFE_CALL(cudaFreeHost(P_Beams_Cor_real_h), "cudaFreeHost failed for P_Beams_Cor_real_h"); + SAFE_CALL(cudaFreeHost(P_Beams_Cor_imag_h), "cudaFreeHost failed for P_Beams_Cor_imag_h"); + SAFE_CALL(cudaFreeHost(beamCorrector_lin_h), "cudaFreeHost failed for beamCorrector_lin_h"); + memory_initialized = false; } @@ -407,8 +448,19 @@ CArray2D sonar_calculation_wrapper( const int P_Beams_N = nBeams * (int)(nRays / raySkips) * (nFreq + 1); const int P_Beams_Bytes = sizeof(thrust::complex) * P_Beams_N; + P_Ray_N = (int)(nRays / raySkips) * (nFreq); + P_Ray_Bytes = sizeof(float) * P_Ray_N; + P_Ray_F_N = (nFreq) * 1; + P_Ray_F_Bytes = sizeof(float) * P_Ray_F_N; + + P_Beams_Cor_N = nBeams * nFreq; + P_Beams_Cor_Bytes = sizeof(float) * P_Beams_Cor_N; + beamCorrector_lin_N = nBeams * nBeams; + beamCorrector_lin_Bytes = sizeof(float) * beamCorrector_lin_N; + if (!memory_initialized) { + std::cout << "Initializing..." << std::endl; SAFE_CALL( cudaMalloc((void **)&d_depth_image, depth_image.step * depth_image.rows), "depth malloc"); SAFE_CALL( @@ -430,6 +482,49 @@ CArray2D sonar_calculation_wrapper( "P_Beams malloc device"); SAFE_CALL(cudaMalloc(&d_P_Beams_F_real, sizeof(float) * nBeams * nFreq), "beam real malloc"); SAFE_CALL(cudaMalloc(&d_P_Beams_F_imag, sizeof(float) * nBeams * nFreq), "beam imag malloc"); + std::cout << "Middle..." << std::endl; + + SAFE_CALL( + cudaMallocHost((void **)&P_Ray_real, P_Ray_Bytes), "CUDA MallocHost Failed for P_Ray_real"); + SAFE_CALL( + cudaMallocHost((void **)&P_Ray_imag, P_Ray_Bytes), "CUDA MallocHost Failed for P_Ray_imag"); + SAFE_CALL( + cudaMallocHost((void **)&P_Ray_F_real, P_Ray_F_Bytes), + "CUDA MallocHost Failed for P_Ray_F_real"); + SAFE_CALL( + cudaMallocHost((void **)&P_Ray_F_imag, P_Ray_F_Bytes), + "CUDA MallocHost Failed for P_Ray_F_imag"); + SAFE_CALL(cudaMalloc((void **)&d_P_Ray_real, P_Ray_Bytes), "CUDA Malloc Failed"); + SAFE_CALL(cudaMalloc((void **)&d_P_Ray_imag, P_Ray_Bytes), "CUDA Malloc Failed"); + SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_real, P_Ray_F_Bytes), "CUDA Malloc Failed"); + SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_imag, P_Ray_F_Bytes), "CUDA Malloc Failed"); + SAFE_CALL( + cudaMallocHost((void **)&P_Beams_Cor_real_h, P_Beams_Cor_Bytes), + "CUDA MallocHost Failed for P_Beams_Cor_real_h"); + SAFE_CALL( + cudaMallocHost((void **)&P_Beams_Cor_imag_h, P_Beams_Cor_Bytes), + "CUDA MallocHost Failed for P_Beams_Cor_imag_h"); + SAFE_CALL( + cudaMallocHost((void **)&beamCorrector_lin_h, beamCorrector_lin_Bytes), + "CUDA MallocHost Failed for beamCorrector_lin_h"); + std::cout << "DONE..." << std::endl; + + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_real, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_real"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_imag, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_imag"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_F_real"); + SAFE_CALL( + cudaMalloc((void **)&d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes), + "CUDA Malloc Failed for d_P_Beams_Cor_F_imag"); + SAFE_CALL( + cudaMalloc((void **)&d_beamCorrector_lin, beamCorrector_lin_Bytes), + "CUDA Malloc Failed for d_beamCorrector_lin"); + std::cout << "realdone..." << std::endl; memory_initialized = true; } @@ -439,7 +534,7 @@ CArray2D sonar_calculation_wrapper( ray_elevationAngles[ray] = _ray_elevationAngles[ray]; } - // Copy data from OpenCV input image to device memory + // Perform your cudaMemcpy calls SAFE_CALL( cudaMemcpy(d_depth_image, depth_image.ptr(), depth_image_Bytes, cudaMemcpyHostToDevice), "CUDA Memcpy Failed"); @@ -467,6 +562,13 @@ CArray2D sonar_calculation_wrapper( const dim3 grid( (depth_image.cols + block.x - 1) / block.x, (depth_image.rows + block.y - 1) / block.y); + cudaEvent_t start1, stop1; + float milliseconds = 0; + + cudaEventCreate(&start1); + cudaEventCreate(&stop1); + cudaEventRecord(start1); + // Launch the beamor conversion kernel sonar_calculation<<>>( d_P_Beams, d_depth_image, d_normal_image, normal_image.cols, normal_image.rows, @@ -479,6 +581,15 @@ CArray2D sonar_calculation_wrapper( // Synchronize to check for any kernel launch errors SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); + cudaEventRecord(stop1); + cudaEventSynchronize(stop1); + + cudaEventElapsedTime(&milliseconds, start1, stop1); + cudaEventDestroy(start1); + cudaEventDestroy(stop1); + + printf("Total sonar calculation (kernel) time: %.3f ms\n", milliseconds); + // Copy back data from destination device meory to OpenCV output image SAFE_CALL( cudaMemcpy(P_Beams, d_P_Beams, P_Beams_Bytes, cudaMemcpyDeviceToHost), "CUDA Memcpy Failed"); @@ -498,30 +609,6 @@ CArray2D sonar_calculation_wrapper( // ######### Summation, Culling and windowing #########// // ########################################################// - // GPU Ray summation using column sum - float *P_Ray_real, *P_Ray_imag; - float *d_P_Ray_real, *d_P_Ray_imag; - const int P_Ray_N = (int)(nRays / raySkips) * (nFreq); - const int P_Ray_Bytes = sizeof(float) * P_Ray_N; - float *P_Ray_F_real, *P_Ray_F_imag; - float *d_P_Ray_F_real, *d_P_Ray_F_imag; - const int P_Ray_F_N = (nFreq) * 1; - const int P_Ray_F_Bytes = sizeof(float) * P_Ray_F_N; - SAFE_CALL( - cudaMallocHost((void **)&P_Ray_real, P_Ray_Bytes), "CUDA MallocHost Failed for P_Ray_real"); - SAFE_CALL( - cudaMallocHost((void **)&P_Ray_imag, P_Ray_Bytes), "CUDA MallocHost Failed for P_Ray_imag"); - SAFE_CALL( - cudaMallocHost((void **)&P_Ray_F_real, P_Ray_F_Bytes), - "CUDA MallocHost Failed for P_Ray_F_real"); - SAFE_CALL( - cudaMallocHost((void **)&P_Ray_F_imag, P_Ray_F_Bytes), - "CUDA MallocHost Failed for P_Ray_F_imag"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_real, P_Ray_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_imag, P_Ray_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_real, P_Ray_F_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_imag, P_Ray_F_Bytes), "CUDA Malloc Failed"); - dim3 blockDim(BLOCK_SIZE); dim3 gridDim(nFreq, nBeams); // one thread block per beam+freq pair @@ -550,16 +637,6 @@ CArray2D sonar_calculation_wrapper( } } - // free memory - cudaFreeHost(P_Ray_real); - cudaFreeHost(P_Ray_imag); - cudaFreeHost(P_Ray_F_real); - cudaFreeHost(P_Ray_F_imag); - cudaFree(d_P_Ray_real); - cudaFree(d_P_Ray_imag); - cudaFree(d_P_Ray_F_real); - cudaFree(d_P_Ray_F_imag); - if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); @@ -585,44 +662,6 @@ CArray2D sonar_calculation_wrapper( return P_Beams_F; } - // Use SAFE_CUBLAS_CALL for cublas functions - - float *P_Beams_Cor_real_h = nullptr, *P_Beams_Cor_imag_h = nullptr; - float *d_P_Beams_Cor_real = nullptr, *d_P_Beams_Cor_imag = nullptr; - float *d_P_Beams_Cor_F_real = nullptr, *d_P_Beams_Cor_F_imag = nullptr; - float *beamCorrector_lin_h = nullptr, *d_beamCorrector_lin = nullptr; - - const int P_Beams_Cor_N = nBeams * nFreq; - const int P_Beams_Cor_Bytes = sizeof(float) * P_Beams_Cor_N; - const int beamCorrector_lin_N = nBeams * nBeams; - const int beamCorrector_lin_Bytes = sizeof(float) * beamCorrector_lin_N; - - SAFE_CALL( - cudaMallocHost((void **)&P_Beams_Cor_real_h, P_Beams_Cor_Bytes), - "CUDA MallocHost Failed for P_Beams_Cor_real_h"); - SAFE_CALL( - cudaMallocHost((void **)&P_Beams_Cor_imag_h, P_Beams_Cor_Bytes), - "CUDA MallocHost Failed for P_Beams_Cor_imag_h"); - SAFE_CALL( - cudaMallocHost((void **)&beamCorrector_lin_h, beamCorrector_lin_Bytes), - "CUDA MallocHost Failed for beamCorrector_lin_h"); - - SAFE_CALL( - cudaMalloc((void **)&d_P_Beams_Cor_real, P_Beams_Cor_Bytes), - "CUDA Malloc Failed for d_P_Beams_Cor_real"); - SAFE_CALL( - cudaMalloc((void **)&d_P_Beams_Cor_imag, P_Beams_Cor_Bytes), - "CUDA Malloc Failed for d_P_Beams_Cor_imag"); - SAFE_CALL( - cudaMalloc((void **)&d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes), - "CUDA Malloc Failed for d_P_Beams_Cor_F_real"); - SAFE_CALL( - cudaMalloc((void **)&d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes), - "CUDA Malloc Failed for d_P_Beams_Cor_F_imag"); - SAFE_CALL( - cudaMalloc((void **)&d_beamCorrector_lin, beamCorrector_lin_Bytes), - "CUDA Malloc Failed for d_beamCorrector_lin"); - for (size_t beam = 0; beam < nBeams; beam++) { for (size_t f = 0; f < nFreq; f++) @@ -708,16 +747,6 @@ CArray2D sonar_calculation_wrapper( P_Beams_Cor_imag_h[f * nBeams + beam] / beamCorrectorSum); } } - - SAFE_CALL(cudaFree(d_P_Beams_Cor_imag), "cudaFree failed for d_P_Beams_Cor_imag"); - SAFE_CALL(cudaFree(d_P_Beams_Cor_real), "cudaFree failed for d_P_Beams_Cor_real"); - SAFE_CALL(cudaFree(d_P_Beams_Cor_F_imag), "cudaFree failed for d_P_Beams_Cor_F_imag"); - SAFE_CALL(cudaFree(d_P_Beams_Cor_F_real), "cudaFree failed for d_P_Beams_Cor_F_real"); - SAFE_CALL(cudaFree(d_beamCorrector_lin), "cudaFree failed for d_beamCorrector_lin"); - SAFE_CALL(cudaFreeHost(P_Beams_Cor_real_h), "cudaFreeHost failed for P_Beams_Cor_real_h"); - SAFE_CALL(cudaFreeHost(P_Beams_Cor_imag_h), "cudaFreeHost failed for P_Beams_Cor_imag_h"); - SAFE_CALL(cudaFreeHost(beamCorrector_lin_h), "cudaFreeHost failed for beamCorrector_lin_h"); - // For calc time measure if (debugFlag) { From cb1f251f74ab565099bb7b61725d1a1238480a5f Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Sat, 16 Aug 2025 02:44:04 -0300 Subject: [PATCH 05/16] cleanup and add debug --- .../multibeam_sonar/sonar_calculation_cuda.cu | 131 +++++++----------- 1 file changed, 52 insertions(+), 79 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index 662a054c..c5f05f50 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -38,6 +38,11 @@ #include #include +// FOR DEBUG -- DEV VERSION +#include + +std::ofstream debugLog("debug_timings.txt", std::ios::app); + #define BLOCK_SIZE 32 // Existing SAFE_CALL (for cudaError_t) @@ -144,51 +149,6 @@ __device__ __host__ float unnormalized_sinc(float t) } } -__global__ void gpu_matrix_mult(const __half * a, const __half * b, __half * c, int m, int n, int k) -{ - int row = blockIdx.y * blockDim.y + threadIdx.y; - int col = blockIdx.x * blockDim.x + threadIdx.x; - - if (col < k && row < m) - { - __half sum = __float2half(0.0f); - for (int i = 0; i < n; ++i) - { - __half valA = a[row * n + i]; - __half valB = b[i * k + col]; - sum = __hadd(sum, __hmul(valA, valB)); - } - c[row * k + col] = sum; - } -} - -__global__ void gpu_matrix_mult_float(float * a, float * b, float * c, int m, int n, int k) -{ - int row = blockIdx.y * blockDim.y + threadIdx.y; - int col = blockIdx.x * blockDim.x + threadIdx.x; - float sum = 0; - if (col < k && row < m) - { - for (int i = 0; i < n; i++) - { - sum += a[row * n + i] * b[i * k + col]; - } - c[row * k + col] = sum; - } -} - -__global__ void gpu_diag_matrix_mult(float * Val, int * RowPtr, float * diagVals, int total_rows) -{ - const int row = threadIdx.x + blockIdx.x * blockDim.x; - if (row < total_rows) - { - for (int i = RowPtr[row]; i < RowPtr[row + 1]; i++) - { - Val[i] = diagVals[row] * Val[i]; - } - } -} - __global__ void reduce_beams_kernel( const thrust::complex * __restrict__ d_P_Beams, float * d_P_Beams_F_real, float * d_P_Beams_F_imag, int nBeams, int nFreq, int nRaysSkipped) @@ -460,7 +420,6 @@ CArray2D sonar_calculation_wrapper( if (!memory_initialized) { - std::cout << "Initializing..." << std::endl; SAFE_CALL( cudaMalloc((void **)&d_depth_image, depth_image.step * depth_image.rows), "depth malloc"); SAFE_CALL( @@ -482,8 +441,6 @@ CArray2D sonar_calculation_wrapper( "P_Beams malloc device"); SAFE_CALL(cudaMalloc(&d_P_Beams_F_real, sizeof(float) * nBeams * nFreq), "beam real malloc"); SAFE_CALL(cudaMalloc(&d_P_Beams_F_imag, sizeof(float) * nBeams * nFreq), "beam imag malloc"); - std::cout << "Middle..." << std::endl; - SAFE_CALL( cudaMallocHost((void **)&P_Ray_real, P_Ray_Bytes), "CUDA MallocHost Failed for P_Ray_real"); SAFE_CALL( @@ -507,8 +464,6 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL( cudaMallocHost((void **)&beamCorrector_lin_h, beamCorrector_lin_Bytes), "CUDA MallocHost Failed for beamCorrector_lin_h"); - std::cout << "DONE..." << std::endl; - SAFE_CALL( cudaMalloc((void **)&d_P_Beams_Cor_real, P_Beams_Cor_Bytes), "CUDA Malloc Failed for d_P_Beams_Cor_real"); @@ -524,8 +479,6 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL( cudaMalloc((void **)&d_beamCorrector_lin, beamCorrector_lin_Bytes), "CUDA Malloc Failed for d_beamCorrector_lin"); - std::cout << "realdone..." << std::endl; - memory_initialized = true; } @@ -594,14 +547,20 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL( cudaMemcpy(P_Beams, d_P_Beams, P_Beams_Bytes, cudaMemcpyDeviceToHost), "CUDA Memcpy Failed"); - // For calc time measure if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); - printf( - "GPU Sonar Computation Time %lld/100 [s]\n", - static_cast(duration.count() / 10000)); + + long long dcount = duration.count(); + float ms = static_cast(dcount) / 1000.0f; + + printf("GPU Sonar Computation Time %lld/100 [s]\n", dcount / 10000); + printf("GPU Sonar Summation Time: %.3f ms\n", ms); + + debugLog << "GPU Sonar Computation Time " << dcount / 10000 << "/100 [s]\n"; + debugLog << "GPU Sonar Summation Time: " << ms << " ms\n"; + start = std::chrono::high_resolution_clock::now(); } @@ -641,9 +600,15 @@ CArray2D sonar_calculation_wrapper( { stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); - printf( - "Sonar Ray Summation %lld/100 [s]\n", static_cast(duration.count() / 10000)); - printf("Sonar Ray Summation Time: %.3f ms\n", static_cast(duration.count()) / 1000.0f); + + long long dcount = duration.count(); + + printf("Sonar Ray Summation %lld/100 [s]\n", dcount / 10000); + printf("Sonar Ray Summation Time: %.3f ms\n", static_cast(dcount) / 1000.0f); + + debugLog << "Sonar Ray Summation " << dcount / 10000 << "/100 [s]\n"; + debugLog << "Sonar Ray Summation Time: " << static_cast(dcount) / 1000.0f << " ms\n"; + start = std::chrono::high_resolution_clock::now(); } @@ -747,15 +712,21 @@ CArray2D sonar_calculation_wrapper( P_Beams_Cor_imag_h[f * nBeams + beam] / beamCorrectorSum); } } - // For calc time measure + if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); - printf( - "GPU Window & Correction %lld/100 [s]\n", - static_cast(duration.count() / 10000)); - printf("GPU Window & Correction: %.3f ms\n", static_cast(duration.count()) / 1000.0f); + + long long dcount = duration.count(); + + printf("GPU Window & Correction %lld/100 [s]\n", dcount / 10000); + printf("GPU Window & Correction: %.3f ms\n", static_cast(dcount) / 1000.0f); + + // Write to file + debugLog << "GPU Window & Correction " << dcount / 10000 << "/100 [s]\n"; + debugLog << "GPU Window & Correction: " << static_cast(dcount) / 1000.0f << " ms\n"; + start = std::chrono::high_resolution_clock::now(); } @@ -797,14 +768,6 @@ CArray2D sonar_calculation_wrapper( } } - if (debugFlag) - { - stop = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = stop - start; - printf("GPU LOOP Computation Time: %.6f seconds\n", duration.count()); - start = std::chrono::high_resolution_clock::now(); - } - // --- Device side input data allocation and initialization cufftComplex * deviceInputData; SAFE_CALL( @@ -862,14 +825,22 @@ CArray2D sonar_calculation_wrapper( } } - // For calc time measure if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); - printf("GPU FFT Calc Time Time: %.3f ms\n", static_cast(duration.count()) / 1000.0f); - printf( - "GPU FFT Calc Time %lld/100 [s]\n", static_cast(duration.count() / 10000)); + + long long dcount = duration.count(); + float ms = static_cast(dcount) / 1000.0f; + + printf("GPU FFT Calc Time: %.3f ms\n", ms); + printf("GPU FFT Calc Time %lld/100 [s]\n", dcount / 10000); + + // Write to file + debugLog << "GPU FFT Calc Time: " << ms << " ms\n"; + debugLog << "GPU FFT Calc Time " << dcount / 10000 << "/100 [s]\n"; + + start = std::chrono::high_resolution_clock::now(); } auto total_stop_time = std::chrono::high_resolution_clock::now(); @@ -878,9 +849,11 @@ CArray2D sonar_calculation_wrapper( if (debugFlag) { - printf( - "Total Sonar Calculation Wrapper Time: %.3f ms\n", - static_cast(total_duration.count()) / 1000.0f); + float ms = static_cast(total_duration.count()) / 1000.0f; + + printf("Total Sonar Calculation Wrapper Time: %.3f ms\n", ms); + + debugLog << "Total Sonar Calculation Wrapper Time: " << ms << " ms\n"; } return P_Beams_F; From aaf4a9361e3bb1c93d47175d769a27198009a2de Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Wed, 20 Aug 2025 05:06:17 -0300 Subject: [PATCH 06/16] clear debug file --- .../multibeam_sonar/sonar_calculation_cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index c5f05f50..42c46607 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -41,7 +41,7 @@ // FOR DEBUG -- DEV VERSION #include -std::ofstream debugLog("debug_timings.txt", std::ios::app); +std::ofstream debugLog("debug_timings.txt"); #define BLOCK_SIZE 32 From 64096c65f2aacf32b664053796ee9ac1be7ea1fc Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Wed, 20 Aug 2025 16:15:16 -0300 Subject: [PATCH 07/16] correct debug --- .../multibeam_sonar/sonar_calculation_cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index 42c46607..113b824f 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -556,10 +556,10 @@ CArray2D sonar_calculation_wrapper( float ms = static_cast(dcount) / 1000.0f; printf("GPU Sonar Computation Time %lld/100 [s]\n", dcount / 10000); - printf("GPU Sonar Summation Time: %.3f ms\n", ms); + printf("GPU Sonar Computation Time: %.3f ms\n", ms); debugLog << "GPU Sonar Computation Time " << dcount / 10000 << "/100 [s]\n"; - debugLog << "GPU Sonar Summation Time: " << ms << " ms\n"; + debugLog << "GPU Sonar Computation Time: " << ms << " ms\n"; start = std::chrono::high_resolution_clock::now(); } From 3aaf0d05737eb1f315d35d1740f4c1a88b0c28b4 Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Wed, 20 Aug 2025 17:00:04 -0300 Subject: [PATCH 08/16] remove memcpy for pbeams --- .../multibeam_sonar/sonar_calculation_cuda.cu | 6 ------ 1 file changed, 6 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index 113b824f..6eac839f 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -541,12 +541,6 @@ CArray2D sonar_calculation_wrapper( cudaEventDestroy(start1); cudaEventDestroy(stop1); - printf("Total sonar calculation (kernel) time: %.3f ms\n", milliseconds); - - // Copy back data from destination device meory to OpenCV output image - SAFE_CALL( - cudaMemcpy(P_Beams, d_P_Beams, P_Beams_Bytes, cudaMemcpyDeviceToHost), "CUDA Memcpy Failed"); - if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); From b678bbf2cf384e398257afa89ca542401a4c6750 Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Sat, 23 Aug 2025 19:13:05 -0300 Subject: [PATCH 09/16] cleanup and remove redundancy --- .../multibeam_sonar/sonar_calculation_cuda.cu | 167 +++--------------- 1 file changed, 27 insertions(+), 140 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index 6eac839f..34354ac8 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -95,15 +95,6 @@ static thrust::complex * P_Beams = nullptr; static bool memory_initialized = false; float * ray_elevationAngles = nullptr; -float *P_Ray_real, *P_Ray_imag; -float *d_P_Ray_real, *d_P_Ray_imag; -int P_Ray_N; -int P_Ray_Bytes; -float *P_Ray_F_real, *P_Ray_F_imag; -float *d_P_Ray_F_real, *d_P_Ray_F_imag; -int P_Ray_F_N; -int P_Ray_F_Bytes; - float *P_Beams_Cor_real_h = nullptr, *P_Beams_Cor_imag_h = nullptr; float *d_P_Beams_Cor_real = nullptr, *d_P_Beams_Cor_imag = nullptr; float *d_P_Beams_Cor_F_real = nullptr, *d_P_Beams_Cor_F_imag = nullptr; @@ -114,6 +105,10 @@ int P_Beams_Cor_Bytes; int beamCorrector_lin_N; int beamCorrector_lin_Bytes; +// FFT +cufftComplex * deviceInputData; +cufftComplex * deviceOutputData; + /////////////////////////////////////////////////////////////////////////// // Incident Angle Calculation Function // incidence angle is target's normal angle accounting for the ray's azimuth @@ -193,7 +188,7 @@ __global__ void reduce_beams_kernel( if (threadIdx.x == 0) { - int output_idx = beam * nFreq + f; + int output_idx = f * nBeams + beam; d_P_Beams_F_real[output_idx] = shared_real[0]; d_P_Beams_F_imag[output_idx] = shared_imag[0]; } @@ -321,17 +316,11 @@ void free_cuda_memory() SAFE_CALL(cudaFree(d_reflectivity_image), "cudaFree failed for d_reflectivity_image"); SAFE_CALL(cudaFree(d_ray_elevationAngles), "cudaFree failed for d_ray_elevationAngles"); SAFE_CALL(cudaFree(d_P_Beams), "cudaFree failed for d_P_Beams"); + SAFE_CALL(cudaFree(deviceOutputData), "cudaFree failed for OutputData"); + SAFE_CALL(cudaFree(deviceInputData), "cudaFree failed for InputData"); SAFE_CALL(cudaFreeHost(P_Beams), "cudaFreeHost failed for P_Beams"); SAFE_CALL(cudaFree(d_P_Beams_F_real), "cudaFree failed for d_P_Beams_F_real"); SAFE_CALL(cudaFree(d_P_Beams_F_imag), "cudaFree failed for d_P_Beams_F_imag"); - SAFE_CALL(cudaFreeHost(P_Ray_real), "cudaFreeHost failed for P_Ray_real"); - SAFE_CALL(cudaFreeHost(P_Ray_imag), "cudaFreeHost failed for P_Ray_imag"); - SAFE_CALL(cudaFreeHost(P_Ray_F_real), "cudaFreeHost failed for P_Ray_F_real"); - SAFE_CALL(cudaFreeHost(P_Ray_F_imag), "cudaFreeHost failed for P_Ray_F_imag"); - SAFE_CALL(cudaFree(d_P_Ray_real), "cudaFree failed for d_P_Ray_real"); - SAFE_CALL(cudaFree(d_P_Ray_imag), "cudaFree failed for d_P_Ray_imag"); - SAFE_CALL(cudaFree(d_P_Ray_F_real), "cudaFree failed for d_P_Ray_F_real"); - SAFE_CALL(cudaFree(d_P_Ray_F_imag), "cudaFree failed for d_P_Ray_F_imag"); SAFE_CALL(cudaFree(d_P_Beams_Cor_imag), "cudaFree failed for d_P_Beams_Cor_imag"); SAFE_CALL(cudaFree(d_P_Beams_Cor_real), "cudaFree failed for d_P_Beams_Cor_real"); SAFE_CALL(cudaFree(d_P_Beams_Cor_F_imag), "cudaFree failed for d_P_Beams_Cor_F_imag"); @@ -340,7 +329,6 @@ void free_cuda_memory() SAFE_CALL(cudaFreeHost(P_Beams_Cor_real_h), "cudaFreeHost failed for P_Beams_Cor_real_h"); SAFE_CALL(cudaFreeHost(P_Beams_Cor_imag_h), "cudaFreeHost failed for P_Beams_Cor_imag_h"); SAFE_CALL(cudaFreeHost(beamCorrector_lin_h), "cudaFreeHost failed for beamCorrector_lin_h"); - memory_initialized = false; } @@ -406,18 +394,16 @@ CArray2D sonar_calculation_wrapper( const int reflectivity_image_Bytes = reflectivity_image.step * reflectivity_image.rows; const int ray_elevationAngles_Bytes = sizeof(float) * nRays; const int P_Beams_N = nBeams * (int)(nRays / raySkips) * (nFreq + 1); - const int P_Beams_Bytes = sizeof(thrust::complex) * P_Beams_N; - - P_Ray_N = (int)(nRays / raySkips) * (nFreq); - P_Ray_Bytes = sizeof(float) * P_Ray_N; - P_Ray_F_N = (nFreq) * 1; - P_Ray_F_Bytes = sizeof(float) * P_Ray_F_N; P_Beams_Cor_N = nBeams * nFreq; P_Beams_Cor_Bytes = sizeof(float) * P_Beams_Cor_N; beamCorrector_lin_N = nBeams * nBeams; beamCorrector_lin_Bytes = sizeof(float) * beamCorrector_lin_N; + // FFT Parameters + const int DATASIZE = nFreq; + const int BATCH = nBeams; + if (!memory_initialized) { SAFE_CALL( @@ -441,20 +427,6 @@ CArray2D sonar_calculation_wrapper( "P_Beams malloc device"); SAFE_CALL(cudaMalloc(&d_P_Beams_F_real, sizeof(float) * nBeams * nFreq), "beam real malloc"); SAFE_CALL(cudaMalloc(&d_P_Beams_F_imag, sizeof(float) * nBeams * nFreq), "beam imag malloc"); - SAFE_CALL( - cudaMallocHost((void **)&P_Ray_real, P_Ray_Bytes), "CUDA MallocHost Failed for P_Ray_real"); - SAFE_CALL( - cudaMallocHost((void **)&P_Ray_imag, P_Ray_Bytes), "CUDA MallocHost Failed for P_Ray_imag"); - SAFE_CALL( - cudaMallocHost((void **)&P_Ray_F_real, P_Ray_F_Bytes), - "CUDA MallocHost Failed for P_Ray_F_real"); - SAFE_CALL( - cudaMallocHost((void **)&P_Ray_F_imag, P_Ray_F_Bytes), - "CUDA MallocHost Failed for P_Ray_F_imag"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_real, P_Ray_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_imag, P_Ray_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_real, P_Ray_F_Bytes), "CUDA Malloc Failed"); - SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_imag, P_Ray_F_Bytes), "CUDA Malloc Failed"); SAFE_CALL( cudaMallocHost((void **)&P_Beams_Cor_real_h, P_Beams_Cor_Bytes), "CUDA MallocHost Failed for P_Beams_Cor_real_h"); @@ -479,6 +451,13 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL( cudaMalloc((void **)&d_beamCorrector_lin, beamCorrector_lin_Bytes), "CUDA Malloc Failed for d_beamCorrector_lin"); + SAFE_CALL( + cudaMalloc((void **)&deviceInputData, DATASIZE * BATCH * sizeof(cufftComplex)), + "FFT CUDA Malloc Failed"); + SAFE_CALL( + cudaMalloc((void **)&deviceOutputData, DATASIZE * BATCH * sizeof(cufftComplex)), + "FFT CUDA Malloc Failed"); + memory_initialized = true; } @@ -515,13 +494,6 @@ CArray2D sonar_calculation_wrapper( const dim3 grid( (depth_image.cols + block.x - 1) / block.x, (depth_image.rows + block.y - 1) / block.y); - cudaEvent_t start1, stop1; - float milliseconds = 0; - - cudaEventCreate(&start1); - cudaEventCreate(&stop1); - cudaEventRecord(start1); - // Launch the beamor conversion kernel sonar_calculation<<>>( d_P_Beams, d_depth_image, d_normal_image, normal_image.cols, normal_image.rows, @@ -534,13 +506,6 @@ CArray2D sonar_calculation_wrapper( // Synchronize to check for any kernel launch errors SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); - cudaEventRecord(stop1); - cudaEventSynchronize(stop1); - - cudaEventElapsedTime(&milliseconds, start1, stop1); - cudaEventDestroy(start1); - cudaEventDestroy(stop1); - if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); @@ -566,30 +531,10 @@ CArray2D sonar_calculation_wrapper( dim3 gridDim(nFreq, nBeams); // one thread block per beam+freq pair reduce_beams_kernel<<>>( - d_P_Beams, d_P_Beams_F_real, d_P_Beams_F_imag, nBeams, nFreq, nRays / raySkips); + d_P_Beams, d_P_Beams_Cor_real, d_P_Beams_Cor_imag, nBeams, nFreq, nRays / raySkips); SAFE_CALL(cudaGetLastError(), "reduce_beams_kernel launch failed"); SAFE_CALL(cudaDeviceSynchronize(), "reduce_beams_kernel execution failed"); - std::vector P_Beams_F_real(nBeams * nFreq); - std::vector P_Beams_F_imag(nBeams * nFreq); - - cudaMemcpy( - P_Beams_F_real.data(), d_P_Beams_F_real, sizeof(float) * nBeams * nFreq, - cudaMemcpyDeviceToHost); - cudaMemcpy( - P_Beams_F_imag.data(), d_P_Beams_F_imag, sizeof(float) * nBeams * nFreq, - cudaMemcpyDeviceToHost); - - CArray2D P_Beams_F(CArray(nFreq), nBeams); - for (int beam = 0; beam < nBeams; beam++) - { - for (int f = 0; f < nFreq; f++) - { - P_Beams_F[beam][f] = - Complex(P_Beams_F_real[beam * nFreq + f], P_Beams_F_imag[beam * nFreq + f]); - } - } - if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); @@ -608,10 +553,7 @@ CArray2D sonar_calculation_wrapper( // --- Beam culling correction --- // beamCorrector and beamCorrectorSum is precalculated at parent cpp - cudaEvent_t start_event, stop_event; - float elapsed_ms = 0.0f; - cudaEventCreate(&start_event); - cudaEventCreate(&stop_event); + CArray2D P_Beams_F(CArray(nFreq), nBeams); cublasHandle_t cublas_handle; cublasStatus_t stat = cublasCreate(&cublas_handle); @@ -623,24 +565,12 @@ CArray2D sonar_calculation_wrapper( for (size_t beam = 0; beam < nBeams; beam++) { - for (size_t f = 0; f < nFreq; f++) - { - P_Beams_Cor_real_h[f * nBeams + beam] = P_Beams_F[beam][f].real(); - P_Beams_Cor_imag_h[f * nBeams + beam] = P_Beams_F[beam][f].imag(); - } for (size_t beam_other = 0; beam_other < nBeams; beam_other++) { beamCorrector_lin_h[beam_other * nBeams + beam] = beamCorrector[beam][beam_other]; } } - // Copy data to device - SAFE_CALL( - cudaMemcpy(d_P_Beams_Cor_real, P_Beams_Cor_real_h, P_Beams_Cor_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed (d_P_Beams_Cor_real)"); - SAFE_CALL( - cudaMemcpy(d_P_Beams_Cor_imag, P_Beams_Cor_imag_h, P_Beams_Cor_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed (d_P_Beams_Cor_imag)"); SAFE_CALL( cudaMemcpy( d_beamCorrector_lin, beamCorrector_lin_h, beamCorrector_lin_Bytes, cudaMemcpyHostToDevice), @@ -659,7 +589,6 @@ CArray2D sonar_calculation_wrapper( const float beta = 0.0f; // Setting beta to 0.0f to overwrite C // Real part - cudaEventRecord(start_event); SAFE_CUBLAS_CALL( cublasSgemm( cublas_handle, @@ -682,12 +611,6 @@ CArray2D sonar_calculation_wrapper( N_gemm, d_P_Beams_Cor_imag, K_gemm, &beta, d_P_Beams_Cor_F_imag, N_gemm), "cublasSgemm Failed for imag part"); - cudaEventRecord(stop_event); - cudaEventSynchronize(stop_event); - cudaEventElapsedTime(&elapsed_ms, start_event, stop_event); - printf("cuBLAS cublasSgemm time: %.3f ms\n", elapsed_ms); - cudaEventDestroy(start_event); - cudaEventDestroy(stop_event); SAFE_CUBLAS_CALL(cublasDestroy_v2(cublas_handle), "CUBLAS_STATUS_ALLOC_FAILED"); SAFE_CALL( @@ -697,16 +620,6 @@ CArray2D sonar_calculation_wrapper( cudaMemcpy(P_Beams_Cor_imag_h, d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes, cudaMemcpyDeviceToHost), "CUDA Memcpy Failed for P_Beams_Cor_imag_h"); - for (size_t beam = 0; beam < nBeams; beam++) - { - for (size_t f = 0; f < nFreq; f++) - { - P_Beams_F[beam][f] = Complex( - P_Beams_Cor_real_h[f * nBeams + beam] / beamCorrectorSum, - P_Beams_Cor_imag_h[f * nBeams + beam] / beamCorrectorSum); - } - } - if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); @@ -728,29 +641,11 @@ CArray2D sonar_calculation_wrapper( // ################### FFT #####################// // #################################################// SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); - const int DATASIZE = nFreq; - const int BATCH = nBeams; // --- Host side input data allocation and initialization cufftComplex * hostInputData = (cufftComplex *)malloc(DATASIZE * BATCH * sizeof(cufftComplex)); start = std::chrono::high_resolution_clock::now(); - // for (int beam = 0; beam < BATCH; beam++) - // { - // for (int f = 0; f < DATASIZE; f++) - // { - // if (f < nFreq) - // { - // hostInputData[beam * DATASIZE + f] = - // make_cuComplex(P_Beams_F[beam][f].real() * 1.0f, P_Beams_F[beam][f].imag() * 1.0f); - // } - // else - // { - // hostInputData[beam * DATASIZE + f] = (make_cuComplex(0.f, 0.f)); // zero padding - // } - // } - // } - #pragma omp parallel for collapse(2) for (int beam = 0; beam < BATCH; ++beam) { @@ -758,15 +653,13 @@ CArray2D sonar_calculation_wrapper( { int idx = beam * DATASIZE + f; const std::complex & val = P_Beams_F[beam][f]; - hostInputData[idx] = make_cuComplex(val.real(), val.imag()); + hostInputData[idx] = make_cuComplex( + P_Beams_Cor_real_h[f * nBeams + beam] / beamCorrectorSum, + P_Beams_Cor_imag_h[f * nBeams + beam] / beamCorrectorSum); } } // --- Device side input data allocation and initialization - cufftComplex * deviceInputData; - SAFE_CALL( - cudaMalloc((void **)&deviceInputData, DATASIZE * BATCH * sizeof(cufftComplex)), - "FFT CUDA Malloc Failed"); SAFE_CALL( cudaMemcpy( deviceInputData, hostInputData, DATASIZE * BATCH * sizeof(cufftComplex), @@ -776,10 +669,6 @@ CArray2D sonar_calculation_wrapper( // --- Host side output data allocation cufftComplex * hostOutputData = (cufftComplex *)malloc(DATASIZE * BATCH * sizeof(cufftComplex)); - // --- Device side output data allocation - cufftComplex * deviceOutputData; - cudaMalloc((void **)&deviceOutputData, DATASIZE * BATCH * sizeof(cufftComplex)); - // --- Batched 1D FFTs cufftHandle handle; int rank = 1; // --- 1D FFTs @@ -803,12 +692,6 @@ CArray2D sonar_calculation_wrapper( cudaMemcpyDeviceToHost), "FFT CUDA Memcopy Failed"); - cufftDestroy(handle); - cudaFree(deviceOutputData); - cudaFree(deviceInputData); - free(hostInputData); - free(hostOutputData); - for (int beam = 0; beam < BATCH; beam++) { for (int f = 0; f < nFreq; f++) @@ -819,6 +702,10 @@ CArray2D sonar_calculation_wrapper( } } + cufftDestroy(handle); + free(hostInputData); + free(hostOutputData); + if (debugFlag) { stop = std::chrono::high_resolution_clock::now(); From 629271eb885a7652a07e894088a87a421d826e8e Mon Sep 17 00:00:00 2001 From: Woen-Sug Choi Date: Mon, 25 Aug 2025 14:32:00 +0900 Subject: [PATCH 10/16] minor fix --- gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt index 211eaba7..f9c64b66 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt @@ -16,7 +16,7 @@ set(CUDA_ARCHITECTURE "60" CACHE STRING "Target CUDA SM version") find_package(ament_cmake REQUIRED) find_package(CUDAToolkit QUIET) -if(BUILD_WITH_CUDA AND CUDAToolkit_FOUND) +if(CUDAToolkit_FOUND) enable_language(CUDA) find_package(CUDA REQUIRED) From e86b7e0912b216e66709b313ff3b82a30052eaca Mon Sep 17 00:00:00 2001 From: Woen-Sug Choi Date: Mon, 25 Aug 2025 15:22:11 +0900 Subject: [PATCH 11/16] fix blazing sonar image and add bool handle --- .../multibeam_sonar/MultibeamSonarSensor.cc | 18 +++---- .../multibeam_sonar/MultibeamSonarSensor.hh | 3 ++ .../multibeam_sonar/sonar_calculation_cuda.cu | 54 ++++++++++--------- .../sonar_calculation_cuda.cuh | 15 +++--- .../description/bluerov2_heavy/model.sdf | 5 +- .../description/blueview_p900/model.sdf | 5 +- 6 files changed, 51 insertions(+), 49 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc index 3ef17146..26f86b18 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc @@ -372,6 +372,9 @@ bool MultibeamSonarSensor::Implementation::InitializeBeamArrangement(MultibeamSo this->debugFlag = sensorElement->Get("debugFlag", false).first; gzmsg << "Debug: " << this->debugFlag << std::endl; + this->blazingFlag = sensorElement->Get("blazingSonarImage", false).first; + gzmsg << "BlazingSonarImage: " << this->blazingFlag << std::endl; + this->pointCloudTopicName = sensorElement->Get("pointCloudTopicName", "point_cloud").first; gzmsg << "pointCloudTopicName: " << this->pointCloudTopicName << std::endl; @@ -648,16 +651,6 @@ bool MultibeamSonarSensor::Implementation::InitializeBeamArrangement(MultibeamSo // -- Pre calculations for sonar -- // - // Random number generator - gzmsg << "Initializing random number generator..." << std::endl; - this->randImage = cv::Mat(this->pointMsg.height(), this->pointMsg.width(), CV_32FC2); - uint64 randN = static_cast(std::rand()); - gzmsg << "Random seed: " << randN << std::endl; - cv::theRNG().state = randN; - cv::RNG rng = cv::theRNG(); - rng.fill(this->randImage, cv::RNG::NORMAL, 0.0f, 1.0f); - gzmsg << "Random image generated with normal distribution." << std::endl; - // Hamming window gzmsg << "Computing Hamming window for " << this->nFreq << " frequencies." << std::endl; this->window = new float[this->nFreq]; @@ -1069,7 +1062,6 @@ void MultibeamSonarSensor::Implementation::ComputeSonarImage() CArray2D P_Beams = NpsGazeboSonar::sonar_calculation_wrapper( this->pointCloudImage, // cv::Mat& depth_image (the point cloud image) normal_image, // cv::Mat& normal_image - this->randImage, // cv::Mat& rand_image hPixelSize, // hPixelSize vPixelSize, // vPixelSize hFOV, // hFOV @@ -1093,7 +1085,9 @@ void MultibeamSonarSensor::Implementation::ComputeSonarImage() this->window, // _window this->beamCorrector, // _beamCorrector this->beamCorrectorSum, // _beamCorrectorSum - this->debugFlag); + this->debugFlag, // debugFlag + this->blazingFlag // _blazingFlag + ); // For calc time measure auto stop = std::chrono::high_resolution_clock::now(); diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.hh b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.hh index 7ccfec3d..b7790e3a 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.hh +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.hh @@ -160,6 +160,9 @@ private: int ray_nElevationRays; float * rangeVector; + // Sonar image parameters + bool blazingFlag; + // Debug flags and reflectivity bool debugFlag; bool constMu; diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index 34354ac8..4bef3e77 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -37,6 +37,7 @@ #include #include +#include // FOR DEBUG -- DEV VERSION #include @@ -85,7 +86,6 @@ static inline void _safe_cuda_call( // Persistent GPU memory pointers (device) static float * d_depth_image = nullptr; static float * d_normal_image = nullptr; -static float * d_rand_image = nullptr; static float * d_reflectivity_image = nullptr; static float * d_ray_elevationAngles = nullptr; static float * d_P_Beams_F_real = nullptr; @@ -198,7 +198,7 @@ __global__ void reduce_beams_kernel( // Sonar Claculation Function __global__ void sonar_calculation( thrust::complex * P_Beams, float * depth_image, float * normal_image, int width, - int height, int depth_image_step, int normal_image_step, float * rand_image, int rand_image_step, + int height, int depth_image_step, int normal_image_step, unsigned long long seed, float * reflectivity_image, int reflectivity_image_step, float hPixelSize, float vPixelSize, float hFOV, float vFOV, float beam_azimuthAngleWidth, float beam_elevationAngleWidth, float ray_azimuthAngleWidth, float * ray_elevationAngles, float ray_elevationAngleWidth, @@ -216,7 +216,6 @@ __global__ void sonar_calculation( // Location of the image pixel const int depth_index = ray * depth_image_step / sizeof(float) + beam; const int normal_index = ray * normal_image_step / sizeof(float) + (3 * beam); - const int rand_index = ray * rand_image_step / sizeof(float) + (2 * beam); const int reflectivity_index = ray * reflectivity_image_step / sizeof(float) + beam; // Input parameters for ray processing @@ -239,9 +238,10 @@ __global__ void sonar_calculation( acos(normal[2]); // compute_incidence(ray_azimuthAngle, ray_elevationAngle, normal); // ----- Point scattering model ------ // - // Gaussian noise generated using opencv RNG - float xi_z = rand_image[rand_index]; - float xi_y = rand_image[rand_index + 1]; + curandState state; + curand_init(seed, beam * height + ray, 0, &state); // seed, unique id, offset, state + float xi_z = curand_normal(&state); // standard normal random value + float xi_y = curand_normal(&state); // standard normal random value // Calculate amplitude thrust::complex randomAmps = thrust::complex(xi_z / sqrt(2.0), xi_y / sqrt(2.0)); @@ -312,7 +312,6 @@ void free_cuda_memory() { SAFE_CALL(cudaFree(d_depth_image), "cudaFree failed for d_depth_image"); SAFE_CALL(cudaFree(d_normal_image), "cudaFree failed for d_normal_image"); - SAFE_CALL(cudaFree(d_rand_image), "cudaFree failed for d_rand_image"); SAFE_CALL(cudaFree(d_reflectivity_image), "cudaFree failed for d_reflectivity_image"); SAFE_CALL(cudaFree(d_ray_elevationAngles), "cudaFree failed for d_ray_elevationAngles"); SAFE_CALL(cudaFree(d_P_Beams), "cudaFree failed for d_P_Beams"); @@ -334,14 +333,13 @@ void free_cuda_memory() // Sonar Claculation Function Wrapper CArray2D sonar_calculation_wrapper( - const cv::Mat & depth_image, const cv::Mat & normal_image, const cv::Mat & rand_image, - double _hPixelSize, double _vPixelSize, double _hFOV, double _vFOV, - double _beam_azimuthAngleWidth, double _beam_elevationAngleWidth, double _ray_azimuthAngleWidth, - float * _ray_elevationAngles, double _ray_elevationAngleWidth, double _soundSpeed, - double _maxDistance, double _sourceLevel, int _nBeams, int _nRays, int _raySkips, - double _sonarFreq, double _bandwidth, int _nFreq, const cv::Mat & reflectivity_image, - double _attenuation, float * window, float ** beamCorrector, float beamCorrectorSum, - bool debugFlag) + const cv::Mat & depth_image, const cv::Mat & normal_image, double _hPixelSize, double _vPixelSize, + double _hFOV, double _vFOV, double _beam_azimuthAngleWidth, double _beam_elevationAngleWidth, + double _ray_azimuthAngleWidth, float * _ray_elevationAngles, double _ray_elevationAngleWidth, + double _soundSpeed, double _maxDistance, double _sourceLevel, int _nBeams, int _nRays, + int _raySkips, double _sonarFreq, double _bandwidth, int _nFreq, + const cv::Mat & reflectivity_image, double _attenuation, float * window, float ** beamCorrector, + float beamCorrectorSum, bool debugFlag, bool blazingFlag) { // Start the timer for the entire function auto total_start_time = std::chrono::high_resolution_clock::now(); @@ -390,7 +388,6 @@ CArray2D sonar_calculation_wrapper( // Calculate total number of bytes of input and output image const int depth_image_Bytes = depth_image.step * depth_image.rows; const int normal_image_Bytes = normal_image.step * normal_image.rows; - const int rand_image_Bytes = rand_image.step * rand_image.rows; const int reflectivity_image_Bytes = reflectivity_image.step * reflectivity_image.rows; const int ray_elevationAngles_Bytes = sizeof(float) * nRays; const int P_Beams_N = nBeams * (int)(nRays / raySkips) * (nFreq + 1); @@ -413,7 +410,6 @@ CArray2D sonar_calculation_wrapper( "MallocHost ray_elevationAngles"); SAFE_CALL( cudaMalloc((void **)&d_normal_image, normal_image.step * normal_image.rows), "normal malloc"); - SAFE_CALL(cudaMalloc((void **)&d_rand_image, rand_image.step * rand_image.rows), "rand malloc"); SAFE_CALL( cudaMalloc((void **)&d_reflectivity_image, reflectivity_image.step * reflectivity_image.rows), "reflectivity malloc"); @@ -473,9 +469,6 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL( cudaMemcpy(d_normal_image, normal_image.ptr(), normal_image_Bytes, cudaMemcpyHostToDevice), "CUDA Memcpy Failed"); - SAFE_CALL( - cudaMemcpy(d_rand_image, rand_image.ptr(), rand_image_Bytes, cudaMemcpyHostToDevice), - "CUDA Memcpy Failed"); SAFE_CALL( cudaMemcpy( d_reflectivity_image, reflectivity_image.ptr(), reflectivity_image_Bytes, @@ -494,14 +487,25 @@ CArray2D sonar_calculation_wrapper( const dim3 grid( (depth_image.cols + block.x - 1) / block.x, (depth_image.rows + block.y - 1) / block.y); + // Random seed for blazing sonar image + unsigned long long seed; + if (blazingFlag) + { + seed = static_cast(time(NULL)); + } + else + { + seed = 1234; + } + // Launch the beamor conversion kernel sonar_calculation<<>>( d_P_Beams, d_depth_image, d_normal_image, normal_image.cols, normal_image.rows, - depth_image.step, normal_image.step, d_rand_image, rand_image.step, d_reflectivity_image, - reflectivity_image.step, hPixelSize, vPixelSize, hFOV, vFOV, beam_azimuthAngleWidth, - beam_elevationAngleWidth, ray_azimuthAngleWidth, d_ray_elevationAngles, ray_elevationAngleWidth, - soundSpeed, sourceTerm, nBeams, nRays, raySkips, sonarFreq, delta_f, nFreq, bandwidth, - max_distance, attenuation, area_scaler); + depth_image.step, normal_image.step, seed, d_reflectivity_image, reflectivity_image.step, + hPixelSize, vPixelSize, hFOV, vFOV, beam_azimuthAngleWidth, beam_elevationAngleWidth, + ray_azimuthAngleWidth, d_ray_elevationAngles, ray_elevationAngleWidth, soundSpeed, sourceTerm, + nBeams, nRays, raySkips, sonarFreq, delta_f, nFreq, bandwidth, max_distance, attenuation, + area_scaler); // Synchronize to check for any kernel launch errors SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed"); diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh index ef78b07a..651c05da 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cuh @@ -45,12 +45,11 @@ void free_cuda_memory(); /// \brief Sonar Claculation Function Wrapper CArray2D sonar_calculation_wrapper( - const cv::Mat & depth_image, const cv::Mat & normal_image, const cv::Mat & rand_image, - double _hPixelSize, double _vPixelSize, double _hFOV, double _vFOV, - double _beam_azimuthAngleWidth, double _beam_elevationAngleWidth, double _ray_azimuthAngleWidth, - float * _ray_elevationAngles, double _ray_elevationAngleWidth, double _soundSpeed, - double _maxDistance, double _sourceLevel, int _nBeams, int _nRays, int _raySkips, - double _sonarFreq, double _bandwidth, int _nFreq, const cv::Mat & reflectivity_image, - double _attenuation, float * _window, float ** _beamCorrector, float _beamCorrectorSum, - bool _debugFlag); + const cv::Mat & depth_image, const cv::Mat & normal_image, double _hPixelSize, double _vPixelSize, + double _hFOV, double _vFOV, double _beam_azimuthAngleWidth, double _beam_elevationAngleWidth, + double _ray_azimuthAngleWidth, float * _ray_elevationAngles, double _ray_elevationAngleWidth, + double _soundSpeed, double _maxDistance, double _sourceLevel, int _nBeams, int _nRays, + int _raySkips, double _sonarFreq, double _bandwidth, int _nFreq, + const cv::Mat & reflectivity_image, double _attenuation, float * _window, float ** _beamCorrector, + float _beamCorrectorSum, bool _debugFlag, bool _blazingFlag); } // namespace NpsGazeboSonar \ No newline at end of file diff --git a/models/dave_robot_models/description/bluerov2_heavy/model.sdf b/models/dave_robot_models/description/bluerov2_heavy/model.sdf index d5084caa..ab3f53fe 100644 --- a/models/dave_robot_models/description/bluerov2_heavy/model.sdf +++ b/models/dave_robot_models/description/bluerov2_heavy/model.sdf @@ -165,8 +165,9 @@ 10 10 0.02 - true - true + true + false + false 5 point_cloud diff --git a/models/dave_sensor_models/description/blueview_p900/model.sdf b/models/dave_sensor_models/description/blueview_p900/model.sdf index 3745c2e3..dff61f43 100644 --- a/models/dave_sensor_models/description/blueview_p900/model.sdf +++ b/models/dave_sensor_models/description/blueview_p900/model.sdf @@ -90,8 +90,9 @@ 10 10 0.02 - true - true + true + false + false 5 point_cloud From 2394ee3fd1e169d45571e5aa4f34ddd17348b707 Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Mon, 25 Aug 2025 03:35:31 -0300 Subject: [PATCH 12/16] clean up not used vectors --- .../multibeam_sonar/sonar_calculation_cuda.cu | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index 4bef3e77..de8d3dd2 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -88,8 +88,6 @@ static float * d_depth_image = nullptr; static float * d_normal_image = nullptr; static float * d_reflectivity_image = nullptr; static float * d_ray_elevationAngles = nullptr; -static float * d_P_Beams_F_real = nullptr; -static float * d_P_Beams_F_imag = nullptr; static thrust::complex * d_P_Beams = nullptr; static thrust::complex * P_Beams = nullptr; static bool memory_initialized = false; @@ -145,8 +143,8 @@ __device__ __host__ float unnormalized_sinc(float t) } __global__ void reduce_beams_kernel( - const thrust::complex * __restrict__ d_P_Beams, float * d_P_Beams_F_real, - float * d_P_Beams_F_imag, int nBeams, int nFreq, int nRaysSkipped) + const thrust::complex * __restrict__ d_P_Beams, float * d_P_Beams_Cor_real, + float * d_P_Beams_Cor_imag, int nBeams, int nFreq, int nRaysSkipped) { const int beam = blockIdx.y; const int f = blockIdx.x; @@ -189,8 +187,8 @@ __global__ void reduce_beams_kernel( if (threadIdx.x == 0) { int output_idx = f * nBeams + beam; - d_P_Beams_F_real[output_idx] = shared_real[0]; - d_P_Beams_F_imag[output_idx] = shared_imag[0]; + d_P_Beams_Cor_real[output_idx] = shared_real[0]; + d_P_Beams_Cor_imag[output_idx] = shared_imag[0]; } } @@ -318,8 +316,6 @@ void free_cuda_memory() SAFE_CALL(cudaFree(deviceOutputData), "cudaFree failed for OutputData"); SAFE_CALL(cudaFree(deviceInputData), "cudaFree failed for InputData"); SAFE_CALL(cudaFreeHost(P_Beams), "cudaFreeHost failed for P_Beams"); - SAFE_CALL(cudaFree(d_P_Beams_F_real), "cudaFree failed for d_P_Beams_F_real"); - SAFE_CALL(cudaFree(d_P_Beams_F_imag), "cudaFree failed for d_P_Beams_F_imag"); SAFE_CALL(cudaFree(d_P_Beams_Cor_imag), "cudaFree failed for d_P_Beams_Cor_imag"); SAFE_CALL(cudaFree(d_P_Beams_Cor_real), "cudaFree failed for d_P_Beams_Cor_real"); SAFE_CALL(cudaFree(d_P_Beams_Cor_F_imag), "cudaFree failed for d_P_Beams_Cor_F_imag"); @@ -421,8 +417,6 @@ CArray2D sonar_calculation_wrapper( SAFE_CALL( cudaMalloc((void **)&d_P_Beams, sizeof(thrust::complex) * P_Beams_N), "P_Beams malloc device"); - SAFE_CALL(cudaMalloc(&d_P_Beams_F_real, sizeof(float) * nBeams * nFreq), "beam real malloc"); - SAFE_CALL(cudaMalloc(&d_P_Beams_F_imag, sizeof(float) * nBeams * nFreq), "beam imag malloc"); SAFE_CALL( cudaMallocHost((void **)&P_Beams_Cor_real_h, P_Beams_Cor_Bytes), "CUDA MallocHost Failed for P_Beams_Cor_real_h"); From 4ff3de58664ca3814031bc4d52ba5e61ba5a3b9d Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Mon, 25 Aug 2025 04:39:31 -0300 Subject: [PATCH 13/16] remove pcl dependencies --- .../multibeam_sonar/CMakeLists.txt | 9 --------- .../multibeam_sonar/MultibeamSonarSensor.cc | 6 +----- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt index f9c64b66..3f0b87d2 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt @@ -32,16 +32,10 @@ if(CUDAToolkit_FOUND) find_package(sensor_msgs REQUIRED) find_package(geometry_msgs REQUIRED) find_package(rosidl_default_generators REQUIRED) - find_package(PCL REQUIRED) - find_package(pcl_conversions REQUIRED) find_package(OpenCV REQUIRED) find_package(marine_acoustic_msgs REQUIRED) find_package(cv_bridge REQUIRED) - include_directories(${PCL_INCLUDE_DIRS}) - link_directories(${PCL_LIBRARY_DIRS}) - add_definitions(${PCL_DEFINITIONS}) - set(GZ_MSGS_VER ${gz-msgs10_VERSION_MAJOR}) set(GZ_RENDERING_VER ${gz-rendering8_VERSION_MAJOR}) set(GZ_SENSORS_VER ${gz-sensors8_VERSION_MAJOR}) @@ -88,7 +82,6 @@ if(CUDAToolkit_FOUND) endif() target_link_libraries(${PROJECT_NAME} - ${PCL_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUBLAS_LIB} @@ -107,8 +100,6 @@ if(CUDAToolkit_FOUND) rclcpp sensor_msgs rosidl_default_generators - PCL - pcl_conversions OpenCV marine_acoustic_msgs geometry_msgs diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc index 26f86b18..310e9568 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/MultibeamSonarSensor.cc @@ -48,11 +48,7 @@ #include "MultibeamSonarSensor.hh" #include "sonar_calculation_cuda.cuh" -#include -#include -#include -#include -#include +#include #include #include #include From b62d18a747153b78cfc73a86e060f96c09e49466 Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Mon, 25 Aug 2025 04:58:58 -0300 Subject: [PATCH 14/16] change curand state type --- .../multibeam_sonar/sonar_calculation_cuda.cu | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index de8d3dd2..7991d0d6 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -236,10 +236,13 @@ __global__ void sonar_calculation( acos(normal[2]); // compute_incidence(ray_azimuthAngle, ray_elevationAngle, normal); // ----- Point scattering model ------ // - curandState state; + curandStatePhilox4_32_10_t state; curand_init(seed, beam * height + ray, 0, &state); // seed, unique id, offset, state - float xi_z = curand_normal(&state); // standard normal random value - float xi_y = curand_normal(&state); // standard normal random value + + float4 xi = curand_normal4(&state); // standard 4 normal random values + + float xi_z = xi.x; + float xi_y = xi.y; // Calculate amplitude thrust::complex randomAmps = thrust::complex(xi_z / sqrt(2.0), xi_y / sqrt(2.0)); From 0f20acd5608aa0e4da01a0eab0255ae530b50712 Mon Sep 17 00:00:00 2001 From: helena_moyen Date: Tue, 26 Aug 2025 19:00:50 -0300 Subject: [PATCH 15/16] replace functions to use gpu intrinsic ones --- .../multibeam_sonar/sonar_calculation_cuda.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu index 7991d0d6..f01da386 100644 --- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu +++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_calculation_cuda.cu @@ -36,6 +36,8 @@ #include #include +#include +#include #include #include @@ -270,17 +272,18 @@ __global__ void sonar_calculation( float freq; if (nFreq % 2 == 0) { - freq = delta_f * (-nFreq / 2.0f + f + 1.0f); + freq = __fdividef(delta_f * (-nFreq + 2.0f * (f + 1.0f)), 2.0f); } else { - freq = delta_f * (-(nFreq - 1) / 2.0f + f + 1.0f); + freq = __fdividef(delta_f * (-(nFreq - 1) + 2.0f * (f + 1.0f)), 2.0f); } - float kw = 2.0f * M_PI * freq / soundSpeed; // wave vector + + float kw = __fdividef(2.0f * M_PI * freq, soundSpeed); // wave vector float phase = 2.0f * distance * kw; float s, c; - __sincosf(phase, &s, &c); + __sincosf(phase, &s, &c); // intrinsic sine and cosine thrust::complex kernel(c, s); kernel *= amplitude; From 3df3fbe3a2d0174a787924ace90f2562ffa0a41c Mon Sep 17 00:00:00 2001 From: Woen-Sug Choi Date: Wed, 3 Sep 2025 22:58:51 +0900 Subject: [PATCH 16/16] add ros-jazzy-marine-acoustic-msgs --- extras/ros-jazzy-gz-harmonic-install.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/extras/ros-jazzy-gz-harmonic-install.sh b/extras/ros-jazzy-gz-harmonic-install.sh index 24d4309c..da522c1c 100644 --- a/extras/ros-jazzy-gz-harmonic-install.sh +++ b/extras/ros-jazzy-gz-harmonic-install.sh @@ -88,6 +88,7 @@ sudo apt update && apt install -y \ ros-$DIST-ros2-controllers \ ros-$DIST-teleop-tools \ ros-$DIST-urdfdom-py \ + ros-$DIST-marine-acoustic-msgs \ ros-dev-tools echo