1818
1919class ucp_perf_cuda_request_manager {
2020public:
21- __device__ ucp_perf_cuda_request_manager (size_t size) : m_size(size)
21+ __device__
22+ ucp_perf_cuda_request_manager (size_t size, ucp_device_request *requests) :
23+ m_size (size), m_requests(requests)
2224 {
2325 assert (m_size <= CAPACITY);
2426 for (size_t i = 0 ; i < m_size; ++i) {
@@ -54,7 +56,7 @@ public:
5456 __device__ ucp_device_request_t &get_request ()
5557 {
5658 assert (get_pending_count () < m_size);
57- size_t index = ucx_bitset_ffs (m_pending, m_size, 0 );
59+ size_t index = ucx_bitset_ffns (m_pending, m_size, 0 );
5860 UCX_BIT_SET (m_pending, index);
5961 return m_requests[index];
6062 }
@@ -69,7 +71,7 @@ private:
6971 static const size_t CAPACITY = 128 ;
7072
7173 size_t m_size;
72- ucp_device_request_t m_requests[CAPACITY] ;
74+ ucp_device_request_t * m_requests;
7375 uint8_t m_pending[UCX_BITSET_SIZE(CAPACITY)];
7476};
7577
@@ -81,24 +83,29 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
8183 const void *address, uint64_t remote_address,
8284 size_t length)
8385{
86+ extern __shared__ ucp_device_request requests[];
8487 ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns ();
8588 ucx_perf_counter_t max_iters = ctx.max_iters ;
8689 uint64_t *sn = ucx_perf_cuda_get_sn (address, length);
87- ucp_perf_cuda_request_manager request_mgr (ctx.max_outstanding );
90+ ucp_device_request *thread_requests =
91+ &requests[ctx.max_outstanding * threadIdx .x ];
92+ ucp_perf_cuda_request_manager request_mgr (ctx.max_outstanding ,
93+ thread_requests);
8894 ucs_status_t status;
8995
9096 for (ucx_perf_counter_t idx = 0 ; idx < max_iters; idx++) {
9197 while (request_mgr.get_pending_count () >= ctx.max_outstanding ) {
9298 status = request_mgr.progress <level>(1 );
93- if (status != UCS_OK ) {
99+ if (UCS_STATUS_IS_ERR ( status) ) {
94100 break ;
95101 }
96102 }
97103
98104 *sn = idx + 1 ;
99105 ucp_device_request_t &req = request_mgr.get_request ();
100106 status = ucp_device_put_single<level>(mem_list, mem_list_index, address,
101- remote_address, length, 0 , &req);
107+ remote_address, length,
108+ UCP_DEVICE_FLAG_NODELAY, &req);
102109 if (status != UCS_OK) {
103110 break ;
104111 }
@@ -109,7 +116,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
109116
110117 while (request_mgr.get_pending_count () > 0 ) {
111118 status = request_mgr.progress <level>(max_iters);
112- if (status != UCS_OK ) {
119+ if (UCS_STATUS_IS_ERR ( status) ) {
113120 break ;
114121 }
115122 }
@@ -135,17 +142,19 @@ ucp_perf_cuda_put_single(ucp_device_mem_list_handle_h mem_list,
135142 unsigned mem_list_index, const void *address,
136143 uint64_t remote_address, size_t length)
137144{
138- ucp_device_request_t req;
145+ extern __shared__ ucp_device_request requests[];
146+ ucp_device_request *req = &requests[threadIdx .x ];
139147 ucs_status_t status;
140148
141149 status = ucp_device_put_single<level>(mem_list, mem_list_index, address,
142- remote_address, length, 0 , &req);
150+ remote_address, length,
151+ UCP_DEVICE_FLAG_NODELAY, req);
143152 if (status != UCS_OK) {
144153 return status;
145154 }
146155
147156 do {
148- status = ucp_device_progress_req<level>(& req);
157+ status = ucp_device_progress_req<level>(req);
149158 } while (status == UCS_INPROGRESS);
150159
151160 return status;
@@ -220,8 +229,9 @@ public:
220229 ucp_perf_barrier (&m_perf);
221230 ucx_perf_test_start_clock (&m_perf);
222231
223- ucp_perf_cuda_put_multi_latency_kernel
224- <UCS_DEVICE_LEVEL_THREAD><<<1 , thread_count>>> (
232+ ucp_perf_cuda_put_multi_latency_kernel<UCS_DEVICE_LEVEL_THREAD>
233+ <<<1 , thread_count,
234+ thread_count * sizeof (ucp_device_request)>>> (
225235 gpu_ctx (), handle.get (), 0 , m_perf.send_buffer ,
226236 m_perf.ucp .remote_addr , length, m_perf.recv_buffer , my_index);
227237 CUDA_CALL_RET (UCS_ERR_NO_DEVICE, cudaGetLastError);
@@ -250,10 +260,12 @@ public:
250260 }
251261
252262 unsigned thread_count = m_perf.params .device_thread_count ;
253- ucp_perf_cuda_put_multi_bw_kernel
254- <UCS_DEVICE_LEVEL_THREAD><<<1 , thread_count>>> (
255- gpu_ctx (), handle.get (), 0 , m_perf.send_buffer ,
256- m_perf.ucp .remote_addr , length);
263+ ucp_perf_cuda_put_multi_bw_kernel<UCS_DEVICE_LEVEL_THREAD>
264+ <<<1 , thread_count,
265+ thread_count * m_perf.params.max_outstanding *
266+ sizeof (ucp_device_request)>>> (
267+ gpu_ctx (), handle.get (), 0 , m_perf.send_buffer ,
268+ m_perf.ucp .remote_addr , length);
257269 CUDA_CALL_RET (UCS_ERR_NO_DEVICE, cudaGetLastError);
258270 wait_for_kernel (length);
259271 } else if (my_index == 0 ) {
0 commit comments