openucx
diff --git a/‎AUTHORS‎
Lines changed: 1 addition & 0 deletions b/‎AUTHORS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎contrib/configure-devel‎
Lines changed: 1 addition & 1 deletion b/‎contrib/configure-devel‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎contrib/configure-opt‎
Lines changed: 1 addition & 1 deletion b/‎contrib/configure-opt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎contrib/configure-prof‎
Lines changed: 1 addition & 1 deletion b/‎contrib/configure-prof‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎contrib/configure-release‎
Lines changed: 1 addition & 1 deletion b/‎contrib/configure-release‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎contrib/configure-release-mt‎
Lines changed: 1 addition & 1 deletion b/‎contrib/configure-release-mt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎contrib/test_jenkins.sh‎
Lines changed: 10 additions & 2 deletions b/‎contrib/test_jenkins.sh‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎contrib/ucx_perftest_config/test_types_ucp_device_cuda‎
Lines changed: 4 additions & 2 deletions b/‎contrib/ucx_perftest_config/test_types_ucp_device_cuda‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/tools/perf/cuda/cuda_kernel.cuh‎
Lines changed: 4 additions & 2 deletions b/‎src/tools/perf/cuda/cuda_kernel.cuh‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/tools/perf/cuda/ucp_cuda_kernel.cu‎
Lines changed: 28 additions & 16 deletions b/‎src/tools/perf/cuda/ucp_cuda_kernel.cu‎
Lines changed: 28 additions & 16 deletions
@@ -59,6 +59,7 @@ Keisuke Fukuda <[email protected]>
 Ken Raffenetti <[email protected]>
 Khaled Hamidouche <[email protected]>
 Konstantin Belousov <[email protected]>
+Laurin Martins <[email protected]>
 Leonid Genkin <[email protected]>
 Lior Paz <[email protected]>
 Luis E. Pena <[email protected]>
 
@@ -11,7 +11,7 @@
 #
 
 basedir=$(cd $(dirname $0) && pwd)
-$basedir/../configure \
+"$basedir/../configure" \
 	--enable-gtest \
 	--enable-examples \
 	--enable-test-apps \
 
@@ -11,6 +11,6 @@
 #
 
 basedir=$(cd $(dirname $0) && pwd)
-$basedir/configure-release \
+"$basedir/configure-release" \
 	--enable-optimizations \
 	"$@"
@@ -11,7 +11,7 @@
 #
 
 basedir=$(cd $(dirname $0) && pwd)
-$basedir/../configure \
+"$basedir/../configure" \
 	--disable-logging \
 	--disable-debug \
 	--disable-assertions \
 
@@ -11,7 +11,7 @@
 #
 
 basedir=$(cd $(dirname $0) && pwd)
-$basedir/../configure \
+"$basedir/../configure" \
 	--disable-logging \
 	--disable-debug \
 	--disable-assertions \
 
@@ -11,6 +11,6 @@
 #
 
 basedir=$(cd $(dirname $0) && pwd)
-$basedir/../contrib/configure-release \
+"$basedir/../contrib/configure-release" \
 	--enable-mt \
 	"$@"
@@ -657,10 +657,18 @@ run_ucx_perftest_cuda_device() {
 	ucx_perftest="$ucx_inst/bin/ucx_perftest"
 	ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"
 
-	# TODO: Run on all GPUs
+	# TODO: Run on all GPUs & NICs combinations
 	ucp_client_args="-a cuda:0 $(hostname)"
+	gda_tls="cuda_copy,rc,rc_gda"
+	cuda_ipc_tls="cuda_copy,rc,cuda_ipc"
 
-	run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
+	# TODO: Run with cuda_ipc_tls
+	for tls in "$gda_tls"
+	do
+		export UCX_TLS=${tls}
+		run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
+	done
+	unset UCX_TLS
 }
 
 #
 
@@ -2,6 +2,8 @@
 # UCP basic device cuda tests
 #
 ucp_device_cuda_bw_1k_1thread         -t ucp_put_multi_bw -m cuda -s 1024 -n 10000
-ucp_device_cuda_bw_1k_128threads      -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 128
+# TODO - Increase number of threads after adjusting perftest.
+ucp_device_cuda_bw_1k_128threads      -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 32
 ucp_device_cuda_lat_1k_1thread        -t ucp_put_multi_lat -m cuda -s 1024 -n 10000
-ucp_device_cuda_lat_1k_128threads     -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 128
+# TODO - Increase number of threads after adjusting perftest.
+ucp_device_cuda_lat_1k_128threads     -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 32
@@ -79,9 +79,11 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
     return count;
 }
 
-UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) {
+UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits,
+                                    size_t from)
+{
     for (size_t i = from; i < bits; i++) {
-        if (UCX_BIT_GET(set, i)) {
+        if (!UCX_BIT_GET(set, i)) {
             return i;
         }
     }
 
@@ -18,7 +18,9 @@
 
 class ucp_perf_cuda_request_manager {
 public:
-    __device__ ucp_perf_cuda_request_manager(size_t size) : m_size(size)
+    __device__
+    ucp_perf_cuda_request_manager(size_t size, ucp_device_request *requests) :
+        m_size(size), m_requests(requests)
     {
         assert(m_size <= CAPACITY);
         for (size_t i = 0; i < m_size; ++i) {
@@ -54,7 +56,7 @@ public:
     __device__ ucp_device_request_t &get_request()
     {
         assert(get_pending_count() < m_size);
-        size_t index = ucx_bitset_ffs(m_pending, m_size, 0);
+        size_t index = ucx_bitset_ffns(m_pending, m_size, 0);
         UCX_BIT_SET(m_pending, index);
         return m_requests[index];
     }
@@ -69,7 +71,7 @@ private:
     static const size_t CAPACITY = 128;
 
     size_t               m_size;
-    ucp_device_request_t m_requests[CAPACITY];
+    ucp_device_request_t *m_requests;
     uint8_t              m_pending[UCX_BITSET_SIZE(CAPACITY)];
 };
 
@@ -81,24 +83,29 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
                                   const void *address, uint64_t remote_address,
                                   size_t length)
 {
+    extern __shared__ ucp_device_request requests[];
     ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns();
     ucx_perf_counter_t max_iters          = ctx.max_iters;
     uint64_t *sn                          = ucx_perf_cuda_get_sn(address, length);
-    ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding);
+    ucp_device_request *thread_requests =
+            &requests[ctx.max_outstanding * threadIdx.x];
+    ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding,
+                                              thread_requests);
     ucs_status_t status;
 
     for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
         while (request_mgr.get_pending_count() >= ctx.max_outstanding) {
             status = request_mgr.progress<level>(1);
-            if (status != UCS_OK) {
+            if (UCS_STATUS_IS_ERR(status)) {
                 break;
             }
         }
 
         *sn = idx + 1;
         ucp_device_request_t &req = request_mgr.get_request();
         status = ucp_device_put_single<level>(mem_list, mem_list_index, address,
-                                              remote_address, length, 0, &req);
+                                              remote_address, length,
+                                              UCP_DEVICE_FLAG_NODELAY, &req);
         if (status != UCS_OK) {
             break;
         }
@@ -109,7 +116,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
 
     while (request_mgr.get_pending_count() > 0) {
         status = request_mgr.progress<level>(max_iters);
-        if (status != UCS_OK) {
+        if (UCS_STATUS_IS_ERR(status)) {
             break;
         }
     }
@@ -135,17 +142,19 @@ ucp_perf_cuda_put_single(ucp_device_mem_list_handle_h mem_list,
                          unsigned mem_list_index, const void *address,
                          uint64_t remote_address, size_t length)
 {
-    ucp_device_request_t req;
+    extern __shared__ ucp_device_request requests[];
+    ucp_device_request *req = &requests[threadIdx.x];
     ucs_status_t status;
 
     status = ucp_device_put_single<level>(mem_list, mem_list_index, address,
-                                          remote_address, length, 0, &req);
+                                          remote_address, length,
+                                          UCP_DEVICE_FLAG_NODELAY, req);
     if (status != UCS_OK) {
         return status;
     }
 
     do {
-        status = ucp_device_progress_req<level>(&req);
+        status = ucp_device_progress_req<level>(req);
     } while (status == UCS_INPROGRESS);
 
     return status;
@@ -220,8 +229,9 @@ public:
         ucp_perf_barrier(&m_perf);
         ucx_perf_test_start_clock(&m_perf);
 
-        ucp_perf_cuda_put_multi_latency_kernel
-            <UCS_DEVICE_LEVEL_THREAD><<<1, thread_count>>>(
+        ucp_perf_cuda_put_multi_latency_kernel<UCS_DEVICE_LEVEL_THREAD>
+            <<<1, thread_count,
+               thread_count * sizeof(ucp_device_request)>>>(
                 gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
                 m_perf.ucp.remote_addr, length, m_perf.recv_buffer, my_index);
         CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
@@ -250,10 +260,12 @@ public:
             }
 
             unsigned thread_count = m_perf.params.device_thread_count;
-            ucp_perf_cuda_put_multi_bw_kernel
-                <UCS_DEVICE_LEVEL_THREAD><<<1, thread_count>>>(
-                    gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
-                    m_perf.ucp.remote_addr, length);
+            ucp_perf_cuda_put_multi_bw_kernel<UCS_DEVICE_LEVEL_THREAD>
+                    <<<1, thread_count,
+                       thread_count * m_perf.params.max_outstanding *
+                               sizeof(ucp_device_request)>>>(
+                            gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
+                            m_perf.ucp.remote_addr, length);
             CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
             wait_for_kernel(length);
         } else if (my_index == 0) {
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`#`
`12`	`12`
`13`	`13`	`basedir=$(cd $(dirname $0) && pwd)`
`14`		`-$basedir/../configure \`
	`14`	`+"$basedir/../configure" \`
`15`	`15`	`--enable-gtest \`
`16`	`16`	`--enable-examples \`
`17`	`17`	`--enable-test-apps \`
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,6 @@`
`11`	`11`	`#`
`12`	`12`
`13`	`13`	`basedir=$(cd $(dirname $0) && pwd)`
`14`		`-$basedir/configure-release \`
	`14`	`+"$basedir/configure-release" \`
`15`	`15`	`--enable-optimizations \`
`16`	`16`	`"$@"`
Original file line number	Diff line number	Diff line change
`@@ -79,9 +79,11 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {`
`79`	`79`	`return count;`
`80`	`80`	`}`
`81`	`81`
`82`		`-UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) {`
	`82`	`+UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits,`
	`83`	`+ size_t from)`
	`84`	`+{`
`83`	`85`	`for (size_t i = from; i < bits; i++) {`
`84`		`- if (UCX_BIT_GET(set, i)) {`
	`86`	`+ if (!UCX_BIT_GET(set, i)) {`
`85`	`87`	`return i;`
`86`	`88`	`}`
`87`	`89`	`}`