Skip to content

Commit d928b7c

Browse files
committed
CONTRIB/CONFIGURE: Update configure scripts to handle spaces in basedir
1 parent 29831d3 commit d928b7c

File tree

23 files changed

+482
-146
lines changed

23 files changed

+482
-146
lines changed

AUTHORS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ Keisuke Fukuda <[email protected]>
5959
Ken Raffenetti <[email protected]>
6060
Khaled Hamidouche <[email protected]>
6161
Konstantin Belousov <[email protected]>
62+
Laurin Martins <[email protected]>
6263
Leonid Genkin <[email protected]>
6364
6465
Luis E. Pena <[email protected]>

contrib/configure-devel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#
1212

1313
basedir=$(cd $(dirname $0) && pwd)
14-
$basedir/../configure \
14+
"$basedir/../configure" \
1515
--enable-gtest \
1616
--enable-examples \
1717
--enable-test-apps \

contrib/configure-opt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,6 @@
1111
#
1212

1313
basedir=$(cd $(dirname $0) && pwd)
14-
$basedir/configure-release \
14+
"$basedir/configure-release" \
1515
--enable-optimizations \
1616
"$@"

contrib/configure-prof

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#
1212

1313
basedir=$(cd $(dirname $0) && pwd)
14-
$basedir/../configure \
14+
"$basedir/../configure" \
1515
--disable-logging \
1616
--disable-debug \
1717
--disable-assertions \

contrib/configure-release

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#
1212

1313
basedir=$(cd $(dirname $0) && pwd)
14-
$basedir/../configure \
14+
"$basedir/../configure" \
1515
--disable-logging \
1616
--disable-debug \
1717
--disable-assertions \

contrib/configure-release-mt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,6 @@
1111
#
1212

1313
basedir=$(cd $(dirname $0) && pwd)
14-
$basedir/../contrib/configure-release \
14+
"$basedir/../contrib/configure-release" \
1515
--enable-mt \
1616
"$@"

contrib/test_jenkins.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -657,10 +657,18 @@ run_ucx_perftest_cuda_device() {
657657
ucx_perftest="$ucx_inst/bin/ucx_perftest"
658658
ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"
659659

660-
# TODO: Run on all GPUs
660+
# TODO: Run on all GPUs & NICs combinations
661661
ucp_client_args="-a cuda:0 $(hostname)"
662+
gda_tls="cuda_copy,rc,rc_gda"
663+
cuda_ipc_tls="cuda_copy,rc,cuda_ipc"
662664

663-
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
665+
# TODO: Run with cuda_ipc_tls
666+
for tls in "$gda_tls"
667+
do
668+
export UCX_TLS=${tls}
669+
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
670+
done
671+
unset UCX_TLS
664672
}
665673

666674
#

contrib/ucx_perftest_config/test_types_ucp_device_cuda

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# UCP basic device cuda tests
33
#
44
ucp_device_cuda_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 1024 -n 10000
5-
ucp_device_cuda_bw_1k_128threads -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 128
5+
# TODO - Increase number of threads after adjusting perftest.
6+
ucp_device_cuda_bw_1k_128threads -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 32
67
ucp_device_cuda_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 1024 -n 10000
7-
ucp_device_cuda_lat_1k_128threads -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 128
8+
# TODO - Increase number of threads after adjusting perftest.
9+
ucp_device_cuda_lat_1k_128threads -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 32

src/tools/perf/cuda/cuda_kernel.cuh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,11 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
7979
return count;
8080
}
8181

82-
UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) {
82+
UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits,
83+
size_t from)
84+
{
8385
for (size_t i = from; i < bits; i++) {
84-
if (UCX_BIT_GET(set, i)) {
86+
if (!UCX_BIT_GET(set, i)) {
8587
return i;
8688
}
8789
}

src/tools/perf/cuda/ucp_cuda_kernel.cu

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818

1919
class ucp_perf_cuda_request_manager {
2020
public:
21-
__device__ ucp_perf_cuda_request_manager(size_t size) : m_size(size)
21+
__device__
22+
ucp_perf_cuda_request_manager(size_t size, ucp_device_request *requests) :
23+
m_size(size), m_requests(requests)
2224
{
2325
assert(m_size <= CAPACITY);
2426
for (size_t i = 0; i < m_size; ++i) {
@@ -54,7 +56,7 @@ public:
5456
__device__ ucp_device_request_t &get_request()
5557
{
5658
assert(get_pending_count() < m_size);
57-
size_t index = ucx_bitset_ffs(m_pending, m_size, 0);
59+
size_t index = ucx_bitset_ffns(m_pending, m_size, 0);
5860
UCX_BIT_SET(m_pending, index);
5961
return m_requests[index];
6062
}
@@ -69,7 +71,7 @@ private:
6971
static const size_t CAPACITY = 128;
7072

7173
size_t m_size;
72-
ucp_device_request_t m_requests[CAPACITY];
74+
ucp_device_request_t *m_requests;
7375
uint8_t m_pending[UCX_BITSET_SIZE(CAPACITY)];
7476
};
7577

@@ -81,24 +83,29 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
8183
const void *address, uint64_t remote_address,
8284
size_t length)
8385
{
86+
extern __shared__ ucp_device_request requests[];
8487
ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns();
8588
ucx_perf_counter_t max_iters = ctx.max_iters;
8689
uint64_t *sn = ucx_perf_cuda_get_sn(address, length);
87-
ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding);
90+
ucp_device_request *thread_requests =
91+
&requests[ctx.max_outstanding * threadIdx.x];
92+
ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding,
93+
thread_requests);
8894
ucs_status_t status;
8995

9096
for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
9197
while (request_mgr.get_pending_count() >= ctx.max_outstanding) {
9298
status = request_mgr.progress<level>(1);
93-
if (status != UCS_OK) {
99+
if (UCS_STATUS_IS_ERR(status)) {
94100
break;
95101
}
96102
}
97103

98104
*sn = idx + 1;
99105
ucp_device_request_t &req = request_mgr.get_request();
100106
status = ucp_device_put_single<level>(mem_list, mem_list_index, address,
101-
remote_address, length, 0, &req);
107+
remote_address, length,
108+
UCP_DEVICE_FLAG_NODELAY, &req);
102109
if (status != UCS_OK) {
103110
break;
104111
}
@@ -109,7 +116,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
109116

110117
while (request_mgr.get_pending_count() > 0) {
111118
status = request_mgr.progress<level>(max_iters);
112-
if (status != UCS_OK) {
119+
if (UCS_STATUS_IS_ERR(status)) {
113120
break;
114121
}
115122
}
@@ -135,17 +142,19 @@ ucp_perf_cuda_put_single(ucp_device_mem_list_handle_h mem_list,
135142
unsigned mem_list_index, const void *address,
136143
uint64_t remote_address, size_t length)
137144
{
138-
ucp_device_request_t req;
145+
extern __shared__ ucp_device_request requests[];
146+
ucp_device_request *req = &requests[threadIdx.x];
139147
ucs_status_t status;
140148

141149
status = ucp_device_put_single<level>(mem_list, mem_list_index, address,
142-
remote_address, length, 0, &req);
150+
remote_address, length,
151+
UCP_DEVICE_FLAG_NODELAY, req);
143152
if (status != UCS_OK) {
144153
return status;
145154
}
146155

147156
do {
148-
status = ucp_device_progress_req<level>(&req);
157+
status = ucp_device_progress_req<level>(req);
149158
} while (status == UCS_INPROGRESS);
150159

151160
return status;
@@ -220,8 +229,9 @@ public:
220229
ucp_perf_barrier(&m_perf);
221230
ucx_perf_test_start_clock(&m_perf);
222231

223-
ucp_perf_cuda_put_multi_latency_kernel
224-
<UCS_DEVICE_LEVEL_THREAD><<<1, thread_count>>>(
232+
ucp_perf_cuda_put_multi_latency_kernel<UCS_DEVICE_LEVEL_THREAD>
233+
<<<1, thread_count,
234+
thread_count * sizeof(ucp_device_request)>>>(
225235
gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
226236
m_perf.ucp.remote_addr, length, m_perf.recv_buffer, my_index);
227237
CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
@@ -250,10 +260,12 @@ public:
250260
}
251261

252262
unsigned thread_count = m_perf.params.device_thread_count;
253-
ucp_perf_cuda_put_multi_bw_kernel
254-
<UCS_DEVICE_LEVEL_THREAD><<<1, thread_count>>>(
255-
gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
256-
m_perf.ucp.remote_addr, length);
263+
ucp_perf_cuda_put_multi_bw_kernel<UCS_DEVICE_LEVEL_THREAD>
264+
<<<1, thread_count,
265+
thread_count * m_perf.params.max_outstanding *
266+
sizeof(ucp_device_request)>>>(
267+
gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
268+
m_perf.ucp.remote_addr, length);
257269
CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
258270
wait_for_kernel(length);
259271
} else if (my_index == 0) {

0 commit comments

Comments
 (0)