From 439e60d79c56a31a96ba3bfc8c4650cdfec62a48 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Tue, 24 Sep 2024 13:45:18 +0000 Subject: [PATCH 1/4] [Bugfix] Implement acquire/release polyfill for Pascal --- csrc/custom_all_reduce.cuh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 632b579c55af..67279c020740 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -131,15 +131,26 @@ DINLINE O downcast(array_t val) { } static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr)); +#else + asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag), + "l"(flag_addr)); +#endif } static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) { FlagType flag; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 asm volatile("ld.acquire.sys.global.u32 %0, [%1];" : "=r"(flag) : "l"(flag_addr)); +#else + asm volatile("ld.volatile.sys.global.u32 %0, [%1]; membar.gl;" + : "=r"(flag) + : "l"(flag_addr)); +#endif return flag; } From 68307b358af2abfda895a26dc3423d54168f2d19 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Tue, 24 Sep 2024 16:41:25 +0000 Subject: [PATCH 2/4] drop .sys modifier from ld --- csrc/custom_all_reduce.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 67279c020740..a2f7e4330000 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -147,7 +147,7 @@ static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) { : "=r"(flag) : "l"(flag_addr)); #else - asm volatile("ld.volatile.sys.global.u32 %0, [%1]; membar.gl;" + asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;" : "=r"(flag) : "l"(flag_addr)); #endif From 3c4d07031c449f6b8c8145851713a4442844bac8 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Tue, 24 Sep 2024 22:35:05 +0000 Subject: [PATCH 3/4] fix test --- csrc/custom_all_reduce_test.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu index c8b5d0a013f6..9d83a001c77a 100644 --- a/csrc/custom_all_reduce_test.cu +++ b/csrc/custom_all_reduce_test.cu @@ -44,7 +44,14 @@ } while (0) __global__ void dummy_kernel() { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms +#else + for (int i = 0; i < 100; i++) { + long long int start = clock64(); + while (clock64() - start < 1000000); // something like 100ms + } +#endif } template From eb4bea2549c149c547d4c320d3142a300d2a826d Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Tue, 24 Sep 2024 23:41:52 +0000 Subject: [PATCH 4/4] fix "sleep" time --- csrc/custom_all_reduce_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu index 9d83a001c77a..376687e91cfd 100644 --- a/csrc/custom_all_reduce_test.cu +++ b/csrc/custom_all_reduce_test.cu @@ -49,7 +49,7 @@ __global__ void dummy_kernel() { #else for (int i = 0; i < 100; i++) { long long int start = clock64(); - while (clock64() - start < 1000000); // something like 100ms + while (clock64() - start < 150000000); // approximately 98.4ms on P40 } #endif }