@@ -164,8 +164,9 @@ skip_tests = []
164164has_cudnn () || push! (skip_tests, " cudnn" )
165165has_cusolvermg () || push! (skip_tests, " cusolvermg" )
166166has_nvml () || push! (skip_tests, " nvml" )
167- if ! has_cutensor () || CUDA. version () < v " 10.1" || first (picks). cap < v " 7.0"
168- push! (skip_tests, " cutensor" )
167+ if ! has_cutensor () || CUDA. version () < v " 10.1" || first (picks). cap < v " 7.0" || do_sanitize
168+ # XXX : some library tests fail under compute-sanitizer
169+ append! (skip_tests, [" cutensor" , " cusparse" ])
169170end
170171is_debug = ccall (:jl_is_debugbuild , Cint, ()) != 0
171172if first (picks). cap < v " 7.0"
@@ -199,6 +200,30 @@ else
199200 all_tests = copy (tests)
200201end
201202
203+ # handle compute-sanitizer
204+ struct rlimit
205+ cur:: Culong
206+ max:: Culong
207+ end
208+ const RLIMIT_NOFILE = 7
209+ if do_sanitize
210+ sanitizer = CUDA. compute_sanitizer ()
211+ @info " Running under $(readchomp (` $sanitizer --version` )) "
212+
213+ # bump the per-process file descriptor limit to work around NVIDIA bug #3273266.
214+ # this value will be inherited by child processes.
215+ if Sys. islinux ()
216+ local limit
217+ limit = Ref {rlimit} ()
218+ ret = ccall (:getrlimit , Cint, (Cint, Ptr{rlimit}), RLIMIT_NOFILE, limit)
219+ systemerror (:getrlimit , ret != 0 )
220+ @warn " Bumping file descriptor limit from $(Int (limit[]. cur)) to $(Int (limit[]. max)) "
221+ limit[] = rlimit (limit[]. max, limit[]. max)
222+ ret = ccall (:setrlimit , Cint, (Cint, Ptr{rlimit}), RLIMIT_NOFILE, limit)
223+ systemerror (:getrlimit , ret != 0 )
224+ end
225+ end
226+
202227# add workers
203228const test_exeflags = Base. julia_cmd ()
204229filter! (test_exeflags. exec) do c
@@ -214,9 +239,7 @@ const test_exename = popfirst!(test_exeflags.exec)
214239function addworker (X; kwargs... )
215240 exename = if do_sanitize
216241 sanitizer = CUDA. compute_sanitizer ()
217- @info " Running under $(readchomp (` $sanitizer --version` )) "
218- # NVIDIA bug 3263616: compute-sanitizer crashes when generating host backtraces
219- ` $sanitizer --tool $sanitize_tool --launch-timeout=0 --show-backtrace=no --target-processes=all --report-api-errors=no $test_exename `
242+ ` $sanitizer --tool $sanitize_tool --launch-timeout=0 --target-processes=all --report-api-errors=no $test_exename `
220243 else
221244 test_exename
222245 end
353376 push! (all_tasks, current_task ())
354377 while length (tests) > 0
355378 test = popfirst! (tests)
356- local resp
379+
380+ # sometimes a worker failed, and we need to spawn a new one
381+ if p === nothing
382+ p = addworker (1 )[1 ]
383+ end
357384 wrkr = p
385+
386+ local resp
358387 snoop = do_snoop ? mktemp () : (nothing , nothing )
359388
360389 # tests that muck with the context should not be timed with CUDA events,
380409 # the worker encountered some failure, recycle it
381410 # so future tests get a fresh environment
382411 rmprocs (wrkr, waitfor= 30 )
383- p = addworker ( 1 )[ 1 ]
412+ p = nothing
384413 else
385414 print_testworker_stats (test, wrkr, resp)
386415 end
0 commit comments