-
-
Notifications
You must be signed in to change notification settings - Fork 5.7k
Closed
Labels
bugIndicates an unexpected problem or unintended behaviorIndicates an unexpected problem or unintended behaviormultithreadingBase.Threads and related functionalityBase.Threads and related functionalitysystem:macAffects only macOSAffects only macOS
Description
The following MWE, using Metal.jl to launch code on a foreign thread, causes Julia to crash:
using Metal
# taken from GCBenchmarks
mutable struct ListNode
key::Int64
next::ListNode
ListNode() = new()
ListNode(x)= new(x)
ListNode(x,y) = new(x,y);
end
function list(n=128)
start::ListNode = ListNode(1)
current::ListNode = start
for i = 2:(n*1024^2)
current = ListNode(i,current)
end
return current.key
end
function main()
println("Creating garbage")
x = list()
@time GC.gc(true)
# launch a kernel that will schedule a callback from a foreign thread
@metal identity(nothing)
# invoke the GC so that the callback happens during GC
GC.gc(true)
end
main()[90832] signal (10.1): Bus error: 10
in expression starting at /Users/tim/Julia/pkg/Metal/wip.jl:32
maybe_collect at /Users/tim/Julia/src/julia/src/gc.c:957
jl_gc_pool_alloc_inner at /Users/tim/Julia/src/julia/src/gc.c:1323
jl_gc_pool_alloc_noinline at /Users/tim/Julia/src/julia/src/gc.c:1383
jl_gc_alloc_ at /Users/tim/Julia/src/julia/src/julia_internal.h:467
jl_init_root_task at /Users/tim/Julia/src/julia/src/task.c:1637
ijl_adopt_thread at /Users/tim/Julia/src/julia/src/threading.c:418
unknown function (ip: 0x165f582b7)
-[MTLToolsCommandBuffer invokeCompletedHandlers] at /System/Library/PrivateFrameworks/MetalTools.framework/Versions/A/MetalTools (unknown line)
Allocations: 143060159 (Pool: 143050503; Big: 9656); GC: 21
zsh: bus error /Users/tim/Julia/src/julia/build/dev/usr/bin/julia-debug -t1 --project wip.jl
Running under LLDB reveals:
(lldb) bt all
thread #1, queue = 'com.apple.main-thread'
frame #0: 0x0000000101e7ca8c libjulia-internal-debug.1.10.dylib`gc_heap_snapshot_record_array_edge(from=0x000000010731a8c0, to=0x000000016fdfaac0) at gc-heap-snapshot.h:68:1
frame #1: 0x0000000101e749a8 libjulia-internal-debug.1.10.dylib`gc_mark_objarray(ptls=0x0000000100842800, obj_parent=0x000000010731a8c0, obj_begin=0x00000001068728e0, obj_end=0x0000000106872938, step=1, nptr=62) at gc.c:2014:17
frame #2: 0x0000000101e7fe1c libjulia-internal-debug.1.10.dylib`gc_queue_remset at gc.c:2628:17
frame #3: 0x0000000101e7eee0 libjulia-internal-debug.1.10.dylib`gc_queue_remset(ptls=0x0000000100842800, ptls2=0x0000000100842800) at gc.c:2984:9
frame #4: 0x0000000101e78e2c libjulia-internal-debug.1.10.dylib`_jl_gc_collect(ptls=0x0000000100842800, collection=JL_GC_FULL) at gc.c:3196:17
frame #5: 0x0000000101e789c8 libjulia-internal-debug.1.10.dylib`ijl_gc_collect(collection=JL_GC_FULL) at gc.c:3523:13
frame #6: 0x0000000147dac488 JIT(0x147da4000) at gcutils.jl:129
frame #7: 0x0000000147dac508 JIT(0x147da4000)
frame #8: 0x0000000101df84b0 libjulia-internal-debug.1.10.dylib`_jl_invoke(F=0x000000014507d118, args=0x000000016fdfb6a8, nargs=0, mfunc=0x0000000107b1b170, world=30688) at gf.c:2888:23
frame #9: 0x0000000101df8554 libjulia-internal-debug.1.10.dylib`ijl_apply_generic(F=0x000000014507d118, args=0x000000016fdfb6a8, nargs=0) at gf.c:3070:12
frame #10: 0x0000000101e22168 libjulia-internal-debug.1.10.dylib`jl_apply(args=0x000000016fdfb6a0, nargs=1) at julia.h:1961:12
frame #11: 0x0000000101e21e78 libjulia-internal-debug.1.10.dylib`do_call(args=0x00000001056efaf8, nargs=1, s=0x000000016fdfbc30) at interpreter.c:125:26
frame #12: 0x0000000101e20410 libjulia-internal-debug.1.10.dylib`eval_value(e=0x000000010574db50, s=0x000000016fdfbc30) at interpreter.c:222:16
frame #13: 0x0000000101e2170c libjulia-internal-debug.1.10.dylib`eval_stmt_value(stmt=0x000000010574db50, s=0x000000016fdfbc30) at interpreter.c:173:23
frame #14: 0x0000000101e1f390 libjulia-internal-debug.1.10.dylib`eval_body(stmts=0x00000001056efa90, s=0x000000016fdfbc30, ip=0, toplevel=1) at interpreter.c:602:21
frame #15: 0x0000000101e1fd04 libjulia-internal-debug.1.10.dylib`jl_interpret_toplevel_thunk(m=0x00000001268cfef0, src=0x0000000104745d10) at interpreter.c:760:21
frame #16: 0x0000000101e4b684 libjulia-internal-debug.1.10.dylib`jl_toplevel_eval_flex(m=0x00000001268cfef0, e=0x0000000104259d50, fast=1, expanded=0) at toplevel.c:922:18
frame #17: 0x0000000101e4b19c libjulia-internal-debug.1.10.dylib`jl_toplevel_eval_flex(m=0x00000001268cfef0, e=0x0000000104259e10, fast=1, expanded=0) at toplevel.c:865:19
frame #18: 0x0000000101e4ced8 libjulia-internal-debug.1.10.dylib`ijl_toplevel_eval(m=0x00000001268cfef0, v=0x0000000104259e10) at toplevel.c:931:12
frame #19: 0x0000000101e4d1e0 libjulia-internal-debug.1.10.dylib`ijl_toplevel_eval_in(m=0x00000001268cfef0, ex=0x0000000104259e10) at toplevel.c:981:13
frame #20: 0x000000012255babc sys-debug.dylib`japi1_include_string_81225 at boot.jl:383
frame #21: 0x0000000101dec8d4 libjulia-internal-debug.1.10.dylib`jl_fptr_args(f=0x0000000122fc78b0, args=0x000000016fdfc760, nargs=4, m=0x0000000123da2b90) at gf.c:2531:12
frame #22: 0x0000000101df83c4 libjulia-internal-debug.1.10.dylib`_jl_invoke(F=0x0000000122fc78b0, args=0x000000016fdfc760, nargs=4, mfunc=0x0000000123da2b40, world=30638) at gf.c:2869:35
frame #23: 0x0000000101df8554 libjulia-internal-debug.1.10.dylib`ijl_apply_generic(F=0x0000000122fc78b0, args=0x000000016fdfc760, nargs=4) at gf.c:3070:12
frame #24: 0x00000001223ddbdc sys-debug.dylib`japi1__include_81232 at loading.jl:2040
frame #25: 0x0000000121c075ac sys-debug.dylib`julia_include_48400 at Base.jl:488
frame #26: 0x0000000121fda854 sys-debug.dylib`jfptr_include_48401 + 16
frame #27: 0x0000000101df83c4 libjulia-internal-debug.1.10.dylib`_jl_invoke(F=0x0000000122948620, args=0x000000016fdfe1b8, nargs=2, mfunc=0x0000000122948890, world=30638) at gf.c:2869:35
frame #28: 0x0000000101df8554 libjulia-internal-debug.1.10.dylib`ijl_apply_generic(F=0x0000000122948620, args=0x000000016fdfe1b8, nargs=2) at gf.c:3070:12
frame #29: 0x0000000121b65e70 sys-debug.dylib`julia_exec_options_82744 at client.jl:307
frame #30: 0x0000000121b66494 sys-debug.dylib`julia__start_82889 at client.jl:541
frame #31: 0x0000000121f3b008 sys-debug.dylib`jfptr__start_82890 + 8
frame #32: 0x0000000101df83c4 libjulia-internal-debug.1.10.dylib`_jl_invoke(F=0x0000000122914ad0, args=0x000000016fdfe930, nargs=0, mfunc=0x0000000122914960, world=30638) at gf.c:2869:35
frame #33: 0x0000000101df8554 libjulia-internal-debug.1.10.dylib`ijl_apply_generic(F=0x0000000122914ad0, args=0x000000016fdfe930, nargs=0) at gf.c:3070:12
frame #34: 0x0000000101e9103c libjulia-internal-debug.1.10.dylib`jl_apply(args=0x000000016fdfe928, nargs=1) at julia.h:1961:12
frame #35: 0x0000000101e92498 libjulia-internal-debug.1.10.dylib`true_main(argc=1, argv=0x000000016fdfecc8) at jlapi.c:582:13
frame #36: 0x0000000101e92338 libjulia-internal-debug.1.10.dylib`jl_repl_entrypoint(argc=1, argv=0x000000016fdfecb0) at jlapi.c:734:15
frame #37: 0x000000010008afcc libjulia-debug.1.dylib`jl_load_repl(argc=4, argv=0x000000016fdfecb0) at loader_lib.c:563:12
frame #38: 0x0000000100003f64 julia-debug`main(argc=4, argv=0x000000016fdfecb0) at loader_exe.c:58:15
frame #39: 0x00000001a19f3f28 dyld`start + 2236
* thread #9, queue = 'com.Metal.CompletionQueueDispatch', stop reason = EXC_BAD_ACCESS (code=2, address=0x1000e0008)
* frame #0: 0x0000000101e7a490 libjulia-internal-debug.1.10.dylib`maybe_collect(ptls=0x000000016b00a200) at gc.c:957:9
frame #1: 0x0000000101e739d4 libjulia-internal-debug.1.10.dylib`jl_gc_pool_alloc_inner(ptls=0x000000016b00a200, pool_offset=1832, osize=400) at gc.c:1323:5
frame #2: 0x0000000101e73cf8 libjulia-internal-debug.1.10.dylib`jl_gc_pool_alloc_noinline(ptls=0x000000016b00a200, pool_offset=1832, osize=400) at gc.c:1383:12
frame #3: 0x0000000101e292b4 libjulia-internal-debug.1.10.dylib`jl_gc_alloc_(ptls=0x000000016b00a200, sz=376, ty=0x0000000126cd63d0) at julia_internal.h:467:13
frame #4: 0x0000000101e295c8 libjulia-internal-debug.1.10.dylib`jl_init_root_task(ptls=0x000000016b00a200, stack_lo=0x000000017c25b000, stack_hi=0x000000017c25a300) at task.c:1637:33
frame #5: 0x0000000101e6c9e0 libjulia-internal-debug.1.10.dylib`ijl_adopt_thread at threading.c:418:21
frame #6: 0x000000016ba9c2b8 JIT(0x16ba94000)
frame #7: 0x00000001a2513078 MetalTools`-[MTLToolsCommandBuffer invokeCompletedHandlers] + 100
frame #8: 0x00000001a2513078 MetalTools`-[MTLToolsCommandBuffer invokeCompletedHandlers] + 100
frame #9: 0x00000001ab1a0498 Metal`MTLDispatchListApply + 52
frame #10: 0x00000001ab1a0894 Metal`-[_MTLCommandBuffer didCompleteWithStartTime:endTime:error:] + 524
frame #11: 0x00000001bea45cc4 IOGPU`-[IOGPUMetalCommandBuffer didCompleteWithStartTime:endTime:error:] + 220
frame #12: 0x00000001ab1a052c Metal`-[_MTLCommandQueue commandBufferDidComplete:startTime:completionTime:error:] + 108
frame #13: 0x00000001bea4f190 IOGPU`IOGPUNotificationQueueDispatchAvailableCompletionNotifications + 128
frame #14: 0x00000001bea4f29c IOGPU`__IOGPUNotificationQueueSetDispatchQueue_block_invoke + 64
frame #15: 0x00000001a1b9c4c0 libdispatch.dylib`_dispatch_client_callout4 + 20
frame #16: 0x00000001a1bb8ed8 libdispatch.dylib`_dispatch_mach_msg_invoke + 468
frame #17: 0x00000001a1ba3960 libdispatch.dylib`_dispatch_lane_serial_drain + 372
frame #18: 0x00000001a1bb9c24 libdispatch.dylib`_dispatch_mach_invoke + 448
frame #19: 0x00000001a1ba3960 libdispatch.dylib`_dispatch_lane_serial_drain + 372
frame #20: 0x00000001a1ba462c libdispatch.dylib`_dispatch_lane_invoke + 436
frame #21: 0x00000001a1ba3960 libdispatch.dylib`_dispatch_lane_serial_drain + 372
frame #22: 0x00000001a1ba45f8 libdispatch.dylib`_dispatch_lane_invoke + 384
frame #23: 0x00000001a1baf244 libdispatch.dylib`_dispatch_workloop_worker_thread + 648
frame #24: 0x00000001a1d48074 libsystem_pthread.dylib`_pthread_wqthread + 288
I'm not familiar with the thread adoption code, or how it acquires the necessary locks, but my guess is that the allocation that happens during task set-up triggers the usual GC safepoint which we fail to catch because we haven't installed signal handlers yet. @JeffBezanson mentions that the jl_gc_unsafe_enter at the start of jl_adopt_thread should have taken care about that though.
Happy to create a MWE without Metal if anybody can suggest a timer-like API to start threads using a callback.
x-ref JuliaGPU/Metal.jl#138
Metadata
Metadata
Assignees
Labels
bugIndicates an unexpected problem or unintended behaviorIndicates an unexpected problem or unintended behaviormultithreadingBase.Threads and related functionalityBase.Threads and related functionalitysystem:macAffects only macOSAffects only macOS