diff --git a/Make.inc b/Make.inc index def5f40f97926..eac6d325d6d59 100644 --- a/Make.inc +++ b/Make.inc @@ -195,7 +195,7 @@ endif JULIA_VERSION := $(shell cat $(JULIAHOME)/VERSION) JULIA_MAJOR_VERSION := $(shell echo $(JULIA_VERSION) | cut -d'-' -f 1 | cut -d'.' -f 1) JULIA_MINOR_VERSION := $(shell echo $(JULIA_VERSION) | cut -d'-' -f 1 | cut -d'.' -f 2) -JULIA_PATCH_VERSION := $(shell echo $(JULIA_VERSION) | cut -d'-' -f 1 | cut -d'.' -f 3) +JULIA_PATCH_VERSION := $(shell echo $(JULIA_VERSION) | cut -d'-' -f 1 | cut -d'+' -f 1 | cut -d'.' -f 3) # libjulia's SONAME will follow the format libjulia.so.$(SOMAJOR). Before v1.0.0, # SOMAJOR will be a two-decimal value, e.g. libjulia.so.0.5, whereas at and beyond diff --git a/NEWS.md b/NEWS.md index 695867f2e4f63..18a5b0e620e7d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -38,6 +38,7 @@ Compiler/Runtime improvements * All uses of the `@pure` macro in `Base` have been replaced with the now-preferred `Base.@assume_effects` ([#44776]). * `invoke(f, invokesig, args...)` calls to a less-specific method than would normally be chosen for `f(args...)` are no longer spuriously invalidated when loading package precompile files ([#46010]). +* The mark phase of the Garbage Collector is now multi-threaded ([#48600]). Command-line option changes --------------------------- @@ -49,6 +50,8 @@ Command-line option changes number of interactive threads to create (`auto` currently means 1) ([#42302]). * New option `--heap-size-hint=` suggests a size limit to invoke garbage collection more eagerly. The size may be specified in bytes, kilobytes (1000k), megabytes (300M), or gigabytes (1.5G) ([#45369]). +* New option `--gcthreads` to set how many threads will be used by the Garbage Collector ([#48600]). + The default is set to `N/2` where `N` is the amount of worker threads (`--threads`) used by Julia. Multi-threading changes ----------------------- diff --git a/VERSION b/VERSION index 77fee73a8cf94..7bede500b4e81 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.9.3 +1.9.4+RAI diff --git a/base/options.jl b/base/options.jl index b9fa5b6ec5508..fb043672dc19a 100644 --- a/base/options.jl +++ b/base/options.jl @@ -11,6 +11,7 @@ struct JLOptions cpu_target::Ptr{UInt8} nthreadpools::Int16 nthreads::Int16 + ngcthreads::Int16 nthreads_per_pool::Ptr{Int16} nprocs::Int32 machine_file::Ptr{UInt8} diff --git a/base/partr.jl b/base/partr.jl index c77a24bdcc003..f84d7bd1a37b7 100644 --- a/base/partr.jl +++ b/base/partr.jl @@ -50,7 +50,7 @@ function multiq_sift_down(heap::taskheap, idx::Int32) child = Int(child) child > length(heap.tasks) && break if isassigned(heap.tasks, child) && - heap.tasks[child].priority < heap.tasks[idx].priority + heap.tasks[child].priority <= heap.tasks[idx].priority t = heap.tasks[idx] heap.tasks[idx] = heap.tasks[child] heap.tasks[child] = t diff --git a/base/reflection.jl b/base/reflection.jl index 1d8aa84cb3ad3..376a66c3ac8e3 100644 --- a/base/reflection.jl +++ b/base/reflection.jl @@ -1092,6 +1092,7 @@ struct CodegenParams prefer_specsig::Cint gnu_pubnames::Cint debug_info_kind::Cint + safepoint_on_entry::Cint lookup::Ptr{Cvoid} @@ -1100,12 +1101,14 @@ struct CodegenParams function CodegenParams(; track_allocations::Bool=true, code_coverage::Bool=true, prefer_specsig::Bool=false, gnu_pubnames=true, debug_info_kind::Cint = default_debug_info_kind(), + safepoint_on_entry::Bool=true, lookup::Ptr{Cvoid}=cglobal(:jl_rettype_inferred), generic_context = nothing) return new( Cint(track_allocations), Cint(code_coverage), Cint(prefer_specsig), Cint(gnu_pubnames), debug_info_kind, + Cint(safepoint_on_entry), lookup, generic_context) end end diff --git a/base/task.jl b/base/task.jl index 92598159e999f..d19189fa3cb5e 100644 --- a/base/task.jl +++ b/base/task.jl @@ -688,7 +688,7 @@ end ## scheduler and work queue -struct IntrusiveLinkedListSynchronized{T} +mutable struct IntrusiveLinkedListSynchronized{T} queue::IntrusiveLinkedList{T} lock::Threads.SpinLock IntrusiveLinkedListSynchronized{T}() where {T} = new(IntrusiveLinkedList{T}(), Threads.SpinLock()) diff --git a/base/threadingconstructs.jl b/base/threadingconstructs.jl index 0c6563c54f99f..ce216f290ae12 100644 --- a/base/threadingconstructs.jl +++ b/base/threadingconstructs.jl @@ -136,6 +136,13 @@ function threadpooltids(pool::Symbol) end end +""" + Threads.ngcthreads() -> Int + +Returns the number of GC threads currently configured. +""" +ngcthreads() = Int(unsafe_load(cglobal(:jl_n_gcthreads, Cint))) + 1 + function threading_run(fun, static) ccall(:jl_enter_threaded_region, Cvoid, ()) n = threadpoolsize() diff --git a/base/timing.jl b/base/timing.jl index c994889d8902c..375ff7aaea60e 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -20,10 +20,13 @@ struct GC_Num max_memory ::Int64 time_to_safepoint ::Int64 max_time_to_safepoint ::Int64 + total_time_to_safepoint ::Int64 sweep_time ::Int64 mark_time ::Int64 total_sweep_time ::Int64 total_mark_time ::Int64 + last_full_sweep ::Int64 + last_incremental_sweep ::Int64 end gc_num() = ccall(:jl_gc_num, GC_Num, ()) @@ -96,6 +99,13 @@ function gc_live_bytes() Int(ccall(:jl_gc_live_bytes, Int64, ())) + num.allocd + num.deferred_alloc end +# must be kept in sync with the value from `src/julia_threads.h`` +const JL_GC_N_MAX_POOLS = 51 +function gc_page_utilization_data() + page_utilization_raw = cglobal(:jl_gc_page_utilization_stats, Float64) + return Base.unsafe_wrap(Array, page_utilization_raw, JL_GC_N_MAX_POOLS, own=false) +end + """ Base.jit_total_bytes() diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl index 11a9c1b552061..b1b098a5581b2 100644 --- a/contrib/generate_precompile.jl +++ b/contrib/generate_precompile.jl @@ -1,7 +1,7 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license if Threads.maxthreadid() != 1 - @warn "Running this file with multiple Julia threads may lead to a build error" Base.maxthreadid() + @warn "Running this file with multiple Julia threads may lead to a build error" Base.Threads.maxthreadid() end if Base.isempty(Base.ARGS) || Base.ARGS[1] !== "0" @@ -245,7 +245,7 @@ const HELP_PROMPT = "help?> " function generate_precompile_statements() start_time = time_ns() - debug_output = devnull # or stdout + debug_output = stdout sysimg = Base.unsafe_string(Base.JLOptions().image_file) # Extract the precompile statements from the precompile file diff --git a/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/md5 b/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/md5 new file mode 100644 index 0000000000000..fb4a9537dda0d --- /dev/null +++ b/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/md5 @@ -0,0 +1 @@ +d9630229362baeff301c473bb3e05905 diff --git a/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/sha512 b/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/sha512 new file mode 100644 index 0000000000000..e6e1af11a228e --- /dev/null +++ b/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/sha512 @@ -0,0 +1 @@ +d8d421d9cce00fddaddbe88ae02076c58f5a669064b197092ceab6051c26fb3aab42f03be3f945dc5dbf376ee0d0a29c33208121e20b3e613a92311ca171828d diff --git a/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/md5 b/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/md5 deleted file mode 100644 index 9b7ee38b00368..0000000000000 --- a/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/md5 +++ /dev/null @@ -1 +0,0 @@ -0a5ac5f1302352d902b3acd4523c6cef diff --git a/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/sha512 b/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/sha512 deleted file mode 100644 index 71efa5b40fb74..0000000000000 --- a/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/sha512 +++ /dev/null @@ -1 +0,0 @@ -69993723d04457c3c6268b1c4844ebed2563e56a0d3cc25e1a0048029491ba423e6808f53a66cf879ed39323112cda07344e9c678e2d6abc2d7444e4e5336c2f diff --git a/doc/man/julia.1 b/doc/man/julia.1 index 383c588c58dae..fa9f641b1e76f 100644 --- a/doc/man/julia.1 +++ b/doc/man/julia.1 @@ -118,6 +118,11 @@ supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads. +.TP +--gcthreads +Enable n GC threads; If unspecified is set to half of the +compute worker threads. + .TP -p, --procs {N|auto} Integer value N launches N additional local worker processes `auto` launches as many workers diff --git a/doc/src/base/multi-threading.md b/doc/src/base/multi-threading.md index 4932aef4cc938..fb75b21479707 100644 --- a/doc/src/base/multi-threading.md +++ b/doc/src/base/multi-threading.md @@ -10,6 +10,7 @@ Base.Threads.nthreads Base.Threads.threadpool Base.Threads.nthreadpools Base.Threads.threadpoolsize +Base.Threads.ngcthreads ``` See also [Multi-Threading](@ref man-multithreading). diff --git a/doc/src/manual/command-line-interface.md b/doc/src/manual/command-line-interface.md index 54c56a354c7a3..781a77a33dadb 100644 --- a/doc/src/manual/command-line-interface.md +++ b/doc/src/manual/command-line-interface.md @@ -106,8 +106,9 @@ The following is a complete list of command-line switches available when launchi |`-e`, `--eval ` |Evaluate ``| |`-E`, `--print ` |Evaluate `` and display the result| |`-L`, `--load ` |Load `` immediately on all processors| -|`-t`, `--threads {N\|auto`} |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future. Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.| -|`-p`, `--procs {N\|auto`} |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)| +|`-t`, `--threads {N\|auto}` |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future. Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.| +| `--gcthreads {N}` |Enable N GC threads; If unspecified is set to half of the compute worker threads.| +|`-p`, `--procs {N\|auto}` |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)| |`--machine-file ` |Run processes on hosts listed in ``| |`-i` |Interactive mode; REPL runs and `isinteractive()` is true| |`-q`, `--quiet` |Quiet startup: no banner, suppress REPL warnings| diff --git a/doc/src/manual/environment-variables.md b/doc/src/manual/environment-variables.md index f29e5b7aaf8f7..68fb79fc0a9a6 100644 --- a/doc/src/manual/environment-variables.md +++ b/doc/src/manual/environment-variables.md @@ -315,6 +315,14 @@ then spinning threads never sleep. Otherwise, `$JULIA_THREAD_SLEEP_THRESHOLD` is interpreted as an unsigned 64-bit integer (`uint64_t`) and gives, in nanoseconds, the amount of time after which spinning threads should sleep. +### [`JULIA_NUM_GC_THREADS`](@id env-gc-threads) + +Sets the number of threads used by Garbage Collection. If unspecified is set to +half of the number of worker threads. + +!!! compat "Julia 1.10" + The environment variable was added in 1.10 + ### `JULIA_EXCLUSIVE` If set to anything besides `0`, then Julia's thread policy is consistent with diff --git a/doc/src/manual/multi-threading.md b/doc/src/manual/multi-threading.md index f8c6b040941f4..e1cb17bd5b93f 100644 --- a/doc/src/manual/multi-threading.md +++ b/doc/src/manual/multi-threading.md @@ -72,6 +72,15 @@ julia> Threads.threadid() three processes have 2 threads enabled. For more fine grained control over worker threads use [`addprocs`](@ref) and pass `-t`/`--threads` as `exeflags`. +### Multiple GC Threads + +The Garbage Collector (GC) can use multiple threads. The amount used is either half the number +of compute worker threads or configured by either the `--gcthreads` command line argument or by using the +[`JULIA_NUM_GC_THREADS`](@ref env-gc-threads) environment variable. + +!!! compat "Julia 1.10" + The `--gcthreads` command line argument requires at least Julia 1.10. + ## [Threadpools](@id man-threadpools) When a program's threads are busy with many tasks to run, tasks may experience diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md index 5b18f57a186be..60e2d8042ee18 100644 --- a/doc/src/manual/profile.md +++ b/doc/src/manual/profile.md @@ -310,7 +310,122 @@ the amount of memory allocated by each line of code. ### Line-by-Line Allocation Tracking -To measure allocation line-by-line, start Julia with the `--track-allocation=` command-line +While [`@time`](@ref) logs high-level stats about memory usage and garbage collection over the course +of evaluating an expression, it can be useful to log each garbage collection event, to get an +intuitive sense of how often the garbage collector is running, how long it's running each time, +and how much garbage it collects each time. This can be enabled with +[`GC.enable_logging(true)`](@ref), which causes Julia to log to stderr every time +a garbage collection happens. + +### [Allocation Profiler](@id allocation-profiler) + +!!! compat "Julia 1.8" + This functionality requires at least Julia 1.8. + +The allocation profiler records the stack trace, type, and size of each +allocation while it is running. It can be invoked with +[`Profile.Allocs.@profile`](@ref). + +This information about the allocations is returned as an array of `Alloc` +objects, wrapped in an `AllocResults` object. The best way to visualize these is +currently with the [PProf.jl](https://github.com/JuliaPerf/PProf.jl) and +[ProfileCanvas.jl](https://github.com/pfitzseb/ProfileCanvas.jl) packages, which +can visualize the call stacks which are making the most allocations. + +The allocation profiler does have significant overhead, so a `sample_rate` +argument can be passed to speed it up by making it skip some allocations. +Passing `sample_rate=1.0` will make it record everything (which is slow); +`sample_rate=0.1` will record only 10% of the allocations (faster), etc. + +!!! compat "Julia 1.11" + + Older versions of Julia could not capture types in all cases. In older versions of + Julia, if you see an allocation of type `Profile.Allocs.UnknownType`, it means that + the profiler doesn't know what type of object was allocated. This mainly happened when + the allocation was coming from generated code produced by the compiler. See + [issue #43688](https://github.com/JuliaLang/julia/issues/43688) for more info. + + Since Julia 1.11, all allocations should have a type reported. + +For more details on how to use this tool, please see the following talk from JuliaCon 2022: +https://www.youtube.com/watch?v=BFvpwC8hEWQ + +##### Allocation Profiler Example + +In this simple example, we use PProf to visualize the alloc profile. You could use another +visualization tool instead. We collect the profile (specifying a sample rate), then we visualize it. +```julia +using Profile, PProf +Profile.Allocs.clear() +Profile.Allocs.@profile sample_rate=0.0001 my_function() +PProf.Allocs.pprof() +``` + +Here is a more in-depth example, showing how we can tune the sample rate. A +good number of samples to aim for is around 1 - 10 thousand. Too many, and the +profile visualizer can get overwhelmed, and profiling will be slow. Too few, +and you don't have a representative sample. + + +```julia-repl +julia> import Profile + +julia> @time my_function() # Estimate allocations from a (second-run) of the function + 0.110018 seconds (1.50 M allocations: 58.725 MiB, 17.17% gc time) +500000 + +julia> Profile.Allocs.clear() + +julia> Profile.Allocs.@profile sample_rate=0.001 begin # 1.5 M * 0.001 = ~1.5K allocs. + my_function() + end +500000 + +julia> prof = Profile.Allocs.fetch(); # If you want, you can also manually inspect the results. + +julia> length(prof.allocs) # Confirm we have expected number of allocations. +1515 + +julia> using PProf # Now, visualize with an external tool, like PProf or ProfileCanvas. + +julia> PProf.Allocs.pprof(prof; from_c=false) # You can optionally pass in a previously fetched profile result. +Analyzing 1515 allocation samples... 100%|████████████████████████████████| Time: 0:00:00 +Main binary filename not available. +Serving web UI on http://localhost:62261 +"alloc-profile.pb.gz" +``` +Then you can view the profile by navigating to http://localhost:62261, and the profile is saved to disk. +See PProf package for more options. + +##### Allocation Profiling Tips + +As stated above, aim for around 1-10 thousand samples in your profile. + +Note that we are uniformly sampling in the space of _all allocations_, and are not weighting +our samples by the size of the allocation. So a given allocation profile may not give a +representative profile of where most bytes are allocated in your program, unless you had set +`sample_rate=1`. + +Allocations can come from users directly constructing objects, but can also come from inside +the runtime or be inserted into compiled code to handle type instability. Looking at the +"source code" view can be helpful to isolate them, and then other external tools such as +[`Cthulhu.jl`](https://github.com/JuliaDebug/Cthulhu.jl) can be useful for identifying the +cause of the allocation. + +##### Allocation Profile Visualization Tools + +There are several profiling visualization tools now that can all display Allocation +Profiles. Here is a small list of some of the main ones we know about: +- [PProf.jl](https://github.com/JuliaPerf/PProf.jl) +- [ProfileCanvas.jl](https://github.com/pfitzseb/ProfileCanvas.jl) +- VSCode's built-in profile visualizer (`@profview_allocs`) [docs needed] +- Viewing the results directly in the REPL + - You can inspect the results in the REPL via [`Profile.Allocs.fetch()`](@ref), to view + the stacktrace and type of each allocation. + +#### Line-by-Line Allocation Tracking + +An alternative way to measure allocations is to start Julia with the `--track-allocation=` command-line option, for which you can choose `none` (the default, do not measure allocation), `user` (measure memory allocation everywhere except Julia's core code), or `all` (measure memory allocation at each line of Julia code). Allocation gets measured for each line of compiled code. When you quit diff --git a/src/Makefile b/src/Makefile index a6939f629412f..2a66e3a0dccac 100644 --- a/src/Makefile +++ b/src/Makefile @@ -43,8 +43,8 @@ endif SRCS := \ jltypes gf typemap smallintset ast builtins module interpreter symbol \ dlload sys init task array staticdata toplevel jl_uv datatype \ - simplevector runtime_intrinsics precompile jloptions \ - threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler method \ + simplevector runtime_intrinsics precompile jloptions mtarraylist \ + threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler method \ jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \ crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall @@ -99,7 +99,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0) UV_HEADERS += uv.h UV_HEADERS += uv/*.h endif -PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) +PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) ifeq ($(OS),WINNT) PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h) endif @@ -353,7 +353,7 @@ $(BUILDDIR)/julia_version.h: $(JULIAHOME)/VERSION @echo "#ifndef JL_VERSION_H" >> $@.$(JULIA_BUILD_MODE).tmp @echo "#define JL_VERSION_H" >> $@.$(JULIA_BUILD_MODE).tmp @echo "#define JULIA_VERSION_STRING" \"$(JULIA_VERSION)\" >> $@.$(JULIA_BUILD_MODE).tmp - @echo $(JULIA_VERSION) | awk 'BEGIN {FS="[.,-]"} \ + @echo $(JULIA_VERSION) | awk 'BEGIN {FS="[.,+-]"} \ {print "#define JULIA_VERSION_MAJOR " $$1 "\n" \ "#define JULIA_VERSION_MINOR " $$2 "\n" \ "#define JULIA_VERSION_PATCH " $$3 ; \ diff --git a/src/cgutils.cpp b/src/cgutils.cpp index c091111f31617..8ce84acb30901 100644 --- a/src/cgutils.cpp +++ b/src/cgutils.cpp @@ -3933,7 +3933,6 @@ static Value *emit_defer_signal(jl_codectx_t &ctx) return ctx.builder.CreateInBoundsGEP(ctx.types().T_sigatomic, ptls, ArrayRef(offset), "jl_defer_signal"); } - #ifndef JL_NDEBUG static int compare_cgparams(const jl_cgparams_t *a, const jl_cgparams_t *b) { diff --git a/src/codegen.cpp b/src/codegen.cpp index 6bd0b0d16865a..a4773acb3fbea 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -1279,6 +1279,7 @@ extern "C" { 1, #endif (int) DICompileUnit::DebugEmissionKind::FullDebug, + 1, jl_rettype_inferred, NULL }; } @@ -7805,7 +7806,11 @@ static jl_llvm_functions_t ctx.builder.CreateAlignedStore(load_world, world_age_field, Align(sizeof(size_t))); } - // step 11b. Do codegen in control flow order + // step 11b. Emit the entry safepoint + if (JL_FEAT_TEST(ctx, safepoint_on_entry)) + emit_gc_safepoint(ctx.builder, get_current_ptls(ctx), ctx.tbaa().tbaa_const); + + // step 11c. Do codegen in control flow order std::vector workstack; std::map BB; std::map come_from_bb; diff --git a/src/codegen_shared.h b/src/codegen_shared.h index 329cc567e8c5f..e0edb792d7645 100644 --- a/src/codegen_shared.h +++ b/src/codegen_shared.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -233,20 +234,39 @@ static inline void emit_signal_fence(llvm::IRBuilder<> &builder) builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SyncScope::SingleThread); } -static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa) +static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa, bool final = false) { + using namespace llvm; + llvm::Value *signal_page = get_current_signal_page_from_ptls(builder, ptls, tbaa); emit_signal_fence(builder); - builder.CreateLoad(getSizeTy(builder.getContext()), get_current_signal_page_from_ptls(builder, ptls, tbaa), true); + Module *M = builder.GetInsertBlock()->getModule(); + LLVMContext &C = builder.getContext(); + // inline jlsafepoint_func->realize(M) + if (final) { + auto T_size = getSizeTy(builder.getContext()); + builder.CreateLoad(T_size, signal_page, true); + } + else { + Function *F = M->getFunction("julia.safepoint"); + if (!F) { + auto T_size = getSizeTy(builder.getContext()); + auto T_psize = T_size->getPointerTo(); + FunctionType *FT = FunctionType::get(Type::getVoidTy(C), {T_psize}, false); + F = Function::Create(FT, Function::ExternalLinkage, "julia.safepoint", M); + F->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + } + builder.CreateCall(F, {signal_page}); + } emit_signal_fence(builder); } -static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, llvm::Value *old_state) +static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, llvm::Value *old_state, bool final) { using namespace llvm; Type *T_int8 = state->getType(); - ptls = emit_bitcast_with_builder(builder, ptls, builder.getInt8PtrTy()); + llvm::Value *ptls_i8 = emit_bitcast_with_builder(builder, ptls, builder.getInt8PtrTy()); Constant *offset = ConstantInt::getSigned(builder.getInt32Ty(), offsetof(jl_tls_states_t, gc_state)); - Value *gc_state = builder.CreateInBoundsGEP(T_int8, ptls, ArrayRef(offset), "gc_state"); + Value *gc_state = builder.CreateInBoundsGEP(T_int8, ptls_i8, ArrayRef(offset), "gc_state"); if (old_state == nullptr) { old_state = builder.CreateLoad(T_int8, gc_state); cast(old_state)->setOrdering(AtomicOrdering::Monotonic); @@ -266,38 +286,38 @@ static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::V passBB, exitBB); builder.SetInsertPoint(passBB); MDNode *tbaa = get_tbaa_const(builder.getContext()); - emit_gc_safepoint(builder, ptls, tbaa); + emit_gc_safepoint(builder, ptls, tbaa, final); builder.CreateBr(exitBB); builder.SetInsertPoint(exitBB); return old_state; } -static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls) +static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls, bool final) { using namespace llvm; Value *state = builder.getInt8(0); - return emit_gc_state_set(builder, ptls, state, nullptr); + return emit_gc_state_set(builder, ptls, state, nullptr, final); } -static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state) +static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, bool final) { using namespace llvm; Value *old_state = builder.getInt8(0); - return emit_gc_state_set(builder, ptls, state, old_state); + return emit_gc_state_set(builder, ptls, state, old_state, final); } -static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls) +static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls, bool final) { using namespace llvm; Value *state = builder.getInt8(JL_GC_STATE_SAFE); - return emit_gc_state_set(builder, ptls, state, nullptr); + return emit_gc_state_set(builder, ptls, state, nullptr, final); } -static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state) +static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, bool final) { using namespace llvm; Value *old_state = builder.getInt8(JL_GC_STATE_SAFE); - return emit_gc_state_set(builder, ptls, state, old_state); + return emit_gc_state_set(builder, ptls, state, old_state, final); } // Compatibility shims for LLVM attribute APIs that were renamed in LLVM 14. diff --git a/src/gc-alloc-profiler.h b/src/gc-alloc-profiler.h index 3fd8bf4388a0a..fcd8e45caa2d8 100644 --- a/src/gc-alloc-profiler.h +++ b/src/gc-alloc-profiler.h @@ -35,6 +35,7 @@ void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t extern int g_alloc_profile_enabled; +// This should only be used from _deprecated_ code paths. We shouldn't see UNKNOWN anymore. #define jl_gc_unknown_type_tag ((jl_datatype_t*)0xdeadaa03) static inline void maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *typ) JL_NOTSAFEPOINT { diff --git a/src/gc-debug.c b/src/gc-debug.c index 3f60ca17e0dc4..78768d5802824 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -27,19 +27,16 @@ jl_gc_pagemeta_t *jl_gc_page_metadata(void *data) // the end of the page. JL_DLLEXPORT jl_taggedvalue_t *jl_gc_find_taggedvalue_pool(char *p, size_t *osize_p) { - if (!page_metadata(p)) + if (!gc_alloc_map_is_set(p)) // Not in the pool return NULL; - struct jl_gc_metadata_ext info = page_metadata_ext(p); + jl_gc_pagemeta_t *meta = page_metadata(p); char *page_begin = gc_page_data(p) + GC_PAGE_OFFSET; // In the page header if (p < page_begin) return NULL; size_t ofs = p - page_begin; - // Check if this is a free page - if (!(info.pagetable0->allocmap[info.pagetable0_i32] & (uint32_t)(1 << info.pagetable0_i))) - return NULL; - int osize = info.meta->osize; + int osize = meta->osize; // Shouldn't be needed, just in case if (osize == 0) return NULL; @@ -111,44 +108,14 @@ static void gc_clear_mark_page(jl_gc_pagemeta_t *pg, int bits) } } -static void gc_clear_mark_pagetable0(pagetable0_t *pagetable0, int bits) -{ - for (int pg_i = 0; pg_i < REGION0_PG_COUNT / 32; pg_i++) { - uint32_t line = pagetable0->allocmap[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - gc_clear_mark_page(pagetable0->meta[pg_i * 32 + j], bits); - } - } - } - } -} - -static void gc_clear_mark_pagetable1(pagetable1_t *pagetable1, int bits) -{ - for (int pg_i = 0; pg_i < REGION1_PG_COUNT / 32; pg_i++) { - uint32_t line = pagetable1->allocmap0[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - gc_clear_mark_pagetable0(pagetable1->meta0[pg_i * 32 + j], bits); - } - } - } - } -} - -static void gc_clear_mark_pagetable(int bits) +static void gc_clear_mark_outer(int bits) { - for (int pg_i = 0; pg_i < (REGION2_PG_COUNT + 31) / 32; pg_i++) { - uint32_t line = memory_map.allocmap1[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - gc_clear_mark_pagetable1(memory_map.meta1[pg_i * 32 + j], bits); - } - } + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + while (pg != NULL) { + gc_clear_mark_page(pg, bits); + pg = pg->next; } } } @@ -184,7 +151,7 @@ static void clear_mark(int bits) v = v->next; } - gc_clear_mark_pagetable(bits); + gc_clear_mark_outer(bits); } static void restore(void) @@ -198,21 +165,32 @@ static void restore(void) static void gc_verify_track(jl_ptls_t ptls) { - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; + // `gc_verify_track` is limited to single-threaded GC + if (jl_n_gcthreads != 0) + return; do { - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); + jl_gc_markqueue_t mq; + jl_gc_markqueue_t *mq2 = &ptls->mark_queue; + ws_queue_t *cq = &mq.chunk_queue; + ws_queue_t *q = &mq.ptr_queue; + jl_atomic_store_relaxed(&cq->top, 0); + jl_atomic_store_relaxed(&cq->bottom, 0); + jl_atomic_store_relaxed(&cq->array, jl_atomic_load_relaxed(&mq2->chunk_queue.array)); + jl_atomic_store_relaxed(&q->top, 0); + jl_atomic_store_relaxed(&q->bottom, 0); + jl_atomic_store_relaxed(&q->array, jl_atomic_load_relaxed(&mq2->ptr_queue.array)); + arraylist_new(&mq.reclaim_set, 32); arraylist_push(&lostval_parents_done, lostval); jl_safe_printf("Now looking for %p =======\n", lostval); clear_mark(GC_CLEAN); - gc_mark_queue_all_roots(ptls, &sp); - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - for (int i = 0; i < gc_n_threads; i++) { + gc_mark_queue_all_roots(ptls, &mq); + gc_mark_finlist(&mq, &to_finalize, 0); + for (int i = 0; i < gc_n_threads;i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + gc_mark_finlist(&mq, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); - gc_mark_loop(ptls, sp); + gc_mark_finlist(&mq, &finalizer_list_marked, 0); + gc_mark_loop_serial_(ptls, &mq); if (lostval_parents.len == 0) { jl_safe_printf("Could not find the missing link. We missed a toplevel root. This is odd.\n"); break; @@ -246,22 +224,35 @@ static void gc_verify_track(jl_ptls_t ptls) void gc_verify(jl_ptls_t ptls) { - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); + // `gc_verify` is limited to single-threaded GC + if (jl_n_gcthreads != 0) { + jl_safe_printf("Warn. GC verify disabled in multi-threaded GC\n"); + return; + } + jl_gc_markqueue_t mq; + jl_gc_markqueue_t *mq2 = &ptls->mark_queue; + ws_queue_t *cq = &mq.chunk_queue; + ws_queue_t *q = &mq.ptr_queue; + jl_atomic_store_relaxed(&cq->top, 0); + jl_atomic_store_relaxed(&cq->bottom, 0); + jl_atomic_store_relaxed(&cq->array, jl_atomic_load_relaxed(&mq2->chunk_queue.array)); + jl_atomic_store_relaxed(&q->top, 0); + jl_atomic_store_relaxed(&q->bottom, 0); + jl_atomic_store_relaxed(&q->array, jl_atomic_load_relaxed(&mq2->ptr_queue.array)); + arraylist_new(&mq.reclaim_set, 32); lostval = NULL; lostval_parents.len = 0; lostval_parents_done.len = 0; clear_mark(GC_CLEAN); gc_verifying = 1; - gc_mark_queue_all_roots(ptls, &sp); - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - for (int i = 0; i < gc_n_threads; i++) { + gc_mark_queue_all_roots(ptls, &mq); + gc_mark_finlist(&mq, &to_finalize, 0); + for (int i = 0; i < gc_n_threads;i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + gc_mark_finlist(&mq, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); - gc_mark_loop(ptls, sp); + gc_mark_finlist(&mq, &finalizer_list_marked, 0); + gc_mark_loop_serial_(ptls, &mq); int clean_len = bits_save[GC_CLEAN].len; for(int i = 0; i < clean_len + bits_save[GC_OLD].len; i++) { jl_taggedvalue_t *v = (jl_taggedvalue_t*)bits_save[i >= clean_len ? GC_OLD : GC_CLEAN].items[i >= clean_len ? i - clean_len : i]; @@ -500,7 +491,7 @@ int jl_gc_debug_check_other(void) return gc_debug_alloc_check(&jl_gc_debug_env.other); } -void jl_gc_debug_print_status(void) +void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT { uint64_t pool_count = jl_gc_debug_env.pool.num; uint64_t other_count = jl_gc_debug_env.other.num; @@ -509,7 +500,7 @@ void jl_gc_debug_print_status(void) pool_count + other_count, pool_count, other_count, gc_num.pause); } -void jl_gc_debug_critical_error(void) +void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT { jl_gc_debug_print_status(); if (!jl_gc_debug_env.wait_for_debugger) @@ -537,7 +528,6 @@ void gc_scrub_record_task(jl_task_t *t) JL_NO_ASAN static void gc_scrub_range(char *low, char *high) { - jl_ptls_t ptls = jl_current_task->ptls; jl_jmp_buf *old_buf = jl_get_safe_restore(); jl_jmp_buf buf; if (jl_setjmp(buf, 0)) { @@ -556,14 +546,6 @@ JL_NO_ASAN static void gc_scrub_range(char *low, char *high) // Make sure the sweep rebuild the freelist pg->has_marked = 1; pg->has_young = 1; - // Find the age bit - char *page_begin = gc_page_data(tag) + GC_PAGE_OFFSET; - int obj_id = (((char*)tag) - page_begin) / osize; - uint8_t *ages = pg->ages + obj_id / 8; - // Force this to be a young object to save some memory - // (especially on 32bit where it's more likely to have pointer-like - // bit patterns) - *ages &= ~(1 << (obj_id % 8)); memset(tag, 0xff, osize); // set mark to GC_MARKED (young and marked) tag->bits.gc = GC_MARKED; @@ -762,45 +744,37 @@ void gc_final_pause_end(int64_t t0, int64_t tend) static void gc_stats_pagetable0(pagetable0_t *pagetable0, unsigned *p0) { - for (int pg_i = 0; pg_i < REGION0_PG_COUNT / 32; pg_i++) { - uint32_t line = pagetable0->allocmap[pg_i] | pagetable0->freemap[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - (*p0)++; - } - } + for (int pg_i = 0; pg_i < REGION0_PG_COUNT; pg_i++) { + uint8_t meta = pagetable0->meta[pg_i]; + assert(meta == GC_PAGE_UNMAPPED || meta == GC_PAGE_ALLOCATED || + meta == GC_PAGE_LAZILY_FREED || meta == GC_PAGE_FREED); + if (meta != GC_PAGE_UNMAPPED) { + (*p0)++; } } } static void gc_stats_pagetable1(pagetable1_t *pagetable1, unsigned *p1, unsigned *p0) { - for (int pg_i = 0; pg_i < REGION1_PG_COUNT / 32; pg_i++) { - uint32_t line = pagetable1->allocmap0[pg_i] | pagetable1->freemap0[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - (*p1)++; - gc_stats_pagetable0(pagetable1->meta0[pg_i * 32 + j], p0); - } - } + for (int pg_i = 0; pg_i < REGION1_PG_COUNT; pg_i++) { + pagetable0_t *pagetable0 = pagetable1->meta0[pg_i]; + if (pagetable0 == NULL) { + continue; } + (*p1)++; + gc_stats_pagetable0(pagetable0, p0); } } static void gc_stats_pagetable(unsigned *p2, unsigned *p1, unsigned *p0) { - for (int pg_i = 0; pg_i < (REGION2_PG_COUNT + 31) / 32; pg_i++) { - uint32_t line = memory_map.allocmap1[pg_i] | memory_map.freemap1[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - (*p2)++; - gc_stats_pagetable1(memory_map.meta1[pg_i * 32 + j], p1, p0); - } - } + for (int pg_i = 0; pg_i < REGION2_PG_COUNT; pg_i++) { + pagetable1_t *pagetable1 = alloc_map.meta1[pg_i]; + if (pagetable1 == NULL) { + continue; } + (*p2)++; + gc_stats_pagetable1(pagetable1, p1, p0); } } @@ -809,7 +783,7 @@ void jl_print_gc_stats(JL_STREAM *s) #ifdef _OS_LINUX_ malloc_stats(); #endif - double ptime = jl_clock_now() - process_t0; + double ptime = jl_hrtime() - process_t0; jl_safe_printf("exec time\t%.5f sec\n", ptime); if (gc_num.pause > 0) { jl_safe_printf("gc time \t%.5f sec (%2.1f%%) in %d (%d full) collections\n", @@ -1030,7 +1004,7 @@ void jl_gc_debug_init(void) #endif #ifdef GC_FINAL_STATS - process_t0 = jl_clock_now(); + process_t0 = jl_hrtime(); #endif } @@ -1152,7 +1126,7 @@ void gc_stats_big_obj(void) static int64_t poolobj_sizes[4]; static int64_t empty_pages; -static void gc_count_pool_page(jl_gc_pagemeta_t *pg) +static void gc_count_pool_page(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT { int osize = pg->osize; char *data = pg->data; @@ -1171,44 +1145,16 @@ static void gc_count_pool_page(jl_gc_pagemeta_t *pg) } } -static void gc_count_pool_pagetable0(pagetable0_t *pagetable0) -{ - for (int pg_i = 0; pg_i < REGION0_PG_COUNT / 32; pg_i++) { - uint32_t line = pagetable0->allocmap[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - gc_count_pool_page(pagetable0->meta[pg_i * 32 + j]); - } - } - } - } -} - -static void gc_count_pool_pagetable1(pagetable1_t *pagetable1) -{ - for (int pg_i = 0; pg_i < REGION1_PG_COUNT / 32; pg_i++) { - uint32_t line = pagetable1->allocmap0[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - gc_count_pool_pagetable0(pagetable1->meta0[pg_i * 32 + j]); - } - } - } - } -} - static void gc_count_pool_pagetable(void) { - for (int pg_i = 0; pg_i < (REGION2_PG_COUNT + 31) / 32; pg_i++) { - uint32_t line = memory_map.allocmap1[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - gc_count_pool_pagetable1(memory_map.meta1[pg_i * 32 + j]); - } + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + while (pg != NULL) { + if (gc_alloc_map_is_set(pg->data)) { + gc_count_pool_page(pg); } + pg = pg->next; } } } @@ -1264,139 +1210,6 @@ int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT return (slot - start) / elsize; } -// Print a backtrace from the bottom (start) of the mark stack up to `sp` -// `pc_offset` will be added to `sp` for convenience in the debugger. -NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset) -{ - jl_jmp_buf *old_buf = jl_get_safe_restore(); - jl_jmp_buf buf; - jl_set_safe_restore(&buf); - if (jl_setjmp(buf, 0) != 0) { - jl_safe_printf("\n!!! ERROR when unwinding gc mark loop -- ABORTING !!!\n"); - jl_set_safe_restore(old_buf); - return; - } - void **top = sp.pc + pc_offset; - jl_gc_mark_data_t *data_top = sp.data; - sp.data = ptls->gc_cache.data_stack; - sp.pc = ptls->gc_cache.pc_stack; - int isroot = 1; - while (sp.pc < top) { - void *pc = *sp.pc; - const char *prefix = isroot ? "r--" : " `-"; - isroot = 0; - if (pc == gc_mark_label_addrs[GC_MARK_L_marked_obj]) { - gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Root object: %p :: %p (bits: %d)\n of type ", - (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits); - jl_((void*)data->tag); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_scan_only]) { - gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Queued root: %p :: %p (bits: %d)\n of type ", - (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits); - jl_((void*)data->tag); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_finlist]) { - gc_mark_finlist_t *data = gc_repush_markdata(&sp, gc_mark_finlist_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Finalizer list from %p to %p\n", - (void*)data, (void*)data->begin, (void*)data->end); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_objarray]) { - gc_mark_objarray_t *data = gc_repush_markdata(&sp, gc_mark_objarray_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Array in object %p :: %p -- [%p, %p)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (void*)data->begin, (void*)data->end); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj8]) { - gc_mark_obj8_t *data = gc_repush_markdata(&sp, gc_mark_obj8_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint8_t *desc = (uint8_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (8bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj16]) { - gc_mark_obj16_t *data = gc_repush_markdata(&sp, gc_mark_obj16_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint16_t *desc = (uint16_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (16bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj32]) { - gc_mark_obj32_t *data = gc_repush_markdata(&sp, gc_mark_obj32_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint32_t *desc = (uint32_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (32bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_stack]) { - gc_mark_stackframe_t *data = gc_repush_markdata(&sp, gc_mark_stackframe_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Stack frame %p -- %d of %d (%s)\n", - (void*)data, prefix, (void*)data->s, (int)data->i, - (int)data->nroots >> 1, - (data->nroots & 1) ? "indirect" : "direct"); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_module_binding]) { - // module_binding - gc_mark_binding_t *data = gc_repush_markdata(&sp, gc_mark_binding_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Module (bindings) %p (bits %d) -- [%p, %p)\n", - (void*)data, prefix, (void*)data->parent, (int)data->bits, - (void*)data->begin, (void*)data->end); - } - else { - jl_safe_printf("Unknown pc %p --- ABORTING !!!\n", pc); - break; - } - } - jl_set_safe_restore(old_buf); -} - static int gc_logging_enabled = 0; JL_DLLEXPORT void jl_enable_gc_logging(int enable) { diff --git a/src/gc-page-profiler.c b/src/gc-page-profiler.c new file mode 100644 index 0000000000000..5af1c3d014770 --- /dev/null +++ b/src/gc-page-profiler.c @@ -0,0 +1,167 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include "gc-page-profiler.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// whether page profiling is enabled +int page_profile_enabled; +// number of pages written +size_t page_profile_pages_written; +// stream to write page profile to +ios_t *page_profile_stream; +// mutex for page profile +uv_mutex_t page_profile_lock; + +gc_page_profiler_serializer_t gc_page_serializer_create(void) JL_NOTSAFEPOINT +{ + gc_page_profiler_serializer_t serializer; + if (__unlikely(page_profile_enabled)) { + arraylist_new(&serializer.typestrs, GC_PAGE_SZ); + } + else { + serializer.typestrs.len = 0; + } + return serializer; +} + +void gc_page_serializer_init(gc_page_profiler_serializer_t *serializer, + jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT +{ + if (__unlikely(page_profile_enabled)) { + serializer->typestrs.len = 0; + serializer->data = (char *)pg->data; + serializer->osize = pg->osize; + } +} + +void gc_page_serializer_destroy(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT +{ + if (__unlikely(page_profile_enabled)) { + arraylist_free(&serializer->typestrs); + } +} + +void gc_page_serializer_write(gc_page_profiler_serializer_t *serializer, + const char *str) JL_NOTSAFEPOINT +{ + if (__unlikely(page_profile_enabled)) { + arraylist_push(&serializer->typestrs, (void *)str); + } +} + +void gc_enable_page_profile(void) JL_NOTSAFEPOINT +{ + page_profile_enabled = 1; +} + +void gc_disable_page_profile(void) JL_NOTSAFEPOINT +{ + page_profile_enabled = 0; +} + +int gc_page_profile_is_enabled(void) JL_NOTSAFEPOINT +{ + return page_profile_enabled; +} + +void gc_page_profile_write_preamble(gc_page_profiler_serializer_t *serializer) + JL_NOTSAFEPOINT +{ + if (__unlikely(page_profile_enabled)) { + char str[GC_TYPE_STR_MAXLEN]; + snprintf(str, GC_TYPE_STR_MAXLEN, + "{\"address\": \"%p\",\"object_size\": %d,\"objects\": [", + serializer->data, serializer->osize); + ios_write(page_profile_stream, str, strlen(str)); + } +} + +void gc_page_profile_write_epilogue(gc_page_profiler_serializer_t *serializer) + JL_NOTSAFEPOINT +{ + if (__unlikely(page_profile_enabled)) { + const char *str = "]}"; + ios_write(page_profile_stream, str, strlen(str)); + } +} + +void gc_page_profile_write_comma(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT +{ + if (__unlikely(page_profile_enabled)) { + // write comma if not first page + if (page_profile_pages_written > 0) { + const char *str = ","; + ios_write(page_profile_stream, str, strlen(str)); + } + } +} + +void gc_page_profile_write_to_file(gc_page_profiler_serializer_t *serializer) + JL_NOTSAFEPOINT +{ + if (__unlikely(page_profile_enabled)) { + // write to file + uv_mutex_lock(&page_profile_lock); + gc_page_profile_write_comma(serializer); + gc_page_profile_write_preamble(serializer); + char str[GC_TYPE_STR_MAXLEN]; + for (size_t i = 0; i < serializer->typestrs.len; i++) { + const char *name = (const char *)serializer->typestrs.items[i]; + if (name == GC_SERIALIZER_EMPTY) { + snprintf(str, GC_TYPE_STR_MAXLEN, "\"empty\","); + } + else if (name == GC_SERIALIZER_GARBAGE) { + snprintf(str, GC_TYPE_STR_MAXLEN, "\"garbage\","); + } + else { + snprintf(str, GC_TYPE_STR_MAXLEN, "\"%s\",", name); + } + // remove trailing comma for last element + if (i == serializer->typestrs.len - 1) { + str[strlen(str) - 1] = '\0'; + } + ios_write(page_profile_stream, str, strlen(str)); + } + gc_page_profile_write_epilogue(serializer); + page_profile_pages_written++; + uv_mutex_unlock(&page_profile_lock); + } +} + +void gc_page_profile_write_json_preamble(ios_t *stream) JL_NOTSAFEPOINT +{ + if (__unlikely(page_profile_enabled)) { + uv_mutex_lock(&page_profile_lock); + const char *str = "{\"pages\": ["; + ios_write(stream, str, strlen(str)); + uv_mutex_unlock(&page_profile_lock); + } +} + +void gc_page_profile_write_json_epilogue(ios_t *stream) JL_NOTSAFEPOINT +{ + if (__unlikely(page_profile_enabled)) { + uv_mutex_lock(&page_profile_lock); + const char *str = "]}"; + ios_write(stream, str, strlen(str)); + uv_mutex_unlock(&page_profile_lock); + } +} + +JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream) +{ + gc_enable_page_profile(); + page_profile_pages_written = 0; + page_profile_stream = stream; + gc_page_profile_write_json_preamble(stream); + jl_gc_collect(JL_GC_FULL); + gc_page_profile_write_json_epilogue(stream); + gc_disable_page_profile(); +} + +#ifdef __cplusplus +} +#endif diff --git a/src/gc-page-profiler.h b/src/gc-page-profiler.h new file mode 100644 index 0000000000000..b103e23905ba5 --- /dev/null +++ b/src/gc-page-profiler.h @@ -0,0 +1,63 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#ifndef GC_PAGE_PROFILER_H +#define GC_PAGE_PROFILER_H + +#include "gc.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GC_TYPE_STR_MAXLEN (512) + +typedef struct { + arraylist_t typestrs; + char *data; + int osize; +} gc_page_profiler_serializer_t; + +// mutex for page profile +extern uv_mutex_t page_profile_lock; + +// Serializer functions +gc_page_profiler_serializer_t gc_page_serializer_create(void) JL_NOTSAFEPOINT; +void gc_page_serializer_init(gc_page_profiler_serializer_t *serializer, jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT; +void gc_page_serializer_destroy(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT; +void gc_page_serializer_write(gc_page_profiler_serializer_t *serializer, const char *str) JL_NOTSAFEPOINT; +// Page profile functions +#define GC_SERIALIZER_EMPTY ((const char *)0x1) +#define GC_SERIALIZER_GARBAGE ((const char *)0x2) +STATIC_INLINE void gc_page_profile_write_empty_page(gc_page_profiler_serializer_t *serializer, + int enabled) JL_NOTSAFEPOINT +{ + if (__unlikely(enabled)) { + gc_page_serializer_write(serializer, GC_SERIALIZER_EMPTY); + } +} +STATIC_INLINE void gc_page_profile_write_garbage(gc_page_profiler_serializer_t *serializer, + int enabled) JL_NOTSAFEPOINT +{ + if (__unlikely(enabled)) { + gc_page_serializer_write(serializer, GC_SERIALIZER_GARBAGE); + } +} +STATIC_INLINE void gc_page_profile_write_live_obj(gc_page_profiler_serializer_t *serializer, + jl_taggedvalue_t *v, + int enabled) JL_NOTSAFEPOINT +{ + if (__unlikely(enabled)) { + const char *name = jl_typeof_str(jl_valueof(v)); + gc_page_serializer_write(serializer, name); + } +} +void gc_enable_page_profile(void) JL_NOTSAFEPOINT; +void gc_disable_page_profile(void) JL_NOTSAFEPOINT; +int gc_page_profile_is_enabled(void) JL_NOTSAFEPOINT; +void gc_page_profile_write_to_file(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT; + +#ifdef __cplusplus +} +#endif + +#endif // GC_PAGE_PROFILER_H diff --git a/src/gc-pages.c b/src/gc-pages.c index d579eb0cd4fbb..f015b5de2295e 100644 --- a/src/gc-pages.c +++ b/src/gc-pages.c @@ -19,7 +19,31 @@ extern "C" { #define MIN_BLOCK_PG_ALLOC (1) // 16 KB static int block_pg_cnt = DEFAULT_BLOCK_PG_ALLOC; -static size_t current_pg_count = 0; +static _Atomic(size_t) current_pg_count = 0; + +// Julia allocates large blocks (64M) with mmap. These are never +// released back but the underlying physical memory may be released +// with calls to madvise(MADV_DONTNEED). +// These large blocks are used to allocated jl_page_size sized +// pages, that are tracked by current_pg_count. +static uint64_t poolmem_bytes_allocated = 0; +static uint64_t poolmem_blocks_allocated_total = 0; + + +JL_DLLEXPORT uint64_t jl_poolmem_blocks_allocated_total(void) +{ + return poolmem_blocks_allocated_total; +} + +JL_DLLEXPORT uint64_t jl_poolmem_bytes_allocated(void) +{ + return poolmem_bytes_allocated; +} + +JL_DLLEXPORT uint64_t jl_current_pg_count(void) +{ + return (uint64_t)jl_atomic_load(¤t_pg_count); +} void jl_gc_init_page(void) { @@ -33,7 +57,7 @@ void jl_gc_init_page(void) // Try to allocate a memory block for multiple pages // Return `NULL` if allocation failed. Result is aligned to `GC_PAGE_SZ`. -static char *jl_gc_try_alloc_pages(int pg_cnt) JL_NOTSAFEPOINT +char *jl_gc_try_alloc_pages_(int pg_cnt) JL_NOTSAFEPOINT { size_t pages_sz = GC_PAGE_SZ * pg_cnt; #ifdef _OS_WINDOWS_ @@ -48,6 +72,12 @@ static char *jl_gc_try_alloc_pages(int pg_cnt) JL_NOTSAFEPOINT MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (mem == MAP_FAILED) return NULL; + poolmem_bytes_allocated += pages_sz; + poolmem_blocks_allocated_total++; + +#ifdef MADV_NOHUGEPAGE + madvise(mem, pages_sz, MADV_NOHUGEPAGE); +#endif #endif if (GC_PAGE_SZ > jl_page_size) // round data pointer up to the nearest gc_page_data-aligned @@ -63,13 +93,12 @@ static char *jl_gc_try_alloc_pages(int pg_cnt) JL_NOTSAFEPOINT // smaller `MIN_BLOCK_PG_ALLOC` a `jl_memory_exception` is thrown. // Assumes `gc_perm_lock` is acquired, the lock is released before the // exception is thrown. -static jl_gc_pagemeta_t *jl_gc_alloc_new_page(void) JL_NOTSAFEPOINT +char *jl_gc_try_alloc_pages(void) JL_NOTSAFEPOINT { - // try to allocate a large block of memory (or a small one) - unsigned pg, pg_cnt = block_pg_cnt; + unsigned pg_cnt = block_pg_cnt; char *mem = NULL; while (1) { - if (__likely((mem = jl_gc_try_alloc_pages(pg_cnt)))) + if (__likely((mem = jl_gc_try_alloc_pages_(pg_cnt)))) break; size_t min_block_pg_alloc = MIN_BLOCK_PG_ALLOC; if (GC_PAGE_SZ * min_block_pg_alloc < jl_page_size) @@ -86,204 +115,71 @@ static jl_gc_pagemeta_t *jl_gc_alloc_new_page(void) JL_NOTSAFEPOINT jl_throw(jl_memory_exception); } } - - // now need to insert these pages into the pagetable metadata - // if any allocation fails, this just stops recording more pages from that point - // and will free (munmap) the remainder - jl_gc_pagemeta_t *page_meta = - (jl_gc_pagemeta_t*)jl_gc_perm_alloc_nolock(pg_cnt * sizeof(jl_gc_pagemeta_t), 1, - sizeof(void*), 0); - pg = 0; - if (page_meta) { - for (; pg < pg_cnt; pg++) { - struct jl_gc_metadata_ext info; - uint32_t msk; - unsigned i; - pagetable1_t **ppagetable1; - pagetable0_t **ppagetable0; - jl_gc_pagemeta_t **pmeta; - - char *ptr = mem + (GC_PAGE_SZ * pg); - page_meta[pg].data = ptr; - - // create & store the level 2 / outermost info - i = REGION_INDEX(ptr); - info.pagetable_i = i % 32; - info.pagetable_i32 = i / 32; - msk = (1u << info.pagetable_i); - if ((memory_map.freemap1[info.pagetable_i32] & msk) == 0) - memory_map.freemap1[info.pagetable_i32] |= msk; // has free - info.pagetable1 = *(ppagetable1 = &memory_map.meta1[i]); - if (!info.pagetable1) { - info.pagetable1 = (pagetable1_t*)jl_gc_perm_alloc_nolock(sizeof(pagetable1_t), 1, - sizeof(void*), 0); - *ppagetable1 = info.pagetable1; - if (!info.pagetable1) - break; - } - - // create & store the level 1 info - i = REGION1_INDEX(ptr); - info.pagetable1_i = i % 32; - info.pagetable1_i32 = i / 32; - msk = (1u << info.pagetable1_i); - if ((info.pagetable1->freemap0[info.pagetable1_i32] & msk) == 0) - info.pagetable1->freemap0[info.pagetable1_i32] |= msk; // has free - info.pagetable0 = *(ppagetable0 = &info.pagetable1->meta0[i]); - if (!info.pagetable0) { - info.pagetable0 = (pagetable0_t*)jl_gc_perm_alloc_nolock(sizeof(pagetable0_t), 1, - sizeof(void*), 0); - *ppagetable0 = info.pagetable0; - if (!info.pagetable0) - break; - } - - // create & store the level 0 / page info - i = REGION0_INDEX(ptr); - info.pagetable0_i = i % 32; - info.pagetable0_i32 = i / 32; - msk = (1u << info.pagetable0_i); - info.pagetable0->freemap[info.pagetable0_i32] |= msk; // is free - pmeta = &info.pagetable0->meta[i]; - info.meta = (*pmeta = &page_meta[pg]); - } - } - - if (pg < pg_cnt) { -#ifndef _OS_WINDOWS_ - // Trim the allocation to only cover the region - // that we successfully created the metadata for. - // This is not supported by the Windows kernel, - // so we have to just skip it there and just lose these virtual addresses. - munmap(mem + LLT_ALIGN(GC_PAGE_SZ * pg, jl_page_size), - GC_PAGE_SZ * pg_cnt - LLT_ALIGN(GC_PAGE_SZ * pg, jl_page_size)); -#endif - if (pg == 0) { - uv_mutex_unlock(&gc_perm_lock); - jl_throw(jl_memory_exception); - } - } - return page_meta; + return mem; } // get a new page, either from the freemap // or from the kernel if none are available NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT { - struct jl_gc_metadata_ext info; - uv_mutex_lock(&gc_perm_lock); - int last_errno = errno; #ifdef _OS_WINDOWS_ DWORD last_error = GetLastError(); #endif - // scan over memory_map page-table for existing allocated but unused pages - for (info.pagetable_i32 = memory_map.lb; info.pagetable_i32 < (REGION2_PG_COUNT + 31) / 32; info.pagetable_i32++) { - uint32_t freemap1 = memory_map.freemap1[info.pagetable_i32]; - for (info.pagetable_i = 0; freemap1; info.pagetable_i++, freemap1 >>= 1) { - unsigned next = ffs_u32(freemap1); - info.pagetable_i += next; - freemap1 >>= next; - info.pagetable1 = memory_map.meta1[info.pagetable_i + info.pagetable_i32 * 32]; - // repeat over page-table level 1 - for (info.pagetable1_i32 = info.pagetable1->lb; info.pagetable1_i32 < REGION1_PG_COUNT / 32; info.pagetable1_i32++) { - uint32_t freemap0 = info.pagetable1->freemap0[info.pagetable1_i32]; - for (info.pagetable1_i = 0; freemap0; info.pagetable1_i++, freemap0 >>= 1) { - unsigned next = ffs_u32(freemap0); - info.pagetable1_i += next; - freemap0 >>= next; - info.pagetable0 = info.pagetable1->meta0[info.pagetable1_i + info.pagetable1_i32 * 32]; - // repeat over page-table level 0 - for (info.pagetable0_i32 = info.pagetable0->lb; info.pagetable0_i32 < REGION0_PG_COUNT / 32; info.pagetable0_i32++) { - uint32_t freemap = info.pagetable0->freemap[info.pagetable0_i32]; - if (freemap) { - info.pagetable0_i = ffs_u32(freemap); - info.meta = info.pagetable0->meta[info.pagetable0_i + info.pagetable0_i32 * 32]; - assert(info.meta->data); - // new pages available starting at min of lb and pagetable_i32 - if (memory_map.lb < info.pagetable_i32) - memory_map.lb = info.pagetable_i32; - if (info.pagetable1->lb < info.pagetable1_i32) - info.pagetable1->lb = info.pagetable1_i32; - if (info.pagetable0->lb < info.pagetable0_i32) - info.pagetable0->lb = info.pagetable0_i32; - goto have_free_page; // break out of all of these loops - } - } - info.pagetable1->freemap0[info.pagetable1_i32] &= ~(uint32_t)(1u << info.pagetable1_i); // record that this was full - } - } - memory_map.freemap1[info.pagetable_i32] &= ~(uint32_t)(1u << info.pagetable_i); // record that this was full - } - } + jl_gc_pagemeta_t *meta = NULL; - // no existing pages found, allocate a new one - { - jl_gc_pagemeta_t *meta = jl_gc_alloc_new_page(); - info = page_metadata_ext(meta->data); - assert(meta == info.meta); - // new pages are now available starting at max of lb and pagetable_i32 - if (memory_map.lb > info.pagetable_i32) - memory_map.lb = info.pagetable_i32; - if (info.pagetable1->lb > info.pagetable1_i32) - info.pagetable1->lb = info.pagetable1_i32; - if (info.pagetable0->lb > info.pagetable0_i32) - info.pagetable0->lb = info.pagetable0_i32; + // try to get page from `pool_clean` + meta = pop_lf_back(&global_page_pool_clean); + if (meta != NULL) { + gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED); + goto exit; } -have_free_page: - // in-use pages are now ending at min of ub and pagetable_i32 - if (memory_map.ub < info.pagetable_i32) - memory_map.ub = info.pagetable_i32; - if (info.pagetable1->ub < info.pagetable1_i32) - info.pagetable1->ub = info.pagetable1_i32; - if (info.pagetable0->ub < info.pagetable0_i32) - info.pagetable0->ub = info.pagetable0_i32; - - // mark this entry as in-use and not free - info.pagetable0->freemap[info.pagetable0_i32] &= ~(uint32_t)(1u << info.pagetable0_i); - info.pagetable0->allocmap[info.pagetable0_i32] |= (uint32_t)(1u << info.pagetable0_i); - info.pagetable1->allocmap0[info.pagetable1_i32] |= (uint32_t)(1u << info.pagetable1_i); - memory_map.allocmap1[info.pagetable_i32] |= (uint32_t)(1u << info.pagetable_i); + // try to get page from `pool_freed` + meta = pop_lf_back(&global_page_pool_freed); + if (meta != NULL) { + gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED); + goto exit; + } + uv_mutex_lock(&gc_perm_lock); + // another thread may have allocated a large block while we were waiting... + meta = pop_lf_back(&global_page_pool_clean); + if (meta != NULL) { + uv_mutex_unlock(&gc_perm_lock); + gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED); + goto exit; + } + // must map a new set of pages + char *data = jl_gc_try_alloc_pages(); + meta = (jl_gc_pagemeta_t*)malloc_s(block_pg_cnt * sizeof(jl_gc_pagemeta_t)); + for (int i = 0; i < block_pg_cnt; i++) { + jl_gc_pagemeta_t *pg = &meta[i]; + pg->data = data + GC_PAGE_SZ * i; + gc_alloc_map_maybe_create(pg->data); + if (i == 0) { + gc_alloc_map_set(pg->data, GC_PAGE_ALLOCATED); + } + else { + push_lf_back(&global_page_pool_clean, pg); + } + } + uv_mutex_unlock(&gc_perm_lock); +exit: #ifdef _OS_WINDOWS_ - VirtualAlloc(info.meta->data, GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE); -#endif -#ifdef _OS_WINDOWS_ + VirtualAlloc(meta->data, GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE); SetLastError(last_error); #endif errno = last_errno; - current_pg_count++; - gc_final_count_page(current_pg_count); - uv_mutex_unlock(&gc_perm_lock); - return info.meta; + jl_atomic_fetch_add(¤t_pg_count, 1); + return meta; } // return a page to the freemap allocator -void jl_gc_free_page(void *p) JL_NOTSAFEPOINT +void jl_gc_free_page(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT { - // update the allocmap and freemap to indicate this contains a free entry - struct jl_gc_metadata_ext info = page_metadata_ext(p); - uint32_t msk; - msk = (uint32_t)(1u << info.pagetable0_i); - assert(!(info.pagetable0->freemap[info.pagetable0_i32] & msk)); - assert(info.pagetable0->allocmap[info.pagetable0_i32] & msk); - info.pagetable0->allocmap[info.pagetable0_i32] &= ~msk; - info.pagetable0->freemap[info.pagetable0_i32] |= msk; - - msk = (uint32_t)(1u << info.pagetable1_i); - assert(info.pagetable1->allocmap0[info.pagetable1_i32] & msk); - if ((info.pagetable1->freemap0[info.pagetable1_i32] & msk) == 0) - info.pagetable1->freemap0[info.pagetable1_i32] |= msk; - - msk = (uint32_t)(1u << info.pagetable_i); - assert(memory_map.allocmap1[info.pagetable_i32] & msk); - if ((memory_map.freemap1[info.pagetable_i32] & msk) == 0) - memory_map.freemap1[info.pagetable_i32] |= msk; - - free(info.meta->ages); - info.meta->ages = NULL; - + void *p = pg->data; + gc_alloc_map_set((char*)p, GC_PAGE_FREED); // tell the OS we don't need these pages right now size_t decommit_size = GC_PAGE_SZ; if (GC_PAGE_SZ < jl_page_size) { @@ -293,16 +189,15 @@ void jl_gc_free_page(void *p) JL_NOTSAFEPOINT void *otherp = (void*)((uintptr_t)p & ~(jl_page_size - 1)); // round down to the nearest physical page p = otherp; while (n_pages--) { - struct jl_gc_metadata_ext info = page_metadata_ext(otherp); - msk = (uint32_t)(1u << info.pagetable0_i); - if (info.pagetable0->allocmap[info.pagetable0_i32] & msk) - goto no_decommit; + if (gc_alloc_map_is_set((char*)otherp)) { + return; + } otherp = (void*)((char*)otherp + GC_PAGE_SZ); } } #ifdef _OS_WINDOWS_ VirtualFree(p, decommit_size, MEM_DECOMMIT); -#elif defined(MADV_FREE) +#elif 0 static int supports_madv_free = 1; if (supports_madv_free) { if (madvise(p, decommit_size, MADV_FREE) == -1) { @@ -316,20 +211,8 @@ void jl_gc_free_page(void *p) JL_NOTSAFEPOINT #else madvise(p, decommit_size, MADV_DONTNEED); #endif - /* TODO: Should we leave this poisoned and rather allow the GC to read poisoned pointers from - * the page when it sweeps pools? - */ msan_unpoison(p, decommit_size); - -no_decommit: - // new pages are now available starting at max of lb and pagetable_i32 - if (memory_map.lb > info.pagetable_i32) - memory_map.lb = info.pagetable_i32; - if (info.pagetable1->lb > info.pagetable1_i32) - info.pagetable1->lb = info.pagetable1_i32; - if (info.pagetable0->lb > info.pagetable0_i32) - info.pagetable0->lb = info.pagetable0_i32; - current_pg_count--; + jl_atomic_fetch_add(¤t_pg_count, -1); } #ifdef __cplusplus diff --git a/src/gc-stacks.c b/src/gc-stacks.c index b35c1722c82ff..693cb8d0eadf0 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -61,6 +61,9 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT munmap(stk, bufsz); return MAP_FAILED; } +#ifdef MADV_NOHUGEPAGE + madvise(stk, bufsz, MADV_NOHUGEPAGE); +#endif #endif jl_atomic_fetch_add(&num_stack_mappings, 1); return stk; @@ -73,6 +76,10 @@ static void free_stack(void *stkbuf, size_t bufsz) } #endif +JL_DLLEXPORT uint32_t jl_get_num_stack_mappings(void) +{ + return jl_atomic_load_relaxed(&num_stack_mappings); +} const unsigned pool_sizes[] = { 128 * 1024, @@ -112,7 +119,7 @@ static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) { unsigned pool_id = select_pool(bufsz); if (pool_sizes[pool_id] == bufsz) { - arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf); + small_arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf); return; } } @@ -141,7 +148,7 @@ void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task) #ifdef _COMPILER_ASAN_ENABLED_ __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz); #endif - arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf); + small_arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf); } } } @@ -156,9 +163,9 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO if (ssize <= pool_sizes[JL_N_STACK_POOLS - 1]) { unsigned pool_id = select_pool(ssize); ssize = pool_sizes[pool_id]; - arraylist_t *pool = &ptls->heap.free_stacks[pool_id]; + small_arraylist_t *pool = &ptls->heap.free_stacks[pool_id]; if (pool->len > 0) { - stk = arraylist_pop(pool); + stk = small_arraylist_pop(pool); } } else { @@ -177,8 +184,8 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO } *bufsz = ssize; if (owner) { - arraylist_t *live_tasks = &ptls->heap.live_tasks; - arraylist_push(live_tasks, owner); + small_arraylist_t *live_tasks = &ptls->heap.live_tasks; + mtarraylist_push(live_tasks, owner); } return stk; } @@ -202,7 +209,7 @@ void sweep_stack_pools(void) // free half of stacks that remain unused since last sweep for (int p = 0; p < JL_N_STACK_POOLS; p++) { - arraylist_t *al = &ptls2->heap.free_stacks[p]; + small_arraylist_t *al = &ptls2->heap.free_stacks[p]; size_t n_to_free; if (al->len > MIN_STACK_MAPPINGS_PER_POOL) { n_to_free = al->len / 2; @@ -213,12 +220,12 @@ void sweep_stack_pools(void) n_to_free = 0; } for (int n = 0; n < n_to_free; n++) { - void *stk = arraylist_pop(al); + void *stk = small_arraylist_pop(al); free_stack(stk, pool_sizes[p]); } } - arraylist_t *live_tasks = &ptls2->heap.live_tasks; + small_arraylist_t *live_tasks = &ptls2->heap.live_tasks; size_t n = 0; size_t ndel = 0; size_t l = live_tasks->len; @@ -261,24 +268,52 @@ void sweep_stack_pools(void) JL_DLLEXPORT jl_array_t *jl_live_tasks(void) { - jl_task_t *ct = jl_current_task; - jl_ptls_t ptls = ct->ptls; - arraylist_t *live_tasks = &ptls->heap.live_tasks; - size_t i, j, l; - jl_array_t *a; - do { - l = live_tasks->len; - a = jl_alloc_vec_any(l + 1); // may gc, changing the number of tasks - } while (l + 1 < live_tasks->len); - l = live_tasks->len; - void **lst = live_tasks->items; - j = 0; - ((void**)jl_array_data(a))[j++] = ptls->root_task; - for (i = 0; i < l; i++) { - if (((jl_task_t*)lst[i])->stkbuf != NULL) - ((void**)jl_array_data(a))[j++] = lst[i]; + size_t nthreads = jl_atomic_load_acquire(&jl_n_threads); + jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); + size_t l = 0; // l is not reset on restart, so we keep getting more aggressive at making a big enough list everything it fails +restart: + for (size_t i = 0; i < nthreads; i++) { + // skip GC threads since they don't have tasks + if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) { + continue; + } + jl_ptls_t ptls2 = allstates[i]; + if (ptls2 == NULL) + continue; + small_arraylist_t *live_tasks = &ptls2->heap.live_tasks; + size_t n = mtarraylist_length(live_tasks); + l += n + (ptls2->root_task->stkbuf != NULL); + } + l += l / 20; // add 5% for margin of estimation error + jl_array_t *a = jl_alloc_vec_any(l); // may gc, changing the number of tasks and forcing us to reload everything + nthreads = jl_atomic_load_acquire(&jl_n_threads); + allstates = jl_atomic_load_relaxed(&jl_all_tls_states); + size_t j = 0; + for (size_t i = 0; i < nthreads; i++) { + // skip GC threads since they don't have tasks + if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) { + continue; + } + jl_ptls_t ptls2 = allstates[i]; + if (ptls2 == NULL) + continue; + jl_task_t *t = ptls2->root_task; + if (t->stkbuf != NULL) { + if (j == l) + goto restart; + ((void**)jl_array_data(a))[j++] = t; + } + small_arraylist_t *live_tasks = &ptls2->heap.live_tasks; + size_t n = mtarraylist_length(live_tasks); + for (size_t i = 0; i < n; i++) { + jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i); + if (t->stkbuf != NULL) { + if (j == l) + goto restart; + ((void**)jl_array_data(a))[j++] = t; + } + } } - l = jl_array_len(a); if (j < l) { JL_GC_PUSH1(&a); jl_array_del_end(a, l - j); diff --git a/src/gc.c b/src/gc.c index 1df6db4986a2f..e8cfae27de0da 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1,6 +1,7 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license #include "gc.h" +#include "gc-page-profiler.h" #include "julia_gcext.h" #include "julia_assert.h" #ifdef __GLIBC__ @@ -11,6 +12,22 @@ extern "C" { #endif +// Number of threads currently running the GC mark-loop +_Atomic(int) gc_n_threads_marking; +// Number of threads sweeping +_Atomic(int) gc_n_threads_sweeping; +// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping +_Atomic(jl_gc_page_stack_t *) gc_allocd_scratch; +// `tid` of mutator thread that triggered GC +_Atomic(int) gc_master_tid; +// `tid` of first GC thread +int gc_first_tid; +// Mutex/cond used to synchronize sleep/wakeup of GC threads +uv_mutex_t gc_threads_lock; +uv_cond_t gc_threads_cond; +// Mutex used to coordinate entry of GC threads in the mark loop +uv_mutex_t gc_queue_observer_lock; + // Linked list of callback functions typedef void (*jl_gc_cb_func_t)(void); @@ -26,6 +43,7 @@ static jl_gc_callback_list_t *gc_cblist_pre_gc; static jl_gc_callback_list_t *gc_cblist_post_gc; static jl_gc_callback_list_t *gc_cblist_notify_external_alloc; static jl_gc_callback_list_t *gc_cblist_notify_external_free; +static jl_gc_callback_list_t *gc_cblist_notify_gc_pressure; #define gc_invoke_callbacks(ty, list, args) \ do { \ @@ -112,16 +130,13 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb); } -// Save/restore local mark stack to/from thread-local storage. - -STATIC_INLINE void export_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) { - ptls->gc_mark_sp = *sp; -} - -STATIC_INLINE void import_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) { - // Has the stack been reallocated in the meantime? - *sp = ptls->gc_mark_sp; -} +JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable) + { + if (enable) + jl_gc_register_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb); + else + jl_gc_deregister_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb); + } // Protect all access to `finalizer_list_marked` and `to_finalize`. // For accessing `ptls->finalizers`, the lock is needed if a thread @@ -179,8 +194,6 @@ JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) return jl_buff_tag; } -pagetable_t memory_map; - // List of marked big objects. Not per-thread. Accessed only by master thread. bigval_t *big_objects_marked = NULL; @@ -327,16 +340,16 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) jl_wake_libuv(); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - // This acquire load pairs with the release stores - // in the signal handler of safepoint so we are sure that - // all the stores on those threads are visible. - // We're currently also using atomic store release in mutator threads - // (in jl_gc_state_set), but we may want to use signals to flush the - // memory operations on those threads lazily instead. - while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) - jl_cpu_pause(); // yield? + if (ptls2 != NULL) { + // This acquire load pairs with the release stores + // in the signal handler of safepoint so we are sure that + // all the stores on those threads are visible. + // We're currently also using atomic store release in mutator threads + // (in jl_gc_state_set), but we may want to use signals to flush the + // memory operations on those threads lazily instead. + while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) + jl_cpu_pause(); // yield? + } } } @@ -644,7 +657,7 @@ void jl_gc_run_all_finalizers(jl_task_t *ct) schedule_all_finalizers(&finalizer_list_marked); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) + if (ptls2 != NULL) schedule_all_finalizers(&ptls2->finalizers); } // unlock here because `run_finalizers` locks this @@ -720,7 +733,7 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o) gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) + if (ptls2 != NULL) finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i); } finalize_object(&finalizer_list_marked, o, &copied_list, 0); @@ -760,12 +773,13 @@ static void gc_sweep_foreign_objs(void) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) + if (ptls2 != NULL) gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs); } } // GC knobs and self-measurement variables +static int under_memory_pressure; static int64_t last_gc_total_bytes = 0; // max_total_memory is a suggestion. We try very hard to stay @@ -802,7 +816,7 @@ static int mark_reset_age = 0; * * <-[(quick)sweep]- * | - * ----> GC_OLD <--[(quick)sweep && age>promotion]-- + * ----> GC_OLD <--[(quick)sweep]------------------- * | | | * | | GC_MARKED (in remset) | * | | ^ | | @@ -819,9 +833,9 @@ static int mark_reset_age = 0; * ========= above this line objects are old ========= | * | * ----[new]------> GC_CLEAN ------[mark]-----------> GC_MARKED - * | ^ | - * <-[(quick)sweep]--- | | - * --[(quick)sweep && age<=promotion]--- + * | + * <-[(quick)sweep]--- + * */ // A quick sweep is a sweep where `!sweep_full` @@ -835,18 +849,10 @@ static int mark_reset_age = 0; // When a write barrier triggers, the offending marked object is both queued, // so as not to trigger the barrier again, and put in the remset. - -#define PROMOTE_AGE 1 -// this cannot be increased as is without changing : -// - sweep_page which is specialized for 1bit age -// - the size of the age storage in jl_gc_pagemeta_t - - static int64_t scanned_bytes; // young bytes scanned while marking static int64_t perm_scanned_bytes; // old bytes scanned while marking int prev_sweep_full = 1; - -#define inc_sat(v,s) v = (v) >= s ? s : (v)+1 +int current_sweep_full = 0; // Full collection heuristics static int64_t live_bytes = 0; @@ -893,7 +899,7 @@ static void gc_sync_all_caches_nolock(jl_ptls_t ptls) assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2) + if (ptls2 != NULL) gc_sync_cache_nolock(ptls, &ptls2->gc_cache); } } @@ -912,21 +918,13 @@ STATIC_INLINE void gc_queue_big_marked(jl_ptls_t ptls, bigval_t *hdr, ptls->gc_cache.nbig_obj = nobj + 1; } -// `gc_setmark_tag` can be called concurrently on multiple threads. -// In all cases, the function atomically sets the mark bits and returns -// the GC bits set as well as if the tag was unchanged by this thread. -// All concurrent calls on the same object are guaranteed to be setting the -// bits to the same value. -// For normal objects, this is the bits with only `GC_MARKED` changed to `1` -// For buffers, this is the bits of the owner object. -// For `mark_reset_age`, this is `GC_MARKED` with `GC_OLD` cleared. -// The return value is `1` if the object was not marked before. -// Returning `0` can happen if another thread marked it in parallel. -STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode, - uintptr_t tag, uint8_t *bits) JL_NOTSAFEPOINT -{ - assert(!gc_marked(tag)); +// Atomically set the mark bit for object and return whether it was previously unmarked +FORCE_INLINE int gc_try_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode) JL_NOTSAFEPOINT +{ assert(gc_marked(mark_mode)); + uintptr_t tag = o->header; + if (gc_marked(tag)) + return 0; if (mark_reset_age) { // Reset the object as if it was just allocated mark_mode = GC_MARKED; @@ -938,7 +936,6 @@ STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode, tag = tag | mark_mode; assert((tag & 0x3) == mark_mode); } - *bits = mark_mode; tag = jl_atomic_exchange_relaxed((_Atomic(uintptr_t)*)&o->header, tag); verify_val(jl_valueof(o)); return !gc_marked(tag); @@ -949,7 +946,7 @@ STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode, STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o, uint8_t mark_mode) JL_NOTSAFEPOINT { - assert(!page_metadata(o)); + assert(!gc_alloc_map_is_set((char*)o)); bigval_t *hdr = bigval_header(o); if (mark_mode == GC_OLD_MARKED) { ptls->gc_cache.perm_scanned_bytes += hdr->sz & ~3; @@ -960,9 +957,8 @@ STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o, // We can't easily tell if the object is old or being promoted // from the gc bits but if the `age` is `0` then the object // must be already on a young list. - if (mark_reset_age && hdr->age) { + if (mark_reset_age) { // Reset the object as if it was just allocated - hdr->age = 0; gc_queue_big_marked(ptls, hdr, 1); } } @@ -973,13 +969,11 @@ STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o, // This function should be called exactly once during marking for each pool // object being marked to update the page metadata. STATIC_INLINE void gc_setmark_pool_(jl_ptls_t ptls, jl_taggedvalue_t *o, - uint8_t mark_mode, - jl_gc_pagemeta_t *page) JL_NOTSAFEPOINT + uint8_t mark_mode, jl_gc_pagemeta_t *page) JL_NOTSAFEPOINT { #ifdef MEMDEBUG gc_setmark_big(ptls, o, mark_mode); #else - jl_assume(page); if (mark_mode == GC_OLD_MARKED) { ptls->gc_cache.perm_scanned_bytes += page->osize; static_assert(sizeof(_Atomic(uint16_t)) == sizeof(page->nold), ""); @@ -989,10 +983,6 @@ STATIC_INLINE void gc_setmark_pool_(jl_ptls_t ptls, jl_taggedvalue_t *o, ptls->gc_cache.scanned_bytes += page->osize; if (mark_reset_age) { page->has_young = 1; - char *page_begin = gc_page_data(o) + GC_PAGE_OFFSET; - int obj_id = (((char*)o) - page_begin) / page->osize; - uint8_t *ages = page->ages + obj_id / 8; - jl_atomic_fetch_and_relaxed((_Atomic(uint8_t)*)ages, ~(1 << (obj_id % 8))); } } objprofile_count(jl_typeof(jl_valueof(o)), @@ -1004,7 +994,7 @@ STATIC_INLINE void gc_setmark_pool_(jl_ptls_t ptls, jl_taggedvalue_t *o, STATIC_INLINE void gc_setmark_pool(jl_ptls_t ptls, jl_taggedvalue_t *o, uint8_t mark_mode) JL_NOTSAFEPOINT { - gc_setmark_pool_(ptls, o, mark_mode, page_metadata(o)); + gc_setmark_pool_(ptls, o, mark_mode, page_metadata((char*)o)); } STATIC_INLINE void gc_setmark(jl_ptls_t ptls, jl_taggedvalue_t *o, @@ -1021,18 +1011,15 @@ STATIC_INLINE void gc_setmark(jl_ptls_t ptls, jl_taggedvalue_t *o, STATIC_INLINE void gc_setmark_buf_(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) JL_NOTSAFEPOINT { jl_taggedvalue_t *buf = jl_astaggedvalue(o); - uintptr_t tag = buf->header; - if (gc_marked(tag)) - return; - uint8_t bits; + uint8_t bits = (gc_old(buf->header) && !mark_reset_age) ? GC_OLD_MARKED : GC_MARKED;; // If the object is larger than the max pool size it can't be a pool object. // This should be accurate most of the time but there might be corner cases // where the size estimate is a little off so we do a pool lookup to make // sure. - if (__likely(gc_setmark_tag(buf, mark_mode, tag, &bits)) && !gc_verifying) { + if (__likely(gc_try_setmark_tag(buf, mark_mode)) && !gc_verifying) { if (minsz <= GC_MAX_SZCLASS) { jl_gc_pagemeta_t *page = page_metadata(buf); - if (page) { + if (page != NULL) { gc_setmark_pool_(ptls, buf, bits, page); return; } @@ -1046,38 +1033,7 @@ void gc_setmark_buf(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) JL gc_setmark_buf_(ptls, o, mark_mode, minsz); } -void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT -{ - jl_taggedvalue_t *o = jl_astaggedvalue(v); - jl_datatype_t *dt = (jl_datatype_t*)jl_typeof(v); - size_t dtsz = jl_datatype_size(dt); - if (o->bits.gc == GC_OLD_MARKED) - return; - o->bits.gc = GC_OLD_MARKED; - if (dt == jl_simplevector_type) { - size_t l = jl_svec_len(v); - dtsz = l * sizeof(void*) + sizeof(jl_svec_t); - } - else if (dt->name == jl_array_typename) { - jl_array_t *a = (jl_array_t*)v; - if (!a->flags.pooled) - dtsz = GC_MAX_SZCLASS + 1; - } - else if (dt == jl_module_type) { - dtsz = sizeof(jl_module_t); - } - else if (dt == jl_task_type) { - dtsz = sizeof(jl_task_t); - } - else if (dt == jl_symbol_type) { - return; - } - gc_setmark(ptls, o, GC_OLD_MARKED, dtsz); - if (dt->layout->npointers != 0) - jl_gc_queue_root(v); -} - -static inline void maybe_collect(jl_ptls_t ptls) +STATIC_INLINE void maybe_collect(jl_ptls_t ptls) { if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) { jl_gc_collect(JL_GC_AUTO); @@ -1095,7 +1051,7 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type); wr->value = value; // NOTE: wb not needed here - arraylist_push(&ptls->heap.weak_refs, wr); + small_arraylist_push(&ptls->heap.weak_refs, wr); return wr; } @@ -1104,14 +1060,14 @@ static void clear_weak_refs(void) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - size_t n, l = ptls2->heap.weak_refs.len; - void **lst = ptls2->heap.weak_refs.items; - for (n = 0; n < l; n++) { - jl_weakref_t *wr = (jl_weakref_t*)lst[n]; - if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc)) - wr->value = (jl_value_t*)jl_nothing; + if (ptls2 != NULL) { + size_t n, l = ptls2->heap.weak_refs.len; + void **lst = ptls2->heap.weak_refs.items; + for (n = 0; n < l; n++) { + jl_weakref_t *wr = (jl_weakref_t*)lst[n]; + if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc)) + wr->value = (jl_value_t*)jl_nothing; + } } } } @@ -1121,27 +1077,27 @@ static void sweep_weak_refs(void) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - size_t n = 0; - size_t ndel = 0; - size_t l = ptls2->heap.weak_refs.len; - void **lst = ptls2->heap.weak_refs.items; - if (l == 0) - continue; - while (1) { - jl_weakref_t *wr = (jl_weakref_t*)lst[n]; - if (gc_marked(jl_astaggedvalue(wr)->bits.gc)) - n++; - else - ndel++; - if (n >= l - ndel) - break; - void *tmp = lst[n]; - lst[n] = lst[n + ndel]; - lst[n + ndel] = tmp; + if (ptls2 != NULL) { + size_t n = 0; + size_t ndel = 0; + size_t l = ptls2->heap.weak_refs.len; + void **lst = ptls2->heap.weak_refs.items; + if (l == 0) + continue; + while (1) { + jl_weakref_t *wr = (jl_weakref_t*)lst[n]; + if (gc_marked(jl_astaggedvalue(wr)->bits.gc)) + n++; + else + ndel++; + if (n >= l - ndel) + break; + void *tmp = lst[n]; + lst[n] = lst[n + ndel]; + lst[n + ndel] = tmp; + } + ptls2->heap.weak_refs.len -= ndel; } - ptls2->heap.weak_refs.len -= ndel; } } @@ -1149,7 +1105,7 @@ static void sweep_weak_refs(void) // big value list // Size includes the tag and the tag is not cleared!! -static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) +STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) { maybe_collect(ptls); size_t offs = offsetof(bigval_t, header); @@ -1172,19 +1128,24 @@ static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) memset(v, 0xee, allocsz); #endif v->sz = allocsz; - v->age = 0; gc_big_object_link(v, &ptls->heap.big_objects); return jl_valueof(&v->header); } -// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code. +// Deprecated version, supported for legacy code. JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) { jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz); - maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag); return val; } +// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code. +JL_DLLEXPORT jl_value_t *jl_gc_big_alloc_instrumented(jl_ptls_t ptls, size_t sz, jl_value_t *type) +{ + jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz); + maybe_record_alloc_to_profile(val, sz, (jl_datatype_t*)type); + return val; +} // This wrapper exists only to prevent `jl_gc_big_alloc_inner` from being inlined into // its callers. We provide an external-facing interface for callers, and inline `jl_gc_big_alloc_inner` @@ -1204,16 +1165,8 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT int old_bits = bits; if (gc_marked(bits)) { pv = &v->next; - int age = v->age; - if (age >= PROMOTE_AGE || bits == GC_OLD_MARKED) { - if (sweep_full || bits == GC_MARKED) { - bits = GC_OLD; - } - } - else { - inc_sat(age, PROMOTE_AGE); - v->age = age; - bits = GC_CLEAN; + if (sweep_full || bits == GC_MARKED) { + bits = GC_OLD; } v->bits.gc = bits; } @@ -1242,9 +1195,8 @@ static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - sweep_big_list(sweep_full, &ptls2->heap.big_objects); + if (ptls2 != NULL) + sweep_big_list(sweep_full, &ptls2->heap.big_objects); } if (sweep_full) { bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked); @@ -1313,9 +1265,15 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls = gc_all_tls_states[i]; - if (ptls) { - memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); + if (ptls != NULL) { + // don't reset `pool_live_bytes` here jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); + jl_atomic_store_relaxed(&ptls->gc_num.freed, 0); + jl_atomic_store_relaxed(&ptls->gc_num.malloc, 0); + jl_atomic_store_relaxed(&ptls->gc_num.realloc, 0); + jl_atomic_store_relaxed(&ptls->gc_num.poolalloc, 0); + jl_atomic_store_relaxed(&ptls->gc_num.bigalloc, 0); + jl_atomic_store_relaxed(&ptls->gc_num.freecall, 0); } } } @@ -1362,81 +1320,75 @@ static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 == NULL) - continue; - mallocarray_t *ma = ptls2->heap.mallocarrays; - mallocarray_t **pma = &ptls2->heap.mallocarrays; - while (ma != NULL) { - mallocarray_t *nxt = ma->next; - int bits = jl_astaggedvalue(ma->a)->bits.gc; - if (gc_marked(bits)) { - pma = &ma->next; - } - else { - *pma = nxt; - assert(ma->a->flags.how == 2); - jl_gc_free_array(ma->a); - ma->next = ptls2->heap.mafreelist; - ptls2->heap.mafreelist = ma; + if (ptls2 != NULL) { + mallocarray_t *ma = ptls2->heap.mallocarrays; + mallocarray_t **pma = &ptls2->heap.mallocarrays; + while (ma != NULL) { + mallocarray_t *nxt = ma->next; + int bits = jl_astaggedvalue(ma->a)->bits.gc; + if (gc_marked(bits)) { + pma = &ma->next; + } + else { + *pma = nxt; + assert(ma->a->flags.how == 2); + jl_gc_free_array(ma->a); + ma->next = ptls2->heap.mafreelist; + ptls2->heap.mafreelist = ma; + } + gc_time_count_mallocd_array(bits); + ma = nxt; } - gc_time_count_mallocd_array(bits); - ma = nxt; } } gc_time_mallocd_array_end(); } // pool allocation -static inline jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT +STATIC_INLINE jl_taggedvalue_t *gc_reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT { assert(GC_PAGE_OFFSET >= sizeof(void*)); pg->nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / p->osize; pg->pool_n = p - ptls2->heap.norm_pools; - memset(pg->ages, 0, GC_PAGE_SZ / 8 / p->osize + 1); jl_taggedvalue_t *beg = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET); - jl_taggedvalue_t *next = (jl_taggedvalue_t*)pg->data; - if (fl == NULL) { - next->next = NULL; - } - else { - // Insert free page after first page. - // This prevents unnecessary fragmentation from multiple pages - // being allocated from at the same time. Instead, objects will - // only ever be allocated from the first object in the list. - // This is specifically being relied on by the implementation - // of jl_gc_internal_obj_base_ptr() so that the function does - // not have to traverse the entire list. - jl_taggedvalue_t *flpage = (jl_taggedvalue_t *)gc_page_data(fl); - next->next = flpage->next; - flpage->next = beg; - beg = fl; - } pg->has_young = 0; pg->has_marked = 0; - pg->fl_begin_offset = -1; - pg->fl_end_offset = -1; pg->prev_nold = 0; pg->nold = 0; + pg->fl_begin_offset = UINT16_MAX; + pg->fl_end_offset = UINT16_MAX; return beg; } +jl_gc_page_stack_t global_page_pool_lazily_freed; +jl_gc_page_stack_t global_page_pool_clean; +jl_gc_page_stack_t global_page_pool_freed; +pagetable_t alloc_map; + // Add a new page to the pool. Discards any pages in `p->newpages` before. -static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT +static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT { // Do not pass in `ptls` as argument. This slows down the fast path // in pool_alloc significantly jl_ptls_t ptls = jl_current_task->ptls; - jl_gc_pagemeta_t *pg = jl_gc_alloc_page(); + jl_gc_pagemeta_t *pg = pop_lf_back(&ptls->page_metadata_buffered); + if (pg != NULL) { + gc_alloc_map_set(pg->data, GC_PAGE_ALLOCATED); + } + else { + pg = jl_gc_alloc_page(); + } pg->osize = p->osize; - pg->ages = (uint8_t*)malloc_s(GC_PAGE_SZ / 8 / p->osize + 1); pg->thread_n = ptls->tid; - jl_taggedvalue_t *fl = reset_page(ptls, p, pg, NULL); + set_page_metadata(pg); + push_lf_back(&ptls->page_metadata_allocd, pg); + jl_taggedvalue_t *fl = gc_reset_page(ptls, p, pg); p->newpages = fl; return fl; } // Size includes the tag and the tag is not cleared!! -static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, +STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int osize) { // Use the pool offset instead of the pool address as the argument @@ -1450,17 +1402,19 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset maybe_collect(ptls); jl_atomic_store_relaxed(&ptls->gc_num.allocd, jl_atomic_load_relaxed(&ptls->gc_num.allocd) + osize); + jl_atomic_store_relaxed(&ptls->gc_num.pool_live_bytes, + jl_atomic_load_relaxed(&ptls->gc_num.pool_live_bytes) + osize); jl_atomic_store_relaxed(&ptls->gc_num.poolalloc, jl_atomic_load_relaxed(&ptls->gc_num.poolalloc) + 1); // first try to use the freelist jl_taggedvalue_t *v = p->freelist; - if (v) { + if (v != NULL) { jl_taggedvalue_t *next = v->next; p->freelist = next; if (__unlikely(gc_page_data(v) != gc_page_data(next))) { // we only update pg's fields when the freelist changes page // since pg's metadata is likely not in cache - jl_gc_pagemeta_t *pg = jl_assume(page_metadata(v)); + jl_gc_pagemeta_t *pg = jl_assume(page_metadata_unsafe(v)); assert(pg->osize == p->osize); pg->nfree = 0; pg->has_young = 1; @@ -1474,19 +1428,16 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset // If there's no pages left or the current page is used up, // we need to use the slow path. char *cur_page = gc_page_data((char*)v - 1); - if (__unlikely(!v || cur_page + GC_PAGE_SZ < (char*)next)) { - if (v) { + if (__unlikely(v == NULL || cur_page + GC_PAGE_SZ < (char*)next)) { + if (v != NULL) { // like the freelist case, // but only update the page metadata when it is full - jl_gc_pagemeta_t *pg = jl_assume(page_metadata((char*)v - 1)); + jl_gc_pagemeta_t *pg = jl_assume(page_metadata_unsafe((char*)v - 1)); assert(pg->osize == p->osize); pg->nfree = 0; pg->has_young = 1; - v = *(jl_taggedvalue_t**)cur_page; } - // Not an else!! - if (!v) - v = add_page(p); + v = gc_add_page(p); next = (jl_taggedvalue_t*)((char*)v + osize); } p->newpages = next; @@ -1494,15 +1445,22 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset return jl_valueof(v); } -// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code. +// Deprecated version, supported for legacy code. JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, int osize) { jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize); - maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag); return val; } +// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code. +JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc_instrumented(jl_ptls_t ptls, int pool_offset, + int osize, jl_value_t* type) +{ + jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize); + maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type); + return val; +} // This wrapper exists only to prevent `jl_gc_pool_alloc_inner` from being inlined into // its callers. We provide an external-facing interface for callers, and inline `jl_gc_pool_alloc_inner` @@ -1523,99 +1481,126 @@ int jl_gc_classify_pools(size_t sz, int *osize) // sweep phase -int64_t lazy_freed_pages = 0; +gc_fragmentation_stat_t gc_page_fragmentation_stats[JL_GC_N_POOLS]; +JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS]; + +STATIC_INLINE void gc_update_page_fragmentation_data(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT +{ + gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[pg->pool_n]; + jl_atomic_fetch_add(&stats->n_freed_objs, pg->nfree); + jl_atomic_fetch_add(&stats->n_pages_allocd, 1); +} + +STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT +{ + for (int i = 0; i < JL_GC_N_POOLS; i++) { + gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[i]; + double utilization = 1.0; + size_t n_freed_objs = jl_atomic_load_relaxed(&stats->n_freed_objs); + size_t n_pages_allocd = jl_atomic_load_relaxed(&stats->n_pages_allocd); + if (n_pages_allocd != 0) { + utilization -= ((double)n_freed_objs * (double)jl_gc_sizeclasses[i]) / (double)n_pages_allocd / (double)GC_PAGE_SZ; + } + jl_gc_page_utilization_stats[i] = utilization; + jl_atomic_store_relaxed(&stats->n_freed_objs, 0); + jl_atomic_store_relaxed(&stats->n_pages_allocd, 0); + } +} + +int64_t buffered_pages = 0; // Returns pointer to terminal pointer of list rooted at *pfl. -static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t **pfl, int sweep_full, int osize) JL_NOTSAFEPOINT +static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *buffered, + jl_gc_pagemeta_t *pg, int osize) JL_NOTSAFEPOINT { char *data = pg->data; - uint8_t *ages = pg->ages; - jl_taggedvalue_t *v = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET); - char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - osize; + jl_taggedvalue_t *v0 = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET); + char *lim = data + GC_PAGE_SZ - osize; + char *lim_newpages = data + GC_PAGE_SZ; + if (gc_page_data((char*)p->newpages - 1) == data) { + lim_newpages = (char*)p->newpages; + } size_t old_nfree = pg->nfree; size_t nfree; + // avoid loading a global variable in the hot path + int page_profile_enabled = gc_page_profile_is_enabled(); + gc_page_serializer_init(s, pg); + int re_use_page = 1; + int keep_as_local_buffer = 0; int freedall = 1; int pg_skpd = 1; if (!pg->has_marked) { + re_use_page = 0; // lazy version: (empty) if the whole page was already unused, free it (return it to the pool) // eager version: (freedall) free page as soon as possible // the eager one uses less memory. // FIXME - need to do accounting on a per-thread basis // on quick sweeps, keep a few pages empty but allocated for performance - if (!sweep_full && lazy_freed_pages <= default_collect_interval / GC_PAGE_SZ) { - jl_ptls_t ptls2 = gc_all_tls_states[pg->thread_n]; - jl_taggedvalue_t *begin = reset_page(ptls2, p, pg, p->newpages); - p->newpages = begin; - begin->next = (jl_taggedvalue_t*)0; - lazy_freed_pages++; - } - else { - jl_gc_free_page(data); + if (!current_sweep_full && buffered_pages <= default_collect_interval / GC_PAGE_SZ) { + buffered_pages++; + keep_as_local_buffer = 1; } nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / osize; + gc_page_profile_write_empty_page(s, page_profile_enabled); goto done; } // For quick sweep, we might be able to skip the page if the page doesn't // have any young live cell before marking. - if (!sweep_full && !pg->has_young) { + if (!current_sweep_full && !pg->has_young) { assert(!prev_sweep_full || pg->prev_nold >= pg->nold); if (!prev_sweep_full || pg->prev_nold == pg->nold) { - // the position of the freelist begin/end in this page - // is stored in its metadata - if (pg->fl_begin_offset != (uint16_t)-1) { - *pfl = page_pfl_beg(pg); - pfl = (jl_taggedvalue_t**)page_pfl_end(pg); - } freedall = 0; nfree = pg->nfree; + gc_page_profile_write_empty_page(s, page_profile_enabled); goto done; } } pg_skpd = 0; - { // scope to avoid clang goto errors + { // scope to avoid clang goto errors int has_marked = 0; int has_young = 0; int16_t prev_nold = 0; int pg_nfree = 0; + jl_taggedvalue_t *fl = NULL; + jl_taggedvalue_t **pfl = &fl; jl_taggedvalue_t **pfl_begin = NULL; - uint8_t msk = 1; // mask for the age bit in the current age byte + // collect page profile + jl_taggedvalue_t *v = v0; + if (page_profile_enabled) { + while ((char*)v <= lim) { + int bits = v->bits.gc; + if (!gc_marked(bits) || (char*)v >= lim_newpages) { + gc_page_profile_write_garbage(s, page_profile_enabled); + } + else { + gc_page_profile_write_live_obj(s, v, page_profile_enabled); + } + v = (jl_taggedvalue_t*)((char*)v + osize); + } + v = v0; + } + // sweep the page while ((char*)v <= lim) { int bits = v->bits.gc; - if (!gc_marked(bits)) { + // if an object is past `lim_newpages` then we can guarantee it's garbage + if (!gc_marked(bits) || (char*)v >= lim_newpages) { *pfl = v; pfl = &v->next; - pfl_begin = pfl_begin ? pfl_begin : pfl; + pfl_begin = (pfl_begin != NULL) ? pfl_begin : pfl; pg_nfree++; - *ages &= ~msk; } else { // marked young or old - if (*ages & msk || bits == GC_OLD_MARKED) { // old enough - // `!age && bits == GC_OLD_MARKED` is possible for - // non-first-class objects like `jl_binding_t` - if (sweep_full || bits == GC_MARKED) { - bits = v->bits.gc = GC_OLD; // promote - } - prev_nold++; - } - else { - assert(bits == GC_MARKED); - bits = v->bits.gc = GC_CLEAN; // unmark - has_young = 1; + if (current_sweep_full || bits == GC_MARKED) { // old enough + bits = v->bits.gc = GC_OLD; // promote } + prev_nold++; has_marked |= gc_marked(bits); - *ages |= msk; freedall = 0; } v = (jl_taggedvalue_t*)((char*)v + osize); - msk <<= 1; - if (!msk) { - msk = 1; - ages++; - } } - assert(!freedall); pg->has_marked = has_marked; pg->has_young = has_young; @@ -1624,12 +1609,12 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t pg->fl_end_offset = (char*)pfl - data; } else { - pg->fl_begin_offset = -1; - pg->fl_end_offset = -1; + pg->fl_begin_offset = UINT16_MAX; + pg->fl_end_offset = UINT16_MAX; } pg->nfree = pg_nfree; - if (sweep_full) { + if (current_sweep_full) { pg->nold = 0; pg->prev_nold = prev_nold; } @@ -1637,97 +1622,36 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t nfree = pg->nfree; done: + if (re_use_page) { + push_lf_back(allocd, pg); + } + else { + gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED); + if (keep_as_local_buffer) { + push_lf_back(buffered, pg); + } + else { + push_lf_back(&global_page_pool_lazily_freed, pg); + } + } + gc_page_profile_write_to_file(s); + gc_update_page_fragmentation_data(pg); gc_time_count_page(freedall, pg_skpd); - gc_num.freed += (nfree - old_nfree) * osize; - return pfl; + jl_ptls_t ptls = gc_all_tls_states[pg->thread_n]; + jl_atomic_fetch_add(&ptls->gc_num.pool_live_bytes, GC_PAGE_SZ - GC_PAGE_OFFSET - nfree * osize); + jl_atomic_fetch_add((_Atomic(int64_t) *)&gc_num.freed, (nfree - old_nfree) * osize); } // the actual sweeping over all allocated pages in a memory pool -static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg, int sweep_full) JL_NOTSAFEPOINT +STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *lazily_freed, + jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT { int p_n = pg->pool_n; int t_n = pg->thread_n; jl_ptls_t ptls2 = gc_all_tls_states[t_n]; jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n]; int osize = pg->osize; - pfl[t_n * JL_GC_N_POOLS + p_n] = sweep_page(p, pg, pfl[t_n * JL_GC_N_POOLS + p_n], sweep_full, osize); -} - -// sweep over a pagetable0 for all allocated pages -static inline int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *pagetable0, int sweep_full) JL_NOTSAFEPOINT -{ - unsigned ub = 0; - unsigned alloc = 0; - for (unsigned pg_i = 0; pg_i <= pagetable0->ub; pg_i++) { - uint32_t line = pagetable0->allocmap[pg_i]; - unsigned j; - if (!line) - continue; - ub = pg_i; - alloc = 1; - for (j = 0; line; j++, line >>= 1) { - unsigned next = ffs_u32(line); - j += next; - line >>= next; - jl_gc_pagemeta_t *pg = pagetable0->meta[pg_i * 32 + j]; - sweep_pool_page(pfl, pg, sweep_full); - } - } - pagetable0->ub = ub; - return alloc; -} - -// sweep over pagetable1 for all pagetable0 that may contain allocated pages -static inline int sweep_pool_pagetable1(jl_taggedvalue_t ***pfl, pagetable1_t *pagetable1, int sweep_full) JL_NOTSAFEPOINT -{ - unsigned ub = 0; - unsigned alloc = 0; - for (unsigned pg_i = 0; pg_i <= pagetable1->ub; pg_i++) { - uint32_t line = pagetable1->allocmap0[pg_i]; - unsigned j; - for (j = 0; line; j++, line >>= 1) { - unsigned next = ffs_u32(line); - j += next; - line >>= next; - pagetable0_t *pagetable0 = pagetable1->meta0[pg_i * 32 + j]; - if (pagetable0 && !sweep_pool_pagetable0(pfl, pagetable0, sweep_full)) - pagetable1->allocmap0[pg_i] &= ~(1 << j); // no allocations found, remember that for next time - } - if (pagetable1->allocmap0[pg_i]) { - ub = pg_i; - alloc = 1; - } - } - pagetable1->ub = ub; - return alloc; -} - -// sweep over all memory for all pagetable1 that may contain allocated pages -static void sweep_pool_pagetable(jl_taggedvalue_t ***pfl, int sweep_full) JL_NOTSAFEPOINT -{ - if (REGION2_PG_COUNT == 1) { // compile-time optimization - pagetable1_t *pagetable1 = memory_map.meta1[0]; - if (pagetable1) - sweep_pool_pagetable1(pfl, pagetable1, sweep_full); - return; - } - unsigned ub = 0; - for (unsigned pg_i = 0; pg_i <= memory_map.ub; pg_i++) { - uint32_t line = memory_map.allocmap1[pg_i]; - unsigned j; - for (j = 0; line; j++, line >>= 1) { - unsigned next = ffs_u32(line); - j += next; - line >>= next; - pagetable1_t *pagetable1 = memory_map.meta1[pg_i * 32 + j]; - if (pagetable1 && !sweep_pool_pagetable1(pfl, pagetable1, sweep_full)) - memory_map.allocmap1[pg_i] &= ~(1 << j); // no allocations found, remember that for next time - } - if (memory_map.allocmap1[pg_i]) { - ub = pg_i; - } - } - memory_map.ub = ub; + gc_sweep_page(s, p, allocd, lazily_freed, pg, osize); } // sweep over all memory that is being used and not in a pool @@ -1739,7 +1663,7 @@ static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT { - assert(pg->fl_begin_offset != (uint16_t)-1); + assert(pg->fl_begin_offset != UINT16_MAX); char *cur_pg = gc_page_data(last); // Fast path for page that has no allocation jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset); @@ -1753,11 +1677,72 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_ pg->nfree = nfree; } +void gc_sweep_wake_all(void) +{ + uv_mutex_lock(&gc_threads_lock); + for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + jl_atomic_fetch_add(&ptls2->gc_sweeps_requested, 1); + } + uv_cond_broadcast(&gc_threads_cond); + uv_mutex_unlock(&gc_threads_lock); +} + +void gc_sweep_wait_for_all(void) +{ + jl_atomic_store(&gc_allocd_scratch, NULL); + while (jl_atomic_load_relaxed(&gc_n_threads_sweeping) != 0) { + jl_cpu_pause(); + } +} + +void gc_sweep_pool_parallel(void) +{ + jl_atomic_fetch_add(&gc_n_threads_sweeping, 1); + jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch); + if (allocd_scratch != NULL) { + gc_page_profiler_serializer_t serializer = gc_page_serializer_create(); + while (1) { + int found_pg = 0; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) { + continue; + } + jl_gc_page_stack_t *allocd = &allocd_scratch[t_i]; + jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd); + if (pg == NULL) { + continue; + } + gc_sweep_pool_page(&serializer, allocd, &ptls2->page_metadata_buffered, pg); + found_pg = 1; + } + if (!found_pg) { + break; + } + } + gc_page_serializer_destroy(&serializer); + } + jl_atomic_fetch_add(&gc_n_threads_sweeping, -1); +} + +void gc_free_pages(void) +{ + while (1) { + jl_gc_pagemeta_t *pg = pop_lf_back(&global_page_pool_lazily_freed); + if (pg == NULL) { + break; + } + jl_gc_free_page(pg); + push_lf_back(&global_page_pool_freed, pg); + } +} + // setup the data-structures for a sweep over all memory pools -static void gc_sweep_pool(int sweep_full) +static void gc_sweep_pool(void) { gc_time_pool_start(); - lazy_freed_pages = 0; + buffered_pages = 0; // For the benefit of the analyzer, which doesn't know that gc_n_threads // doesn't change over the course of this function @@ -1777,11 +1762,12 @@ static void gc_sweep_pool(int sweep_full) } continue; } + jl_atomic_store_relaxed(&ptls2->gc_num.pool_live_bytes, 0); for (int i = 0; i < JL_GC_N_POOLS; i++) { jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; jl_taggedvalue_t *last = p->freelist; - if (last) { - jl_gc_pagemeta_t *pg = jl_assume(page_metadata(last)); + if (last != NULL) { + jl_gc_pagemeta_t *pg = jl_assume(page_metadata_unsafe(last)); gc_pool_sync_nfree(pg, last); pg->has_young = 1; } @@ -1789,31 +1775,74 @@ static void gc_sweep_pool(int sweep_full) pfl[t_i * JL_GC_N_POOLS + i] = &p->freelist; last = p->newpages; - if (last) { + if (last != NULL) { char *last_p = (char*)last; - jl_gc_pagemeta_t *pg = jl_assume(page_metadata(last_p - 1)); + jl_gc_pagemeta_t *pg = jl_assume(page_metadata_unsafe(last_p - 1)); assert(last_p - gc_page_data(last_p - 1) >= GC_PAGE_OFFSET); pg->nfree = (GC_PAGE_SZ - (last_p - gc_page_data(last_p - 1))) / p->osize; pg->has_young = 1; } - p->newpages = NULL; + } + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_buffered.bottom); + while (pg != NULL) { + jl_gc_pagemeta_t *pg2 = pg->next; + buffered_pages++; + pg = pg2; } } // the actual sweeping - sweep_pool_pagetable(pfl, sweep_full); + jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t)); + memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t)); + jl_atomic_store(&gc_allocd_scratch, tmp); + gc_sweep_wake_all(); + gc_sweep_pool_parallel(); + gc_sweep_wait_for_all(); - // null out terminal pointers of free lists for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 == NULL) + if (ptls2 != NULL) { + ptls2->page_metadata_allocd = tmp[t_i]; + for (int i = 0; i < JL_GC_N_POOLS; i++) { + jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; + p->newpages = NULL; + } + } + } + + // merge free lists + for (int t_i = 0; t_i < n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) { continue; - for (int i = 0; i < JL_GC_N_POOLS; i++) { - *pfl[t_i * JL_GC_N_POOLS + i] = NULL; + } + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + while (pg != NULL) { + jl_gc_pagemeta_t *pg2 = pg->next; + if (pg->fl_begin_offset != UINT16_MAX) { + char *cur_pg = pg->data; + jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset); + jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset); + *pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg; + pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next; + } + pg = pg2; + } + } + + // null out terminal pointers of free lists + for (int t_i = 0; t_i < n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 != NULL) { + for (int i = 0; i < JL_GC_N_POOLS; i++) { + *pfl[t_i * JL_GC_N_POOLS + i] = NULL; + } } } - gc_time_pool_end(sweep_full); + gc_free_pages(); + gc_dump_page_utilization_data(); + gc_time_pool_end(current_sweep_full); } static void gc_sweep_perm_alloc(void) @@ -1893,7 +1922,7 @@ static void *volatile gc_findval; // for usage from gdb, for finding the gc-root // Handle the case where the stack is only partially copied. STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset, - uintptr_t lb, uintptr_t ub) + uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT { uintptr_t addr = (uintptr_t)_addr; if (addr >= lb && addr < ub) @@ -1902,929 +1931,682 @@ STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset, } STATIC_INLINE uintptr_t gc_read_stack(void *_addr, uintptr_t offset, - uintptr_t lb, uintptr_t ub) + uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT { uintptr_t real_addr = gc_get_stack_addr(_addr, offset, lb, ub); return *(uintptr_t*)real_addr; } JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_ptls_t ptls, jl_datatype_t *vt, - jl_gc_mark_sp_t sp) + jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT { jl_safe_printf("GC error (probable corruption) :\n"); jl_gc_debug_print_status(); jl_(vt); jl_gc_debug_critical_error(); - gc_mark_loop_unwind(ptls, sp, 0); abort(); } -// This stores the label address in the mark loop function. -// We can't directly store that to a global array so we need some hack to get that. -// See the call to `gc_mark_loop` in init with a `NULL` `ptls`. -void *gc_mark_label_addrs[_GC_MARK_L_MAX]; - -// Double the local mark stack (both pc and data) -static void NOINLINE gc_mark_stack_resize(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) JL_NOTSAFEPOINT -{ - jl_gc_mark_data_t *old_data = gc_cache->data_stack; - void **pc_stack = sp->pc_start; - size_t stack_size = (char*)sp->pc_end - (char*)pc_stack; - ptrdiff_t datadiff = (char*)sp->data - (char*)old_data; - gc_cache->data_stack = (jl_gc_mark_data_t *)realloc_s(old_data, stack_size * 2 * sizeof(jl_gc_mark_data_t)); - sp->data = (jl_gc_mark_data_t *)((char*)gc_cache->data_stack + datadiff); - - sp->pc_start = gc_cache->pc_stack = (void**)realloc_s(pc_stack, stack_size * 2 * sizeof(void*)); - gc_cache->pc_stack_end = sp->pc_end = sp->pc_start + stack_size * 2; - sp->pc = sp->pc_start + (sp->pc - pc_stack); -} - -// Push a work item to the stack. The type of the work item is marked with `pc`. -// The data needed is in `data` and is of size `data_size`. -// If there isn't enough space on the stack, the stack will be resized with the stack -// lock held. The caller should invalidate any local cache of the stack addresses that's not -// in `gc_cache` or `sp` -// The `sp` will be updated on return if `inc` is true. -STATIC_INLINE void gc_mark_stack_push(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - void *pc, void *data, size_t data_size, int inc) JL_NOTSAFEPOINT +// Check if `nptr` is tagged for `old + refyoung`, +// Push the object to the remset and update the `nptr` counter if necessary. +STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, + uintptr_t nptr) JL_NOTSAFEPOINT { - assert(data_size <= sizeof(jl_gc_mark_data_t)); - if (__unlikely(sp->pc == sp->pc_end)) - gc_mark_stack_resize(gc_cache, sp); - *sp->pc = pc; - memcpy(sp->data, data, data_size); - if (inc) { - sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + data_size); - sp->pc++; + if (__unlikely((nptr & 0x3) == 0x3)) { + ptls->heap.remset_nptr += nptr >> 2; + arraylist_t *remset = ptls->heap.remset; + size_t len = remset->len; + if (__unlikely(len >= remset->max)) { + arraylist_push(remset, obj); + } + else { + remset->len = len + 1; + remset->items[len] = obj; + } } } -// Check if the reference is non-NULL and atomically set the mark bit. -// Update `*nptr`, which is the `nptr` field of the parent item, if the object is young. -// Return the tag (with GC bits cleared) and the GC bits in `*ptag` and `*pbits`. -// Return whether the object needs to be scanned / have metadata updated. -STATIC_INLINE int gc_try_setmark(jl_value_t *obj, uintptr_t *nptr, - uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT +// Push a work item to the queue +STATIC_INLINE void gc_ptr_queue_push(jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT { - if (!obj) - return 0; - jl_taggedvalue_t *o = jl_astaggedvalue(obj); - uintptr_t tag = o->header; - if (!gc_marked(tag)) { - uint8_t bits; - int res = gc_setmark_tag(o, GC_MARKED, tag, &bits); - if (!gc_old(bits)) - *nptr = *nptr | 1; - *ptag = tag & ~(uintptr_t)0xf; - *pbits = bits; - return __likely(res); - } - else if (!gc_old(tag)) { - *nptr = *nptr | 1; - } - return 0; + ws_array_t *old_a = ws_queue_push(&mq->ptr_queue, &obj, sizeof(jl_value_t*)); + // Put `old_a` in `reclaim_set` to be freed after the mark phase + if (__unlikely(old_a != NULL)) + arraylist_push(&mq->reclaim_set, old_a); } -// Queue a finalizer list to be scanned in the mark loop. Start marking from index `start`. -void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - arraylist_t *list, size_t start) +// Pop from the mark queue +STATIC_INLINE jl_value_t *gc_ptr_queue_pop(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT { - size_t len = list->len; - if (len <= start) - return; - jl_value_t **items = (jl_value_t**)list->items; - gc_mark_finlist_t markdata = {items + start, items + len}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_finlist], - &markdata, sizeof(markdata), 1); + jl_value_t *v = NULL; + ws_queue_pop(&mq->ptr_queue, &v, sizeof(jl_value_t*)); + return v; } -// Queue a object to be scanned. The object should already be marked and the GC metadata -// should already be updated for it. Only scanning of the object should be performed. -STATIC_INLINE void gc_mark_queue_scan_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - jl_value_t *obj) +// Steal from `mq2` +STATIC_INLINE jl_value_t *gc_ptr_queue_steal_from(jl_gc_markqueue_t *mq2) JL_NOTSAFEPOINT { - jl_taggedvalue_t *o = jl_astaggedvalue(obj); - uintptr_t tag = o->header; - uint8_t bits = tag & 0xf; - tag = tag & ~(uintptr_t)0xf; - gc_mark_marked_obj_t data = {obj, tag, bits}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_scan_only], - &data, sizeof(data), 1); + jl_value_t *v = NULL; + ws_queue_steal_from(&mq2->ptr_queue, &v, sizeof(jl_value_t*)); + return v; } -// Mark and queue a object to be scanned. -// The object will be marked atomically which can also happen concurrently. -// It will be queued if the object wasn't marked already (or concurrently by another thread) -// Returns whether the object is young. -STATIC_INLINE int gc_mark_queue_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, void *_obj) JL_NOTSAFEPOINT +// Push chunk `*c` into chunk queue +STATIC_INLINE void gc_chunkqueue_push(jl_gc_markqueue_t *mq, jl_gc_chunk_t *c) JL_NOTSAFEPOINT { - jl_value_t *obj = (jl_value_t*)jl_assume(_obj); - uintptr_t nptr = 0; - uintptr_t tag = 0; - uint8_t bits = 0; - if (!gc_try_setmark(obj, &nptr, &tag, &bits)) - return (int)nptr; - gc_mark_marked_obj_t data = {obj, tag, bits}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_marked_obj], - &data, sizeof(data), 1); - return (int)nptr; + ws_array_t *old_a = ws_queue_push(&mq->chunk_queue, c, sizeof(jl_gc_chunk_t)); + // Put `old_a` in `reclaim_set` to be freed after the mark phase + if (__unlikely(old_a != NULL)) + arraylist_push(&mq->reclaim_set, old_a); } -int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_value_t *obj) +// Pop chunk from chunk queue +STATIC_INLINE jl_gc_chunk_t gc_chunkqueue_pop(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT { - return gc_mark_queue_obj(gc_cache, sp, obj); + jl_gc_chunk_t c = {.cid = GC_empty_chunk}; + ws_queue_pop(&mq->chunk_queue, &c, sizeof(jl_gc_chunk_t)); + return c; } -JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) +// Steal chunk from `mq2` +STATIC_INLINE jl_gc_chunk_t gc_chunkqueue_steal_from(jl_gc_markqueue_t *mq2) JL_NOTSAFEPOINT { - return gc_mark_queue_obj(&ptls->gc_cache, &ptls->gc_mark_sp, obj); + jl_gc_chunk_t c = {.cid = GC_empty_chunk}; + ws_queue_steal_from(&mq2->chunk_queue, &c, sizeof(jl_gc_chunk_t)); + return c; } -JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, - jl_value_t **objs, size_t nobjs) +// Enqueue an unmarked obj. last bit of `nptr` is set if `_obj` is young +STATIC_INLINE void gc_try_claim_and_push(jl_gc_markqueue_t *mq, void *_obj, + uintptr_t *nptr) JL_NOTSAFEPOINT { - gc_mark_objarray_t data = { parent, objs, objs + nobjs, 1, - jl_astaggedvalue(parent)->bits.gc & 2 }; - gc_mark_stack_push(&ptls->gc_cache, &ptls->gc_mark_sp, - gc_mark_label_addrs[GC_MARK_L_objarray], - &data, sizeof(data), 1); + if (_obj == NULL) + return; + jl_value_t *obj = (jl_value_t *)jl_assume(_obj); + jl_taggedvalue_t *o = jl_astaggedvalue(obj); + if (!gc_old(o->header) && nptr) + *nptr |= 1; + if (gc_try_setmark_tag(o, GC_MARKED)) + gc_ptr_queue_push(mq, obj); } - -// Check if `nptr` is tagged for `old + refyoung`, -// Push the object to the remset and update the `nptr` counter if necessary. -STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, uintptr_t nptr) JL_NOTSAFEPOINT +// Mark object with 8bit field descriptors +STATIC_INLINE jl_value_t *gc_mark_obj8(jl_ptls_t ptls, char *obj8_parent, uint8_t *obj8_begin, + uint8_t *obj8_end, uintptr_t nptr) JL_NOTSAFEPOINT { - if (__unlikely((nptr & 0x3) == 0x3)) { - ptls->heap.remset_nptr += nptr >> 2; - arraylist_t *remset = ptls->heap.remset; - size_t len = remset->len; - if (__unlikely(len >= remset->max)) { - arraylist_push(remset, obj); - } - else { - remset->len = len + 1; - remset->items[len] = obj; + (void)jl_assume(obj8_begin < obj8_end); + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t **slot = NULL; + jl_value_t *new_obj = NULL; + for (; obj8_begin < obj8_end; obj8_begin++) { + slot = &((jl_value_t**)obj8_parent)[*obj8_begin]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("object", obj8_parent, slot, "field(%d)", + gc_slot_to_fieldidx(obj8_parent, slot, (jl_datatype_t*)jl_typeof(obj8_parent))); + if (obj8_begin + 1 != obj8_end) { + gc_try_claim_and_push(mq, new_obj, &nptr); + } + else { + // Unroll marking of last item to avoid pushing + // and popping it right away + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + nptr |= !gc_old(o->header); + if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL; + } + gc_heap_snapshot_record_object_edge((jl_value_t*)obj8_parent, slot); } } + gc_mark_push_remset(ptls, (jl_value_t *)obj8_parent, nptr); + return new_obj; } -// Scan a dense array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_objarray(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, - gc_mark_objarray_t *objary, - jl_value_t **begin, jl_value_t **end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) +// Mark object with 16bit field descriptors +STATIC_INLINE jl_value_t *gc_mark_obj16(jl_ptls_t ptls, char *obj16_parent, uint16_t *obj16_begin, + uint16_t *obj16_end, uintptr_t nptr) JL_NOTSAFEPOINT { - (void)jl_assume(objary == (gc_mark_objarray_t*)sp->data); - for (; begin < end; begin += objary->step) { - *pnew_obj = *begin; - if (*pnew_obj) { - verify_parent2("obj array", objary->parent, begin, "elem(%d)", - gc_slot_to_arrayidx(objary->parent, begin)); - gc_heap_snapshot_record_array_edge(objary->parent, begin); - } - if (!gc_try_setmark(*pnew_obj, &objary->nptr, ptag, pbits)) - continue; - begin += objary->step; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - objary->begin = begin; - gc_repush_markdata(sp, gc_mark_objarray_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, objary->parent, objary->nptr); - } - return 1; - } - gc_mark_push_remset(ptls, objary->parent, objary->nptr); - return 0; -} - -// Scan a sparse array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_array8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, - gc_mark_array8_t *ary8, - jl_value_t **begin, jl_value_t **end, - uint8_t *elem_begin, uint8_t *elem_end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(ary8 == (gc_mark_array8_t*)sp->data); - size_t elsize = ((jl_array_t*)ary8->elem.parent)->elsize / sizeof(jl_value_t*); - for (; begin < end; begin += elsize) { - for (; elem_begin < elem_end; elem_begin++) { - jl_value_t **slot = &begin[*elem_begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("array", ary8->elem.parent, slot, "elem(%d)", - gc_slot_to_arrayidx(ary8->elem.parent, begin)); - gc_heap_snapshot_record_array_edge(ary8->elem.parent, slot); - } - if (!gc_try_setmark(*pnew_obj, &ary8->elem.nptr, ptag, pbits)) - continue; - elem_begin++; - // Found an object to mark - if (elem_begin < elem_end) { - // Haven't done with this one yet. Update the content and push it back - ary8->elem.begin = elem_begin; - ary8->begin = begin; - gc_repush_markdata(sp, gc_mark_array8_t); + (void)jl_assume(obj16_begin < obj16_end); + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t **slot = NULL; + jl_value_t *new_obj = NULL; + for (; obj16_begin < obj16_end; obj16_begin++) { + slot = &((jl_value_t **)obj16_parent)[*obj16_begin]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("object", obj16_parent, slot, "field(%d)", + gc_slot_to_fieldidx(obj16_parent, slot, (jl_datatype_t*)jl_typeof(obj16_parent))); + if (obj16_begin + 1 != obj16_end) { + gc_try_claim_and_push(mq, new_obj, &nptr); } else { - begin += elsize; - if (begin < end) { - // Haven't done with this array yet. Reset the content and push it back - ary8->elem.begin = ary8->rebegin; - ary8->begin = begin; - gc_repush_markdata(sp, gc_mark_array8_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, ary8->elem.parent, ary8->elem.nptr); - } + // Unroll marking of last item to avoid pushing + // and popping it right away + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + nptr |= !gc_old(o->header); + if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL; } - return 1; + gc_heap_snapshot_record_object_edge((jl_value_t*)obj16_parent, slot); } - elem_begin = ary8->rebegin; - } - gc_mark_push_remset(ptls, ary8->elem.parent, ary8->elem.nptr); - return 0; -} - -// Scan a sparse array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_array16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, - gc_mark_array16_t *ary16, - jl_value_t **begin, jl_value_t **end, - uint16_t *elem_begin, uint16_t *elem_end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(ary16 == (gc_mark_array16_t*)sp->data); - size_t elsize = ((jl_array_t*)ary16->elem.parent)->elsize / sizeof(jl_value_t*); - for (; begin < end; begin += elsize) { - for (; elem_begin < elem_end; elem_begin++) { - jl_value_t **slot = &begin[*elem_begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("array", ary16->elem.parent, slot, "elem(%d)", - gc_slot_to_arrayidx(ary16->elem.parent, begin)); - gc_heap_snapshot_record_array_edge(ary16->elem.parent, slot); - } - if (!gc_try_setmark(*pnew_obj, &ary16->elem.nptr, ptag, pbits)) - continue; - elem_begin++; - // Found an object to mark - if (elem_begin < elem_end) { - // Haven't done with this one yet. Update the content and push it back - ary16->elem.begin = elem_begin; - ary16->begin = begin; - gc_repush_markdata(sp, gc_mark_array16_t); + } + gc_mark_push_remset(ptls, (jl_value_t *)obj16_parent, nptr); + return new_obj; +} + +// Mark object with 32bit field descriptors +STATIC_INLINE jl_value_t *gc_mark_obj32(jl_ptls_t ptls, char *obj32_parent, uint32_t *obj32_begin, + uint32_t *obj32_end, uintptr_t nptr) JL_NOTSAFEPOINT +{ + (void)jl_assume(obj32_begin < obj32_end); + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t **slot = NULL; + jl_value_t *new_obj = NULL; + for (; obj32_begin < obj32_end; obj32_begin++) { + slot = &((jl_value_t **)obj32_parent)[*obj32_begin]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("object", obj32_parent, slot, "field(%d)", + gc_slot_to_fieldidx(obj32_parent, slot, (jl_datatype_t*)jl_typeof(obj32_parent))); + if (obj32_begin + 1 != obj32_end) { + gc_try_claim_and_push(mq, new_obj, &nptr); } else { - begin += elsize; - if (begin < end) { - // Haven't done with this array yet. Reset the content and push it back - ary16->elem.begin = ary16->rebegin; - ary16->begin = begin; - gc_repush_markdata(sp, gc_mark_array16_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, ary16->elem.parent, ary16->elem.nptr); - } + // Unroll marking of last item to avoid pushing + // and popping it right away + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + nptr |= !gc_old(o->header); + if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL; } - return 1; + gc_heap_snapshot_record_object_edge((jl_value_t*)obj32_parent, slot); } - elem_begin = ary16->rebegin; } - gc_mark_push_remset(ptls, ary16->elem.parent, ary16->elem.nptr); - return 0; -} - - -// Scan an object with 8bits field descriptors. see `gc_mark_obj8_t` -STATIC_INLINE int gc_mark_scan_obj8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj8_t *obj8, - char *parent, uint8_t *begin, uint8_t *end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(obj8 == (gc_mark_obj8_t*)sp->data); - (void)jl_assume(begin < end); - for (; begin < end; begin++) { - jl_value_t **slot = &((jl_value_t**)parent)[*begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("object", parent, slot, "field(%d)", - gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent))); - gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot); + gc_mark_push_remset(ptls, (jl_value_t *)obj32_parent, nptr); + return new_obj; +} + +// Mark object array +STATIC_INLINE void gc_mark_objarray(jl_ptls_t ptls, jl_value_t *obj_parent, jl_value_t **obj_begin, + jl_value_t **obj_end, uint32_t step, uintptr_t nptr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + // Decide whether need to chunk objary + (void)jl_assume(step > 0); + if ((nptr & 0x2) == 0x2) { + // pre-scan this object: most of this object should be old, so look for + // the first young object before starting this chunk + // (this also would be valid for young objects, but probably less beneficial) + for (; obj_begin < obj_end; obj_begin += step) { + jl_value_t **slot = obj_begin; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("obj array", obj_parent, obj_begin, "elem(%d)", + gc_slot_to_arrayidx(obj_parent, obj_begin)); + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + if (!gc_old(o->header)) + nptr |= 1; + if (!gc_marked(o->header)) + break; + gc_heap_snapshot_record_array_edge(obj_parent, slot); + } } - if (!gc_try_setmark(*pnew_obj, &obj8->nptr, ptag, pbits)) - continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - obj8->begin = begin; - gc_repush_markdata(sp, gc_mark_obj8_t); + } + size_t too_big = (obj_end - obj_begin) / GC_CHUNK_BATCH_SIZE > step; // use this order of operations to avoid idiv + jl_value_t **scan_end = obj_end; + int pushed_chunk = 0; + if (too_big) { + scan_end = obj_begin + step * GC_CHUNK_BATCH_SIZE; + // case 1: array owner is young, so we won't need to scan through all its elements + // to know that we will never need to push it to the remset. it's fine + // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue + // ASAP in order to expose as much parallelism as possible + // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after + // scanning the array elements + if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) { + jl_gc_chunk_t c = {GC_objary_chunk, obj_parent, scan_end, obj_end, NULL, NULL, step, nptr}; + gc_chunkqueue_push(mq, &c); + pushed_chunk = 1; } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, obj8->parent, obj8->nptr); - } - return 1; - } - gc_mark_push_remset(ptls, obj8->parent, obj8->nptr); - return 0; -} - -// Scan an object with 16bits field descriptors. see `gc_mark_obj16_t` -STATIC_INLINE int gc_mark_scan_obj16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj16_t *obj16, - char *parent, uint16_t *begin, uint16_t *end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT -{ - (void)jl_assume(obj16 == (gc_mark_obj16_t*)sp->data); - (void)jl_assume(begin < end); - for (; begin < end; begin++) { - jl_value_t **slot = &((jl_value_t**)parent)[*begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("object", parent, slot, "field(%d)", - gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent))); - gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot); - } - if (!gc_try_setmark(*pnew_obj, &obj16->nptr, ptag, pbits)) - continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - obj16->begin = begin; - gc_repush_markdata(sp, gc_mark_obj16_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, obj16->parent, obj16->nptr); - } - return 1; - } - gc_mark_push_remset(ptls, obj16->parent, obj16->nptr); - return 0; -} - -// Scan an object with 32bits field descriptors. see `gc_mark_obj32_t` -STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj32_t *obj32, - char *parent, uint32_t *begin, uint32_t *end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(obj32 == (gc_mark_obj32_t*)sp->data); - (void)jl_assume(begin < end); - for (; begin < end; begin++) { - jl_value_t **slot = &((jl_value_t**)parent)[*begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("object", parent, slot, "field(%d)", - gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent))); - gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot); + } + for (; obj_begin < scan_end; obj_begin += step) { + jl_value_t **slot = obj_begin; + new_obj = *obj_begin; + if (new_obj != NULL) { + verify_parent2("obj array", obj_parent, obj_begin, "elem(%d)", + gc_slot_to_arrayidx(obj_parent, obj_begin)); + gc_try_claim_and_push(mq, new_obj, &nptr); + gc_heap_snapshot_record_array_edge(obj_parent, slot); } - if (!gc_try_setmark(*pnew_obj, &obj32->nptr, ptag, pbits)) - continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - obj32->begin = begin; - gc_repush_markdata(sp, gc_mark_obj32_t); + } + if (too_big) { + if (!pushed_chunk) { + jl_gc_chunk_t c = {GC_objary_chunk, obj_parent, scan_end, obj_end, NULL, NULL, step, nptr}; + gc_chunkqueue_push(mq, &c); } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, obj32->parent, obj32->nptr); + } + else { + gc_mark_push_remset(ptls, obj_parent, nptr); + } +} + +// Mark array with 8bit field descriptors +STATIC_INLINE void gc_mark_array8(jl_ptls_t ptls, jl_value_t *ary8_parent, jl_value_t **ary8_begin, + jl_value_t **ary8_end, uint8_t *elem_begin, uint8_t *elem_end, + uintptr_t nptr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + size_t elsize = ((jl_array_t *)ary8_parent)->elsize / sizeof(jl_value_t *); + assert(elsize > 0); + // Decide whether need to chunk objary + if ((nptr & 0x2) == 0x2) { + // pre-scan this object: most of this object should be old, so look for + // the first young object before starting this chunk + // (this also would be valid for young objects, but probably less beneficial) + for (; ary8_begin < ary8_end; ary8_begin += elsize) { + int early_end = 0; + for (uint8_t *pindex = elem_begin; pindex < elem_end; pindex++) { + jl_value_t **slot = &ary8_begin[*pindex]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("array", ary8_parent, &new_obj, "elem(%d)", + gc_slot_to_arrayidx(ary8_parent, ary8_begin)); + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + if (!gc_old(o->header)) + nptr |= 1; + if (!gc_marked(o->header)){ + early_end = 1; + break; + } + gc_heap_snapshot_record_array_edge(ary8_parent, slot); + } + } + if (early_end) + break; } - return 1; } - gc_mark_push_remset(ptls, obj32->parent, obj32->nptr); - return 0; -} - -#if defined(__GNUC__) && !defined(_OS_EMSCRIPTEN_) -# define gc_mark_laddr(name) (&&name) -# define gc_mark_jmp(ptr) goto *(ptr) -#else -#define gc_mark_laddr(name) ((void*)(uintptr_t)GC_MARK_L_##name) -#define gc_mark_jmp(ptr) do { \ - switch ((int)(uintptr_t)ptr) { \ - case GC_MARK_L_marked_obj: \ - goto marked_obj; \ - case GC_MARK_L_scan_only: \ - goto scan_only; \ - case GC_MARK_L_finlist: \ - goto finlist; \ - case GC_MARK_L_objarray: \ - goto objarray; \ - case GC_MARK_L_array8: \ - goto array8; \ - case GC_MARK_L_array16: \ - goto array16; \ - case GC_MARK_L_obj8: \ - goto obj8; \ - case GC_MARK_L_obj16: \ - goto obj16; \ - case GC_MARK_L_obj32: \ - goto obj32; \ - case GC_MARK_L_stack: \ - goto stack; \ - case GC_MARK_L_excstack: \ - goto excstack; \ - case GC_MARK_L_module_binding: \ - goto module_binding; \ - default: \ - abort(); \ - } \ - } while (0) -#endif - -// This is the main marking loop. -// It uses an iterative (mostly) Depth-first search (DFS) to mark all the objects. -// Instead of using the native stack, two stacks are manually maintained, -// one (fixed-size) pc stack which stores the return address and one (variable-size) -// data stack which stores the local variables needed by the scanning code. -// Using a manually maintained stack has a few advantages -// -// 1. We can resize the stack as we go and never worry about stack overflow -// This is especitally useful when enters the GC in a deep call stack. -// It also removes the very deep GC call stack in a profile. -// 2. We can minimize the number of local variables to save on the stack. -// This includes minimizing the sizes of the stack frames and only saving variables -// that have been changed before making "function calls" (i.e. `goto mark;`) -// 3. We can perform end-of-loop tail-call optimization for common cases. -// 4. The marking can be interrupted more easily since all the states are maintained -// in a well-defined format already. -// This will be useful if we want to have incremental marking again. -// 5. The frames can be stolen by another thread more easily and it is not necessary -// to copy works to be stolen to another queue. Useful for parallel marking. -// (Will still require synchronization in stack popping of course.) -// 6. A flat function (i.e. no or very few function calls) also give the compiler -// opportunity to keep more states in registers that doesn't have to be spilled as often. -// -// We use two stacks so that the thief on another thread can steal the fixed sized pc stack -// and use that to figure out the size of the struct on the variable size data stack. -// -// The main disadvantages are that we bypass some stack-based CPU optimizations including the -// stack engine and return address prediction. -// Using two stacks also double the number of operations on the stack pointer -// though we still only need to use one of them (the pc stack pointer) for bounds check. -// In general, it seems that the reduction of stack memory ops and instructions count -// have a larger positive effect on the performance. =) - -// As a general guide we do not want to make non-inlined function calls in this function -// if possible since a large number of registers has to be spilled when that happens. -// This is especially true on on X86 which doesn't have many (any?) -// callee saved general purpose registers. -// (OTOH, the spill will likely make use of the stack engine which is otherwise idle so -// the performance impact is minimum as long as it's not in the hottest path) - -// There are three external entry points to the loop, corresponding to label -// `marked_obj`, `scan_only` and `finlist` (see the corresponding functions -// `gc_mark_queue_obj`, `gc_mark_queue_scan_obj` and `gc_mark_queue_finlist` above). -// The scanning of the object starts with `goto mark`, which updates the metadata and scans -// the object whose information is stored in `new_obj`, `tag` and `bits`. -// The branches in `mark` will dispatch the object to one of the scan "loop"s to be scanned -// as either a normal julia object or one of the special objects with specific storage format. -// Each of the scan "loop" will perform a DFS of the object in the following way -// -// 1. When encountering an pointer (julia object reference) slots, load, perform NULL check -// and atomically set the mark bits to determine if the object needs to be scanned. -// 2. If yes, it'll push itself back onto the mark stack (after updating fields that are changed) -// using `gc_repush_markdata` to increment the stack pointers. -// This step can also be replaced by a tail call by finishing up the marking of the current -// object when the end of the current object is reached. -// 3. Jump to `mark`. The marking of the current object will be resumed after the child is -// scanned by popping the stack frame back. -// -// Some of the special object scannings use BFS to simplify the code (Task and Module). - -// The jumps from the dispatch to the scan "loop"s are done by first pushing a frame -// to the stacks while only increment the data stack pointer before jumping to the loop -// This way the scan "loop" gets exactly what it expects after a stack pop. -// Additional optimizations are done for some of the common cases by skipping -// the unnecessary data stack pointer increment and the load from the stack -// (i.e. store to load forwarding). See `objary_loaded`, `obj8_loaded` and `obj16_loaded`. -JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp) -{ - if (__unlikely(ptls == NULL)) { - gc_mark_label_addrs[GC_MARK_L_marked_obj] = gc_mark_laddr(marked_obj); - gc_mark_label_addrs[GC_MARK_L_scan_only] = gc_mark_laddr(scan_only); - gc_mark_label_addrs[GC_MARK_L_finlist] = gc_mark_laddr(finlist); - gc_mark_label_addrs[GC_MARK_L_objarray] = gc_mark_laddr(objarray); - gc_mark_label_addrs[GC_MARK_L_array8] = gc_mark_laddr(array8); - gc_mark_label_addrs[GC_MARK_L_array16] = gc_mark_laddr(array16); - gc_mark_label_addrs[GC_MARK_L_obj8] = gc_mark_laddr(obj8); - gc_mark_label_addrs[GC_MARK_L_obj16] = gc_mark_laddr(obj16); - gc_mark_label_addrs[GC_MARK_L_obj32] = gc_mark_laddr(obj32); - gc_mark_label_addrs[GC_MARK_L_stack] = gc_mark_laddr(stack); - gc_mark_label_addrs[GC_MARK_L_excstack] = gc_mark_laddr(excstack); - gc_mark_label_addrs[GC_MARK_L_module_binding] = gc_mark_laddr(module_binding); - return; + size_t too_big = (ary8_end - ary8_begin) / GC_CHUNK_BATCH_SIZE > elsize; // use this order of operations to avoid idiv + jl_value_t **scan_end = ary8_end; + int pushed_chunk = 0; + if (too_big) { + scan_end = ary8_begin + elsize * GC_CHUNK_BATCH_SIZE; + // case 1: array owner is young, so we won't need to scan through all its elements + // to know that we will never need to push it to the remset. it's fine + // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue + // ASAP in order to expose as much parallelism as possible + // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after + // scanning the array elements + if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) { + jl_gc_chunk_t c = {GC_ary8_chunk, ary8_parent, scan_end, ary8_end, elem_begin, elem_end, 0, nptr}; + gc_chunkqueue_push(mq, &c); + pushed_chunk = 1; + } } - - jl_value_t *new_obj = NULL; - uintptr_t tag = 0; - uint8_t bits = 0; - int meta_updated = 0; - - gc_mark_objarray_t *objary; - jl_value_t **objary_begin; - jl_value_t **objary_end; - - gc_mark_array8_t *ary8; - gc_mark_array16_t *ary16; - - gc_mark_obj8_t *obj8; - char *obj8_parent; - uint8_t *obj8_begin; - uint8_t *obj8_end; - - gc_mark_obj16_t *obj16; - char *obj16_parent; - uint16_t *obj16_begin; - uint16_t *obj16_end; - -pop: - if (sp.pc == sp.pc_start) { - // TODO: stealing form another thread - return; + for (; ary8_begin < ary8_end; ary8_begin += elsize) { + for (uint8_t *pindex = elem_begin; pindex < elem_end; pindex++) { + jl_value_t **slot = &ary8_begin[*pindex]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("array", ary8_parent, &new_obj, "elem(%d)", + gc_slot_to_arrayidx(ary8_parent, ary8_begin)); + gc_try_claim_and_push(mq, new_obj, &nptr); + gc_heap_snapshot_record_array_edge(ary8_parent, slot); + } + } } - sp.pc--; - gc_mark_jmp(*sp.pc); // computed goto - -marked_obj: { - // An object that has been marked and needs have metadata updated and scanned. - gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t); - new_obj = obj->obj; - tag = obj->tag; - bits = obj->bits; - goto mark; - } - -scan_only: { - // An object that has been marked and needs to be scanned. - gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t); - new_obj = obj->obj; - tag = obj->tag; - bits = obj->bits; - meta_updated = 1; - goto mark; - } - -objarray: - objary = gc_pop_markdata(&sp, gc_mark_objarray_t); - objary_begin = objary->begin; - objary_end = objary->end; -objarray_loaded: - if (gc_mark_scan_objarray(ptls, &sp, objary, objary_begin, objary_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -array8: - ary8 = gc_pop_markdata(&sp, gc_mark_array8_t); - objary_begin = ary8->begin; - objary_end = ary8->end; - obj8_begin = ary8->elem.begin; - obj8_end = ary8->elem.end; -array8_loaded: - if (gc_mark_scan_array8(ptls, &sp, ary8, objary_begin, objary_end, obj8_begin, obj8_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -array16: - ary16 = gc_pop_markdata(&sp, gc_mark_array16_t); - objary_begin = ary16->begin; - objary_end = ary16->end; - obj16_begin = ary16->elem.begin; - obj16_end = ary16->elem.end; -array16_loaded: - if (gc_mark_scan_array16(ptls, &sp, ary16, objary_begin, objary_end, obj16_begin, obj16_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -obj8: - obj8 = gc_pop_markdata(&sp, gc_mark_obj8_t); - obj8_parent = (char*)obj8->parent; - obj8_begin = obj8->begin; - obj8_end = obj8->end; -obj8_loaded: - if (gc_mark_scan_obj8(ptls, &sp, obj8, obj8_parent, obj8_begin, obj8_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -obj16: - obj16 = gc_pop_markdata(&sp, gc_mark_obj16_t); - obj16_parent = (char*)obj16->parent; - obj16_begin = obj16->begin; - obj16_end = obj16->end; -obj16_loaded: - if (gc_mark_scan_obj16(ptls, &sp, obj16, obj16_parent, obj16_begin, obj16_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -obj32: { - gc_mark_obj32_t *obj32 = gc_pop_markdata(&sp, gc_mark_obj32_t); - char *parent = (char*)obj32->parent; - uint32_t *begin = obj32->begin; - uint32_t *end = obj32->end; - if (gc_mark_scan_obj32(ptls, &sp, obj32, parent, begin, end, &new_obj, &tag, &bits)) - goto mark; - goto pop; + if (too_big) { + if (!pushed_chunk) { + jl_gc_chunk_t c = {GC_ary8_chunk, ary8_parent, scan_end, ary8_end, elem_begin, elem_end, 0, nptr}; + gc_chunkqueue_push(mq, &c); + } } - -stack: { - // Scan the stack. see `gc_mark_stackframe_t` - // The task object this stack belongs to is being scanned separately as a normal - // 8bit field descriptor object. - gc_mark_stackframe_t *stack = gc_pop_markdata(&sp, gc_mark_stackframe_t); - jl_gcframe_t *s = stack->s; - uint32_t i = stack->i; - uint32_t nroots = stack->nroots; - uintptr_t offset = stack->offset; - uintptr_t lb = stack->lb; - uintptr_t ub = stack->ub; - uint32_t nr = nroots >> 2; - uintptr_t nptr = 0; - while (1) { - jl_value_t ***rts = (jl_value_t***)(((void**)s) + 2); - for (; i < nr; i++) { - if (nroots & 1) { - void **slot = (void**)gc_read_stack(&rts[i], offset, lb, ub); - new_obj = (jl_value_t*)gc_read_stack(slot, offset, lb, ub); - } - else { - new_obj = (jl_value_t*)gc_read_stack(&rts[i], offset, lb, ub); - if (gc_ptr_tag(new_obj, 1)) { - // handle tagged pointers in finalizer list - new_obj = gc_ptr_clear_tag(new_obj, 1); - // skip over the finalizer fptr - i++; + else { + gc_mark_push_remset(ptls, ary8_parent, nptr); + } +} + +// Mark array with 16bit field descriptors +STATIC_INLINE void gc_mark_array16(jl_ptls_t ptls, jl_value_t *ary16_parent, jl_value_t **ary16_begin, + jl_value_t **ary16_end, uint16_t *elem_begin, uint16_t *elem_end, + uintptr_t nptr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + size_t elsize = ((jl_array_t *)ary16_parent)->elsize / sizeof(jl_value_t *); + assert(elsize > 0); + // Decide whether need to chunk objary + if ((nptr & 0x2) == 0x2) { + // pre-scan this object: most of this object should be old, so look for + // the first young object before starting this chunk + // (this also would be valid for young objects, but probably less beneficial) + for (; ary16_begin < ary16_end; ary16_begin += elsize) { + int early_end = 0; + for (uint16_t *pindex = elem_begin; pindex < elem_end; pindex++) { + jl_value_t **slot = &ary16_begin[*pindex]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("array", ary16_parent, &new_obj, "elem(%d)", + gc_slot_to_arrayidx(ary16_parent, ary16_begin)); + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + if (!gc_old(o->header)) + nptr |= 1; + if (!gc_marked(o->header)){ + early_end = 1; + break; } - if (gc_ptr_tag(new_obj, 2)) - continue; - } - if (!gc_try_setmark(new_obj, &nptr, &tag, &bits)) - continue; - gc_heap_snapshot_record_frame_to_object_edge(s, new_obj); - i++; - if (i < nr) { - // Haven't done with this one yet. Update the content and push it back - stack->i = i; - gc_repush_markdata(&sp, gc_mark_stackframe_t); - } - // TODO stack addresses needs copy stack handling - else if ((s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub))) { - gc_heap_snapshot_record_frame_to_frame_edge(stack->s, s); - stack->s = s; - stack->i = 0; - uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub); - assert(new_nroots <= UINT32_MAX); - stack->nroots = (uint32_t)new_nroots; - gc_repush_markdata(&sp, gc_mark_stackframe_t); + gc_heap_snapshot_record_array_edge(ary16_parent, slot); } - goto mark; } - s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub); - // walk up one stack frame - if (s != 0) { - gc_heap_snapshot_record_frame_to_frame_edge(stack->s, s); - stack->s = s; - i = 0; - uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub); - assert(new_nroots <= UINT32_MAX); - nroots = stack->nroots = (uint32_t)new_nroots; - nr = nroots >> 2; - continue; + if (early_end) + break; + } + } + size_t too_big = (ary16_end - ary16_begin) / GC_CHUNK_BATCH_SIZE > elsize; // use this order of operations to avoid idiv + jl_value_t **scan_end = ary16_end; + int pushed_chunk = 0; + if (too_big) { + scan_end = ary16_begin + elsize * GC_CHUNK_BATCH_SIZE; + // case 1: array owner is young, so we won't need to scan through all its elements + // to know that we will never need to push it to the remset. it's fine + // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue + // ASAP in order to expose as much parallelism as possible + // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after + // scanning the array elements + if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) { + jl_gc_chunk_t c = {GC_ary16_chunk, ary16_parent, scan_end, ary16_end, elem_begin, elem_end, elsize, nptr}; + gc_chunkqueue_push(mq, &c); + pushed_chunk = 1; + } + } + for (; ary16_begin < scan_end; ary16_begin += elsize) { + for (uint16_t *pindex = elem_begin; pindex < elem_end; pindex++) { + jl_value_t **slot = &ary16_begin[*pindex]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("array", ary16_parent, &new_obj, "elem(%d)", + gc_slot_to_arrayidx(ary16_parent, ary16_begin)); + gc_try_claim_and_push(mq, new_obj, &nptr); + gc_heap_snapshot_record_array_edge(ary16_parent, slot); } - goto pop; } } + if (too_big) { + if (!pushed_chunk) { + jl_gc_chunk_t c = {GC_ary16_chunk, ary16_parent, scan_end, ary16_end, elem_begin, elem_end, elsize, nptr}; + gc_chunkqueue_push(mq, &c); + } + } + else { + gc_mark_push_remset(ptls, ary16_parent, nptr); + } +} -excstack: { - // Scan an exception stack - gc_mark_excstack_t *stackitr = gc_pop_markdata(&sp, gc_mark_excstack_t); - jl_excstack_t *excstack = stackitr->s; - size_t itr = stackitr->itr; - size_t bt_index = stackitr->bt_index; - size_t jlval_index = stackitr->jlval_index; - while (itr > 0) { - size_t bt_size = jl_excstack_bt_size(excstack, itr); - jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr); - for (; bt_index < bt_size; bt_index += jl_bt_entry_size(bt_data + bt_index)) { - jl_bt_element_t *bt_entry = bt_data + bt_index; - if (jl_bt_is_native(bt_entry)) - continue; - // Found an extended backtrace entry: iterate over any - // GC-managed values inside. - size_t njlvals = jl_bt_num_jlvals(bt_entry); - while (jlval_index < njlvals) { - new_obj = jl_bt_entry_jlvalue(bt_entry, jlval_index); - gc_heap_snapshot_record_frame_to_object_edge(bt_entry, new_obj); - uintptr_t nptr = 0; - jlval_index += 1; - if (gc_try_setmark(new_obj, &nptr, &tag, &bits)) { - stackitr->itr = itr; - stackitr->bt_index = bt_index; - stackitr->jlval_index = jlval_index; - gc_repush_markdata(&sp, gc_mark_excstack_t); - goto mark; - } - } - jlval_index = 0; - } - // The exception comes last - mark it - new_obj = jl_excstack_exception(excstack, itr); - gc_heap_snapshot_record_frame_to_object_edge(excstack, new_obj); - itr = jl_excstack_next(excstack, itr); - bt_index = 0; - jlval_index = 0; - uintptr_t nptr = 0; - if (gc_try_setmark(new_obj, &nptr, &tag, &bits)) { - stackitr->itr = itr; - stackitr->bt_index = bt_index; - stackitr->jlval_index = jlval_index; - gc_repush_markdata(&sp, gc_mark_excstack_t); - goto mark; - } +// Mark chunk of large array +STATIC_INLINE void gc_mark_chunk(jl_ptls_t ptls, jl_gc_markqueue_t *mq, jl_gc_chunk_t *c) JL_NOTSAFEPOINT +{ + switch (c->cid) { + case GC_objary_chunk: { + jl_value_t *obj_parent = c->parent; + jl_value_t **obj_begin = c->begin; + jl_value_t **obj_end = c->end; + uint32_t step = c->step; + uintptr_t nptr = c->nptr; + gc_mark_objarray(ptls, obj_parent, obj_begin, obj_end, step, + nptr); + break; + } + case GC_ary8_chunk: { + jl_value_t *ary8_parent = c->parent; + jl_value_t **ary8_begin = c->begin; + jl_value_t **ary8_end = c->end; + uint8_t *elem_begin = (uint8_t *)c->elem_begin; + uint8_t *elem_end = (uint8_t *)c->elem_end; + uintptr_t nptr = c->nptr; + gc_mark_array8(ptls, ary8_parent, ary8_begin, ary8_end, elem_begin, elem_end, + nptr); + break; + } + case GC_ary16_chunk: { + jl_value_t *ary16_parent = c->parent; + jl_value_t **ary16_begin = c->begin; + jl_value_t **ary16_end = c->end; + uint16_t *elem_begin = (uint16_t *)c->elem_begin; + uint16_t *elem_end = (uint16_t *)c->elem_end; + uintptr_t nptr = c->nptr; + gc_mark_array16(ptls, ary16_parent, ary16_begin, ary16_end, elem_begin, elem_end, + nptr); + break; + } + case GC_finlist_chunk: { + jl_value_t **fl_begin = c->begin; + jl_value_t **fl_end = c->end; + gc_mark_finlist_(mq, fl_begin, fl_end); + break; + } + default: { + // `empty-chunk` should be checked by caller + jl_safe_printf("GC internal error: chunk mismatch\n"); + abort(); } - goto pop; } +} -module_binding: { - // Scan a module. see `gc_mark_binding_t` - // Other fields of the module will be scanned after the bindings are scanned - gc_mark_binding_t *binding = gc_pop_markdata(&sp, gc_mark_binding_t); - jl_binding_t **begin = binding->begin; - jl_binding_t **end = binding->end; - uint8_t mbits = binding->bits; - for (; begin < end; begin += 2) { - jl_binding_t *b = *begin; - if (b == (jl_binding_t*)HT_NOTFOUND) - continue; - if (jl_object_in_image((jl_value_t*)b)) { - jl_taggedvalue_t *buf = jl_astaggedvalue(b); - uintptr_t tag = buf->header; - uint8_t bits; - if (!gc_marked(tag)) - gc_setmark_tag(buf, GC_OLD_MARKED, tag, &bits); +// Mark gc frame +STATIC_INLINE void gc_mark_stack(jl_ptls_t ptls, jl_gcframe_t *s, uint32_t nroots, uintptr_t offset, + uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + uint32_t nr = nroots >> 2; + while (1) { + jl_value_t ***rts = (jl_value_t ***)(((void **)s) + 2); + for (uint32_t i = 0; i < nr; i++) { + if (nroots & 1) { + void **slot = (void **)gc_read_stack(&rts[i], offset, lb, ub); + new_obj = (jl_value_t *)gc_read_stack(slot, offset, lb, ub); } else { - gc_setmark_buf_(ptls, b, mbits, sizeof(jl_binding_t)); - } - void *vb = jl_astaggedvalue(b); - verify_parent1("module", binding->parent, &vb, "binding_buff"); - // Record the size used for the box for non-const bindings - gc_heap_snapshot_record_module_to_binding(binding->parent, b); - (void)vb; - jl_value_t *ty = jl_atomic_load_relaxed(&b->ty); - if (ty && ty != (jl_value_t*)jl_any_type) { - verify_parent2("module", binding->parent, - &b->ty, "binding(%s)", jl_symbol_name(b->name)); - if (gc_try_setmark(ty, &binding->nptr, &tag, &bits)) { - new_obj = ty; - gc_repush_markdata(&sp, gc_mark_binding_t); - goto mark; - } - } - jl_value_t *value = jl_atomic_load_relaxed(&b->value); - jl_value_t *globalref = jl_atomic_load_relaxed(&b->globalref); - if (value) { - verify_parent2("module", binding->parent, - &b->value, "binding(%s)", jl_symbol_name(b->name)); - if (gc_try_setmark(value, &binding->nptr, &tag, &bits)) { - new_obj = value; - begin += 2; - binding->begin = begin; - gc_repush_markdata(&sp, gc_mark_binding_t); - uintptr_t gr_tag; - uint8_t gr_bits; - if (gc_try_setmark(globalref, &binding->nptr, &gr_tag, &gr_bits)) { - gc_mark_marked_obj_t data = {globalref, gr_tag, gr_bits}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(marked_obj), - &data, sizeof(data), 1); - } - goto mark; + new_obj = (jl_value_t *)gc_read_stack(&rts[i], offset, lb, ub); + if (gc_ptr_tag(new_obj, 1)) { + // handle tagged pointers in finalizer list + new_obj = (jl_value_t *)gc_ptr_clear_tag(new_obj, 1); + // skip over the finalizer fptr + i++; } + if (gc_ptr_tag(new_obj, 2)) + continue; } - if (gc_try_setmark(globalref, &binding->nptr, &tag, &bits)) { - begin += 2; - binding->begin = begin; - gc_repush_markdata(&sp, gc_mark_binding_t); - new_obj = globalref; - goto mark; + if (new_obj != NULL) { + gc_try_claim_and_push(mq, new_obj, NULL); + gc_heap_snapshot_record_frame_to_object_edge(s, new_obj); } } - jl_module_t *m = binding->parent; - int scanparent = gc_try_setmark((jl_value_t*)m->parent, &binding->nptr, &tag, &bits); - size_t nusings = m->usings.len; - if (nusings) { - // this is only necessary because bindings for "using" modules - // are added only when accessed. therefore if a module is replaced - // after "using" it but before accessing it, this array might - // contain the only reference. - objary_begin = (jl_value_t**)m->usings.items; - objary_end = objary_begin + nusings; - gc_mark_objarray_t data = {(jl_value_t*)m, objary_begin, objary_end, 1, binding->nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &data, sizeof(data), 0); - if (!scanparent) { - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + jl_gcframe_t *sprev = (jl_gcframe_t *)gc_read_stack(&s->prev, offset, lb, ub); + if (sprev == NULL) + break; + gc_heap_snapshot_record_frame_to_frame_edge(s, sprev); + s = sprev; + uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub); + assert(new_nroots <= UINT32_MAX); + nroots = (uint32_t)new_nroots; + nr = nroots >> 2; + } +} + +// Mark exception stack +STATIC_INLINE void gc_mark_excstack(jl_ptls_t ptls, jl_excstack_t *excstack, size_t itr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + while (itr > 0) { + size_t bt_size = jl_excstack_bt_size(excstack, itr); + jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr); + for (size_t bt_index = 0; bt_index < bt_size; + bt_index += jl_bt_entry_size(bt_data + bt_index)) { + jl_bt_element_t *bt_entry = bt_data + bt_index; + if (jl_bt_is_native(bt_entry)) + continue; + // Found an extended backtrace entry: iterate over any + // GC-managed values inside. + size_t njlvals = jl_bt_num_jlvals(bt_entry); + for (size_t jlval_index = 0; jlval_index < njlvals; jlval_index++) { + new_obj = jl_bt_entry_jlvalue(bt_entry, jlval_index); + gc_try_claim_and_push(mq, new_obj, NULL); + gc_heap_snapshot_record_frame_to_object_edge(bt_entry, new_obj); } - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(data)); - sp.pc++; + } + // The exception comes last - mark it + new_obj = jl_excstack_exception(excstack, itr); + itr = jl_excstack_next(excstack, itr); + gc_try_claim_and_push(mq, new_obj, NULL); + gc_heap_snapshot_record_frame_to_object_edge(excstack, new_obj); + } +} + +// Mark module binding +STATIC_INLINE void gc_mark_module_binding(jl_ptls_t ptls, jl_module_t *mb_parent, jl_binding_t **mb_begin, + jl_binding_t **mb_end, uintptr_t nptr, + uint8_t bits) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + for (; mb_begin < mb_end; mb_begin += 2) { + jl_binding_t *b = *mb_begin; + if (b == (jl_binding_t *)HT_NOTFOUND) + continue; + if (jl_object_in_image((jl_value_t*)b)) { + jl_taggedvalue_t *buf = jl_astaggedvalue(b); + gc_try_setmark_tag(buf, GC_OLD_MARKED); } else { - gc_mark_push_remset(ptls, (jl_value_t*)m, binding->nptr); + gc_setmark_buf_(ptls, b, bits, sizeof(jl_binding_t)); } - if (scanparent) { - new_obj = (jl_value_t*)m->parent; - goto mark; + void *vb = jl_astaggedvalue(b); + verify_parent1("module", binding->parent, &vb, "binding_buff"); + // Record the size used for the box for non-const bindings + gc_heap_snapshot_record_module_to_binding(mb_parent, b); + (void)vb; + jl_value_t *ty = jl_atomic_load_relaxed(&b->ty); + if (ty && ty != (jl_value_t*)jl_any_type) { + verify_parent2("module", binding->parent, + &b->ty, "binding(%s)", jl_symbol_name(b->name)); + gc_try_claim_and_push(mq, ty, &nptr); } - goto pop; + jl_value_t *value = jl_atomic_load_relaxed(&b->value); + if (value) { + verify_parent2("module", binding->parent, + &b->value, "binding(%s)", jl_symbol_name(b->name)); + gc_try_claim_and_push(mq, value, &nptr); + } + jl_value_t *globalref = jl_atomic_load_relaxed(&b->globalref); + gc_try_claim_and_push(mq, globalref, &nptr); + } + gc_try_claim_and_push(mq, (jl_value_t *)mb_parent->parent, &nptr); + size_t nusings = mb_parent->usings.len; + if (nusings > 0) { + // this is only necessary because bindings for "using" modules + // are added only when accessed. therefore if a module is replaced + // after "using" it but before accessing it, this array might + // contain the only reference. + jl_value_t *obj_parent = (jl_value_t *)mb_parent; + jl_value_t **objary_begin = (jl_value_t **)mb_parent->usings.items; + jl_value_t **objary_end = objary_begin + nusings; + gc_mark_objarray(ptls, obj_parent, objary_begin, objary_end, 1, nptr); } + else { + gc_mark_push_remset(ptls, (jl_value_t *)mb_parent, nptr); + } +} -finlist: { - // Scan a finalizer (or format compatible) list. see `gc_mark_finlist_t` - gc_mark_finlist_t *finlist = gc_pop_markdata(&sp, gc_mark_finlist_t); - jl_value_t **begin = finlist->begin; - jl_value_t **end = finlist->end; - for (; begin < end; begin++) { - new_obj = *begin; - if (__unlikely(!new_obj)) - continue; - if (gc_ptr_tag(new_obj, 1)) { - new_obj = (jl_value_t*)gc_ptr_clear_tag(new_obj, 1); - begin++; - assert(begin < end); - } - if (gc_ptr_tag(new_obj, 2)) - continue; - uintptr_t nptr = 0; - if (!gc_try_setmark(new_obj, &nptr, &tag, &bits)) - continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - finlist->begin = begin; - gc_repush_markdata(&sp, gc_mark_finlist_t); - } - goto mark; +void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) +{ + jl_value_t *new_obj; + // Decide whether need to chunk finlist + size_t nrefs = (fl_end - fl_begin); + if (nrefs > GC_CHUNK_BATCH_SIZE) { + jl_gc_chunk_t c = {GC_finlist_chunk, NULL, fl_begin + GC_CHUNK_BATCH_SIZE, fl_end, 0, 0, 0, 0}; + gc_chunkqueue_push(mq, &c); + fl_end = fl_begin + GC_CHUNK_BATCH_SIZE; + } + for (; fl_begin < fl_end; fl_begin++) { + new_obj = *fl_begin; + if (__unlikely(!new_obj)) + continue; + if (gc_ptr_tag(new_obj, 1)) { + new_obj = (jl_value_t *)gc_ptr_clear_tag(new_obj, 1); + fl_begin++; + assert(fl_begin < fl_end); } - goto pop; + if (gc_ptr_tag(new_obj, 2)) + continue; + gc_try_claim_and_push(mq, new_obj, NULL); } +} -mark: { - // Generic scanning entry point. - // Expects `new_obj`, `tag` and `bits` to be set correctly. -#ifdef JL_DEBUG_BUILD +// Mark finalizer list (or list of objects following same format) +void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) +{ + size_t len = list->len; + if (len <= start) + return; + jl_value_t **fl_begin = (jl_value_t **)list->items + start; + jl_value_t **fl_end = (jl_value_t **)list->items + len; + gc_mark_finlist_(mq, fl_begin, fl_end); +} + +JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) +{ + int may_claim = gc_try_setmark_tag(jl_astaggedvalue(obj), GC_MARKED); + if (may_claim) + gc_ptr_queue_push(&ptls->mark_queue, obj); + return may_claim; +} + +JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, + jl_value_t **objs, size_t nobjs) +{ + uintptr_t nptr = (nobjs << 2) | (jl_astaggedvalue(parent)->bits.gc & 2); + gc_mark_objarray(ptls, parent, objs, objs + nobjs, 1, nptr); +} + +// Enqueue and mark all outgoing references from `new_obj` which have not been marked +// yet. `meta_updated` is mostly used to make sure we don't update metadata twice for +// objects which have been enqueued into the `remset` +FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_new_obj, + int meta_updated) +{ + jl_value_t *new_obj = (jl_value_t *)_new_obj; + mark_obj: { + #ifdef JL_DEBUG_BUILD if (new_obj == gc_findval) jl_raise_debugger(); -#endif + #endif jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); - jl_datatype_t *vt = (jl_datatype_t*)tag; - int foreign_alloc = 0; + jl_datatype_t *vt = (jl_datatype_t *)(o->header & ~(uintptr_t)0xf); + uint8_t bits = (gc_old(o->header) && !mark_reset_age) ? GC_OLD_MARKED : GC_MARKED; int update_meta = __likely(!meta_updated && !gc_verifying); - if (update_meta && o->bits.in_image) { + int foreign_alloc = 0; + if (update_meta && jl_object_in_image(new_obj)) { foreign_alloc = 1; update_meta = 0; } - meta_updated = 0; // Symbols are always marked assert(vt != jl_symbol_type); if (vt == jl_simplevector_type) { size_t l = jl_svec_len(new_obj); jl_value_t **data = jl_svec_data(new_obj); - size_t dtsz = l * sizeof(void*) + sizeof(jl_svec_t); + size_t dtsz = l * sizeof(void *) + sizeof(jl_svec_t); if (update_meta) gc_setmark(ptls, o, bits, dtsz); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, dtsz); + jl_value_t *objary_parent = new_obj; + jl_value_t **objary_begin = data; + jl_value_t **objary_end = data + l; + uint32_t step = 1; uintptr_t nptr = (l << 2) | (bits & GC_OLD); - objary_begin = data; - objary_end = data + l; - gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr); } else if (vt->name == jl_array_typename) { - jl_array_t *a = (jl_array_t*)new_obj; + jl_array_t *a = (jl_array_t *)new_obj; jl_array_flags_t flags = a->flags; if (update_meta) { if (flags.pooled) @@ -2832,9 +2614,10 @@ mark: { else gc_setmark_big(ptls, o, bits); } - else if (foreign_alloc) + else if (foreign_alloc) { objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_array_t)); - if (flags.how ==0){ + } + if (flags.how == 0) { void *data_ptr = (char*)a + sizeof(jl_array_t) +jl_array_ndimwords(a->flags.ndims) * sizeof(size_t); gc_heap_snapshot_record_hidden_edge(new_obj, data_ptr, jl_array_nbytes(a), 2); } @@ -2862,102 +2645,81 @@ mark: { else if (flags.how == 3) { jl_value_t *owner = jl_array_data_owner(a); uintptr_t nptr = (1 << 2) | (bits & GC_OLD); + gc_try_claim_and_push(mq, owner, &nptr); gc_heap_snapshot_record_internal_array_edge(new_obj, owner); - int markowner = gc_try_setmark(owner, &nptr, &tag, &bits); gc_mark_push_remset(ptls, new_obj, nptr); - if (markowner) { - new_obj = owner; - goto mark; - } - goto pop; + return; } - if (a->data == NULL || jl_array_len(a) == 0) - goto pop; + if (!a->data || jl_array_len(a) == 0) + return; if (flags.ptrarray) { - if ((jl_datatype_t*)jl_tparam0(vt) == jl_symbol_type) - goto pop; + if ((jl_datatype_t *)jl_tparam0(vt) == jl_symbol_type) + return; size_t l = jl_array_len(a); + jl_value_t *objary_parent = new_obj; + jl_value_t **objary_begin = (jl_value_t **)a->data; + jl_value_t **objary_end = objary_begin + l; + uint32_t step = 1; uintptr_t nptr = (l << 2) | (bits & GC_OLD); - objary_begin = (jl_value_t**)a->data; - objary_end = objary_begin + l; - gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr); } else if (flags.hasptr) { - jl_datatype_t *et = (jl_datatype_t*)jl_tparam0(vt); + jl_datatype_t *et = (jl_datatype_t *)jl_tparam0(vt); const jl_datatype_layout_t *layout = et->layout; unsigned npointers = layout->npointers; - unsigned elsize = a->elsize / sizeof(jl_value_t*); + unsigned elsize = a->elsize / sizeof(jl_value_t *); size_t l = jl_array_len(a); + jl_value_t *objary_parent = new_obj; + jl_value_t **objary_begin = (jl_value_t **)a->data; + jl_value_t **objary_end = objary_begin + l * elsize; + uint32_t step = elsize; uintptr_t nptr = ((l * npointers) << 2) | (bits & GC_OLD); - objary_begin = (jl_value_t**)a->data; - objary_end = objary_begin + l * elsize; if (npointers == 1) { // TODO: detect anytime time stride is uniform? objary_begin += layout->first_ptr; - gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, elsize, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr); } else if (layout->fielddesc_type == 0) { - obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout); - obj8_end = obj8_begin + npointers; - gc_mark_array8_t markdata = {objary_begin, objary_end, obj8_begin, {new_obj, obj8_begin, obj8_end, nptr}}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array8), - &markdata, sizeof(markdata), 0); - ary8 = (gc_mark_array8_t*)sp.data; - goto array8_loaded; + uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout); + uint8_t *obj8_end = obj8_begin + npointers; + gc_mark_array8(ptls, objary_parent, objary_begin, objary_end, obj8_begin, + obj8_end, nptr); } else if (layout->fielddesc_type == 1) { - obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout); - obj16_end = obj16_begin + npointers; - gc_mark_array16_t markdata = {objary_begin, objary_end, obj16_begin, {new_obj, obj16_begin, obj16_end, nptr}}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array16), - &markdata, sizeof(markdata), 0); - ary16 = (gc_mark_array16_t*)sp.data; - goto array16_loaded; + uint16_t *obj16_begin = (uint16_t *)jl_dt_layout_ptrs(layout); + uint16_t *obj16_end = obj16_begin + npointers; + gc_mark_array16(ptls, objary_parent, objary_begin, objary_end, obj16_begin, + obj16_end, nptr); } else { assert(0 && "unimplemented"); } } - goto pop; } else if (vt == jl_module_type) { if (update_meta) gc_setmark(ptls, o, bits, sizeof(jl_module_t)); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_module_t)); - jl_module_t *m = (jl_module_t*)new_obj; - jl_binding_t **table = (jl_binding_t**)m->bindings.table; - size_t bsize = m->bindings.size; - uintptr_t nptr = ((bsize + m->usings.len + 1) << 2) | (bits & GC_OLD); - gc_mark_binding_t markdata = {m, table + 1, table + bsize, nptr, bits}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(module_binding), - &markdata, sizeof(markdata), 0); - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata)); - goto module_binding; + jl_module_t *mb_parent = (jl_module_t *)new_obj; + jl_binding_t **mb_begin = (jl_binding_t **)mb_parent->bindings.table + 1; + size_t bsize = mb_parent->bindings.size; + jl_binding_t **mb_end = (jl_binding_t **)mb_parent->bindings.table + bsize; + uintptr_t nptr = ((bsize + mb_parent->usings.len + 1) << 2) | (bits & GC_OLD); + gc_mark_module_binding(ptls, mb_parent, mb_begin, mb_end, nptr, bits); } else if (vt == jl_task_type) { if (update_meta) gc_setmark(ptls, o, bits, sizeof(jl_task_t)); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_task_t)); - jl_task_t *ta = (jl_task_t*)new_obj; + jl_task_t *ta = (jl_task_t *)new_obj; gc_scrub_record_task(ta); if (gc_cblist_task_scanner) { - export_gc_state(ptls, &sp); int16_t tid = jl_atomic_load_relaxed(&ta->tid); - gc_invoke_callbacks(jl_gc_cb_task_scanner_t, - gc_cblist_task_scanner, - (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task)); - import_gc_state(ptls, &sp); + gc_invoke_callbacks(jl_gc_cb_task_scanner_t, gc_cblist_task_scanner, + (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task)); } -#ifdef COPY_STACKS + #ifdef COPY_STACKS void *stkbuf = ta->stkbuf; if (stkbuf && ta->copy_stack) { gc_setmark_buf_(ptls, stkbuf, bits, ta->bufsz); @@ -2966,14 +2728,14 @@ mark: { // TODO: edge to stack data // TODO: synthetic node for stack data (how big is it?) } -#endif + #endif jl_gcframe_t *s = ta->gcstack; size_t nroots; uintptr_t offset = 0; uintptr_t lb = 0; uintptr_t ub = (uintptr_t)-1; -#ifdef COPY_STACKS - if (stkbuf && ta->copy_stack && ta->ptls == NULL) { + #ifdef COPY_STACKS + if (stkbuf && ta->copy_stack && !ta->ptls) { int16_t tid = jl_atomic_load_relaxed(&ta->tid); assert(tid >= 0); jl_ptls_t ptls2 = gc_all_tls_states[tid]; @@ -2981,38 +2743,38 @@ mark: { lb = ub - ta->copy_stack; offset = (uintptr_t)stkbuf - lb; } -#endif - if (s) { + #endif + if (s != NULL) { nroots = gc_read_stack(&s->nroots, offset, lb, ub); gc_heap_snapshot_record_task_to_frame_edge(ta, s); - assert(nroots <= UINT32_MAX); - gc_mark_stackframe_t stackdata = {s, 0, (uint32_t)nroots, offset, lb, ub}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(stack), - &stackdata, sizeof(stackdata), 1); + gc_mark_stack(ptls, s, (uint32_t)nroots, offset, lb, ub); } if (ta->excstack) { - gc_heap_snapshot_record_task_to_frame_edge(ta, ta->excstack); - gc_setmark_buf_(ptls, ta->excstack, bits, sizeof(jl_excstack_t) + - sizeof(uintptr_t)*ta->excstack->reserved_size); - gc_mark_excstack_t stackdata = {ta->excstack, ta->excstack->top, 0, 0}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(excstack), - &stackdata, sizeof(stackdata), 1); + jl_excstack_t *excstack = ta->excstack; + gc_heap_snapshot_record_task_to_frame_edge(ta, excstack); + size_t itr = ta->excstack->top; + gc_setmark_buf_(ptls, excstack, bits, + sizeof(jl_excstack_t) + + sizeof(uintptr_t) * excstack->reserved_size); + gc_mark_excstack(ptls, excstack, itr); } const jl_datatype_layout_t *layout = jl_task_type->layout; assert(layout->fielddesc_type == 0); assert(layout->nfields > 0); uint32_t npointers = layout->npointers; - obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout); - obj8_end = obj8_begin + npointers; + char *obj8_parent = (char *)ta; + uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout); + uint8_t *obj8_end = obj8_begin + npointers; // assume tasks always reference young objects: set lowest bit uintptr_t nptr = (npointers << 2) | 1 | bits; - gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8), - &markdata, sizeof(markdata), 0); - obj8 = (gc_mark_obj8_t*)sp.data; - obj8_parent = (char*)ta; - goto obj8_loaded; + new_obj = gc_mark_obj8(ptls, obj8_parent, obj8_begin, obj8_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_ptr_queue_push(mq, new_obj); + } } else if (vt == jl_string_type) { size_t dtsz = jl_string_len(new_obj) + sizeof(size_t) + 1; @@ -3020,140 +2782,457 @@ mark: { gc_setmark(ptls, o, bits, dtsz); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, dtsz); - goto pop; } else { if (__unlikely(!jl_is_datatype(vt))) - gc_assert_datatype_fail(ptls, vt, sp); + gc_assert_datatype_fail(ptls, vt, mq); size_t dtsz = jl_datatype_size(vt); if (update_meta) gc_setmark(ptls, o, bits, dtsz); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, dtsz); if (vt == jl_weakref_type) - goto pop; + return; const jl_datatype_layout_t *layout = vt->layout; uint32_t npointers = layout->npointers; if (npointers == 0) - goto pop; - uintptr_t nptr = npointers << 2 | (bits & GC_OLD); - assert((layout->nfields > 0 || layout->fielddesc_type == 3) && "opaque types should have been handled specially"); + return; + uintptr_t nptr = (npointers << 2 | (bits & GC_OLD)); + assert((layout->nfields > 0 || layout->fielddesc_type == 3) && + "opaque types should have been handled specially"); if (layout->fielddesc_type == 0) { - obj8_parent = (char*)new_obj; - obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout); - obj8_end = obj8_begin + npointers; + char *obj8_parent = (char *)new_obj; + uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout); + uint8_t *obj8_end = obj8_begin + npointers; assert(obj8_begin < obj8_end); - gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8), - &markdata, sizeof(markdata), 0); - obj8 = (gc_mark_obj8_t*)sp.data; - goto obj8_loaded; + new_obj = gc_mark_obj8(ptls, obj8_parent, obj8_begin, obj8_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_ptr_queue_push(mq, new_obj); + } } else if (layout->fielddesc_type == 1) { - obj16_parent = (char*)new_obj; - obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout); - obj16_end = obj16_begin + npointers; + char *obj16_parent = (char *)new_obj; + uint16_t *obj16_begin = (uint16_t *)jl_dt_layout_ptrs(layout); + uint16_t *obj16_end = obj16_begin + npointers; assert(obj16_begin < obj16_end); - gc_mark_obj16_t markdata = {new_obj, obj16_begin, obj16_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj16), - &markdata, sizeof(markdata), 0); - obj16 = (gc_mark_obj16_t*)sp.data; - goto obj16_loaded; + new_obj = gc_mark_obj16(ptls, obj16_parent, obj16_begin, obj16_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_ptr_queue_push(mq, new_obj); + } } else if (layout->fielddesc_type == 2) { // This is very uncommon // Do not do store to load forwarding to save some code size - uint32_t *obj32_begin = (uint32_t*)jl_dt_layout_ptrs(layout); + char *obj32_parent = (char *)new_obj; + uint32_t *obj32_begin = (uint32_t *)jl_dt_layout_ptrs(layout); uint32_t *obj32_end = obj32_begin + npointers; - gc_mark_obj32_t markdata = {new_obj, obj32_begin, obj32_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj32), - &markdata, sizeof(markdata), 0); - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata)); - goto obj32; + assert(obj32_begin < obj32_end); + new_obj = gc_mark_obj32(ptls, obj32_parent, obj32_begin, obj32_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_ptr_queue_push(mq, new_obj); + } } else { assert(layout->fielddesc_type == 3); - jl_fielddescdyn_t *desc = (jl_fielddescdyn_t*)jl_dt_layout_fields(layout); + jl_fielddescdyn_t *desc = (jl_fielddescdyn_t *)jl_dt_layout_fields(layout); int old = jl_astaggedvalue(new_obj)->bits.gc & 2; - export_gc_state(ptls, &sp); uintptr_t young = desc->markfunc(ptls, new_obj); - import_gc_state(ptls, &sp); if (old && young) gc_mark_push_remset(ptls, new_obj, young * 4 + 3); + } + } + } +} + +// Used in gc-debug +void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq) +{ + while (1) { + void *new_obj = (void *)gc_ptr_queue_pop(&ptls->mark_queue); + // No more objects to mark + if (__unlikely(new_obj == NULL)) { + return; + } + gc_mark_outrefs(ptls, mq, new_obj, 0); + } +} + +// Drain items from worker's own chunkqueue +void gc_drain_own_chunkqueue(jl_ptls_t ptls, jl_gc_markqueue_t *mq) +{ + jl_gc_chunk_t c = {.cid = GC_empty_chunk}; + do { + c = gc_chunkqueue_pop(mq); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); + gc_mark_loop_serial_(ptls, mq); + } + } while (c.cid != GC_empty_chunk); +} + +// Main mark loop. Stack (allocated on the heap) of `jl_value_t *` +// is used to keep track of processed items. Maintaning this stack (instead of +// native one) avoids stack overflow when marking deep objects and +// makes it easier to implement parallel marking via work-stealing +JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls) +{ + gc_mark_loop_serial_(ptls, &ptls->mark_queue); + gc_drain_own_chunkqueue(ptls, &ptls->mark_queue); +} + +void gc_mark_and_steal(jl_ptls_t ptls) +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq_master = NULL; + int master_tid = jl_atomic_load(&gc_master_tid); + if (master_tid == -1) { + return; + } + mq_master = &gc_all_tls_states[master_tid]->mark_queue; + void *new_obj; + jl_gc_chunk_t c; + pop : { + new_obj = gc_ptr_queue_pop(mq); + if (new_obj != NULL) { + goto mark; + } + c = gc_chunkqueue_pop(mq); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); + goto pop; + } + goto steal; + } + mark : { + gc_mark_outrefs(ptls, mq, new_obj, 0); + goto pop; + } + // Note that for the stealing heuristics, we try to + // steal chunks much more agressively than pointers, + // since we know chunks will likely expand into a lot + // of work for the mark loop + steal : { + // Try to steal chunk from random GC thread + for (int i = 0; i < 4 * jl_n_gcthreads; i++) { + uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_gcthreads; + jl_gc_markqueue_t *mq2 = &gc_all_tls_states[v]->mark_queue; + c = gc_chunkqueue_steal_from(mq2); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); + goto pop; + } + } + // Sequentially walk GC threads to try to steal chunk + for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) { + jl_gc_markqueue_t *mq2 = &gc_all_tls_states[i]->mark_queue; + c = gc_chunkqueue_steal_from(mq2); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); + goto pop; + } + } + // Try to steal chunk from master thread + if (mq_master != NULL) { + c = gc_chunkqueue_steal_from(mq_master); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); goto pop; } } + // Try to steal pointer from random GC thread + for (int i = 0; i < 4 * jl_n_gcthreads; i++) { + uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_gcthreads; + jl_gc_markqueue_t *mq2 = &gc_all_tls_states[v]->mark_queue; + new_obj = gc_ptr_queue_steal_from(mq2); + if (new_obj != NULL) + goto mark; + } + // Sequentially walk GC threads to try to steal pointer + for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) { + jl_gc_markqueue_t *mq2 = &gc_all_tls_states[i]->mark_queue; + new_obj = gc_ptr_queue_steal_from(mq2); + if (new_obj != NULL) + goto mark; + } + // Try to steal pointer from master thread + if (mq_master != NULL) { + new_obj = gc_ptr_queue_steal_from(mq_master); + if (new_obj != NULL) + goto mark; + } + } +} + +size_t gc_count_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + // assume each chunk is worth 256 units of work and each pointer + // is worth 1 unit of work + size_t work = 256 * (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) - + jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top)); + work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) - + jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top)); + return work; +} + +/** + * Correctness argument for the mark-loop termination protocol. + * + * Safety properties: + * - No work items shall be in any thread's queues when `gc_mark_loop_barrier` observes + * that `gc_n_threads_marking` is zero. + * + * - No work item shall be stolen from the master thread (i.e. mutator thread which started + * GC and which helped the `jl_n_gcthreads` - 1 threads to mark) after + * `gc_mark_loop_barrier` observes that `gc_n_threads_marking` is zero. This property is + * necessary because we call `gc_mark_loop_serial` after marking the finalizer list in + * `_jl_gc_collect`, and want to ensure that we have the serial mark-loop semantics there, + * and that no work is stolen from us at that point. + * + * Proof: + * - Suppose the master thread observes that `gc_n_threads_marking` is zero in + * `gc_mark_loop_barrier` and there is a work item left in one thread's queue at that point. + * Since threads try to steal from all threads' queues, this implies that all threads must + * have tried to steal from the queue which still has a work item left, but failed to do so, + * which violates the semantics of Chase-Lev's work-stealing queue. + * + * - Let E1 be the event "master thread writes -1 to gc_master_tid" and E2 be the event + * "master thread observes that `gc_n_threads_marking` is zero". Since we're using + * sequentially consistent atomics, E1 => E2. Now suppose one thread which is spinning in + * `gc_should_mark` tries to enter the mark-loop after E2. In order to do so, it must + * increment `gc_n_threads_marking` to 1 in an event E3, and then read `gc_master_tid` in an + * event E4. Since we're using sequentially consistent atomics, E3 => E4. Since we observed + * `gc_n_threads_marking` as zero in E2, then E2 => E3, and we conclude E1 => E4, so that + * the thread which is spinning in `gc_should_mark` must observe that `gc_master_tid` is -1 + * and therefore won't enter the mark-loop. + */ + +int gc_should_mark(jl_ptls_t ptls) +{ + int should_mark = 0; + int n_threads_marking = jl_atomic_load(&gc_n_threads_marking); + // fast path + if (n_threads_marking == 0) { + return 0; + } + uv_mutex_lock(&gc_queue_observer_lock); + while (1) { + int tid = jl_atomic_load(&gc_master_tid); + // fast path + if (tid == -1) { + break; + } + n_threads_marking = jl_atomic_load(&gc_n_threads_marking); + // fast path + if (n_threads_marking == 0) { + break; + } + size_t work = gc_count_work_in_queue(gc_all_tls_states[tid]); + for (tid = gc_first_tid; tid < gc_first_tid + jl_n_gcthreads; tid++) { + work += gc_count_work_in_queue(gc_all_tls_states[tid]); + } + // if there is a lot of work left, enter the mark loop + if (work >= 16 * n_threads_marking) { + jl_atomic_fetch_add(&gc_n_threads_marking, 1); + should_mark = 1; + break; + } + jl_cpu_pause(); + } + uv_mutex_unlock(&gc_queue_observer_lock); + return should_mark; +} + +void gc_wake_all_for_marking(jl_ptls_t ptls) +{ + jl_atomic_store(&gc_master_tid, ptls->tid); + uv_mutex_lock(&gc_threads_lock); + jl_atomic_fetch_add(&gc_n_threads_marking, 1); + uv_cond_broadcast(&gc_threads_cond); + uv_mutex_unlock(&gc_threads_lock); +} + +void gc_mark_loop_parallel(jl_ptls_t ptls, int master) +{ + if (master) { + gc_wake_all_for_marking(ptls); + gc_mark_and_steal(ptls); + jl_atomic_fetch_add(&gc_n_threads_marking, -1); + } + while (1) { + int should_mark = gc_should_mark(ptls); + if (!should_mark) { + break; + } + gc_mark_and_steal(ptls); + jl_atomic_fetch_add(&gc_n_threads_marking, -1); + } +} + +void gc_mark_loop(jl_ptls_t ptls) +{ + if (jl_n_gcthreads == 0 || gc_heap_snapshot_enabled) { + gc_mark_loop_serial(ptls); + } + else { + gc_mark_loop_parallel(ptls, 1); + } +} + +void gc_mark_loop_barrier(void) +{ + jl_atomic_store(&gc_master_tid, -1); + while (jl_atomic_load(&gc_n_threads_marking) != 0) { + jl_cpu_pause(); + } +} + +void gc_mark_clean_reclaim_sets(void) +{ + // Clean up `reclaim-sets` + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + arraylist_t *reclaim_set2 = &ptls2->mark_queue.reclaim_set; + ws_array_t *a = NULL; + while ((a = (ws_array_t *)arraylist_pop(reclaim_set2)) != NULL) { + free(a->buffer); + free(a); + } } } -static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - jl_ptls_t ptls2) +static void gc_premark(jl_ptls_t ptls2) +{ + arraylist_t *remset = ptls2->heap.remset; + ptls2->heap.remset = ptls2->heap.last_remset; + ptls2->heap.last_remset = remset; + ptls2->heap.remset->len = 0; + ptls2->heap.remset_nptr = 0; + // avoid counting remembered objects & bindings twice + // in `perm_scanned_bytes` + size_t len = remset->len; + void **items = remset->items; + for (size_t i = 0; i < len; i++) { + jl_value_t *item = (jl_value_t*)items[i]; + objprofile_count(jl_typeof(item), 2, 0); + jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED; + } + len = ptls2->heap.rem_bindings.len; + items = ptls2->heap.rem_bindings.items; + for (size_t i = 0; i < len; i++) { + void *ptr = items[i]; + jl_astaggedvalue(ptr)->bits.gc = GC_OLD_MARKED; + } +} + +static void gc_queue_thread_local(jl_gc_markqueue_t *mq, jl_ptls_t ptls2) { jl_task_t *task; task = ptls2->root_task; - if (task) { - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "root task"); } task = jl_atomic_load_relaxed(&ptls2->current_task); - if (task) { - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "current task"); } task = ptls2->next_task; - if (task) { - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "next task"); } task = ptls2->previous_task; - if (task) { // shouldn't be necessary, but no reason not to - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "previous task"); } if (ptls2->previous_exception) { - gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception); + gc_try_claim_and_push(mq, ptls2->previous_exception, NULL); gc_heap_snapshot_record_root((jl_value_t*)ptls2->previous_exception, "previous exception"); } } +static void gc_queue_bt_buf(jl_gc_markqueue_t *mq, jl_ptls_t ptls2) +{ + jl_bt_element_t *bt_data = ptls2->bt_data; + size_t bt_size = ptls2->bt_size; + for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { + jl_bt_element_t *bt_entry = bt_data + i; + if (jl_bt_is_native(bt_entry)) + continue; + size_t njlvals = jl_bt_num_jlvals(bt_entry); + for (size_t j = 0; j < njlvals; j++) + gc_try_claim_and_push(mq, jl_bt_entry_jlvalue(bt_entry, j), NULL); + } +} + +static void gc_queue_remset(jl_ptls_t ptls, jl_ptls_t ptls2) +{ + size_t len = ptls2->heap.last_remset->len; + void **items = ptls2->heap.last_remset->items; + for (size_t i = 0; i < len; i++) { + // Objects in the `remset` are already marked, + // so a `gc_try_claim_and_push` wouldn't work here + gc_mark_outrefs(ptls, &ptls->mark_queue, (jl_value_t *)items[i], 1); + } + int n_bnd_refyoung = 0; + len = ptls2->heap.rem_bindings.len; + items = ptls2->heap.rem_bindings.items; + for (size_t i = 0; i < len; i++) { + jl_binding_t *ptr = (jl_binding_t*)items[i]; + uintptr_t bnd_refyoung = 0; + jl_value_t *v = jl_atomic_load_relaxed(&ptr->value); + gc_try_claim_and_push(&ptls->mark_queue, v, &bnd_refyoung); + jl_value_t *ty = jl_atomic_load_relaxed(&ptr->ty); + gc_try_claim_and_push(&ptls->mark_queue, ty, &bnd_refyoung); + jl_value_t *globalref = jl_atomic_load_relaxed(&ptr->globalref); + gc_try_claim_and_push(&ptls->mark_queue, globalref, &bnd_refyoung); + if (bnd_refyoung) { + items[n_bnd_refyoung] = ptr; + n_bnd_refyoung++; + } + } + ptls2->heap.rem_bindings.len = n_bnd_refyoung; +} + extern jl_value_t *cmpswap_names JL_GLOBALLY_ROOTED; // mark the initial root set -static void mark_roots(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) +static void gc_mark_roots(jl_gc_markqueue_t *mq) { // modules - gc_mark_queue_obj(gc_cache, sp, jl_main_module); + gc_try_claim_and_push(mq, jl_main_module, NULL); gc_heap_snapshot_record_root((jl_value_t*)jl_main_module, "main_module"); - // invisible builtin values - if (jl_an_empty_vec_any != NULL) - gc_mark_queue_obj(gc_cache, sp, jl_an_empty_vec_any); - if (jl_module_init_order != NULL) - gc_mark_queue_obj(gc_cache, sp, jl_module_init_order); + gc_try_claim_and_push(mq, jl_an_empty_vec_any, NULL); + gc_try_claim_and_push(mq, jl_module_init_order, NULL); for (size_t i = 0; i < jl_current_modules.size; i += 2) { if (jl_current_modules.table[i + 1] != HT_NOTFOUND) { - gc_mark_queue_obj(gc_cache, sp, jl_current_modules.table[i]); + gc_try_claim_and_push(mq, jl_current_modules.table[i], NULL); gc_heap_snapshot_record_root((jl_value_t*)jl_current_modules.table[i], "top level module"); } } - gc_mark_queue_obj(gc_cache, sp, jl_anytuple_type_type); + gc_try_claim_and_push(mq, jl_anytuple_type_type, NULL); for (size_t i = 0; i < N_CALL_CACHE; i++) { jl_typemap_entry_t *v = jl_atomic_load_relaxed(&call_cache[i]); - if (v != NULL) { - gc_mark_queue_obj(gc_cache, sp, v); - } - } - if (jl_all_methods != NULL) { - gc_mark_queue_obj(gc_cache, sp, jl_all_methods); + gc_try_claim_and_push(mq, v, NULL); } - if (_jl_debug_method_invalidation != NULL) - gc_mark_queue_obj(gc_cache, sp, _jl_debug_method_invalidation); - + gc_try_claim_and_push(mq, jl_all_methods, NULL); + gc_try_claim_and_push(mq, _jl_debug_method_invalidation, NULL); // constants - gc_mark_queue_obj(gc_cache, sp, jl_emptytuple_type); - if (cmpswap_names != NULL) - gc_mark_queue_obj(gc_cache, sp, cmpswap_names); - gc_mark_queue_obj(gc_cache, sp, jl_global_roots_table); + gc_try_claim_and_push(mq, jl_emptytuple_type, NULL); + gc_try_claim_and_push(mq, cmpswap_names, NULL); + gc_try_claim_and_push(mq, jl_global_roots_table, NULL); } // find unmarked objects that need to be finalized from the finalizer list "list". @@ -3283,79 +3362,23 @@ JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT return newtb - oldtb; } -JL_DLLEXPORT int64_t jl_gc_live_bytes(void) -{ - return live_bytes; -} - -static void jl_gc_premark(jl_ptls_t ptls2) -{ - arraylist_t *remset = ptls2->heap.remset; - ptls2->heap.remset = ptls2->heap.last_remset; - ptls2->heap.last_remset = remset; - ptls2->heap.remset->len = 0; - ptls2->heap.remset_nptr = 0; - - // avoid counting remembered objects & bindings twice - // in `perm_scanned_bytes` - size_t len = remset->len; - void **items = remset->items; - for (size_t i = 0; i < len; i++) { - jl_value_t *item = (jl_value_t*)items[i]; - objprofile_count(jl_typeof(item), 2, 0); - jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED; - } - len = ptls2->heap.rem_bindings.len; - items = ptls2->heap.rem_bindings.items; - for (size_t i = 0; i < len; i++) { - void *ptr = items[i]; - jl_astaggedvalue(ptr)->bits.gc = GC_OLD_MARKED; - } -} - -static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) +JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void) { - size_t len = ptls2->heap.last_remset->len; - void **items = ptls2->heap.last_remset->items; - for (size_t i = 0; i < len; i++) - gc_mark_queue_scan_obj(gc_cache, sp, (jl_value_t*)items[i]); - int n_bnd_refyoung = 0; - len = ptls2->heap.rem_bindings.len; - items = ptls2->heap.rem_bindings.items; - for (size_t i = 0; i < len; i++) { - jl_binding_t *ptr = (jl_binding_t*)items[i]; - // A null pointer can happen here when the binding is cleaned up - // as an exception is thrown after it was already queued (#10221) - int bnd_refyoung = 0; - jl_value_t *v = jl_atomic_load_relaxed(&ptr->value); - if (v != NULL && gc_mark_queue_obj(gc_cache, sp, v)) - bnd_refyoung = 1; - jl_value_t *ty = jl_atomic_load_relaxed(&ptr->ty); - if (ty != NULL && gc_mark_queue_obj(gc_cache, sp, ty)) - bnd_refyoung = 1; - jl_value_t *globalref = jl_atomic_load_relaxed(&ptr->globalref); - if (globalref != NULL && gc_mark_queue_obj(gc_cache, sp, globalref)) - bnd_refyoung = 1; - if (bnd_refyoung) { - items[n_bnd_refyoung] = ptr; - n_bnd_refyoung++; + int n_threads = jl_atomic_load_acquire(&jl_n_threads); + jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + int64_t pool_live_bytes = 0; + for (int i = 0; i < n_threads; i++) { + jl_ptls_t ptls2 = all_tls_states[i]; + if (ptls2 != NULL) { + pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_num.pool_live_bytes); } } - ptls2->heap.rem_bindings.len = n_bnd_refyoung; + return pool_live_bytes; } -static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) +JL_DLLEXPORT int64_t jl_gc_live_bytes(void) { - jl_bt_element_t *bt_data = ptls2->bt_data; - size_t bt_size = ptls2->bt_size; - for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { - jl_bt_element_t *bt_entry = bt_data + i; - if (jl_bt_is_native(bt_entry)) - continue; - size_t njlvals = jl_bt_num_jlvals(bt_entry); - for (size_t j = 0; j < njlvals; j++) - gc_mark_queue_obj(gc_cache, sp, jl_bt_entry_jlvalue(bt_entry, j)); - } + return live_bytes; } size_t jl_maxrss(void); @@ -3365,9 +3388,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) { combine_thread_gc_counts(&gc_num); - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); + jl_gc_markqueue_t *mq = &ptls->mark_queue; uint64_t gc_start_time = jl_hrtime(); int64_t last_perm_scanned_bytes = perm_scanned_bytes; @@ -3379,33 +3400,39 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) - jl_gc_premark(ptls2); + gc_premark(ptls2); } assert(gc_n_threads); + int single_threaded = (jl_n_gcthreads == 0 || gc_heap_snapshot_enabled); for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 == NULL) - continue; - // 2.1. mark every object in the `last_remsets` and `rem_binding` - jl_gc_queue_remset(gc_cache, &sp, ptls2); - // 2.2. mark every thread local root - jl_gc_queue_thread_local(gc_cache, &sp, ptls2); - // 2.3. mark any managed objects in the backtrace buffer - // TODO: treat these as roots for gc_heap_snapshot_record - jl_gc_queue_bt_buf(gc_cache, &sp, ptls2); + jl_gc_markqueue_t *mq2 = mq; + jl_ptls_t ptls_gc_thread = NULL; + if (!single_threaded) { + ptls_gc_thread = gc_all_tls_states[gc_first_tid + t_i % jl_n_gcthreads]; + mq2 = &ptls_gc_thread->mark_queue; + } + if (ptls2 != NULL) { + // 2.1. mark every thread local root + gc_queue_thread_local(mq2, ptls2); + // 2.2. mark any managed objects in the backtrace buffer + // TODO: treat these as roots for gc_heap_snapshot_record + gc_queue_bt_buf(mq2, ptls2); + // 2.3. mark every object in the `last_remsets` and `rem_binding` + gc_queue_remset(single_threaded ? ptls : ptls_gc_thread, ptls2); + } } // 3. walk roots - mark_roots(gc_cache, &sp); + gc_mark_roots(mq); if (gc_cblist_root_scanner) { - export_gc_state(ptls, &sp); gc_invoke_callbacks(jl_gc_cb_root_scanner_t, gc_cblist_root_scanner, (collection)); - import_gc_state(ptls, &sp); } - gc_mark_loop(ptls, sp); - gc_mark_sp_init(gc_cache, &sp); + gc_mark_loop(ptls); + gc_mark_loop_barrier(); + gc_mark_clean_reclaim_sets(); gc_num.since_sweep += gc_num.allocd; JL_PROBE_GC_MARK_END(scanned_bytes, perm_scanned_bytes); gc_settime_premark_end(); @@ -3426,9 +3453,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - sweep_finalizer_list(&ptls2->finalizers); + if (ptls2 != NULL) + sweep_finalizer_list(&ptls2->finalizers); } if (prev_sweep_full) { sweep_finalizer_list(&finalizer_list_marked); @@ -3437,15 +3463,13 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + if (ptls2 != NULL) + gc_mark_finlist(mq, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, orig_marked_len); + gc_mark_finlist(mq, &finalizer_list_marked, orig_marked_len); // "Flush" the mark stack before flipping the reset_age bit // so that the objects are not incorrectly reset. - gc_mark_loop(ptls, sp); - gc_mark_sp_init(gc_cache, &sp); + gc_mark_loop_serial(ptls); // Conservative marking relies on age to tell allocated objects // and freelist entries apart. mark_reset_age = !jl_gc_conservative_gc_support_enabled(); @@ -3453,9 +3477,10 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // `to_finalize` list. These objects are only reachable from this list // and should not be referenced by any old objects so this won't break // the GC invariant. - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - gc_mark_loop(ptls, sp); + gc_mark_finlist(mq, &to_finalize, 0); + gc_mark_loop_serial(ptls); mark_reset_age = 0; + gc_settime_postmark_end(); // Flush everything in mark cache @@ -3480,9 +3505,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - nptr += ptls2->heap.remset_nptr; + if (ptls2 != NULL) + nptr += ptls2->heap.remset_nptr; } // many pointers in the intergen frontier => "quick" mark is not quick @@ -3503,7 +3527,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) size_t maxmem = 0; #ifdef _P64 // on a big memory machine, increase max_collect_interval to totalmem / nthreads / 2 - maxmem = total_mem / gc_n_threads / 2; + maxmem = total_mem / (gc_n_threads - jl_n_gcthreads) / 2; #endif if (maxmem < max_collect_interval) maxmem = max_collect_interval; @@ -3535,16 +3559,17 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) last_long_collect_interval = gc_num.interval; } scanned_bytes = 0; - // 5. start sweeping + // 6. start sweeping uint64_t start_sweep_time = jl_hrtime(); JL_PROBE_GC_SWEEP_BEGIN(sweep_full); + current_sweep_full = sweep_full; sweep_weak_refs(); sweep_stack_pools(); gc_sweep_foreign_objs(); gc_sweep_other(ptls, sweep_full); gc_scrub(); gc_verify_tags(); - gc_sweep_pool(sweep_full); + gc_sweep_pool(); if (sweep_full) gc_sweep_perm_alloc(); JL_PROBE_GC_SWEEP_END(); @@ -3554,9 +3579,15 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) uint64_t sweep_time = gc_end_time - start_sweep_time; gc_num.total_sweep_time += sweep_time; gc_num.sweep_time = sweep_time; + if (sweep_full) { + gc_num.last_full_sweep = gc_end_time; + } + else { + gc_num.last_incremental_sweep = gc_end_time; + } // sweeping is over - // 6. if it is a quick sweep, put back the remembered objects in queued state + // 7. if it is a quick sweep, put back the remembered objects in queued state // so that we don't trigger the barrier again on them. assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { @@ -3590,7 +3621,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) } #endif - _report_gc_finished(pause, gc_num.freed, sweep_full, recollect); gc_final_pause_end(gc_start_time, gc_end_time); @@ -3608,19 +3638,19 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (collection == JL_GC_AUTO) { //If we aren't freeing enough or are seeing lots and lots of pointers let it increase faster - if(!not_freed_enough || large_frontier) { + if (!not_freed_enough || large_frontier) { int64_t tot = 2 * (live_bytes + gc_num.since_sweep) / 3; if (gc_num.interval > tot) { gc_num.interval = tot; last_long_collect_interval = tot; } + } // If the current interval is larger than half the live data decrease the interval - } else { + else { int64_t half = (live_bytes / 2); if (gc_num.interval > half) gc_num.interval = half; } - // But never go below default if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval; } @@ -3633,6 +3663,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) else { // We can't stay under our goal so let's go back to // the minimum interval and hope things get better + under_memory_pressure = 1; gc_num.interval = default_collect_interval; } } @@ -3704,6 +3735,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) if (duration > gc_num.max_time_to_safepoint) gc_num.max_time_to_safepoint = duration; gc_num.time_to_safepoint = duration; + gc_num.total_time_to_safepoint += duration; gc_invoke_callbacks(jl_gc_cb_pre_gc_t, gc_cblist_pre_gc, (collection)); @@ -3735,22 +3767,27 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) gc_invoke_callbacks(jl_gc_cb_post_gc_t, gc_cblist_post_gc, (collection)); + + if (under_memory_pressure) { + gc_invoke_callbacks(jl_gc_cb_notify_gc_pressure_t, + gc_cblist_notify_gc_pressure, ()); + } + under_memory_pressure = 0; #ifdef _OS_WINDOWS_ SetLastError(last_error); #endif errno = last_errno; } -void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) +void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq) { - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; assert(gc_n_threads); for (size_t i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) - jl_gc_queue_thread_local(gc_cache, sp, ptls2); + if (ptls2 != NULL) + gc_queue_thread_local(mq, ptls2); } - mark_roots(gc_cache, sp); + gc_mark_roots(mq); } // allocator entry points @@ -3770,8 +3807,10 @@ void jl_init_thread_heap(jl_ptls_t ptls) p[i].freelist = NULL; p[i].newpages = NULL; } - arraylist_new(&heap->weak_refs, 0); - arraylist_new(&heap->live_tasks, 0); + small_arraylist_new(&heap->weak_refs, 0); + small_arraylist_new(&heap->live_tasks, 0); + for (int i = 0; i < JL_N_STACK_POOLS; i++) + small_arraylist_new(&heap->free_stacks[i], 0); heap->mallocarrays = NULL; heap->mafreelist = NULL; heap->big_objects = NULL; @@ -3787,10 +3826,20 @@ void jl_init_thread_heap(jl_ptls_t ptls) gc_cache->perm_scanned_bytes = 0; gc_cache->scanned_bytes = 0; gc_cache->nbig_obj = 0; - size_t init_size = 1024; - gc_cache->pc_stack = (void**)malloc_s(init_size * sizeof(void*)); - gc_cache->pc_stack_end = gc_cache->pc_stack + init_size; - gc_cache->data_stack = (jl_gc_mark_data_t *)malloc_s(init_size * sizeof(jl_gc_mark_data_t)); + + // Initialize GC mark-queue + jl_gc_markqueue_t *mq = &ptls->mark_queue; + ws_queue_t *cq = &mq->chunk_queue; + ws_array_t *wsa = create_ws_array(GC_CHUNK_QUEUE_INIT_SIZE, sizeof(jl_gc_chunk_t)); + jl_atomic_store_relaxed(&cq->top, 0); + jl_atomic_store_relaxed(&cq->bottom, 0); + jl_atomic_store_relaxed(&cq->array, wsa); + ws_queue_t *q = &mq->ptr_queue; + ws_array_t *wsa2 = create_ws_array(GC_PTR_QUEUE_INIT_SIZE, sizeof(jl_value_t *)); + jl_atomic_store_relaxed(&q->top, 0); + jl_atomic_store_relaxed(&q->bottom, 0); + jl_atomic_store_relaxed(&q->array, wsa2); + arraylist_new(&mq->reclaim_set, 32); memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); @@ -3801,8 +3850,12 @@ void jl_gc_init(void) { JL_MUTEX_INIT(&heapsnapshot_lock); JL_MUTEX_INIT(&finalizers_lock); + uv_mutex_init(&page_profile_lock); uv_mutex_init(&gc_cache_lock); uv_mutex_init(&gc_perm_lock); + uv_mutex_init(&gc_threads_lock); + uv_cond_init(&gc_threads_cond); + uv_mutex_init(&gc_queue_observer_lock); jl_gc_init_page(); jl_gc_debug_init(); @@ -3834,13 +3887,11 @@ void jl_gc_init(void) #endif if (jl_options.heap_size_hint) jl_gc_set_max_memory(jl_options.heap_size_hint); - - jl_gc_mark_sp_t sp = {NULL, NULL, NULL, NULL}; - gc_mark_loop(NULL, sp); t_start = jl_hrtime(); } -void jl_gc_set_max_memory(uint64_t max_mem) { +JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) +{ if (max_mem > 0 && max_mem < (uint64_t)1 << (sizeof(memsize_t) * 8 - 1)) { max_total_memory = max_mem; @@ -3864,7 +3915,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); jl_atomic_store_relaxed(&ptls->gc_num.allocd, @@ -3879,7 +3930,7 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) { jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); jl_atomic_store_relaxed(&ptls->gc_num.allocd, @@ -3895,7 +3946,7 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; free(p); - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; jl_atomic_store_relaxed(&ptls->gc_num.freed, jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz); @@ -3908,7 +3959,7 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size { jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); if (sz < old) @@ -4047,7 +4098,10 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds SetLastError(last_error); #endif errno = last_errno; - maybe_record_alloc_to_profile((jl_value_t*)b, sz, jl_gc_unknown_type_tag); + // gc_managed_realloc_ is currently used exclusively for resizing array buffers. + if (allocsz > oldsz) { + maybe_record_alloc_to_profile((jl_value_t*)b, allocsz - oldsz, (jl_datatype_t*)jl_buff_tag); + } return b; } @@ -4088,7 +4142,6 @@ jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz) // old pointer. bigval_t *newbig = (bigval_t*)gc_managed_realloc_(ptls, hdr, allocsz, oldsz, 1, s, 0); newbig->sz = allocsz; - newbig->age = 0; gc_big_object_link(newbig, &ptls->heap.big_objects); jl_value_t *snew = jl_valueof(&newbig->header); *(size_t*)snew = sz; @@ -4164,6 +4217,9 @@ void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offs errno = last_errno; if (__unlikely(pool == MAP_FAILED)) return NULL; +#ifdef MADV_NOHUGEPAGE + madvise(pool, GC_PERM_POOL_SIZE, MADV_NOHUGEPAGE); +#endif #endif gc_perm_pool = (uintptr_t)pool; gc_perm_end = gc_perm_pool + GC_PERM_POOL_SIZE; @@ -4259,7 +4315,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) { p = (char *) p - 1; jl_gc_pagemeta_t *meta = page_metadata(p); - if (meta && meta->ages) { + if (meta) { char *page = gc_page_data(p); // offset within page. size_t off = (char *)p - page; @@ -4285,7 +4341,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) jl_gc_pool_t *pool = gc_all_tls_states[meta->thread_n]->heap.norm_pools + meta->pool_n; - if (meta->fl_begin_offset == (uint16_t) -1) { + if (meta->fl_begin_offset == UINT16_MAX) { // case 2: this is a page on the newpages list jl_taggedvalue_t *newpages = pool->newpages; // Check if the page is being allocated from via newpages @@ -4294,7 +4350,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) char *data = gc_page_data(newpages); if (data != meta->data) { // Pages on newpages form a linked list where only the - // first one is allocated from (see reset_page()). + // first one is allocated from (see gc_reset_page()). // All other pages are empty. return NULL; } @@ -4322,7 +4378,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) // entries and 1 for live objects. The above subcases arise // because allocating a cell will not update the age bit, so we // need extra logic for pages that have been allocated from. - unsigned obj_id = (off - off2) / osize; // We now distinguish between the second and third subcase. // Freelist entries are consumed in ascending order. Anything // before the freelist pointer was either live during the last @@ -4330,11 +4385,9 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) if (gc_page_data(cell) == gc_page_data(pool->freelist) && (char *)cell < (char *)pool->freelist) goto valid_object; - // We know now that the age bit reflects liveness status during - // the last sweep and that the cell has not been reused since. - if (!(meta->ages[obj_id / 8] & (1 << (obj_id % 8)))) { - return NULL; - } + // already skipped marked or old objects above, so here + // the age bits are 0, thus the object is on the freelist + return NULL; // Not a freelist entry, therefore a valid object. valid_object: // We have to treat objects with type `jl_buff_tag` differently, diff --git a/src/gc.h b/src/gc.h index 7b02df69abbc1..ff52269b73af9 100644 --- a/src/gc.h +++ b/src/gc.h @@ -31,8 +31,12 @@ extern "C" { #endif +#ifdef GC_SMALL_PAGE +#define GC_PAGE_LG2 12 // log2(size of a page) +#else #define GC_PAGE_LG2 14 // log2(size of a page) -#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k +#endif +#define GC_PAGE_SZ (1 << GC_PAGE_LG2) #define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT)) #define jl_malloc_tag ((void*)0xdeadaa01) @@ -42,7 +46,6 @@ extern "C" { typedef struct { uint64_t num; uint64_t next; - uint64_t min; uint64_t interv; uint64_t max; @@ -77,179 +80,49 @@ typedef struct { uint64_t max_memory; uint64_t time_to_safepoint; uint64_t max_time_to_safepoint; + uint64_t total_time_to_safepoint; uint64_t sweep_time; uint64_t mark_time; uint64_t total_sweep_time; uint64_t total_mark_time; + uint64_t last_full_sweep; + uint64_t last_incremental_sweep; } jl_gc_num_t; -enum { - GC_MARK_L_marked_obj, - GC_MARK_L_scan_only, - GC_MARK_L_finlist, - GC_MARK_L_objarray, - GC_MARK_L_array8, - GC_MARK_L_array16, - GC_MARK_L_obj8, - GC_MARK_L_obj16, - GC_MARK_L_obj32, - GC_MARK_L_stack, - GC_MARK_L_excstack, - GC_MARK_L_module_binding, - _GC_MARK_L_MAX -}; - -// The following structs (`gc_mark_*_t`) contain iterator state used for the -// scanning of various object types. -// -// The `nptr` member records the number of pointers slots referenced by -// an object to be used in the full collection heuristics as well as whether the object -// references young objects. -// `nptr >> 2` is the number of pointers fields referenced by the object. -// The lowest bit of `nptr` is set if the object references young object. -// The 2nd lowest bit of `nptr` is the GC old bits of the object after marking. -// A `0x3` in the low bits means that the object needs to be in the remset. - -// An generic object that's marked and needs to be scanned -// The metadata might need update too (depend on the PC) -typedef struct { - jl_value_t *obj; // The object - uintptr_t tag; // The tag with the GC bits masked out - uint8_t bits; // The GC bits after tagging (`bits & 1 == 1`) -} gc_mark_marked_obj_t; - -// An object array. This can come from an array, svec, or the using array or a module -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - jl_value_t **begin; // The first slot to be scanned. - jl_value_t **end; // The end address (after the last slot to be scanned) - uint32_t step; // Number of pointers to jump between marks - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_objarray_t; - -// A normal object with 8bits field descriptors -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - uint8_t *begin; // Current field descriptor. - uint8_t *end; // End of field descriptor. - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_obj8_t; - -// A normal object with 16bits field descriptors -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - uint16_t *begin; // Current field descriptor. - uint16_t *end; // End of field descriptor. - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_obj16_t; - -// A normal object with 32bits field descriptors -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - uint32_t *begin; // Current field descriptor. - uint32_t *end; // End of field descriptor. - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_obj32_t; - -typedef struct { - jl_value_t **begin; // The first slot to be scanned. - jl_value_t **end; // The end address (after the last slot to be scanned) - uint8_t *rebegin; - gc_mark_obj8_t elem; -} gc_mark_array8_t; - -typedef struct { - jl_value_t **begin; // The first slot to be scanned. - jl_value_t **end; // The end address (after the last slot to be scanned) - uint16_t *rebegin; - gc_mark_obj16_t elem; -} gc_mark_array16_t; - -// Stack frame -typedef struct { - jl_gcframe_t *s; // The current stack frame - uint32_t i; // The current slot index in the frame - uint32_t nroots; // `nroots` fields in the frame - // Parameters to mark the copy_stack range. - uintptr_t offset; - uintptr_t lb; - uintptr_t ub; -} gc_mark_stackframe_t; - -// Exception stack data -typedef struct { - jl_excstack_t *s; // Stack of exceptions - size_t itr; // Iterator into exception stack - size_t bt_index; // Current backtrace buffer entry index - size_t jlval_index; // Index into GC managed values for current bt entry -} gc_mark_excstack_t; - -// Module bindings. This is also the beginning of module scanning. -// The loop will start marking other references in a module after the bindings are marked -typedef struct { - jl_module_t *parent; // The parent module to trigger write barrier on. - jl_binding_t **begin; // The first slot to be scanned. - jl_binding_t **end; // The end address (after the last slot to be scanned) - uintptr_t nptr; // See notes about `nptr` above. - uint8_t bits; // GC bits of the module (the bits to mark the binding buffer with) -} gc_mark_binding_t; - -// Finalizer (or object) list -typedef struct { - jl_value_t **begin; - jl_value_t **end; -} gc_mark_finlist_t; - -// This is used to determine the max size of the data objects on the data stack. -// We'll use this size to determine the size of the data stack corresponding to a -// PC stack size. Since the data objects are not all of the same size, we'll waste -// some memory on the data stack this way but that size is unlikely going to be significant. -union _jl_gc_mark_data { - gc_mark_marked_obj_t marked; - gc_mark_objarray_t objarray; - gc_mark_array8_t array8; - gc_mark_array16_t array16; - gc_mark_obj8_t obj8; - gc_mark_obj16_t obj16; - gc_mark_obj32_t obj32; - gc_mark_stackframe_t stackframe; - gc_mark_excstack_t excstackframe; - gc_mark_binding_t binding; - gc_mark_finlist_t finlist; -}; - -// Pop a data struct from the mark data stack (i.e. decrease the stack pointer) -// This should be used after dispatch and therefore the pc stack pointer is already popped from -// the stack. -STATIC_INLINE void *gc_pop_markdata_(jl_gc_mark_sp_t *sp, size_t size) -{ - jl_gc_mark_data_t *data = (jl_gc_mark_data_t *)(((char*)sp->data) - size); - sp->data = data; - return data; -} -#define gc_pop_markdata(sp, type) ((type*)gc_pop_markdata_(sp, sizeof(type))) - -// Re-push a frame to the mark stack (both data and pc) -// The data and pc are expected to be on the stack (or updated in place) already. -// Mainly useful to pause the current scanning in order to scan an new object. -STATIC_INLINE void *gc_repush_markdata_(jl_gc_mark_sp_t *sp, size_t size) JL_NOTSAFEPOINT -{ - jl_gc_mark_data_t *data = sp->data; - sp->pc++; - sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + size); - return data; -} -#define gc_repush_markdata(sp, type) ((type*)gc_repush_markdata_(sp, sizeof(type))) +// Array chunks (work items representing suffixes of +// large arrays of pointers left to be marked) + +typedef enum { + GC_empty_chunk = 0, // for sentinel representing no items left in chunk queue + GC_objary_chunk, // for chunk of object array + GC_ary8_chunk, // for chunk of array with 8 bit field descriptors + GC_ary16_chunk, // for chunk of array with 16 bit field descriptors + GC_finlist_chunk, // for chunk of finalizer list +} gc_chunk_id_t; + +typedef struct _jl_gc_chunk_t { + gc_chunk_id_t cid; + struct _jl_value_t *parent; // array owner + struct _jl_value_t **begin; // pointer to first element that needs scanning + struct _jl_value_t **end; // pointer to last element that needs scanning + void *elem_begin; // used to scan pointers within objects when marking `ary8` or `ary16` + void *elem_end; // used to scan pointers within objects when marking `ary8` or `ary16` + uint32_t step; // step-size used when marking objarray + uintptr_t nptr; // (`nptr` & 0x1) if array has young element and (`nptr` & 0x2) if array owner is old +} jl_gc_chunk_t; + +#define GC_CHUNK_BATCH_SIZE (1 << 16) // maximum number of references that can be processed + // without creating a chunk + +#define GC_PTR_QUEUE_INIT_SIZE (1 << 18) // initial size of queue of `jl_value_t *` +#define GC_CHUNK_QUEUE_INIT_SIZE (1 << 14) // initial size of chunk-queue // layout for big (>2k) objects JL_EXTENSION typedef struct _bigval_t { struct _bigval_t *next; struct _bigval_t **prev; // pointer to the next field of the prev entry - union { - size_t sz; - uintptr_t age : 2; - }; + size_t sz; #ifdef _P64 // Add padding so that the value is 64-byte aligned // (8 pointers of 8 bytes each) - (4 other pointers in struct) void *_padding[8 - 4]; @@ -275,7 +148,10 @@ typedef struct _mallocarray_t { } mallocarray_t; // pool page metadata -typedef struct { +typedef struct _jl_gc_pagemeta_t { + // next metadata structure in per-thread list + // or in one of the `jl_gc_global_page_pool_t` + struct _jl_gc_pagemeta_t *next; // index of pool that owns this page uint8_t pool_n; // Whether any cell in the page is marked @@ -302,36 +178,74 @@ typedef struct { // number of free objects in this page. // invalid if pool that owns this page is allocating objects from this page. uint16_t nfree; - uint16_t osize; // size of each object in this page + uint16_t osize; // size of each object in this page uint16_t fl_begin_offset; // offset of first free object in this page uint16_t fl_end_offset; // offset of last free object in this page uint16_t thread_n; // thread id of the heap that owns this page char *data; - uint8_t *ages; } jl_gc_pagemeta_t; -// Page layout: -// Newpage freelist: sizeof(void*) -// Padding: GC_PAGE_OFFSET - sizeof(void*) -// Blocks: osize * n -// Tag: sizeof(jl_taggedvalue_t) -// Data: <= osize - sizeof(jl_taggedvalue_t) +extern jl_gc_page_stack_t global_page_pool_lazily_freed; +extern jl_gc_page_stack_t global_page_pool_clean; +extern jl_gc_page_stack_t global_page_pool_freed; + +// Lock-free stack implementation taken +// from Herlihy's "The Art of Multiprocessor Programming" +// XXX: this is not a general-purpose lock-free stack. We can +// get away with just using a CAS and not implementing some ABA +// prevention mechanism since once a node is popped from the +// `jl_gc_page_stack_t`, it may only be pushed back to them +// in the sweeping phase, which also doesn't push a node into the +// same stack after it's popped + +STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT +{ + while (1) { + jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom); + elt->next = old_back; + if (jl_atomic_cmpswap(&pool->bottom, &old_back, elt)) { + break; + } + jl_cpu_pause(); + } +} + +STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT +{ + while (1) { + jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom); + if (old_back == NULL) { + return NULL; + } + if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) { + return old_back; + } + jl_cpu_pause(); + } +} + +typedef struct { + _Atomic(size_t) n_freed_objs; + _Atomic(size_t) n_pages_allocd; +} gc_fragmentation_stat_t; -// Memory map: -// The complete address space is divided up into a multi-level page table. -// The three levels have similar but slightly different structures: -// - pagetable0_t: the bottom/leaf level (covers the contiguous addresses) -// - pagetable1_t: the middle level -// - pagetable2_t: the top/leaf level (covers the entire virtual address space) -// Corresponding to these similar structures is a large amount of repetitive -// code that is nearly the same but not identical. It could be made less -// repetitive with C macros, but only at the cost of debuggability. The specialized -// structure of this representation allows us to partially unroll and optimize -// various conditions at each level. - -// The following constants define the branching factors at each level. -// The constants and GC_PAGE_LG2 must therefore sum to sizeof(void*). -// They should all be multiples of 32 (sizeof(uint32_t)) except that REGION2_PG_COUNT may also be 1. +#ifdef GC_SMALL_PAGE +#ifdef _P64 +#define REGION0_PG_COUNT (1 << 16) +#define REGION1_PG_COUNT (1 << 18) +#define REGION2_PG_COUNT (1 << 18) +#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0xFFFF) // shift by GC_PAGE_LG2 +#define REGION1_INDEX(p) (((uintptr_t)(p) >> 28) & 0x3FFFF) +#define REGION_INDEX(p) (((uintptr_t)(p) >> 46) & 0x3FFFF) +#else +#define REGION0_PG_COUNT (1 << 10) +#define REGION1_PG_COUNT (1 << 10) +#define REGION2_PG_COUNT (1 << 0) +#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0x3FF) // shift by GC_PAGE_LG2 +#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF) +#define REGION_INDEX(p) (0) +#endif +#else #ifdef _P64 #define REGION0_PG_COUNT (1 << 16) #define REGION1_PG_COUNT (1 << 16) @@ -347,39 +261,122 @@ typedef struct { #define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF) #define REGION_INDEX(p) (0) #endif +#endif // define the representation of the levels of the page-table (0 to 2) typedef struct { - jl_gc_pagemeta_t *meta[REGION0_PG_COUNT]; - uint32_t allocmap[REGION0_PG_COUNT / 32]; - uint32_t freemap[REGION0_PG_COUNT / 32]; - // store a lower bound of the first free page in each region - int lb; - // an upper bound of the last non-free page - int ub; + uint8_t meta[REGION0_PG_COUNT]; } pagetable0_t; typedef struct { pagetable0_t *meta0[REGION1_PG_COUNT]; - uint32_t allocmap0[REGION1_PG_COUNT / 32]; - uint32_t freemap0[REGION1_PG_COUNT / 32]; - // store a lower bound of the first free page in each region - int lb; - // an upper bound of the last non-free page - int ub; } pagetable1_t; typedef struct { pagetable1_t *meta1[REGION2_PG_COUNT]; - uint32_t allocmap1[(REGION2_PG_COUNT + 31) / 32]; - uint32_t freemap1[(REGION2_PG_COUNT + 31) / 32]; - // store a lower bound of the first free page in each region - int lb; - // an upper bound of the last non-free page - int ub; } pagetable_t; -#ifdef __clang_gcanalyzer__ +#define GC_PAGE_UNMAPPED 0 +#define GC_PAGE_ALLOCATED 1 +#define GC_PAGE_LAZILY_FREED 2 +#define GC_PAGE_FREED 3 + +extern pagetable_t alloc_map; + +STATIC_INLINE uint8_t gc_alloc_map_is_set(char *_data) JL_NOTSAFEPOINT +{ + uintptr_t data = ((uintptr_t)_data); + unsigned i; + i = REGION_INDEX(data); + pagetable1_t *r1 = alloc_map.meta1[i]; + if (r1 == NULL) + return 0; + i = REGION1_INDEX(data); + pagetable0_t *r0 = r1->meta0[i]; + if (r0 == NULL) + return 0; + i = REGION0_INDEX(data); + return (r0->meta[i] == GC_PAGE_ALLOCATED); +} + +STATIC_INLINE void gc_alloc_map_set(char *_data, uint8_t v) JL_NOTSAFEPOINT +{ + uintptr_t data = ((uintptr_t)_data); + unsigned i; + i = REGION_INDEX(data); + pagetable1_t *r1 = alloc_map.meta1[i]; + assert(r1 != NULL); + i = REGION1_INDEX(data); + pagetable0_t *r0 = r1->meta0[i]; + assert(r0 != NULL); + i = REGION0_INDEX(data); + r0->meta[i] = v; +} + +STATIC_INLINE void gc_alloc_map_maybe_create(char *_data) JL_NOTSAFEPOINT +{ + uintptr_t data = ((uintptr_t)_data); + unsigned i; + i = REGION_INDEX(data); + pagetable1_t *r1 = alloc_map.meta1[i]; + if (r1 == NULL) { + r1 = (pagetable1_t*)calloc_s(sizeof(pagetable1_t)); + alloc_map.meta1[i] = r1; + } + i = REGION1_INDEX(data); + pagetable0_t *r0 = r1->meta0[i]; + if (r0 == NULL) { + r0 = (pagetable0_t*)calloc_s(sizeof(pagetable0_t)); + r1->meta0[i] = r0; + } +} + +// Page layout: +// Metadata pointer: sizeof(jl_gc_pagemeta_t*) +// Padding: GC_PAGE_OFFSET - sizeof(jl_gc_pagemeta_t*) +// Blocks: osize * n +// Tag: sizeof(jl_taggedvalue_t) +// Data: <= osize - sizeof(jl_taggedvalue_t) + +STATIC_INLINE char *gc_page_data(void *x) JL_NOTSAFEPOINT +{ + return (char*)(((uintptr_t)x >> GC_PAGE_LG2) << GC_PAGE_LG2); +} + +STATIC_INLINE jl_gc_pagemeta_t *page_metadata_unsafe(void *_data) JL_NOTSAFEPOINT +{ + return *(jl_gc_pagemeta_t**)(gc_page_data(_data)); +} + +STATIC_INLINE jl_gc_pagemeta_t *page_metadata(void *_data) JL_NOTSAFEPOINT +{ + if (!gc_alloc_map_is_set((char*)_data)) { + return NULL; + } + return page_metadata_unsafe(_data); +} + +STATIC_INLINE void set_page_metadata(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT +{ + *(jl_gc_pagemeta_t**)(pg->data) = pg; +} + +STATIC_INLINE void push_page_metadata_back(jl_gc_pagemeta_t **ppg, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT +{ + elt->next = *ppg; + *ppg = elt; +} + +STATIC_INLINE jl_gc_pagemeta_t *pop_page_metadata_back(jl_gc_pagemeta_t **ppg) JL_NOTSAFEPOINT +{ + jl_gc_pagemeta_t *v = *ppg; + if (*ppg != NULL) { + *ppg = (*ppg)->next; + } + return v; +} + +#ifdef __clang_gcanalyzer__ /* clang may not have __builtin_ffs */ unsigned ffs_u32(uint32_t bitvec) JL_NOTSAFEPOINT; #else STATIC_INLINE unsigned ffs_u32(uint32_t bitvec) @@ -389,11 +386,11 @@ STATIC_INLINE unsigned ffs_u32(uint32_t bitvec) #endif extern jl_gc_num_t gc_num; -extern pagetable_t memory_map; extern bigval_t *big_objects_marked; extern arraylist_t finalizer_list_marked; extern arraylist_t to_finalize; -extern int64_t lazy_freed_pages; +extern int64_t buffered_pages; +extern int gc_first_tid; extern int gc_n_threads; extern jl_ptls_t* gc_all_tls_states; @@ -402,12 +399,6 @@ STATIC_INLINE bigval_t *bigval_header(jl_taggedvalue_t *o) JL_NOTSAFEPOINT return container_of(o, bigval_t, header); } -// round an address inside a gcpage's data to its beginning -STATIC_INLINE char *gc_page_data(void *x) JL_NOTSAFEPOINT -{ - return (char*)(((uintptr_t)x >> GC_PAGE_LG2) << GC_PAGE_LG2); -} - STATIC_INLINE jl_taggedvalue_t *page_pfl_beg(jl_gc_pagemeta_t *p) JL_NOTSAFEPOINT { return (jl_taggedvalue_t*)(p->data + p->fl_begin_offset); @@ -445,52 +436,6 @@ STATIC_INLINE void *gc_ptr_clear_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT NOINLINE uintptr_t gc_get_stack_ptr(void); -STATIC_INLINE jl_gc_pagemeta_t *page_metadata(void *_data) JL_NOTSAFEPOINT -{ - uintptr_t data = ((uintptr_t)_data); - unsigned i; - i = REGION_INDEX(data); - pagetable1_t *r1 = memory_map.meta1[i]; - if (!r1) - return NULL; - i = REGION1_INDEX(data); - pagetable0_t *r0 = r1->meta0[i]; - if (!r0) - return NULL; - i = REGION0_INDEX(data); - return r0->meta[i]; -} - -struct jl_gc_metadata_ext { - pagetable1_t *pagetable1; - pagetable0_t *pagetable0; - jl_gc_pagemeta_t *meta; - unsigned pagetable_i32, pagetable_i; - unsigned pagetable1_i32, pagetable1_i; - unsigned pagetable0_i32, pagetable0_i; -}; - -STATIC_INLINE struct jl_gc_metadata_ext page_metadata_ext(void *_data) JL_NOTSAFEPOINT -{ - uintptr_t data = (uintptr_t)_data; - struct jl_gc_metadata_ext info; - unsigned i; - i = REGION_INDEX(data); - info.pagetable_i = i % 32; - info.pagetable_i32 = i / 32; - info.pagetable1 = memory_map.meta1[i]; - i = REGION1_INDEX(data); - info.pagetable1_i = i % 32; - info.pagetable1_i32 = i / 32; - info.pagetable0 = info.pagetable1->meta0[i]; - i = REGION0_INDEX(data); - info.pagetable0_i = i % 32; - info.pagetable0_i32 = i / 32; - info.meta = info.pagetable0->meta[i]; - assert(info.meta); - return info; -} - STATIC_INLINE void gc_big_object_unlink(const bigval_t *hdr) JL_NOTSAFEPOINT { *hdr->prev = hdr->next; @@ -508,28 +453,26 @@ STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) JL_NOTSAFE *list = hdr; } -STATIC_INLINE void gc_mark_sp_init(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) -{ - sp->pc = gc_cache->pc_stack; - sp->data = gc_cache->data_stack; - sp->pc_start = gc_cache->pc_stack; - sp->pc_end = gc_cache->pc_stack_end; -} - -void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp); -void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - arraylist_t *list, size_t start); -void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp); +extern uv_mutex_t gc_threads_lock; +extern uv_cond_t gc_threads_cond; +extern _Atomic(int) gc_n_threads_marking; +extern _Atomic(int) gc_n_threads_sweeping; +void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); +void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT; +void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_NOTSAFEPOINT; +void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq); +void gc_mark_loop_serial(jl_ptls_t ptls); +void gc_mark_loop_parallel(jl_ptls_t ptls, int master); +void gc_sweep_pool_parallel(void); +void gc_free_pages(void); void sweep_stack_pools(void); void jl_gc_debug_init(void); -extern void *gc_mark_label_addrs[_GC_MARK_L_MAX]; - // GC pages -void jl_gc_init_page(void); +void jl_gc_init_page(void) JL_NOTSAFEPOINT; NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT; -void jl_gc_free_page(void *p) JL_NOTSAFEPOINT; +void jl_gc_free_page(jl_gc_pagemeta_t *p) JL_NOTSAFEPOINT; // GC debug @@ -609,7 +552,6 @@ static inline void gc_verify_tags(void) } #endif - #ifdef GC_VERIFY extern jl_value_t *lostval; void gc_verify(jl_ptls_t ptls); @@ -650,10 +592,9 @@ extern int gc_verifying; #define gc_verifying (0) #endif - int gc_slot_to_fieldidx(void *_obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT; int gc_slot_to_arrayidx(void *_obj, void *begin) JL_NOTSAFEPOINT; -NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset); +NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int offset) JL_NOTSAFEPOINT; #ifdef GC_DEBUG_ENV JL_DLLEXPORT extern jl_gc_debug_env_t jl_gc_debug_env; @@ -717,6 +658,7 @@ void gc_count_pool(void); size_t jl_array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_enable_gc_logging(int enable); +JL_DLLEXPORT uint32_t jl_get_num_stack_mappings(void); void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT; #ifdef __cplusplus diff --git a/src/init.c b/src/init.c index 522f16041d566..ea344fb9e1e62 100644 --- a/src/init.c +++ b/src/init.c @@ -804,6 +804,8 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel) _finish_julia_init(rel, ptls, ct); } +void jl_init_heartbeat(void); + static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_task_t *ct) { jl_resolve_sysimg_location(rel); @@ -840,9 +842,12 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_ if (jl_base_module == NULL) { // nthreads > 1 requires code in Base jl_atomic_store_relaxed(&jl_n_threads, 1); + jl_n_gcthreads = 0; } jl_start_threads(); + jl_init_heartbeat(); + jl_gc_enable(1); if (jl_options.image_file && (!jl_generating_output() || jl_options.incremental) && jl_module_init_order) { diff --git a/src/interpreter.c b/src/interpreter.c index 1f9c416d99b1b..6f546db9bbbb1 100644 --- a/src/interpreter.c +++ b/src/interpreter.c @@ -65,7 +65,8 @@ extern void JL_GC_ENABLEFRAME(interpreter_state*) JL_NOTSAFEPOINT; // we define this separately so that we can populate the frame before we add it to the backtrace // it's recommended to mark the containing function with NOINLINE, though not essential #define JL_GC_ENABLEFRAME(frame) \ - ((void**)&frame[1])[0] = __builtin_frame_address(0); + jl_signal_fence(); \ + ((void**)&frame[1])[0] = __builtin_frame_address(0); #endif diff --git a/src/jl_exported_data.inc b/src/jl_exported_data.inc index fee57ed60dd7a..f28ecefbded4a 100644 --- a/src/jl_exported_data.inc +++ b/src/jl_exported_data.inc @@ -132,6 +132,7 @@ #define JL_EXPORTED_DATA_SYMBOLS(XX) \ XX(jl_n_threadpools, int) \ XX(jl_n_threads, _Atomic(int)) \ + XX(jl_n_gcthreads, int) \ XX(jl_options, jl_options_t) \ XX(jl_task_gcstack_offset, int) \ XX(jl_task_ptls_offset, int) \ diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index 7f29176e67755..7699dc8cdb904 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -160,6 +160,7 @@ XX(jl_gc_alloc_3w) \ XX(jl_gc_alloc_typed) \ XX(jl_gc_big_alloc) \ + XX(jl_gc_big_alloc_instrumented) \ XX(jl_gc_collect) \ XX(jl_gc_conservative_gc_support_enabled) \ XX(jl_gc_counted_calloc) \ @@ -176,6 +177,7 @@ XX(jl_gc_get_max_memory) \ XX(jl_gc_internal_obj_base_ptr) \ XX(jl_gc_is_enabled) \ + XX(jl_gc_pool_live_bytes) \ XX(jl_gc_live_bytes) \ XX(jl_gc_managed_malloc) \ XX(jl_gc_managed_realloc) \ @@ -186,6 +188,7 @@ XX(jl_gc_new_weakref_th) \ XX(jl_gc_num) \ XX(jl_gc_pool_alloc) \ + XX(jl_gc_pool_alloc_instrumented) \ XX(jl_gc_queue_multiroot) \ XX(jl_gc_queue_root) \ XX(jl_gc_safepoint) \ @@ -196,6 +199,7 @@ XX(jl_gc_set_cb_pre_gc) \ XX(jl_gc_set_cb_root_scanner) \ XX(jl_gc_set_cb_task_scanner) \ + XX(jl_gc_set_max_memory) \ XX(jl_gc_sync_total_bytes) \ XX(jl_gc_total_hrtime) \ XX(jl_gdblookup) \ diff --git a/src/jloptions.c b/src/jloptions.c index 988078c7c58d9..812543a09399e 100644 --- a/src/jloptions.c +++ b/src/jloptions.c @@ -40,6 +40,7 @@ JL_DLLEXPORT void jl_init_options(void) NULL, // cpu_target ("native", "core2", etc...) 0, // nthreadpools 0, // nthreads + 0, // ngcthreads NULL, // nthreads_per_pool 0, // nprocs NULL, // machine_file @@ -129,6 +130,7 @@ static const char opts[] = " interface if supported (Linux and Windows) or to the number of CPU\n" " threads if not supported (MacOS) or if process affinity is not\n" " configured, and sets M to 1.\n" + " --gcthreads=N Use N threads for GC, set to half of the number of compute threads if unspecified.\n" " -p, --procs {N|auto} Integer value N launches N additional local worker processes\n" " \"auto\" launches as many workers as the number of local CPU threads (logical cores)\n" " --machine-file Run processes on hosts listed in \n\n" @@ -253,7 +255,8 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) opt_strip_metadata, opt_strip_ir, opt_heap_size_hint, - opt_permalloc_pkgimg + opt_permalloc_pkgimg, + opt_gc_threads, }; static const char* const shortopts = "+vhqH:e:E:L:J:C:it:p:O:g:"; static const struct option longopts[] = { @@ -278,6 +281,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) { "cpu-target", required_argument, 0, 'C' }, { "procs", required_argument, 0, 'p' }, { "threads", required_argument, 0, 't' }, + { "gcthreads", required_argument, 0, opt_gc_threads }, { "machine-file", required_argument, 0, opt_machine_file }, { "project", optional_argument, 0, opt_project }, { "color", required_argument, 0, opt_color }, @@ -811,8 +815,6 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) break; } jl_options.heap_size_hint = (uint64_t)(value * multiplier); - - jl_gc_set_max_memory(jl_options.heap_size_hint); } } if (jl_options.heap_size_hint == 0) @@ -827,6 +829,13 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) else jl_errorf("julia: invalid argument to --permalloc-pkgimg={yes|no} (%s)", optarg); break; + case opt_gc_threads: + errno = 0; + long ngcthreads = strtol(optarg, &endptr, 10); + if (errno != 0 || optarg == endptr || *endptr != 0 || ngcthreads < 1 || ngcthreads >= INT16_MAX) + jl_errorf("julia: --gcthreads=; n must be an integer >= 1"); + jl_options.ngcthreads = (int16_t)ngcthreads; + break; default: jl_errorf("julia: unhandled option -- %c\n" "This is a bug, please report it.", c); diff --git a/src/jloptions.h b/src/jloptions.h index 0b72b4c3be062..93f6d321f38d6 100644 --- a/src/jloptions.h +++ b/src/jloptions.h @@ -15,6 +15,7 @@ typedef struct { const char *cpu_target; int8_t nthreadpools; int16_t nthreads; + int16_t ngcthreads; const int16_t *nthreads_per_pool; int32_t nprocs; const char *machine_file; diff --git a/src/julia.h b/src/julia.h index 0a88633905324..a5d3259ad0d3f 100644 --- a/src/julia.h +++ b/src/julia.h @@ -927,6 +927,8 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, struct _jl_task_t *owner) JL_N JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz); JL_DLLEXPORT void jl_gc_use(jl_value_t *a); JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void); +// Set GC memory trigger in bytes for greedy memory collecting +JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem); JL_DLLEXPORT void jl_clear_malloc_data(void); @@ -968,6 +970,11 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz, int isaligned, jl_value_t *owner); JL_DLLEXPORT void jl_gc_safepoint(void); +void *mtarraylist_get(small_arraylist_t *_a, size_t idx) JL_NOTSAFEPOINT; +size_t mtarraylist_length(small_arraylist_t *_a) JL_NOTSAFEPOINT; +void mtarraylist_add(small_arraylist_t *_a, void *elt, size_t idx) JL_NOTSAFEPOINT; +void mtarraylist_push(small_arraylist_t *_a, void *elt) JL_NOTSAFEPOINT; + // object accessors ----------------------------------------------------------- #define jl_svec_len(t) (((jl_svec_t*)(t))->length) @@ -1684,6 +1691,7 @@ JL_DLLEXPORT jl_sym_t *jl_get_ARCH(void) JL_NOTSAFEPOINT; JL_DLLEXPORT jl_value_t *jl_get_libllvm(void) JL_NOTSAFEPOINT; extern JL_DLLIMPORT int jl_n_threadpools; extern JL_DLLIMPORT _Atomic(int) jl_n_threads; +extern JL_DLLIMPORT int jl_n_gcthreads; extern JL_DLLIMPORT int *jl_n_threads_per_pool; // environment entries @@ -2241,9 +2249,11 @@ typedef struct { // controls the emission of debug-info. mirrors the clang options int gnu_pubnames; // can we emit the gnu pubnames debuginfo - int debug_info_kind; // Enum for line-table-only, line-directives-only, + int debug_info_kind; // Enum for line-table-only, line-directives-only, // limited, standalone + int safepoint_on_entry; // Emit a safepoint on entry to each function + // Cache access. Default: jl_rettype_inferred. jl_codeinstance_lookup_t lookup; diff --git a/src/julia_gcext.h b/src/julia_gcext.h index 27f0a6b5ec11c..e7cb57d622a78 100644 --- a/src/julia_gcext.h +++ b/src/julia_gcext.h @@ -34,6 +34,10 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_alloc(jl_gc_cb_notify_external_al JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_free_t cb, int enable); +// Memory pressure callback +typedef void (*jl_gc_cb_notify_gc_pressure_t)(void); +JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable); + // Types for custom mark and sweep functions. typedef uintptr_t (*jl_markfunc_t)(jl_ptls_t, jl_value_t *obj); typedef void (*jl_sweepfunc_t)(jl_value_t *obj); diff --git a/src/julia_internal.h b/src/julia_internal.h index e5419efd2a2ac..b932cc82be6ea 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -199,6 +199,8 @@ JL_DLLEXPORT void jl_lock_profile(void) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_unlock_profile(void) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_lock_profile_wr(void) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT; +int jl_lock_stackwalk(void) JL_NOTSAFEPOINT; +void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT; // number of cycles since power-on static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT @@ -335,7 +337,6 @@ void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset) JL_NOTSAFEPOINT; void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset) JL_NOTSAFEPOINT; -void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v); void gc_sweep_sysimg(void); @@ -358,24 +359,48 @@ static const int jl_gc_sizeclasses[] = { 144, 160, 176, 192, 208, 224, 240, 256, // the following tables are computed for maximum packing efficiency via the formula: - // pg = 2^14 + // pg = GC_SMALL_PAGE ? 2^12 : 2^14 // sz = (div.(pg-8, rng).÷16)*16; hcat(sz, (pg-8).÷sz, pg .- (pg-8).÷sz.*sz)' +#ifdef GC_SMALL_PAGE + // rng = 15:-1:2 (14 pools) + 272, 288, 304, 336, 368, 400, 448, 496, 576, 672, 816, 1008, 1360, 2032 +// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, /pool +// 16, 64, 144, 64, 48, 96, 64, 128, 64, 64, 16, 64, 16, 32, bytes lost +#else // rng = 60:-4:32 (8 pools) 272, 288, 304, 336, 368, 400, 448, 496, -// 60, 56, 53, 48, 44, 40, 36, 33, /pool -// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost +// 60, 56, 53, 48, 44, 40, 36, 33, /pool +// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost // rng = 30:-2:16 (8 pools) 544, 576, 624, 672, 736, 816, 896, 1008, -// 30, 28, 26, 24, 22, 20, 18, 16, /pool -// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost +// 30, 28, 26, 24, 22, 20, 18, 16, /pool +// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost // rng = 15:-1:8 (8 pools) 1088, 1168, 1248, 1360, 1488, 1632, 1808, 2032 -// 15, 14, 13, 12, 11, 10, 9, 8, /pool -// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost +// 15, 14, 13, 12, 11, 10, 9, 8, /pool +// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost +#endif }; +#ifdef GC_SMALL_PAGE +#ifdef _P64 +# define JL_GC_N_POOLS 39 +#elif MAX_ALIGN == 8 +# define JL_GC_N_POOLS 40 +#else +# define JL_GC_N_POOLS 41 +#endif +#else +#ifdef _P64 +# define JL_GC_N_POOLS 49 +#elif MAX_ALIGN == 8 +# define JL_GC_N_POOLS 50 +#else +# define JL_GC_N_POOLS 51 +#endif +#endif static_assert(sizeof(jl_gc_sizeclasses) / sizeof(jl_gc_sizeclasses[0]) == JL_GC_N_POOLS, ""); STATIC_INLINE int jl_gc_alignment(size_t sz) @@ -402,7 +427,12 @@ JL_DLLEXPORT int jl_alignment(size_t sz); // the following table is computed as: // [searchsortedfirst(jl_gc_sizeclasses, i) - 1 for i = 0:16:jl_gc_sizeclasses[end]] -static const uint8_t szclass_table[] = {0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48}; +static const uint8_t szclass_table[] = +#ifdef GC_SMALL_PAGE + {0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,33,33,34,34,34,34,34,34,35,35,35,35,35,35,35,35,35,36,36,36,36,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38}; +#else + {0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,34,34,35,35,35,36,36,36,37,37,37,37,38,38,38,38,38,39,39,39,39,39,40,40,40,40,40,40,40,41,41,41,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,46,46,46,46,46,46,46,46,46,47,47,47,47,47,47,47,47,47,47,47,48,48,48,48,48,48,48,48,48,48,48,48,48,48}; +#endif static_assert(sizeof(szclass_table) == 128, ""); STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz) @@ -560,9 +590,6 @@ void jl_gc_run_all_finalizers(jl_task_t *ct); void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task); void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT; -// Set GC memory trigger in bytes for greedy memory collecting -void jl_gc_set_max_memory(uint64_t max_mem); - JL_DLLEXPORT void jl_gc_queue_binding(jl_binding_t *bnd) JL_NOTSAFEPOINT; void gc_setmark_buf(jl_ptls_t ptls, void *buf, uint8_t, size_t) JL_NOTSAFEPOINT; @@ -582,8 +609,8 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOT } } -void jl_gc_debug_print_status(void); -JL_DLLEXPORT void jl_gc_debug_critical_error(void); +void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT; void jl_print_gc_stats(JL_STREAM *s); void jl_gc_reset_alloc_count(void); uint32_t jl_get_gs_ctr(void); @@ -1173,14 +1200,17 @@ size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_cont #endif JL_DLLEXPORT jl_value_t *jl_get_backtrace(void); void jl_critical_error(int sig, int si_code, bt_context_t *context, jl_task_t *ct); -JL_DLLEXPORT void jl_raise_debugger(void); +JL_DLLEXPORT void jl_raise_debugger(void) JL_NOTSAFEPOINT; int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT; -void jl_print_native_codeloc(uintptr_t ip) JL_NOTSAFEPOINT; -void jl_print_bt_entry_codeloc(jl_bt_element_t *bt_data) JL_NOTSAFEPOINT; +void jl_print_native_codeloc(char *pre_str, uintptr_t ip) JL_NOTSAFEPOINT; +void jl_print_bt_entry_codeloc(int sig, jl_bt_element_t *bt_data) JL_NOTSAFEPOINT; #ifdef _OS_WINDOWS_ JL_DLLEXPORT void jl_refresh_dbg_module_list(void); #endif +int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx) JL_NOTSAFEPOINT; +void jl_thread_resume(int tid) JL_NOTSAFEPOINT; + // *to is NULL or malloc'd pointer, from is allowed to be NULL STATIC_INLINE char *jl_copy_str(char **to, const char *from) JL_NOTSAFEPOINT { diff --git a/src/julia_threads.h b/src/julia_threads.h index 847465b363a2e..f69f9dd4baacf 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -5,6 +5,7 @@ #define JL_THREADS_H #include "julia_atomics.h" +#include "work-stealing-queue.h" #ifndef _OS_WINDOWS_ #include "pthread.h" #endif @@ -106,7 +107,7 @@ typedef struct { // handle to reference an OS thread #ifdef _OS_WINDOWS_ -typedef DWORD jl_thread_t; +typedef HANDLE jl_thread_t; #else typedef pthread_t jl_thread_t; #endif @@ -127,6 +128,7 @@ typedef struct { typedef struct { _Atomic(int64_t) allocd; + _Atomic(int64_t) pool_live_bytes; _Atomic(int64_t) freed; _Atomic(uint64_t) malloc; _Atomic(uint64_t) realloc; @@ -137,10 +139,10 @@ typedef struct { typedef struct { // variable for tracking weak references - arraylist_t weak_refs; + small_arraylist_t weak_refs; // live tasks started on this thread // that are holding onto a stack from the pool - arraylist_t live_tasks; + small_arraylist_t live_tasks; // variables for tracking malloc'd arrays struct _mallocarray_t *mallocarrays; @@ -158,29 +160,18 @@ typedef struct { arraylist_t *last_remset; // variables for allocating objects from pools -#ifdef _P64 -# define JL_GC_N_POOLS 49 -#elif MAX_ALIGN == 8 -# define JL_GC_N_POOLS 50 -#else -# define JL_GC_N_POOLS 51 -#endif - jl_gc_pool_t norm_pools[JL_GC_N_POOLS]; +#define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h` + jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS]; #define JL_N_STACK_POOLS 16 - arraylist_t free_stacks[JL_N_STACK_POOLS]; + small_arraylist_t free_stacks[JL_N_STACK_POOLS]; } jl_thread_heap_t; -// Cache of thread local change to global metadata during GC -// This is sync'd after marking. -typedef union _jl_gc_mark_data jl_gc_mark_data_t; - typedef struct { - void **pc; // Current stack address for the pc (up growing) - jl_gc_mark_data_t *data; // Current stack address for the data (up growing) - void **pc_start; // Cached value of `gc_cache->pc_stack` - void **pc_end; // Cached value of `gc_cache->pc_stack_end` -} jl_gc_mark_sp_t; + ws_queue_t chunk_queue; + ws_queue_t ptr_queue; + arraylist_t reclaim_set; +} jl_gc_markqueue_t; typedef struct { // thread local increment of `perm_scanned_bytes` @@ -198,12 +189,14 @@ typedef struct { // this makes sure that a single objects can only appear once in // the lists (the mark bit cannot be flipped to `0` without sweeping) void *big_obj[1024]; - void **pc_stack; - void **pc_stack_end; - jl_gc_mark_data_t *data_stack; } jl_gc_mark_cache_t; struct _jl_bt_element_t; +struct _jl_gc_pagemeta_t; + +typedef struct { + _Atomic(struct _jl_gc_pagemeta_t *) bottom; +} jl_gc_page_stack_t; // This includes all the thread local states we care about for a thread. // Changes to TLS field types must be reflected in codegen. @@ -266,9 +259,12 @@ typedef struct _jl_tls_states_t { #endif jl_thread_t system_id; arraylist_t finalizers; + jl_gc_page_stack_t page_metadata_allocd; + jl_gc_page_stack_t page_metadata_buffered; + jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; arraylist_t sweep_objs; - jl_gc_mark_sp_t gc_mark_sp; + _Atomic(int64_t) gc_sweeps_requested; // Saved exception for previous *external* API call or NULL if cleared. // Access via jl_exception_occurred(). struct _jl_value_t *previous_exception; diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp index 1da37a249fbd2..30a5d9a59f676 100644 --- a/src/llvm-final-gc-lowering.cpp +++ b/src/llvm-final-gc-lowering.cpp @@ -27,6 +27,7 @@ STATISTIC(GetGCFrameSlotCount, "Number of lowered getGCFrameSlotFunc intrinsics" STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics"); STATISTIC(QueueGCRootCount, "Number of lowered queueGCRootFunc intrinsics"); STATISTIC(QueueGCBindingCount, "Number of lowered queueGCBindingFunc intrinsics"); +STATISTIC(SafepointCount, "Number of lowered safepoint intrinsics"); using namespace llvm; @@ -72,6 +73,9 @@ struct FinalLowerGC: private JuliaPassContext { // Lowers a `julia.queue_gc_binding` intrinsic. Value *lowerQueueGCBinding(CallInst *target, Function &F); + + // Lowers a `julia.safepoint` intrinsic. + Value *lowerSafepoint(CallInst *target, Function &F); }; Value *FinalLowerGC::lowerNewGCFrame(CallInst *target, Function &F) @@ -202,15 +206,28 @@ Value *FinalLowerGC::lowerQueueGCBinding(CallInst *target, Function &F) return target; } +Value *FinalLowerGC::lowerSafepoint(CallInst *target, Function &F) +{ + ++SafepointCount; + assert(target->arg_size() == 1); + IRBuilder<> builder(target->getContext()); + builder.SetInsertPoint(target); + auto T_size = getSizeTy(builder.getContext()); + Value* signal_page = target->getOperand(0); + Value* load = builder.CreateLoad(T_size, signal_page, true); + return load; +} + Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) { ++GCAllocBytesCount; - assert(target->arg_size() == 2); + assert(target->arg_size() == 3); CallInst *newI; IRBuilder<> builder(target); builder.SetCurrentDebugLocation(target->getDebugLoc()); auto ptls = target->getArgOperand(0); + auto type = target->getArgOperand(2); Attribute derefAttr; if (auto CI = dyn_cast(target->getArgOperand(1))) { @@ -221,19 +238,19 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) if (offset < 0) { newI = builder.CreateCall( bigAllocFunc, - { ptls, ConstantInt::get(getSizeTy(F.getContext()), sz + sizeof(void*)) }); + { ptls, ConstantInt::get(getSizeTy(F.getContext()), sz + sizeof(void*)), type }); derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), sz + sizeof(void*)); } else { auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset); auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); - newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize }); + newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize, type }); derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), osize); } } else { auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), getSizeTy(F.getContext())); size = builder.CreateAdd(size, ConstantInt::get(getSizeTy(F.getContext()), sizeof(void*))); - newI = builder.CreateCall(allocTypedFunc, { ptls, size, ConstantPointerNull::get(Type::getInt8PtrTy(F.getContext())) }); + newI = builder.CreateCall(allocTypedFunc, { ptls, size, type }); derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), sizeof(void*)); } newI->setAttributes(newI->getCalledFunction()->getAttributes()); @@ -316,16 +333,20 @@ static void replaceInstruction( bool FinalLowerGC::runOnFunction(Function &F) { - LLVM_DEBUG(dbgs() << "FINAL GC LOWERING: Processing function " << F.getName() << "\n"); // Check availability of functions again since they might have been deleted. initFunctions(*F.getParent()); - if (!pgcstack_getter && !adoptthread_func) + if (!pgcstack_getter && !adoptthread_func) { + LLVM_DEBUG(dbgs() << "FINAL GC LOWERING: Skipping function " << F.getName() << "\n"); return false; + } // Look for a call to 'julia.get_pgcstack'. pgcstack = getPGCstack(F); - if (!pgcstack) + if (!pgcstack) { + LLVM_DEBUG(dbgs() << "FINAL GC LOWERING: Skipping function " << F.getName() << " no pgcstack\n"); return false; + } + LLVM_DEBUG(dbgs() << "FINAL GC LOWERING: Processing function " << F.getName() << "\n"); // Acquire intrinsic functions. auto newGCFrameFunc = getOrNull(jl_intrinsics::newGCFrame); @@ -335,6 +356,7 @@ bool FinalLowerGC::runOnFunction(Function &F) auto GCAllocBytesFunc = getOrNull(jl_intrinsics::GCAllocBytes); auto queueGCRootFunc = getOrNull(jl_intrinsics::queueGCRoot); auto queueGCBindingFunc = getOrNull(jl_intrinsics::queueGCBinding); + auto safepointFunc = getOrNull(jl_intrinsics::safepoint); // Lower all calls to supported intrinsics. for (BasicBlock &BB : F) { @@ -346,6 +368,7 @@ bool FinalLowerGC::runOnFunction(Function &F) } Value *callee = CI->getCalledOperand(); + assert(callee); if (callee == newGCFrameFunc) { replaceInstruction(CI, lowerNewGCFrame(CI, F), it); @@ -370,6 +393,10 @@ bool FinalLowerGC::runOnFunction(Function &F) else if (callee == queueGCBindingFunc) { replaceInstruction(CI, lowerQueueGCBinding(CI, F), it); } + else if (callee == safepointFunc) { + lowerSafepoint(CI, F); + it = CI->eraseFromParent(); + } else { ++it; } diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index eaba9c7b10d98..9c8959ae7874a 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2324,22 +2324,6 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { IRBuilder<> builder(CI); builder.SetCurrentDebugLocation(CI->getDebugLoc()); - // Create a call to the `julia.gc_alloc_bytes` intrinsic, which is like - // `julia.gc_alloc_obj` except it doesn't set the tag. - auto allocBytesIntrinsic = getOrDeclare(jl_intrinsics::GCAllocBytes); - auto ptlsLoad = get_current_ptls_from_task(builder, CI->getArgOperand(0), tbaa_gcframe); - auto ptls = builder.CreateBitCast(ptlsLoad, Type::getInt8PtrTy(builder.getContext())); - auto newI = builder.CreateCall( - allocBytesIntrinsic, - { - ptls, - builder.CreateIntCast( - CI->getArgOperand(1), - allocBytesIntrinsic->getFunctionType()->getParamType(1), - false) - }); - newI->takeName(CI); - // LLVM alignment/bit check is not happy about addrspacecast and refuse // to remove write barrier because of it. // We pretty much only load using `T_size` so try our best to strip @@ -2378,7 +2362,36 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { builder.CreateAlignmentAssumption(DL, tag, 16); } } - // Set the tag. + + // Create a call to the `julia.gc_alloc_bytes` intrinsic, which is like + // `julia.gc_alloc_obj` except it specializes the call based on the constant + // size of the object to allocate, to save one indirection, and doesn't set + // the type tag. (Note that if the size is not a constant, it will call + // gc_alloc_obj, and will redundantly set the tag.) + auto allocBytesIntrinsic = getOrDeclare(jl_intrinsics::GCAllocBytes); + auto ptlsLoad = get_current_ptls_from_task(builder, CI->getArgOperand(0), tbaa_gcframe); + auto ptls = builder.CreateBitCast(ptlsLoad, Type::getInt8PtrTy(builder.getContext())); + auto newI = builder.CreateCall( + allocBytesIntrinsic, + { + ptls, + builder.CreateIntCast( + CI->getArgOperand(1), + allocBytesIntrinsic->getFunctionType()->getParamType(1), + false), + builder.CreatePtrToInt(tag, T_size), + }); + newI->takeName(CI); + + // Now, finally, set the tag. We do this in IR instead of in the C alloc + // function, to provide possible optimization opportunities. (I think? TBH + // the most recent editor of this code is not entirely clear on why we + // prefer to set the tag in the generated code. Providing optimziation + // opportunities is the most likely reason; the tradeoff is slightly + // larger code size and increased compilation time, compiling this + // instruction at every allocation site, rather than once in the C alloc + // function.) + auto &M = *builder.GetInsertBlock()->getModule(); StoreInst *store = builder.CreateAlignedStore( tag, EmitTagPtr(builder, tag_type, newI), Align(sizeof(size_t))); store->setOrdering(AtomicOrdering::Unordered); diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index fa3437ffdce48..91850ebe8df07 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -119,6 +119,13 @@ namespace jl_intrinsics { static const char *POP_GC_FRAME_NAME = "julia.pop_gc_frame"; static const char *QUEUE_GC_ROOT_NAME = "julia.queue_gc_root"; static const char *QUEUE_GC_BINDING_NAME = "julia.queue_gc_binding"; + static const char *SAFEPOINT_NAME = "julia.safepoint"; + + static auto T_size_t(const JuliaPassContext &context) { + return sizeof(size_t) == sizeof(uint32_t) ? + Type::getInt32Ty(context.getLLVMContext()) : + Type::getInt64Ty(context.getLLVMContext()); + } // Annotates a function with attributes suitable for GC allocation // functions. Specifically, the return value is marked noalias and nonnull. @@ -150,9 +157,8 @@ namespace jl_intrinsics { FunctionType::get( context.T_prjlvalue, { Type::getInt8PtrTy(context.getLLVMContext()), - sizeof(size_t) == sizeof(uint32_t) ? - Type::getInt32Ty(context.getLLVMContext()) : - Type::getInt64Ty(context.getLLVMContext()) }, + T_size_t(context), + T_size_t(context) }, // type false), Function::ExternalLinkage, GC_ALLOC_BYTES_NAME); @@ -224,15 +230,37 @@ namespace jl_intrinsics { intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); return intrinsic; }); + + const IntrinsicDescription safepoint( + SAFEPOINT_NAME, + [](const JuliaPassContext &context) { + auto T_size = getSizeTy(context.getLLVMContext()); + auto T_psize = T_size->getPointerTo(); + auto intrinsic = Function::Create( + FunctionType::get( + Type::getVoidTy(context.getLLVMContext()), + {T_psize}, + false), + Function::ExternalLinkage, + SAFEPOINT_NAME); + intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + return intrinsic; + }); } namespace jl_well_known { - static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc); - static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc); + static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc_instrumented); + static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc_instrumented); static const char *GC_QUEUE_ROOT_NAME = XSTR(jl_gc_queue_root); static const char *GC_QUEUE_BINDING_NAME = XSTR(jl_gc_queue_binding); static const char *GC_ALLOC_TYPED_NAME = XSTR(jl_gc_alloc_typed); + static auto T_size_t(const JuliaPassContext &context) { + return sizeof(size_t) == sizeof(uint32_t) ? + Type::getInt32Ty(context.getLLVMContext()) : + Type::getInt64Ty(context.getLLVMContext()); + } + using jl_intrinsics::addGCAllocAttributes; const WellKnownFunctionDescription GCBigAlloc( @@ -242,9 +270,8 @@ namespace jl_well_known { FunctionType::get( context.T_prjlvalue, { Type::getInt8PtrTy(context.getLLVMContext()), - sizeof(size_t) == sizeof(uint32_t) ? - Type::getInt32Ty(context.getLLVMContext()) : - Type::getInt64Ty(context.getLLVMContext()) }, + T_size_t(context), + T_size_t(context) }, false), Function::ExternalLinkage, GC_BIG_ALLOC_NAME); @@ -258,7 +285,7 @@ namespace jl_well_known { auto poolAllocFunc = Function::Create( FunctionType::get( context.T_prjlvalue, - { Type::getInt8PtrTy(context.getLLVMContext()), Type::getInt32Ty(context.getLLVMContext()), Type::getInt32Ty(context.getLLVMContext()) }, + { Type::getInt8PtrTy(context.getLLVMContext()), Type::getInt32Ty(context.getLLVMContext()), Type::getInt32Ty(context.getLLVMContext()), T_size_t(context) }, false), Function::ExternalLinkage, GC_POOL_ALLOC_NAME); @@ -301,10 +328,8 @@ namespace jl_well_known { FunctionType::get( context.T_prjlvalue, { Type::getInt8PtrTy(context.getLLVMContext()), - sizeof(size_t) == sizeof(uint32_t) ? - Type::getInt32Ty(context.getLLVMContext()) : - Type::getInt64Ty(context.getLLVMContext()), - Type::getInt8PtrTy(context.getLLVMContext()) }, + T_size_t(context), + T_size_t(context) }, // type false), Function::ExternalLinkage, GC_ALLOC_TYPED_NAME); diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h index f25f9181ddb18..e54f39c05ba59 100644 --- a/src/llvm-pass-helpers.h +++ b/src/llvm-pass-helpers.h @@ -129,6 +129,9 @@ namespace jl_intrinsics { // `julia.queue_gc_binding`: an intrinsic that queues a binding for GC. extern const IntrinsicDescription queueGCBinding; + + // `julia.safepoint`: an intrinsic that triggers a GC safepoint. + extern const IntrinsicDescription safepoint; } // A namespace for well-known Julia runtime function descriptions. diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp index a39a73c5393a2..be4cc3a1edf2a 100644 --- a/src/llvm-ptls.cpp +++ b/src/llvm-ptls.cpp @@ -207,7 +207,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, IRBuilder<> builder(fastTerm->getParent()); fastTerm->removeFromParent(); MDNode *tbaa = tbaa_gcframe; - Value *prior = emit_gc_unsafe_enter(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, pgcstack), tbaa)); + Value *prior = emit_gc_unsafe_enter(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, pgcstack), tbaa), true); builder.Insert(fastTerm); phi->addIncoming(pgcstack, fastTerm->getParent()); // emit pre-return cleanup @@ -219,7 +219,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, for (auto &BB : *pgcstack->getParent()->getParent()) { if (isa(BB.getTerminator())) { IRBuilder<> builder(BB.getTerminator()); - emit_gc_unsafe_leave(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, phi), tbaa), last_gc_state); + emit_gc_unsafe_leave(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, phi), tbaa), last_gc_state, true); } } } diff --git a/src/mach_dyld_atfork.tbd b/src/mach_dyld_atfork.tbd index 9a5d18099dbcf..c2cda4417ec38 100644 --- a/src/mach_dyld_atfork.tbd +++ b/src/mach_dyld_atfork.tbd @@ -21,5 +21,6 @@ install-name: '/usr/lib/libSystem.B.dylib' exports: - targets: [ arm64-macos, arm64e-macos, x86_64-macos, x86_64-maccatalyst, arm64-maccatalyst, arm64e-maccatalyst ] - symbols: [ __dyld_atfork_parent, __dyld_atfork_prepare ] + symbols: [ __dyld_atfork_parent, __dyld_atfork_prepare, + __dyld_dlopen_atfork_parent, __dyld_dlopen_atfork_prepare ] ... diff --git a/src/mtarraylist.c b/src/mtarraylist.c new file mode 100644 index 0000000000000..8bad44797dab4 --- /dev/null +++ b/src/mtarraylist.c @@ -0,0 +1,81 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include "julia.h" +#include "julia_internal.h" +#include "julia_assert.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// this file provides some alternate API functions for small_arraylist (push and add) +// which can be safely observed from other threads concurrently +// there is only permitted to be a single writer thread (or a mutex) +// but there can be any number of observers + +typedef struct { + _Atomic(uint32_t) len; + uint32_t max; + _Atomic(_Atomic(void*)*) items; + _Atomic(void*) _space[SMALL_AL_N_INLINE]; +} small_mtarraylist_t; + +// change capacity to at least newlen +static void mtarraylist_resizeto(small_mtarraylist_t *a, size_t len, size_t newlen) JL_NOTSAFEPOINT +{ + size_t max = a->max; + if (newlen > max) { + size_t nm = max * 2; + if (nm == 0) + nm = 1; + while (newlen > nm) + nm *= 2; + void *olditems = (void*)jl_atomic_load_relaxed(&a->items); + void *p = calloc_s(nm * sizeof(void*)); + memcpy(p, olditems, len * sizeof(void*)); + jl_atomic_store_release(&a->items, (_Atomic(void*)*)p); + a->max = nm; + if (olditems != (void*)&a->_space[0]) { + jl_task_t *ct = jl_current_task; + jl_gc_add_quiescent(ct->ptls, (void**)olditems, free); + } + } +} + +// single-threaded +void mtarraylist_push(small_arraylist_t *_a, void *elt) +{ + small_mtarraylist_t *a = (small_mtarraylist_t*)_a; + size_t len = jl_atomic_load_relaxed(&a->len); + mtarraylist_resizeto(a, len, len + 1); + jl_atomic_store_release(&jl_atomic_load_relaxed(&a->items)[len], elt); + jl_atomic_store_release(&a->len, len + 1); +} + +// single-threaded +void mtarraylist_add(small_arraylist_t *_a, void *elt, size_t idx) +{ + small_mtarraylist_t *a = (small_mtarraylist_t*)_a; + size_t len = jl_atomic_load_relaxed(&a->len); + mtarraylist_resizeto(a, len, idx + 1); + jl_atomic_store_release(&jl_atomic_load_relaxed(&a->items)[idx], elt); + if (jl_atomic_load_relaxed(&a->len) < idx + 1) + jl_atomic_store_release(&a->len, idx + 1); +} + +// concurrent-safe +size_t mtarraylist_length(small_arraylist_t *_a) +{ + small_mtarraylist_t *a = (small_mtarraylist_t*)_a; + return jl_atomic_load_relaxed(&a->len); +} + +// concurrent-safe +void *mtarraylist_get(small_arraylist_t *_a, size_t idx) +{ + small_mtarraylist_t *a = (small_mtarraylist_t*)_a; + size_t len = jl_atomic_load_acquire(&a->len); + if (idx >= len) + return NULL; + return jl_atomic_load_relaxed(&jl_atomic_load_relaxed(&a->items)[idx]); +} diff --git a/src/options.h b/src/options.h index 82b71431ecea0..1ff0f0ce545bd 100644 --- a/src/options.h +++ b/src/options.h @@ -81,6 +81,11 @@ // Automatic Instrumenting Profiler //#define ENABLE_TIMINGS +// pool allocator configuration options + +// GC_SMALL_PAGE allocates objects in 4k pages +// #define GC_SMALL_PAGE + // method dispatch profiling -------------------------------------------------- @@ -134,10 +139,16 @@ // threadpools specification #define THREADPOOLS_NAME "JULIA_THREADPOOLS" +// GC threads +#define NUM_GC_THREADS_NAME "JULIA_NUM_GC_THREADS" + // affinitization behavior #define MACHINE_EXCLUSIVE_NAME "JULIA_EXCLUSIVE" #define DEFAULT_MACHINE_EXCLUSIVE 0 +// heartbeats +#define JL_HEARTBEAT_THREAD + // partr -- parallel tasks runtime options ------------------------------------ // multiq diff --git a/src/partr.c b/src/partr.c index bbcf5048cace2..c86dae7239097 100644 --- a/src/partr.c +++ b/src/partr.c @@ -78,7 +78,7 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA // GC functions used extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, - jl_gc_mark_sp_t *sp, jl_value_t *obj) JL_NOTSAFEPOINT; + jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT; // parallel task runtime // --- @@ -108,7 +108,48 @@ void jl_init_threadinginfra(void) void JL_NORETURN jl_finish_task(jl_task_t *t); -// thread function: used by all except the main thread +static inline int may_mark(void) JL_NOTSAFEPOINT +{ + return (jl_atomic_load(&gc_n_threads_marking) > 0); +} + +static inline int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + return (jl_atomic_load(&ptls->gc_sweeps_requested) > 0); +} + +// gc thread function +void jl_gc_threadfun(void *arg) +{ + jl_threadarg_t *targ = (jl_threadarg_t*)arg; + + // initialize this thread (set tid and create heap) + jl_ptls_t ptls = jl_init_threadtls(targ->tid); + + // wait for all threads + jl_gc_state_set(ptls, JL_GC_STATE_WAITING, 0); + uv_barrier_wait(targ->barrier); + + // free the thread argument here + free(targ); + + while (1) { + uv_mutex_lock(&gc_threads_lock); + while (!may_mark() && !may_sweep(ptls)) { + uv_cond_wait(&gc_threads_cond, &gc_threads_lock); + } + uv_mutex_unlock(&gc_threads_lock); + if (may_mark()) { + gc_mark_loop_parallel(ptls, 0); + } + if (may_sweep(ptls)) { // not an else! + gc_sweep_pool_parallel(); + jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1); + } + } +} + +// thread function: used by all mutator threads except the main thread void jl_threadfun(void *arg) { jl_threadarg_t *targ = (jl_threadarg_t*)arg; @@ -419,7 +460,6 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, uv_mutex_lock(&ptls->sleep_lock); while (may_sleep(ptls)) { uv_cond_wait(&ptls->wake_signal, &ptls->sleep_lock); - // TODO: help with gc work here, if applicable } assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping); uv_mutex_unlock(&ptls->sleep_lock); diff --git a/src/signal-handling.c b/src/signal-handling.c index 391a97055af84..7137cafdef9c4 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -474,7 +474,7 @@ void jl_critical_error(int sig, int si_code, bt_context_t *context, jl_task_t *c *bt_size = n = rec_backtrace_ctx(bt_data, JL_MAX_BT_SIZE, context, NULL); } for (i = 0; i < n; i += jl_bt_entry_size(bt_data + i)) { - jl_print_bt_entry_codeloc(bt_data + i); + jl_print_bt_entry_codeloc(sig, bt_data + i); } jl_gc_debug_print_status(); jl_gc_debug_critical_error(); diff --git a/src/signals-mach.c b/src/signals-mach.c index 8ac7e5301d7ad..a772ecbd0a901 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -36,6 +36,9 @@ extern int _keymgr_set_lockmode_processwide_ptr(unsigned int key, unsigned int m extern void _dyld_atfork_prepare(void) __attribute__((weak_import)); extern void _dyld_atfork_parent(void) __attribute__((weak_import)); //extern void _dyld_fork_child(void) __attribute__((weak_import)); +extern void _dyld_dlopen_atfork_prepare(void) __attribute__((weak_import)); +extern void _dyld_dlopen_atfork_parent(void) __attribute__((weak_import)); +//extern void _dyld_dlopen_atfork_child(void) __attribute__((weak_import)); static void attach_exception_port(thread_port_t thread, int segv_only); @@ -381,12 +384,12 @@ static void attach_exception_port(thread_port_t thread, int segv_only) HANDLE_MACH_ERROR("thread_set_exception_ports", ret); } -static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) +static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) JL_NOTSAFEPOINT { jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; if (ptls2 == NULL) // this thread is not alive return 0; - jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL; + jl_task_t *ct2 = jl_atomic_load_relaxed(&ptls2->current_task); if (ct2 == NULL) // this thread is already dead return 0; @@ -404,18 +407,18 @@ static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) return 1; } -static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t **ctx) +int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx) { (void)timeout; - static host_thread_state_t state; + host_thread_state_t state; if (!jl_thread_suspend_and_get_state2(tid, &state)) { - *ctx = NULL; - return; + return 0; } - *ctx = (unw_context_t*)&state; + *ctx = *(unw_context_t*)&state; + return 1; } -static void jl_thread_resume(int tid, int sig) +void jl_thread_resume(int tid) { jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); @@ -568,7 +571,12 @@ static int jl_lock_profile_mach(int dlsymlock) // workaround for old keymgr bugs void *unused = NULL; int keymgr_locked = _keymgr_get_and_lock_processwide_ptr_2(KEYMGR_GCC3_DW2_OBJ_LIST, &unused) == 0; - // workaround for new dlsym4 bugs (API and bugs introduced in macOS 12.1) + // workaround for new dlsym4 bugs in the workaround for dlsym bugs: _dyld_atfork_prepare + // acquires its locks in the wrong order, but fortunately we happen to able to guard it + // with this call to force it to prevent that TSAN violation from causing a deadlock + if (dlsymlock && _dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_prepare(); + // workaround for new dlsym4 bugs (API and bugs introduced circa macOS 12.1) if (dlsymlock && _dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) _dyld_atfork_prepare(); return keymgr_locked; @@ -576,15 +584,24 @@ static int jl_lock_profile_mach(int dlsymlock) static void jl_unlock_profile_mach(int dlsymlock, int keymgr_locked) { - if (dlsymlock && _dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) \ - _dyld_atfork_parent(); \ + if (dlsymlock && _dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) + _dyld_atfork_parent(); + if (dlsymlock && _dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_parent(); if (keymgr_locked) _keymgr_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST); jl_unlock_profile(); } -#define jl_lock_profile() int keymgr_locked = jl_lock_profile_mach(1) -#define jl_unlock_profile() jl_unlock_profile_mach(1, keymgr_locked) +int jl_lock_stackwalk(void) +{ + return jl_lock_profile_mach(1); +} + +void jl_unlock_stackwalk(int lockret) +{ + jl_unlock_profile_mach(1, lockret); +} void *mach_profile_listener(void *arg) { @@ -615,15 +632,19 @@ void *mach_profile_listener(void *arg) break; } + if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_prepare(); if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) _dyld_atfork_prepare(); // briefly acquire the dlsym lock host_thread_state_t state; - if (!jl_thread_suspend_and_get_state2(i, &state)) - continue; + int valid_thread = jl_thread_suspend_and_get_state2(i, &state); unw_context_t *uc = (unw_context_t*)&state; if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) _dyld_atfork_parent(); // quickly release the dlsym lock - + if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_parent(); + if (!valid_thread) + continue; if (running) { #ifdef LLVMLIBUNWIND /* @@ -677,7 +698,7 @@ void *mach_profile_listener(void *arg) bt_data_prof[bt_size_cur++].uintptr = 0; } // We're done! Resume the thread. - jl_thread_resume(i, 0); + jl_thread_resume(i); } jl_unlock_profile_mach(0, keymgr_locked); if (running) { diff --git a/src/signals-unix.c b/src/signals-unix.c index 8d02aa96a8586..f38389913aa59 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -293,6 +293,18 @@ int exc_reg_is_write_fault(uintptr_t esr) { #include "signals-mach.c" #else +int jl_lock_stackwalk(void) +{ + jl_lock_profile(); + return 0; +} + +void jl_unlock_stackwalk(int lockret) +{ + (void)lockret; + jl_unlock_profile(); +} + #if defined(_OS_LINUX_) && (defined(_CPU_X86_64_) || defined(_CPU_X86_)) int is_write_fault(void *context) { @@ -386,12 +398,12 @@ JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context) } #if !defined(JL_DISABLE_LIBUNWIND) -static unw_context_t *signal_context; +static bt_context_t *signal_context; pthread_mutex_t in_signal_lock; static pthread_cond_t exit_signal_cond; static pthread_cond_t signal_caught_cond; -static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t **ctx) +int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx) { struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts); @@ -401,9 +413,8 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL; if (ct2 == NULL) { // this thread is not alive or already dead - *ctx = NULL; pthread_mutex_unlock(&in_signal_lock); - return; + return 0; } jl_atomic_store_release(&ptls2->signal_request, 1); pthread_kill(ptls2->system_id, SIGUSR2); @@ -412,9 +423,8 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t if (err == ETIMEDOUT) { sig_atomic_t request = 1; if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) { - *ctx = NULL; pthread_mutex_unlock(&in_signal_lock); - return; + return 0; } // Request is either now 0 (meaning the other thread is waiting for // exit_signal_cond already), @@ -431,15 +441,16 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t // checking it is 0, and add an acquire barrier for good measure) int request = jl_atomic_load_acquire(&ptls2->signal_request); assert(request == 0); (void) request; - *ctx = signal_context; + jl_atomic_store_release(&ptls2->signal_request, 1); // prepare to resume normally + *ctx = *signal_context; + return 1; } -static void jl_thread_resume(int tid, int sig) +void jl_thread_resume(int tid) { jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; - jl_atomic_store_release(&ptls2->signal_request, sig == -1 ? 3 : 1); pthread_cond_broadcast(&exit_signal_cond); - pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge + pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge (so that signal_request doesn't get mixed up) // The other thread is waiting to leave exit_signal_cond (verify that here by // checking it is 0, and add an acquire barrier for good measure) int request = jl_atomic_load_acquire(&ptls2->signal_request); @@ -474,14 +485,14 @@ CFI_NORETURN static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size) { jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; - unw_context_t *signal_context; + bt_context_t signal_context; // This also makes sure `sleep` is aborted. - jl_thread_suspend_and_get_state(0, 30, &signal_context); - if (signal_context != NULL) { + if (jl_thread_suspend_and_get_state(0, 30, &signal_context)) { thread0_exit_signo = signo; ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0])); - jl_thread_resume(0, -1); // resume with message 3 (call jl_exit_thread0_cb) + jl_atomic_store_release(&ptls2->signal_request, 3); + jl_thread_resume(0); // resume with message 3 (call jl_exit_thread0_cb) } else { // thread 0 is gone? just do the exit ourself @@ -877,11 +888,11 @@ static void *signal_listener(void *arg) int nthreads = jl_atomic_load_acquire(&jl_n_threads); bt_size = 0; #if !defined(JL_DISABLE_LIBUNWIND) - unw_context_t *signal_context; + bt_context_t signal_context; // sample each thread, round-robin style in reverse order // (so that thread zero gets notified last) if (critical || profile) { - jl_lock_profile(); + int lockret = jl_lock_stackwalk(); int *randperm; if (profile) randperm = profile_get_randperm(nthreads); @@ -889,8 +900,7 @@ static void *signal_listener(void *arg) // Stop the threads in the random or reverse round-robin order. int i = profile ? randperm[idx] : idx; // notify thread to stop - jl_thread_suspend_and_get_state(i, 1, &signal_context); - if (signal_context == NULL) + if (!jl_thread_suspend_and_get_state(i, 1, &signal_context)) continue; // do backtrace on thread contexts for critical signals @@ -898,7 +908,7 @@ static void *signal_listener(void *arg) if (critical) { bt_size += rec_backtrace_ctx(bt_data + bt_size, JL_MAX_BT_SIZE / nthreads - 1, - signal_context, NULL); + &signal_context, NULL); bt_data[bt_size++].uintptr = 0; } @@ -920,7 +930,7 @@ static void *signal_listener(void *arg) } else { // Get backtrace data bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, - bt_size_max - bt_size_cur - 1, signal_context, NULL); + bt_size_max - bt_size_cur - 1, &signal_context, NULL); } jl_set_safe_restore(old_buf); @@ -945,9 +955,9 @@ static void *signal_listener(void *arg) } // notify thread to resume - jl_thread_resume(i, sig); + jl_thread_resume(i); } - jl_unlock_profile(); + jl_unlock_stackwalk(lockret); } #ifndef HAVE_MACH if (profile && running) { @@ -985,7 +995,7 @@ static void *signal_listener(void *arg) jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig)); size_t i; for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { - jl_print_bt_entry_codeloc(bt_data + i); + jl_print_bt_entry_codeloc(-1, bt_data + i); } } } diff --git a/src/signals-win.c b/src/signals-win.c index 5dd6b34558ca6..d1f83d6bfdcc4 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -327,7 +327,7 @@ LONG WINAPI jl_exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo) jl_safe_printf("UNKNOWN"); break; } jl_safe_printf(" at 0x%Ix -- ", (size_t)ExceptionInfo->ExceptionRecord->ExceptionAddress); - jl_print_native_codeloc((uintptr_t)ExceptionInfo->ExceptionRecord->ExceptionAddress); + jl_print_native_codeloc("", (uintptr_t)ExceptionInfo->ExceptionRecord->ExceptionAddress); jl_critical_error(0, 0, ExceptionInfo->ContextRecord, ct); static int recursion = 0; @@ -344,6 +344,54 @@ JL_DLLEXPORT void jl_install_sigint_handler(void) static volatile HANDLE hBtThread = 0; +int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx) +{ + (void)timeout; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + if (ptls2 == NULL) // this thread is not alive + return 0; + jl_task_t *ct2 = jl_atomic_load_relaxed(&ptls2->current_task); + if (ct2 == NULL) // this thread is already dead + return 0; + HANDLE hThread = ptls2->system_id; + if ((DWORD)-1 == SuspendThread(hThread)) + return 0; + assert(sizeof(*ctx) == sizeof(CONTEXT)); + memset(ctx, 0, sizeof(CONTEXT)); + ctx->ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + if (!GetThreadContext(hThread, ctx)) { + if ((DWORD)-1 == ResumeThread(hThread)) + abort(); + return 0; + } + return 1; +} + +void jl_thread_resume(int tid) +{ + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + HANDLE hThread = ptls2->system_id; + if ((DWORD)-1 == ResumeThread(hThread)) { + fputs("failed to resume main thread! aborting.", stderr); + abort(); + } +} + +int jl_lock_stackwalk(void) +{ + uv_mutex_lock(&jl_in_stackwalk); + jl_lock_profile(); + return 0; +} + +void jl_unlock_stackwalk(int lockret) +{ + (void)lockret; + jl_unlock_profile(); + uv_mutex_unlock(&jl_in_stackwalk); +} + + static DWORD WINAPI profile_bt( LPVOID lparam ) { // Note: illegal to use jl_* functions from this thread except for profiling-specific functions @@ -357,58 +405,45 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) continue; } else { - uv_mutex_lock(&jl_in_stackwalk); - jl_lock_profile(); - if ((DWORD)-1 == SuspendThread(hMainThread)) { - fputs("failed to suspend main thread. aborting profiling.", stderr); - break; - } + // TODO: bring this up to parity with other OS by adding loop over tid here + int lockret = jl_lock_stackwalk(); CONTEXT ctxThread; - memset(&ctxThread, 0, sizeof(CONTEXT)); - ctxThread.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; - if (!GetThreadContext(hMainThread, &ctxThread)) { - fputs("failed to get context from main thread. aborting profiling.", stderr); + if (!jl_thread_suspend_and_get_state(0, 0, &ctxThread)) { + jl_unlock_stackwalk(lockret); + fputs("failed to suspend main thread. aborting profiling.", stderr); jl_profile_stop_timer(); + break; } - else { - // Get backtrace data - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, - bt_size_max - bt_size_cur - 1, &ctxThread, NULL); + // Get backtrace data + bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, + bt_size_max - bt_size_cur - 1, &ctxThread, NULL); - jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread + jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread - // store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; + // store threadid but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; - // store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); + // store task id (never null) + bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); - // store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); + // store cpu cycle clock + bt_data_prof[bt_size_cur++].uintptr = cycleclock(); - // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; - // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; - } - jl_unlock_profile(); - uv_mutex_unlock(&jl_in_stackwalk); - if ((DWORD)-1 == ResumeThread(hMainThread)) { - jl_profile_stop_timer(); - fputs("failed to resume main thread! aborting.", stderr); - jl_gc_debug_critical_error(); - abort(); - } + // Mark the end of this block with two 0's + bt_data_prof[bt_size_cur++].uintptr = 0; + bt_data_prof[bt_size_cur++].uintptr = 0; + jl_unlock_stackwalk(lockret); + jl_thread_resume(0); jl_check_profile_autostop(); } } } - jl_unlock_profile(); uv_mutex_unlock(&jl_in_stackwalk); jl_profile_stop_timer(); - hBtThread = 0; + hBtThread = NULL; return 0; } diff --git a/src/stackwalk.c b/src/stackwalk.c index 6c5eb6f4537cc..d6c71a2f1b01b 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -612,22 +612,25 @@ JL_DLLEXPORT jl_value_t *jl_lookup_code_address(void *ip, int skipC) return rs; } -static void jl_safe_print_codeloc(const char* func_name, const char* file_name, +static void jl_safe_print_codeloc(const char *pre_str, + const char* func_name, const char* file_name, int line, int inlined) JL_NOTSAFEPOINT { const char *inlined_str = inlined ? " [inlined]" : ""; if (line != -1) { - jl_safe_printf("%s at %s:%d%s\n", func_name, file_name, line, inlined_str); + jl_safe_printf("%s%s at %s:%d%s\n", + pre_str, func_name, file_name, line, inlined_str); } else { - jl_safe_printf("%s at %s (unknown line)%s\n", func_name, file_name, inlined_str); + jl_safe_printf("%s%s at %s (unknown line)%s\n", + pre_str, func_name, file_name, inlined_str); } } // Print function, file and line containing native instruction pointer `ip` by // looking up debug info. Prints multiple such frames when `ip` points to // inlined code. -void jl_print_native_codeloc(uintptr_t ip) JL_NOTSAFEPOINT +void jl_print_native_codeloc(char *pre_str, uintptr_t ip) JL_NOTSAFEPOINT { // This function is not allowed to reference any TLS variables since // it can be called from an unmanaged thread on OSX. @@ -639,10 +642,11 @@ void jl_print_native_codeloc(uintptr_t ip) JL_NOTSAFEPOINT for (i = 0; i < n; i++) { jl_frame_t frame = frames[i]; if (!frame.func_name) { - jl_safe_printf("unknown function (ip: %p)\n", (void*)ip); + jl_safe_printf("%sunknown function (ip: %p)\n", pre_str, (void*)ip); } else { - jl_safe_print_codeloc(frame.func_name, frame.file_name, frame.line, frame.inlined); + jl_safe_print_codeloc(pre_str, frame.func_name, + frame.file_name, frame.line, frame.inlined); free(frame.func_name); free(frame.file_name); } @@ -651,10 +655,17 @@ void jl_print_native_codeloc(uintptr_t ip) JL_NOTSAFEPOINT } // Print code location for backtrace buffer entry at *bt_entry -void jl_print_bt_entry_codeloc(jl_bt_element_t *bt_entry) JL_NOTSAFEPOINT +void jl_print_bt_entry_codeloc(int sig, jl_bt_element_t *bt_entry) JL_NOTSAFEPOINT { + char sig_str[32], pre_str[64]; + sig_str[0] = '\0'; + if (sig != -1) { + snprintf(sig_str, 32, "signal (%d) ", sig); + } + snprintf(pre_str, 64, "%sthread (%d) ", sig_str, jl_threadid() + 1); + if (jl_bt_is_native(bt_entry)) { - jl_print_native_codeloc(bt_entry[0].uintptr); + jl_print_native_codeloc(pre_str, bt_entry[0].uintptr); } else if (jl_bt_entry_tag(bt_entry) == JL_BT_INTERP_FRAME_TAG) { size_t ip = jl_bt_entry_header(bt_entry); @@ -680,7 +691,7 @@ void jl_print_bt_entry_codeloc(jl_bt_element_t *bt_entry) JL_NOTSAFEPOINT method = (jl_value_t*)((jl_method_t*)method)->name; if (jl_is_symbol(method)) func_name = jl_symbol_name((jl_sym_t*)method); - jl_safe_print_codeloc(func_name, jl_symbol_name(locinfo->file), + jl_safe_print_codeloc(pre_str, func_name, jl_symbol_name(locinfo->file), locinfo->line, locinfo->inlined_at); debuginfoloc = locinfo->inlined_at; } @@ -854,7 +865,7 @@ _os_ptr_munge(uintptr_t ptr) extern bt_context_t *jl_to_bt_context(void *sigctx); -void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT +static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT { jl_task_t *ct = jl_current_task; jl_ptls_t ptls = ct->ptls; @@ -863,222 +874,242 @@ void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0); return; } - if (t->copy_stack || !t->started || t->stkbuf == NULL) - return; - int16_t old = -1; - if (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid) - return; bt_context_t *context = NULL; -#if defined(_OS_WINDOWS_) bt_context_t c; - memset(&c, 0, sizeof(c)); - _JUMP_BUFFER *mctx = (_JUMP_BUFFER*)&t->ctx.ctx.uc_mcontext; + int16_t old = -1; + while (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid) { + int lockret = jl_lock_stackwalk(); + // if this task is already running somewhere, we need to stop the thread it is running on and query its state + if (!jl_thread_suspend_and_get_state(old, 0, &c)) { + jl_unlock_stackwalk(lockret); + return; + } + jl_unlock_stackwalk(lockret); + if (jl_atomic_load_relaxed(&t->tid) == old) { + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[old]; + if (ptls2->previous_task == t || // we might print the wrong stack here, since we can't know whether we executed the swapcontext yet or not, but it at least avoids trying to access the state inside uc_mcontext which might not be set yet + (ptls2->previous_task == NULL && jl_atomic_load_relaxed(&ptls2->current_task) == t)) { // this case should be always accurate + // use the thread context for the unwind state + context = &c; + } + break; + } + // got the wrong thread stopped, try again + jl_thread_resume(old); + } + if (context == NULL && (!t->copy_stack && t->started && t->stkbuf != NULL)) { + // need to read the context from the task stored state +#if defined(_OS_WINDOWS_) + memset(&c, 0, sizeof(c)); + _JUMP_BUFFER *mctx = (_JUMP_BUFFER*)&t->ctx.ctx.uc_mcontext; #if defined(_CPU_X86_64_) - c.Rbx = mctx->Rbx; - c.Rsp = mctx->Rsp; - c.Rbp = mctx->Rbp; - c.Rsi = mctx->Rsi; - c.Rdi = mctx->Rdi; - c.R12 = mctx->R12; - c.R13 = mctx->R13; - c.R14 = mctx->R14; - c.R15 = mctx->R15; - c.Rip = mctx->Rip; - memcpy(&c.Xmm6, &mctx->Xmm6, 10 * sizeof(mctx->Xmm6)); // Xmm6-Xmm15 + c.Rbx = mctx->Rbx; + c.Rsp = mctx->Rsp; + c.Rbp = mctx->Rbp; + c.Rsi = mctx->Rsi; + c.Rdi = mctx->Rdi; + c.R12 = mctx->R12; + c.R13 = mctx->R13; + c.R14 = mctx->R14; + c.R15 = mctx->R15; + c.Rip = mctx->Rip; + memcpy(&c.Xmm6, &mctx->Xmm6, 10 * sizeof(mctx->Xmm6)); // Xmm6-Xmm15 #else - c.Eip = mctx->Eip; - c.Esp = mctx->Esp; - c.Ebp = mctx->Ebp; + c.Eip = mctx->Eip; + c.Esp = mctx->Esp; + c.Ebp = mctx->Ebp; #endif - context = &c; + context = &c; #elif defined(JL_HAVE_UNW_CONTEXT) - context = &t->ctx.ctx; + context = &t->ctx.ctx; #elif defined(JL_HAVE_UCONTEXT) - context = jl_to_bt_context(&t->ctx.ctx); + context = jl_to_bt_context(&t->ctx.ctx); #elif defined(JL_HAVE_ASM) - bt_context_t c; - memset(&c, 0, sizeof(c)); - #if defined(_OS_LINUX_) && defined(__GLIBC__) - __jmp_buf *mctx = &t->ctx.ctx.uc_mcontext->__jmpbuf; - mcontext_t *mc = &c.uc_mcontext; - #if defined(_CPU_X86_) - // https://github.com/bminor/glibc/blame/master/sysdeps/i386/__longjmp.S - // https://github.com/bminor/glibc/blame/master/sysdeps/i386/jmpbuf-offsets.h - // https://github.com/bminor/musl/blame/master/src/setjmp/i386/longjmp.s - mc->gregs[REG_EBX] = (*mctx)[0]; - mc->gregs[REG_ESI] = (*mctx)[1]; - mc->gregs[REG_EDI] = (*mctx)[2]; - mc->gregs[REG_EBP] = (*mctx)[3]; - mc->gregs[REG_ESP] = (*mctx)[4]; - mc->gregs[REG_EIP] = (*mctx)[5]; - // ifdef PTR_DEMANGLE ? - mc->gregs[REG_ESP] = ptr_demangle(mc->gregs[REG_ESP]); - mc->gregs[REG_EIP] = ptr_demangle(mc->gregs[REG_EIP]); - context = &c; - #elif defined(_CPU_X86_64_) - // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/__longjmp.S - // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/jmpbuf-offsets.h - // https://github.com/bminor/musl/blame/master/src/setjmp/x86_64/setjmp.s - mc->gregs[REG_RBX] = (*mctx)[0]; - mc->gregs[REG_RBP] = (*mctx)[1]; - mc->gregs[REG_R12] = (*mctx)[2]; - mc->gregs[REG_R13] = (*mctx)[3]; - mc->gregs[REG_R14] = (*mctx)[4]; - mc->gregs[REG_R15] = (*mctx)[5]; - mc->gregs[REG_RSP] = (*mctx)[6]; - mc->gregs[REG_RIP] = (*mctx)[7]; - // ifdef PTR_DEMANGLE ? - mc->gregs[REG_RBP] = ptr_demangle(mc->gregs[REG_RBP]); - mc->gregs[REG_RSP] = ptr_demangle(mc->gregs[REG_RSP]); - mc->gregs[REG_RIP] = ptr_demangle(mc->gregs[REG_RIP]); - context = &c; - #elif defined(_CPU_ARM_) - // https://github.com/bminor/glibc/blame/master/sysdeps/arm/__longjmp.S - // https://github.com/bminor/glibc/blame/master/sysdeps/arm/include/bits/setjmp.h - // https://github.com/bminor/musl/blame/master/src/setjmp/arm/longjmp.S - mc->arm_sp = (*mctx)[0]; - mc->arm_lr = (*mctx)[1]; - mc->arm_r4 = (*mctx)[2]; // aka v1 - mc->arm_r5 = (*mctx)[3]; // aka v2 - mc->arm_r6 = (*mctx)[4]; // aka v3 - mc->arm_r7 = (*mctx)[5]; // aka v4 - mc->arm_r8 = (*mctx)[6]; // aka v5 - mc->arm_r9 = (*mctx)[7]; // aka v6 aka sb - mc->arm_r10 = (*mctx)[8]; // aka v7 aka sl - mc->arm_fp = (*mctx)[10]; // aka v8 aka r11 - // ifdef PTR_DEMANGLE ? - mc->arm_sp = ptr_demangle(mc->arm_sp); - mc->arm_lr = ptr_demangle(mc->arm_lr); - mc->arm_pc = mc->arm_lr; - context = &c; - #elif defined(_CPU_AARCH64_) - // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/__longjmp.S - // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/jmpbuf-offsets.h - // https://github.com/bminor/musl/blame/master/src/setjmp/aarch64/longjmp.s - // https://github.com/libunwind/libunwind/blob/ec171c9ba7ea3abb2a1383cee2988a7abd483a1f/src/aarch64/unwind_i.h#L62 - unw_fpsimd_context_t *mcfp = (unw_fpsimd_context_t*)&mc->__reserved; - mc->regs[19] = (*mctx)[0]; - mc->regs[20] = (*mctx)[1]; - mc->regs[21] = (*mctx)[2]; - mc->regs[22] = (*mctx)[3]; - mc->regs[23] = (*mctx)[4]; - mc->regs[24] = (*mctx)[5]; - mc->regs[25] = (*mctx)[6]; - mc->regs[26] = (*mctx)[7]; - mc->regs[27] = (*mctx)[8]; - mc->regs[28] = (*mctx)[9]; - mc->regs[29] = (*mctx)[10]; // aka fp - mc->regs[30] = (*mctx)[11]; // aka lr - // Yes, they did skip 12 why writing the code originally; and, no, I do not know why. - mc->sp = (*mctx)[13]; - mcfp->vregs[7] = (*mctx)[14]; // aka d8 - mcfp->vregs[8] = (*mctx)[15]; // aka d9 - mcfp->vregs[9] = (*mctx)[16]; // aka d10 - mcfp->vregs[10] = (*mctx)[17]; // aka d11 - mcfp->vregs[11] = (*mctx)[18]; // aka d12 - mcfp->vregs[12] = (*mctx)[19]; // aka d13 - mcfp->vregs[13] = (*mctx)[20]; // aka d14 - mcfp->vregs[14] = (*mctx)[21]; // aka d15 - // ifdef PTR_DEMANGLE ? - mc->sp = ptr_demangle(mc->sp); - mc->regs[30] = ptr_demangle(mc->regs[30]); - mc->pc = mc->regs[30]; - context = &c; - #else - #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown linux") - (void)mc; - (void)c; - (void)mctx; - #endif - #elif defined(_OS_DARWIN_) - sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext; - #if defined(_CPU_X86_64_) - // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/x86_64/_setjmp.s - x86_thread_state64_t *mc = (x86_thread_state64_t*)&c; - mc->__rbx = ((uint64_t*)mctx)[0]; - mc->__rbp = ((uint64_t*)mctx)[1]; - mc->__rsp = ((uint64_t*)mctx)[2]; - mc->__r12 = ((uint64_t*)mctx)[3]; - mc->__r13 = ((uint64_t*)mctx)[4]; - mc->__r14 = ((uint64_t*)mctx)[5]; - mc->__r15 = ((uint64_t*)mctx)[6]; - mc->__rip = ((uint64_t*)mctx)[7]; - // added in libsystem_plaform 177.200.16 (macOS Mojave 10.14.3) - // prior to that _os_ptr_munge_token was (hopefully) typically 0, - // so x ^ 0 == x and this is a no-op - mc->__rbp = _OS_PTR_UNMUNGE(mc->__rbp); - mc->__rsp = _OS_PTR_UNMUNGE(mc->__rsp); - mc->__rip = _OS_PTR_UNMUNGE(mc->__rip); - context = &c; - #elif defined(_CPU_AARCH64_) - // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/arm64/setjmp.s - // https://github.com/apple/darwin-xnu/blob/main/osfmk/mach/arm/_structs.h - // https://github.com/llvm/llvm-project/blob/7714e0317520207572168388f22012dd9e152e9e/libunwind/src/Registers.hpp -> Registers_arm64 - arm_thread_state64_t *mc = (arm_thread_state64_t*)&c; - mc->__x[19] = ((uint64_t*)mctx)[0]; - mc->__x[20] = ((uint64_t*)mctx)[1]; - mc->__x[21] = ((uint64_t*)mctx)[2]; - mc->__x[22] = ((uint64_t*)mctx)[3]; - mc->__x[23] = ((uint64_t*)mctx)[4]; - mc->__x[24] = ((uint64_t*)mctx)[5]; - mc->__x[25] = ((uint64_t*)mctx)[6]; - mc->__x[26] = ((uint64_t*)mctx)[7]; - mc->__x[27] = ((uint64_t*)mctx)[8]; - mc->__x[28] = ((uint64_t*)mctx)[9]; - mc->__x[10] = ((uint64_t*)mctx)[10]; - mc->__x[11] = ((uint64_t*)mctx)[11]; - mc->__x[12] = ((uint64_t*)mctx)[12]; - // 13 is reserved/unused - double *mcfp = (double*)&mc[1]; - mcfp[7] = ((uint64_t*)mctx)[14]; // aka d8 - mcfp[8] = ((uint64_t*)mctx)[15]; // aka d9 - mcfp[9] = ((uint64_t*)mctx)[16]; // aka d10 - mcfp[10] = ((uint64_t*)mctx)[17]; // aka d11 - mcfp[11] = ((uint64_t*)mctx)[18]; // aka d12 - mcfp[12] = ((uint64_t*)mctx)[19]; // aka d13 - mcfp[13] = ((uint64_t*)mctx)[20]; // aka d14 - mcfp[14] = ((uint64_t*)mctx)[21]; // aka d15 - mc->__fp = _OS_PTR_UNMUNGE(mc->__x[10]); - mc->__lr = _OS_PTR_UNMUNGE(mc->__x[11]); - mc->__x[12] = _OS_PTR_UNMUNGE(mc->__x[12]); - mc->__sp = mc->__x[12]; - // libunwind is broken for signed-pointers, but perhaps best not to leave the signed pointer lying around either - mc->__pc = ptrauth_strip(mc->__lr, 0); - mc->__pad = 0; // aka __ra_sign_state = not signed - context = &c; - #else - #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown darwin") - (void)mctx; - (void)c; - #endif - #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_) - sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext; - mcontext_t *mc = &c.uc_mcontext; - // https://github.com/freebsd/freebsd-src/blob/releng/13.1/lib/libc/amd64/gen/_setjmp.S - mc->mc_rip = ((long*)mctx)[0]; - mc->mc_rbx = ((long*)mctx)[1]; - mc->mc_rsp = ((long*)mctx)[2]; - mc->mc_rbp = ((long*)mctx)[3]; - mc->mc_r12 = ((long*)mctx)[4]; - mc->mc_r13 = ((long*)mctx)[5]; - mc->mc_r14 = ((long*)mctx)[6]; - mc->mc_r15 = ((long*)mctx)[7]; - context = &c; - #else - #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown system") - (void)c; - #endif + memset(&c, 0, sizeof(c)); + #if defined(_OS_LINUX_) && defined(__GLIBC__) + __jmp_buf *mctx = &t->ctx.ctx.uc_mcontext->__jmpbuf; + mcontext_t *mc = &c.uc_mcontext; + #if defined(_CPU_X86_) + // https://github.com/bminor/glibc/blame/master/sysdeps/i386/__longjmp.S + // https://github.com/bminor/glibc/blame/master/sysdeps/i386/jmpbuf-offsets.h + // https://github.com/bminor/musl/blame/master/src/setjmp/i386/longjmp.s + mc->gregs[REG_EBX] = (*mctx)[0]; + mc->gregs[REG_ESI] = (*mctx)[1]; + mc->gregs[REG_EDI] = (*mctx)[2]; + mc->gregs[REG_EBP] = (*mctx)[3]; + mc->gregs[REG_ESP] = (*mctx)[4]; + mc->gregs[REG_EIP] = (*mctx)[5]; + // ifdef PTR_DEMANGLE ? + mc->gregs[REG_ESP] = ptr_demangle(mc->gregs[REG_ESP]); + mc->gregs[REG_EIP] = ptr_demangle(mc->gregs[REG_EIP]); + context = &c; + #elif defined(_CPU_X86_64_) + // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/__longjmp.S + // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/jmpbuf-offsets.h + // https://github.com/bminor/musl/blame/master/src/setjmp/x86_64/setjmp.s + mc->gregs[REG_RBX] = (*mctx)[0]; + mc->gregs[REG_RBP] = (*mctx)[1]; + mc->gregs[REG_R12] = (*mctx)[2]; + mc->gregs[REG_R13] = (*mctx)[3]; + mc->gregs[REG_R14] = (*mctx)[4]; + mc->gregs[REG_R15] = (*mctx)[5]; + mc->gregs[REG_RSP] = (*mctx)[6]; + mc->gregs[REG_RIP] = (*mctx)[7]; + // ifdef PTR_DEMANGLE ? + mc->gregs[REG_RBP] = ptr_demangle(mc->gregs[REG_RBP]); + mc->gregs[REG_RSP] = ptr_demangle(mc->gregs[REG_RSP]); + mc->gregs[REG_RIP] = ptr_demangle(mc->gregs[REG_RIP]); + context = &c; + #elif defined(_CPU_ARM_) + // https://github.com/bminor/glibc/blame/master/sysdeps/arm/__longjmp.S + // https://github.com/bminor/glibc/blame/master/sysdeps/arm/include/bits/setjmp.h + // https://github.com/bminor/musl/blame/master/src/setjmp/arm/longjmp.S + mc->arm_sp = (*mctx)[0]; + mc->arm_lr = (*mctx)[1]; + mc->arm_r4 = (*mctx)[2]; // aka v1 + mc->arm_r5 = (*mctx)[3]; // aka v2 + mc->arm_r6 = (*mctx)[4]; // aka v3 + mc->arm_r7 = (*mctx)[5]; // aka v4 + mc->arm_r8 = (*mctx)[6]; // aka v5 + mc->arm_r9 = (*mctx)[7]; // aka v6 aka sb + mc->arm_r10 = (*mctx)[8]; // aka v7 aka sl + mc->arm_fp = (*mctx)[10]; // aka v8 aka r11 + // ifdef PTR_DEMANGLE ? + mc->arm_sp = ptr_demangle(mc->arm_sp); + mc->arm_lr = ptr_demangle(mc->arm_lr); + mc->arm_pc = mc->arm_lr; + context = &c; + #elif defined(_CPU_AARCH64_) + // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/__longjmp.S + // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/jmpbuf-offsets.h + // https://github.com/bminor/musl/blame/master/src/setjmp/aarch64/longjmp.s + // https://github.com/libunwind/libunwind/blob/ec171c9ba7ea3abb2a1383cee2988a7abd483a1f/src/aarch64/unwind_i.h#L62 + unw_fpsimd_context_t *mcfp = (unw_fpsimd_context_t*)&mc->__reserved; + mc->regs[19] = (*mctx)[0]; + mc->regs[20] = (*mctx)[1]; + mc->regs[21] = (*mctx)[2]; + mc->regs[22] = (*mctx)[3]; + mc->regs[23] = (*mctx)[4]; + mc->regs[24] = (*mctx)[5]; + mc->regs[25] = (*mctx)[6]; + mc->regs[26] = (*mctx)[7]; + mc->regs[27] = (*mctx)[8]; + mc->regs[28] = (*mctx)[9]; + mc->regs[29] = (*mctx)[10]; // aka fp + mc->regs[30] = (*mctx)[11]; // aka lr + // Yes, they did skip 12 why writing the code originally; and, no, I do not know why. + mc->sp = (*mctx)[13]; + mcfp->vregs[7] = (*mctx)[14]; // aka d8 + mcfp->vregs[8] = (*mctx)[15]; // aka d9 + mcfp->vregs[9] = (*mctx)[16]; // aka d10 + mcfp->vregs[10] = (*mctx)[17]; // aka d11 + mcfp->vregs[11] = (*mctx)[18]; // aka d12 + mcfp->vregs[12] = (*mctx)[19]; // aka d13 + mcfp->vregs[13] = (*mctx)[20]; // aka d14 + mcfp->vregs[14] = (*mctx)[21]; // aka d15 + // ifdef PTR_DEMANGLE ? + mc->sp = ptr_demangle(mc->sp); + mc->regs[30] = ptr_demangle(mc->regs[30]); + mc->pc = mc->regs[30]; + context = &c; + #else + #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown linux") + (void)mc; + (void)c; + (void)mctx; + #endif + #elif defined(_OS_DARWIN_) + sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext; + #if defined(_CPU_X86_64_) + // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/x86_64/_setjmp.s + x86_thread_state64_t *mc = (x86_thread_state64_t*)&c; + mc->__rbx = ((uint64_t*)mctx)[0]; + mc->__rbp = ((uint64_t*)mctx)[1]; + mc->__rsp = ((uint64_t*)mctx)[2]; + mc->__r12 = ((uint64_t*)mctx)[3]; + mc->__r13 = ((uint64_t*)mctx)[4]; + mc->__r14 = ((uint64_t*)mctx)[5]; + mc->__r15 = ((uint64_t*)mctx)[6]; + mc->__rip = ((uint64_t*)mctx)[7]; + // added in libsystem_plaform 177.200.16 (macOS Mojave 10.14.3) + // prior to that _os_ptr_munge_token was (hopefully) typically 0, + // so x ^ 0 == x and this is a no-op + mc->__rbp = _OS_PTR_UNMUNGE(mc->__rbp); + mc->__rsp = _OS_PTR_UNMUNGE(mc->__rsp); + mc->__rip = _OS_PTR_UNMUNGE(mc->__rip); + context = &c; + #elif defined(_CPU_AARCH64_) + // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/arm64/setjmp.s + // https://github.com/apple/darwin-xnu/blob/main/osfmk/mach/arm/_structs.h + // https://github.com/llvm/llvm-project/blob/7714e0317520207572168388f22012dd9e152e9e/libunwind/src/Registers.hpp -> Registers_arm64 + arm_thread_state64_t *mc = (arm_thread_state64_t*)&c; + mc->__x[19] = ((uint64_t*)mctx)[0]; + mc->__x[20] = ((uint64_t*)mctx)[1]; + mc->__x[21] = ((uint64_t*)mctx)[2]; + mc->__x[22] = ((uint64_t*)mctx)[3]; + mc->__x[23] = ((uint64_t*)mctx)[4]; + mc->__x[24] = ((uint64_t*)mctx)[5]; + mc->__x[25] = ((uint64_t*)mctx)[6]; + mc->__x[26] = ((uint64_t*)mctx)[7]; + mc->__x[27] = ((uint64_t*)mctx)[8]; + mc->__x[28] = ((uint64_t*)mctx)[9]; + mc->__x[10] = ((uint64_t*)mctx)[10]; + mc->__x[11] = ((uint64_t*)mctx)[11]; + mc->__x[12] = ((uint64_t*)mctx)[12]; + // 13 is reserved/unused + double *mcfp = (double*)&mc[1]; + mcfp[7] = ((uint64_t*)mctx)[14]; // aka d8 + mcfp[8] = ((uint64_t*)mctx)[15]; // aka d9 + mcfp[9] = ((uint64_t*)mctx)[16]; // aka d10 + mcfp[10] = ((uint64_t*)mctx)[17]; // aka d11 + mcfp[11] = ((uint64_t*)mctx)[18]; // aka d12 + mcfp[12] = ((uint64_t*)mctx)[19]; // aka d13 + mcfp[13] = ((uint64_t*)mctx)[20]; // aka d14 + mcfp[14] = ((uint64_t*)mctx)[21]; // aka d15 + mc->__fp = _OS_PTR_UNMUNGE(mc->__x[10]); + mc->__lr = _OS_PTR_UNMUNGE(mc->__x[11]); + mc->__x[12] = _OS_PTR_UNMUNGE(mc->__x[12]); + mc->__sp = mc->__x[12]; + // libunwind is broken for signed-pointers, but perhaps best not to leave the signed pointer lying around either + mc->__pc = ptrauth_strip(mc->__lr, 0); + mc->__pad = 0; // aka __ra_sign_state = not signed + context = &c; + #else + #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown darwin") + (void)mctx; + (void)c; + #endif + #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_) + sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext; + mcontext_t *mc = &c.uc_mcontext; + // https://github.com/freebsd/freebsd-src/blob/releng/13.1/lib/libc/amd64/gen/_setjmp.S + mc->mc_rip = ((long*)mctx)[0]; + mc->mc_rbx = ((long*)mctx)[1]; + mc->mc_rsp = ((long*)mctx)[2]; + mc->mc_rbp = ((long*)mctx)[3]; + mc->mc_r12 = ((long*)mctx)[4]; + mc->mc_r13 = ((long*)mctx)[5]; + mc->mc_r14 = ((long*)mctx)[6]; + mc->mc_r15 = ((long*)mctx)[7]; + context = &c; + #else + #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown system") + (void)c; + #endif #elif defined(JL_HAVE_ASYNCIFY) - #pragma message("jl_rec_backtrace not defined for ASYNCIFY") + #pragma message("jl_rec_backtrace not defined for ASYNCIFY") #elif defined(JL_HAVE_SIGALTSTACK) - #pragma message("jl_rec_backtrace not defined for SIGALTSTACK") + #pragma message("jl_rec_backtrace not defined for SIGALTSTACK") #else - #pragma message("jl_rec_backtrace not defined for unknown task system") + #pragma message("jl_rec_backtrace not defined for unknown task system") #endif + } if (context) - ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context, t->gcstack); + ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context, t->gcstack); if (old == -1) jl_atomic_store_relaxed(&t->tid, old); + else if (old != ptls->tid) + jl_thread_resume(old); } //-------------------------------------------------- @@ -1086,7 +1117,9 @@ void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT JL_DLLEXPORT void jl_gdblookup(void* ip) { - jl_print_native_codeloc((uintptr_t)ip); + char pre_str[64]; + snprintf(pre_str, 64, "thread (%d) ", jl_threadid() + 1); + jl_print_native_codeloc(pre_str, (uintptr_t)ip); } // Print backtrace for current exception in catch block @@ -1101,11 +1134,11 @@ JL_DLLEXPORT void jlbacktrace(void) JL_NOTSAFEPOINT size_t i, bt_size = jl_excstack_bt_size(s, s->top); jl_bt_element_t *bt_data = jl_excstack_bt_data(s, s->top); for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { - jl_print_bt_entry_codeloc(bt_data + i); + jl_print_bt_entry_codeloc(-1, bt_data + i); } } -// Print backtrace for specified task +// Print backtrace for specified task to jl_safe_printf stderr JL_DLLEXPORT void jlbacktracet(jl_task_t *t) JL_NOTSAFEPOINT { jl_task_t *ct = jl_current_task; @@ -1114,7 +1147,7 @@ JL_DLLEXPORT void jlbacktracet(jl_task_t *t) JL_NOTSAFEPOINT size_t i, bt_size = ptls->bt_size; jl_bt_element_t *bt_data = ptls->bt_data; for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { - jl_print_bt_entry_codeloc(bt_data + i); + jl_print_bt_entry_codeloc(-1, bt_data + i); } } @@ -1123,46 +1156,69 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT jlbacktrace(); } -// Print backtraces for all live tasks, for all threads. -// WARNING: this is dangerous and can crash if used outside of gdb, if -// all of Julia's threads are not stopped! +extern int gc_first_tid; + +// Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT { size_t nthreads = jl_atomic_load_acquire(&jl_n_threads); jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); + int ctid = jl_threadid() + 1; + jl_safe_printf("thread (%d) ++++ Task backtraces\n", ctid); for (size_t i = 0; i < nthreads; i++) { + // skip GC threads since they don't have tasks + if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) { + continue; + } jl_ptls_t ptls2 = allstates[i]; - arraylist_t *live_tasks = &ptls2->heap.live_tasks; - size_t n = live_tasks->len; - jl_safe_printf("==== Thread %d created %zu live tasks\n", - ptls2->tid + 1, n + 1); - jl_safe_printf(" ---- Root task (%p)\n", ptls2->root_task); - jl_safe_printf(" (sticky: %d, started: %d, state: %d, tid: %d)\n", - ptls2->root_task->sticky, ptls2->root_task->started, - jl_atomic_load_relaxed(&ptls2->root_task->_state), - jl_atomic_load_relaxed(&ptls2->root_task->tid) + 1); - jlbacktracet(ptls2->root_task); - - void **lst = live_tasks->items; - for (size_t j = 0; j < live_tasks->len; j++) { - jl_task_t *t = (jl_task_t *)lst[j]; + if (ptls2 == NULL) { + continue; + } + small_arraylist_t *live_tasks = &ptls2->heap.live_tasks; + size_t n = mtarraylist_length(live_tasks); + int t_state = JL_TASK_STATE_DONE; + jl_task_t *t = ptls2->root_task; + if (t != NULL) + t_state = jl_atomic_load_relaxed(&t->_state); + jl_safe_printf("thread (%d) ==== Thread %d created %zu live tasks\n", + ctid, ptls2->tid + 1, n + (t_state != JL_TASK_STATE_DONE)); + if (show_done || t_state != JL_TASK_STATE_DONE) { + jl_safe_printf("thread (%d) ---- Root task (%p)\n", ctid, ptls2->root_task); + if (t != NULL) { + jl_safe_printf("thread (%d) (sticky: %d, started: %d, state: %d, tid: %d)\n", + ctid, t->sticky, t->started, t_state, + jl_atomic_load_relaxed(&t->tid) + 1); + if (t->stkbuf != NULL) { + jlbacktracet(t); + } + else { + jl_safe_printf("thread (%d) no stack\n", ctid); + } + } + jl_safe_printf("thread (%d) ---- End root task\n", ctid); + } + + for (size_t j = 0; j < n; j++) { + jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, j); + if (t == NULL) + continue; int t_state = jl_atomic_load_relaxed(&t->_state); - if (!show_done && t_state == JL_TASK_STATE_DONE) { + if (!show_done && t_state == JL_TASK_STATE_DONE) continue; - } - jl_safe_printf(" ---- Task %zu (%p)\n", j + 1, t); - jl_safe_printf(" (sticky: %d, started: %d, state: %d, tid: %d)\n", - t->sticky, t->started, t_state, + jl_safe_printf("thread (%d) ---- Task %zu (%p)\n", ctid, j + 1, t); + // n.b. this information might not be consistent with the stack printing after it, since it could start running or change tid, etc. + jl_safe_printf("thread (%d) (sticky: %d, started: %d, state: %d, tid: %d)\n", + ctid, t->sticky, t->started, t_state, jl_atomic_load_relaxed(&t->tid) + 1); if (t->stkbuf != NULL) jlbacktracet(t); else - jl_safe_printf(" no stack\n"); - jl_safe_printf(" ---- End task %zu\n", j + 1); + jl_safe_printf("thread (%d) no stack\n", ctid); + jl_safe_printf("thread (%d) ---- End task %zu\n", ctid, j + 1); } - jl_safe_printf("==== End thread %d\n", ptls2->tid + 1); + jl_safe_printf("thread (%d) ==== End thread %d\n", ctid, ptls2->tid + 1); } - jl_safe_printf("==== Done\n"); + jl_safe_printf("thread (%d) ++++ Done\n", ctid); } #ifdef __cplusplus diff --git a/src/support/MurmurHash3.c b/src/support/MurmurHash3.c index 43bd015ddd69f..1f7056d0e7e56 100644 --- a/src/support/MurmurHash3.c +++ b/src/support/MurmurHash3.c @@ -12,8 +12,6 @@ //----------------------------------------------------------------------------- // Platform-specific functions and macros -#define FORCE_INLINE inline __attribute__((always_inline)) - static inline uint32_t rotl32 ( uint32_t x, int8_t r ) { return (x << r) | (x >> (32 - r)); diff --git a/src/support/dtypes.h b/src/support/dtypes.h index d49ae0b22b5f9..0e528b5cc9b56 100644 --- a/src/support/dtypes.h +++ b/src/support/dtypes.h @@ -117,6 +117,7 @@ typedef intptr_t ssize_t; #define LLT_FREE(x) free(x) #define STATIC_INLINE static inline +#define FORCE_INLINE static inline __attribute__((always_inline)) #if defined(_OS_WINDOWS_) && !defined(_COMPILER_GCC_) # define NOINLINE __declspec(noinline) @@ -331,6 +332,23 @@ STATIC_INLINE void jl_store_unaligned_i16(void *ptr, uint16_t val) JL_NOTSAFEPOI memcpy(ptr, &val, 2); } +STATIC_INLINE void *calloc_s(size_t sz) JL_NOTSAFEPOINT { + int last_errno = errno; +#ifdef _OS_WINDOWS_ + DWORD last_error = GetLastError(); +#endif + void *p = calloc(sz == 0 ? 1 : sz, 1); + if (p == NULL) { + perror("(julia) calloc"); + abort(); + } +#ifdef _OS_WINDOWS_ + SetLastError(last_error); +#endif + errno = last_errno; + return p; +} + STATIC_INLINE void *malloc_s(size_t sz) JL_NOTSAFEPOINT { int last_errno = errno; #ifdef _OS_WINDOWS_ diff --git a/src/threading.c b/src/threading.c index 8500279220825..6c17abf2c36be 100644 --- a/src/threading.c +++ b/src/threading.c @@ -305,6 +305,8 @@ static uv_mutex_t tls_lock; // controls write-access to these variables: _Atomic(jl_ptls_t*) jl_all_tls_states JL_GLOBALLY_ROOTED; int jl_all_tls_states_size; static uv_cond_t cond; +// concurrent reads are permitted, using the same pattern as mtsmall_arraylist +// it is implemented separately because the API of direct jl_all_tls_states use is already widely prevalent // return calling thread's ID JL_DLLEXPORT int16_t jl_threadid(void) @@ -338,7 +340,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid) #ifndef _OS_WINDOWS_ pthread_setspecific(jl_task_exit_key, (void*)ptls); #endif - ptls->system_id = (jl_thread_t)(uintptr_t)uv_thread_self(); + ptls->system_id = uv_thread_self(); ptls->rngseed = jl_rand(); if (tid == 0) ptls->disable_gc = 1; @@ -373,10 +375,10 @@ jl_ptls_t jl_init_threadtls(int16_t tid) uv_cond_init(&ptls->wake_signal); uv_mutex_lock(&tls_lock); - jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); if (tid == -1) tid = jl_atomic_load_relaxed(&jl_n_threads); ptls->tid = tid; + jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); if (jl_all_tls_states_size <= tid) { int i, newsize = jl_all_tls_states_size + tid + 2; jl_ptls_t *newpptls = (jl_ptls_t*)calloc(newsize, sizeof(jl_ptls_t)); @@ -562,6 +564,8 @@ static void jl_check_tls(void) JL_DLLEXPORT const int jl_tls_elf_support = 0; #endif +extern int gc_first_tid; + // interface to Julia; sets up to make the runtime thread-safe void jl_init_threading(void) { @@ -614,13 +618,32 @@ void jl_init_threading(void) } } - jl_all_tls_states_size = nthreads + nthreadsi; + int16_t ngcthreads = jl_options.ngcthreads - 1; + if (ngcthreads == -1 && + (cp = getenv(NUM_GC_THREADS_NAME))) { // ENV[NUM_GC_THREADS_NAME] specified + + ngcthreads = (uint64_t)strtol(cp, NULL, 10) - 1; + } + if (ngcthreads == -1) { + // if `--gcthreads` was not specified, set the number of GC threads + // to the number of compute threads + if (nthreads <= 1) { + ngcthreads = 0; + } + else { + ngcthreads = nthreads - 1; + } + } + + jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads; jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int)); jl_n_threads_per_pool[0] = nthreadsi; jl_n_threads_per_pool[1] = nthreads; jl_atomic_store_release(&jl_all_tls_states, (jl_ptls_t*)calloc(jl_all_tls_states_size, sizeof(jl_ptls_t))); jl_atomic_store_release(&jl_n_threads, jl_all_tls_states_size); + jl_n_gcthreads = ngcthreads; + gc_first_tid = nthreads; } static uv_barrier_t thread_init_done; @@ -628,6 +651,7 @@ static uv_barrier_t thread_init_done; void jl_start_threads(void) { int nthreads = jl_atomic_load_relaxed(&jl_n_threads); + int ngcthreads = jl_n_gcthreads; int cpumasksize = uv_cpumask_size(); char *cp; int i, exclusive; @@ -660,15 +684,23 @@ void jl_start_threads(void) // create threads uv_barrier_init(&thread_init_done, nthreads); + // GC/System threads need to be after the worker threads. + int nworker_threads = nthreads - ngcthreads; + for (i = 1; i < nthreads; ++i) { jl_threadarg_t *t = (jl_threadarg_t *)malloc_s(sizeof(jl_threadarg_t)); // ownership will be passed to the thread t->tid = i; t->barrier = &thread_init_done; - uv_thread_create(&uvtid, jl_threadfun, t); - if (exclusive) { - mask[i] = 1; - uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize); - mask[i] = 0; + if (i < nworker_threads) { + uv_thread_create(&uvtid, jl_threadfun, t); + if (exclusive) { + mask[i] = 1; + uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize); + mask[i] = 0; + } + } + else { + uv_thread_create(&uvtid, jl_gc_threadfun, t); } uv_thread_detach(&uvtid); } @@ -816,6 +848,224 @@ JL_DLLEXPORT int jl_alignment(size_t sz) return jl_gc_alignment(sz); } +// Heartbeat mechanism for Julia's task scheduler +// --- +// Start a thread that does not participate in running Julia's tasks. This +// thread simply sleeps until the heartbeat mechanism is enabled. When +// enabled, the heartbeat thread enters a loop in which it blocks waiting +// for the specified heartbeat interval. If, within that interval, +// `jl_heartbeat()` is *not* called at least once, then the thread calls +// `jl_print_task_backtraces(0)`. + +#ifdef JL_HEARTBEAT_THREAD + +#include + +volatile int heartbeat_enabled; +uv_sem_t heartbeat_on_sem, // jl_heartbeat_enable -> thread + heartbeat_off_sem; // thread -> jl_heartbeat_enable +int heartbeat_interval_s, + tasks_after_n, + reset_tasks_after_n; +int tasks_showed, n_hbs_missed, n_hbs_recvd; +_Atomic(int) heartbeats; + +JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT; +void jl_heartbeat_threadfun(void *arg); + +// start the heartbeat thread with heartbeats disabled +void jl_init_heartbeat(void) +{ + uv_thread_t uvtid; + heartbeat_enabled = 0; + uv_sem_init(&heartbeat_on_sem, 0); + uv_sem_init(&heartbeat_off_sem, 0); + uv_thread_create(&uvtid, jl_heartbeat_threadfun, NULL); + uv_thread_detach(&uvtid); +} + +// enable/disable heartbeats +// heartbeat_s: interval within which jl_heartbeat() must be called +// show_tasks_after_n: number of heartbeats missed before printing task backtraces +// reset_after_n: number of heartbeats after which to reset +// +// When disabling heartbeats, the heartbeat thread must wake up, +// find out that heartbeats are now diabled, and reset. For now, we +// handle this by preventing re-enabling of heartbeats until this +// completes. +JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n, + int reset_after_n) +{ + if (heartbeat_s <= 0) { + heartbeat_enabled = 0; + heartbeat_interval_s = tasks_after_n = reset_tasks_after_n = 0; + } + else { + // must disable before enabling + if (heartbeat_enabled) { + return -1; + } + // heartbeat thread must be ready + if (uv_sem_trywait(&heartbeat_off_sem) != 0) { + return -1; + } + + jl_atomic_store_relaxed(&heartbeats, 0); + heartbeat_interval_s = heartbeat_s; + tasks_after_n = show_tasks_after_n; + reset_tasks_after_n = reset_after_n; + tasks_showed = 0; + n_hbs_missed = 0; + n_hbs_recvd = 0; + heartbeat_enabled = 1; + uv_sem_post(&heartbeat_on_sem); // wake the heartbeat thread + } + return 0; +} + +// heartbeat +JL_DLLEXPORT void jl_heartbeat(void) +{ + jl_atomic_fetch_add(&heartbeats, 1); +} + +// sleep the thread for the specified interval +void sleep_for(int secs, int nsecs) +{ + struct timespec rqtp, rmtp; + rqtp.tv_sec = secs; + rqtp.tv_nsec = nsecs; + rmtp.tv_sec = 0; + rmtp.tv_nsec = 0; + for (; ;) { + // this suspends the thread so we aren't using CPU + if (nanosleep(&rqtp, &rmtp) == 0) { + return; + } + // TODO: else if (errno == EINTR) + // this could be SIGTERM and we should shutdown but how to find out? + rqtp = rmtp; + } +} + +// check for heartbeats and maybe report loss +uint8_t check_heartbeats(uint8_t gc_state) +{ + int hb = jl_atomic_exchange(&heartbeats, 0); + + if (hb <= 0) { + // we didn't get a heartbeat + n_hbs_recvd = 0; + n_hbs_missed++; + + // if we've printed task backtraces already, do nothing + if (!tasks_showed) { + // otherwise, at least show this message + jl_safe_printf("==== heartbeat loss (%ds) ====\n", + n_hbs_missed * heartbeat_interval_s); + // if we've missed enough heartbeats, print task backtraces + if (n_hbs_missed >= tasks_after_n) { + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + + // exit GC-safe region to report then re-enter + jl_gc_safe_leave(ptls, gc_state); + jl_print_task_backtraces(0); + gc_state = jl_gc_safe_enter(ptls); + + // we printed task backtraces + tasks_showed = 1; + } + } + } + else { + // got a heartbeat + n_hbs_recvd++; + // if we'd printed task backtraces, check for reset + if (tasks_showed && n_hbs_recvd >= reset_tasks_after_n) { + tasks_showed = 0; + jl_safe_printf("==== heartbeats recovered (lost for %ds) ====\n", + n_hbs_missed * heartbeat_interval_s); + } + n_hbs_missed = 0; + } + + return gc_state; +} + +// heartbeat thread function +void jl_heartbeat_threadfun(void *arg) +{ + int s = 59, ns = 1e9 - 1, rs; + uint64_t t0, tchb; + + // We need a TLS because backtraces are accumulated into ptls->bt_size + // and ptls->bt_data, so we need to call jl_adopt_thread(). + jl_adopt_thread(); + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + + // Don't hold up GC, this thread doesn't participate. + uint8_t gc_state = jl_gc_safe_enter(ptls); + + for (;;) { + if (!heartbeat_enabled) { + // post the off semaphore to indicate we're ready to enable + uv_sem_post(&heartbeat_off_sem); + + // sleep the thread here; this semaphore is posted in + // jl_heartbeat_enable() + uv_sem_wait(&heartbeat_on_sem); + + // Set the sleep duration. + s = heartbeat_interval_s - 1; + ns = 1e9 - 1; + continue; + } + + // heartbeat is enabled; sleep, waiting for the desired interval + sleep_for(s, ns); + + // if heartbeats were turned off while we were sleeping, reset + if (!heartbeat_enabled) { + continue; + } + + // check if any heartbeats have happened, report as appropriate + t0 = jl_hrtime(); + gc_state = check_heartbeats(gc_state); + tchb = jl_hrtime() - t0; + + // adjust the next sleep duration based on how long the heartbeat + // check took + rs = 1; + while (tchb > 1e9) { + rs++; + tchb -= 1e9; + } + s = heartbeat_interval_s - rs; + ns = 1e9 - tchb; + } +} + +#else // !JL_HEARTBEAT_THREAD + +void jl_init_heartbeat(void) +{ +} + +JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n, + int reset_after_n) +{ + return -1; +} + +JL_DLLEXPORT void jl_heartbeat(void) +{ +} + +#endif // JL_HEARTBEAT_THREAD + #ifdef __cplusplus } #endif diff --git a/src/threading.h b/src/threading.h index 9fd63f0fd188d..1cf2d4a9d3711 100644 --- a/src/threading.h +++ b/src/threading.h @@ -25,6 +25,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid); // provided by a threading infrastructure void jl_init_threadinginfra(void); +void jl_gc_threadfun(void *arg); void jl_threadfun(void *arg); #ifdef __cplusplus diff --git a/src/work-stealing-queue.h b/src/work-stealing-queue.h new file mode 100644 index 0000000000000..38429e02886e9 --- /dev/null +++ b/src/work-stealing-queue.h @@ -0,0 +1,102 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#ifndef WORK_STEALING_QUEUE_H +#define WORK_STEALING_QUEUE_H + +#include "julia_atomics.h" +#include "assert.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// ======= +// Chase and Lev's work-stealing queue, optimized for +// weak memory models by Le et al. +// +// * Chase D., Lev Y. Dynamic Circular Work-Stealing queue +// * Le N. M. et al. Correct and Efficient Work-Stealing for +// Weak Memory Models +// ======= + +typedef struct { + char *buffer; + int32_t capacity; + int32_t mask; +} ws_array_t; + +static inline ws_array_t *create_ws_array(size_t capacity, int32_t eltsz) JL_NOTSAFEPOINT +{ + ws_array_t *a = (ws_array_t *)malloc_s(sizeof(ws_array_t)); + a->buffer = (char *)malloc_s(capacity * eltsz); + a->capacity = capacity; + a->mask = capacity - 1; + return a; +} + +typedef struct { + _Atomic(int64_t) top; + _Atomic(int64_t) bottom; + _Atomic(ws_array_t *) array; +} ws_queue_t; + +static inline ws_array_t *ws_queue_push(ws_queue_t *q, void *elt, int32_t eltsz) JL_NOTSAFEPOINT +{ + int64_t b = jl_atomic_load_relaxed(&q->bottom); + int64_t t = jl_atomic_load_acquire(&q->top); + ws_array_t *ary = jl_atomic_load_relaxed(&q->array); + ws_array_t *old_ary = NULL; + if (__unlikely(b - t > ary->capacity - 1)) { + ws_array_t *new_ary = create_ws_array(2 * ary->capacity, eltsz); + for (int i = 0; i < ary->capacity; i++) { + memcpy(new_ary->buffer + ((t + i) & new_ary->mask) * eltsz, ary->buffer + ((t + i) & ary->mask) * eltsz, eltsz); + } + jl_atomic_store_release(&q->array, new_ary); + old_ary = ary; + ary = new_ary; + } + memcpy(ary->buffer + (b & ary->mask) * eltsz, elt, eltsz); + jl_fence_release(); + jl_atomic_store_relaxed(&q->bottom, b + 1); + return old_ary; +} + +static inline void ws_queue_pop(ws_queue_t *q, void *dest, int32_t eltsz) JL_NOTSAFEPOINT +{ + int64_t b = jl_atomic_load_relaxed(&q->bottom) - 1; + ws_array_t *ary = jl_atomic_load_relaxed(&q->array); + jl_atomic_store_relaxed(&q->bottom, b); + jl_fence(); + int64_t t = jl_atomic_load_relaxed(&q->top); + if (__likely(t <= b)) { + memcpy(dest, ary->buffer + (b & ary->mask) * eltsz, eltsz); + if (t == b) { + if (!jl_atomic_cmpswap(&q->top, &t, t + 1)) + memset(dest, 0, eltsz); + jl_atomic_store_relaxed(&q->bottom, b + 1); + } + } + else { + memset(dest, 0, eltsz); + jl_atomic_store_relaxed(&q->bottom, b + 1); + } +} + +static inline void ws_queue_steal_from(ws_queue_t *q, void *dest, int32_t eltsz) JL_NOTSAFEPOINT +{ + int64_t t = jl_atomic_load_acquire(&q->top); + jl_fence(); + int64_t b = jl_atomic_load_acquire(&q->bottom); + if (t < b) { + ws_array_t *ary = jl_atomic_load_relaxed(&q->array); + memcpy(dest, ary->buffer + (t & ary->mask) * eltsz, eltsz); + if (!jl_atomic_cmpswap(&q->top, &t, t + 1)) + memset(dest, 0, eltsz); + } +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/stdlib/Distributed/src/cluster.jl b/stdlib/Distributed/src/cluster.jl index 554a3d9185080..d8039ba160d66 100644 --- a/stdlib/Distributed/src/cluster.jl +++ b/stdlib/Distributed/src/cluster.jl @@ -1345,9 +1345,10 @@ function process_opts(opts) end # Propagate --threads to workers - threads = get_threads_spec(opts) + threads = opts.nthreads > 0 ? `--threads=$(opts.nthreads)` : `` + gcthreads = opts.ngcthreads > 0 ? `--gcthreads=$(opts.ngcthreads)` : `` - exeflags = `$threads` + exeflags = `$threads $gcthreads` # add processors if opts.nprocs > 0 diff --git a/stdlib/Pkg.version b/stdlib/Pkg.version index 3279234710531..c7cc917268a90 100644 --- a/stdlib/Pkg.version +++ b/stdlib/Pkg.version @@ -1,4 +1,4 @@ PKG_BRANCH = release-1.9 -PKG_SHA1 = ffe4615b1e4e39b818a49bb1a06467932d5eaf51 +PKG_SHA1 = 397c8bf20ef70a78247d4cbd3d59f28a3116c884 PKG_GIT_URL := https://github.com/JuliaLang/Pkg.jl.git PKG_TAR_URL = https://api.github.com/repos/JuliaLang/Pkg.jl/tarball/$1 diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 6dd2a12205b66..482c2c0a97ad3 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -1275,6 +1275,21 @@ function take_heap_snapshot(all_one::Bool=false) return take_heap_snapshot(f, all_one) end +""" + Profile.take_page_profile(io::IOStream) + Profile.take_page_profile(filepath::String) + +Write a JSON snapshot of the pages from Julia's pool allocator, printing for every pool allocated object, whether it's garbage, or its type. +""" +function take_page_profile(io::IOStream) + Base.@_lock_ios(io, ccall(:jl_gc_take_page_profile, Cvoid, (Ptr{Cvoid},), io.handle)) +end +function take_page_profile(filepath::String) + open(filepath, "w") do io + take_page_profile(io) + end + return filepath +end include("Allocs.jl") diff --git a/stdlib/Profile/test/allocs.jl b/stdlib/Profile/test/allocs.jl index c2ec7d2f6cb54..ae0cbab945f01 100644 --- a/stdlib/Profile/test/allocs.jl +++ b/stdlib/Profile/test/allocs.jl @@ -121,3 +121,34 @@ end @test length(prof.allocs) >= 1 @test length([a for a in prof.allocs if a.type == String]) >= 1 end + +@testset "alloc profiler catches allocs from codegen" begin + @eval begin + struct MyType x::Int; y::Int end + Base.:(+)(n::Number, x::MyType) = n + x.x + x.y + foo(a, x) = a[1] + x + wrapper(a) = foo(a, MyType(0,1)) + end + a = Any[1,2,3] + # warmup + wrapper(a) + + @eval Allocs.@profile sample_rate=1 wrapper($a) + + prof = Allocs.fetch() + Allocs.clear() + + @test length(prof.allocs) >= 1 + @test length([a for a in prof.allocs if a.type == MyType]) >= 1 +end + +@testset "alloc profiler catches allocs from buffer resize" begin + a = Int[] + Allocs.@profile sample_rate=1 for _ in 1:100; push!(a, 1); end + + prof = Allocs.fetch() + Allocs.clear() + + @test length(prof.allocs) >= 1 + @test length([a for a in prof.allocs if a.type == Profile.Allocs.BufferType]) >= 1 +end diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl index 6ff4890004f62..01a596278d74f 100644 --- a/stdlib/Profile/test/runtests.jl +++ b/stdlib/Profile/test/runtests.jl @@ -287,4 +287,14 @@ end rm(tmpdir, force = true, recursive = true) end +@testset "PageProfile" begin + fname = "$(getpid())_$(time_ns())" + fpath = joinpath(tempdir(), fname) + Profile.take_page_profile(fpath) + open(fpath) do fs + @test readline(fs) != "" + end + rm(fpath) +end + include("allocs.jl") diff --git a/test/choosetests.jl b/test/choosetests.jl index 334ef051a0fe6..7d426221180f5 100644 --- a/test/choosetests.jl +++ b/test/choosetests.jl @@ -21,7 +21,7 @@ const TESTNAMES = [ "combinatorics", "sysinfo", "env", "rounding", "ranges", "mod2pi", "euler", "show", "client", "errorshow", "sets", "goto", "llvmcall", "llvmcall2", "ryu", - "some", "meta", "stacktraces", "docs", + "some", "meta", "stacktraces", "docs", "gc", "misc", "threads", "stress", "binaryplatforms", "atexit", "enums", "cmdlineargs", "int", "interpreter", "checked", "bitset", "floatfuncs", "precompile", diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index 7f86534f923b3..f73a7854fd2f1 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -269,6 +269,24 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no` @test p.exitcode == 1 && p.termsignal == 0 end + # --gcthreads + code = "print(Threads.ngcthreads())" + cpu_threads = ccall(:jl_effective_threads, Int32, ()) + @test (cpu_threads == 1 ? "1" : string(div(cpu_threads, 2))) == + read(`$exename --threads auto -e $code`, String) == + read(`$exename --threads=auto -e $code`, String) == + read(`$exename -tauto -e $code`, String) == + read(`$exename -t auto -e $code`, String) + for nt in (nothing, "1") + withenv("JULIA_NUM_GC_THREADS" => nt) do + @test read(`$exename --gcthreads=2 -e $code`, String) == "2" + end + end + + withenv("JULIA_NUM_GC_THREADS" => 2) do + @test read(`$exename -e $code`, String) == "2" + end + # --machine-file # this does not check that machine file works, # only that the filename gets correctly passed to the option struct diff --git a/test/compiler/codegen.jl b/test/compiler/codegen.jl index e4e107351c57f..4c9c7e97a710b 100644 --- a/test/compiler/codegen.jl +++ b/test/compiler/codegen.jl @@ -15,9 +15,12 @@ function libjulia_codegen_name() is_debug_build ? "libjulia-codegen-debug" : "libjulia-codegen" end -# `_dump_function` might be more efficient but it doesn't really matter here... -get_llvm(@nospecialize(f), @nospecialize(t), raw=true, dump_module=false, optimize=true) = - sprint(code_llvm, f, t, raw, dump_module, optimize) +# The tests below assume a certain format and safepoint_on_entry=true breaks that. +function get_llvm(@nospecialize(f), @nospecialize(t), raw=true, dump_module=false, optimize=true) + params = Base.CodegenParams(safepoint_on_entry=false) + d = InteractiveUtils._dump_function(f, t, false, false, !raw, dump_module, :att, optimize, :none, false, params) + sprint(print, d) +end if !is_debug_build && opt_level > 0 # Make sure getptls call is removed at IR level with optimization on diff --git a/test/gc.jl b/test/gc.jl new file mode 100644 index 0000000000000..9cc9d753dfc09 --- /dev/null +++ b/test/gc.jl @@ -0,0 +1,18 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +using Test + +function run_gctest(file) + let cmd = `$(Base.julia_cmd()) --depwarn=error --rr-detach --startup-file=no $file` + for test_nthreads in (1, 2, 4) + new_env = copy(ENV) + new_env["JULIA_NUM_THREADS"] = string(test_nthreads) + new_env["JULIA_NUM_GC_THREADS"] = string(test_nthreads) + @time run(pipeline(setenv(cmd, new_env), stdout = stdout, stderr = stderr)) + end + end +end + +@time run_gctest("gc/binarytree.jl") +@time run_gctest("gc/linkedlist.jl") +@time run_gctest("gc/objarray.jl") diff --git a/test/gc/binarytree.jl b/test/gc/binarytree.jl new file mode 100644 index 0000000000000..3089e2d2ce869 --- /dev/null +++ b/test/gc/binarytree.jl @@ -0,0 +1,53 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +module BinaryTreeMutable + +# Adopted from +# https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/binarytrees.html#binarytrees + +using Base.Threads +using Printf + +mutable struct Node + l::Union{Nothing, Node} + r::Union{Nothing, Node} +end + +function make(n::Int) + return n === 0 ? Node(nothing, nothing) : Node(make(n-1), make(n-1)) +end + +function check(node::Node) + return 1 + (node.l === nothing ? 0 : check(node.l) + check(node.r)) +end + +function binary_trees(io, n::Int) + @printf io "stretch tree of depth %jd\t check: %jd\n" n+1 check(make(n+1)) + + long_tree = make(n) + minDepth = 4 + resultSize = div((n - minDepth), 2) + 1 + results = Vector{String}(undef, resultSize) + Threads.@threads for depth in minDepth:2:n + c = 0 + niter = 1 << (n - depth + minDepth) + for _ in 1:niter + c += check(make(depth)) + end + index = div((depth - minDepth),2) + 1 + results[index] = @sprintf "%jd\t trees of depth %jd\t check: %jd\n" niter depth c + end + + for i in results + write(io, i) + end + + @printf io "long lived tree of depth %jd\t check: %jd\n" n check(long_tree) +end + +end #module + +using .BinaryTreeMutable + +BinaryTreeMutable.binary_trees(devnull, 20) +GC.gc() diff --git a/test/gc/linkedlist.jl b/test/gc/linkedlist.jl new file mode 100644 index 0000000000000..c447a9680326d --- /dev/null +++ b/test/gc/linkedlist.jl @@ -0,0 +1,21 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +mutable struct ListNode + key::Int64 + next::ListNode + ListNode() = new() + ListNode(x)= new(x) + ListNode(x,y) = new(x,y); +end + +function list(n=128) + start::ListNode = ListNode(1) + current::ListNode = start + for i = 2:(n*1024^2) + current = ListNode(i,current) + end + return current.key +end + +_ = list() +GC.gc() diff --git a/test/gc/objarray.jl b/test/gc/objarray.jl new file mode 100644 index 0000000000000..4b4cb67c42eac --- /dev/null +++ b/test/gc/objarray.jl @@ -0,0 +1,35 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +using Random: seed! +seed!(1) + +abstract type Cell end + +struct CellA<:Cell + a::Ref{Int} +end + +struct CellB<:Cell + b::String +end + +function fillcells!(mc::Array{Cell}) + for ind in eachindex(mc) + mc[ind] = ifelse(rand() > 0.5, CellA(ind), CellB(string(ind))) + end + return mc +end + +function work(size) + mcells = Array{Cell}(undef, size, size) + fillcells!(mcells) +end + +function run(maxsize) + Threads.@threads for i in 1:maxsize + work(i*500) + end +end + +run(4) +GC.gc() diff --git a/test/llvmpasses/alloc-opt-gcframe.jl b/test/llvmpasses/alloc-opt-gcframe.jl index 3b5fc3a51a606..ad4be12be0840 100644 --- a/test/llvmpasses/alloc-opt-gcframe.jl +++ b/test/llvmpasses/alloc-opt-gcframe.jl @@ -14,11 +14,11 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" # CHECK-LABEL: @return_obj # CHECK-NOT: @julia.gc_alloc_obj # CHECK: %current_task = getelementptr inbounds {}*, {}** %gcstack, i64 -12 -# CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 15 +# CHECK: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 15 # CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 # CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** # CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -# CHECK-NEXT: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8* [[ptls_i8]], i32 [[SIZE_T:[0-9]+]], i32 16) +# CHECK-NEXT: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc_instrumented(i8* [[ptls_i8]], i32 [[SIZE_T:[0-9]+]], i32 16, i64 {{.*}} @tag {{.*}}) # CHECK: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* {{.*}} unordered, align 8, !tbaa !4 println(""" define {} addrspace(10)* @return_obj() { @@ -260,8 +260,8 @@ L3: """) # CHECK-LABEL: }{{$}} -# CHECK: declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, -# CHECK: declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*, +# CHECK: declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc_instrumented(i8*, +# CHECK: declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc_instrumented(i8*, println(""" declare void @external_function() declare {}*** @julia.get_pgcstack() diff --git a/test/llvmpasses/final-lower-gc.ll b/test/llvmpasses/final-lower-gc.ll index 4af43f748020b..840c911d0874f 100644 --- a/test/llvmpasses/final-lower-gc.ll +++ b/test/llvmpasses/final-lower-gc.ll @@ -13,7 +13,7 @@ declare noalias nonnull {} addrspace(10)** @julia.new_gc_frame(i32) declare void @julia.push_gc_frame({} addrspace(10)**, i32) declare {} addrspace(10)** @julia.get_gc_frame_slot({} addrspace(10)**, i32) declare void @julia.pop_gc_frame({} addrspace(10)**) -declare noalias nonnull {} addrspace(10)* @julia.gc_alloc_bytes(i8*, i64) #0 +declare noalias nonnull {} addrspace(10)* @julia.gc_alloc_bytes(i8*, i64, i64) #0 attributes #0 = { allocsize(1) } @@ -59,8 +59,8 @@ top: %pgcstack = call {}*** @julia.get_pgcstack() %ptls = call {}*** @julia.ptls_states() %ptls_i8 = bitcast {}*** %ptls to i8* -; CHECK: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc - %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 8) +; CHECK: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc_instrumented + %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 8, i64 12341234) %0 = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* %1 = getelementptr {} addrspace(10)*, {} addrspace(10)* addrspace(10)* %0, i64 -1 store {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* %1, align 8, !tbaa !0 @@ -74,8 +74,8 @@ top: %ptls = call {}*** @julia.ptls_states() %ptls_i8 = bitcast {}*** %ptls to i8* ; CHECK: %0 = add i64 %size, 8 -; CHECK: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_alloc_typed(i8* %ptls_i8, i64 %0, i8* null) - %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 %size) +; CHECK: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_alloc_typed(i8* %ptls_i8, i64 %0, i64 12341234) + %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 %size, i64 12341234) %0 = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* %1 = getelementptr {} addrspace(10)*, {} addrspace(10)* addrspace(10)* %0, i64 -1 store {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* %1, align 8, !tbaa !0 diff --git a/test/llvmpasses/late-lower-gc-addrspaces.ll b/test/llvmpasses/late-lower-gc-addrspaces.ll index 7497febf1e846..7bb8c76b07e63 100644 --- a/test/llvmpasses/late-lower-gc-addrspaces.ll +++ b/test/llvmpasses/late-lower-gc-addrspaces.ll @@ -49,7 +49,7 @@ top: ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8) +; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; CHECK-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* ; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; CHECK-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4 @@ -74,7 +74,7 @@ top: ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8) +; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; CHECK-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* ; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; CHECK-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4 diff --git a/test/llvmpasses/late-lower-gc.ll b/test/llvmpasses/late-lower-gc.ll index 65a67c78d7810..77599290f8ef7 100644 --- a/test/llvmpasses/late-lower-gc.ll +++ b/test/llvmpasses/late-lower-gc.ll @@ -46,7 +46,7 @@ top: ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8) +; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; CHECK-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* ; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; CHECK-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4 @@ -71,7 +71,7 @@ top: ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* -; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8) +; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}}) ; CHECK-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* ; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; CHECK-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4 @@ -154,7 +154,7 @@ define void @decayar([2 x {} addrspace(10)* addrspace(11)*] %ar) { %l0 = load {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %e0 %e1 = extractvalue [2 x {} addrspace(10)* addrspace(11)*] %ar, 1 %l1 = load {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %e1 - %r = call i32 @callee_root({} addrspace(10)* %l0, {} addrspace(10)* %l1) + %r = call i32 @callee_root({} addrspace(10)* %l0, {} addrspace(10)* %l1) ret void } diff --git a/test/llvmpasses/pipeline-o0.jl b/test/llvmpasses/pipeline-o0.jl index ff9cd0aace704..3cbd5a9174cc2 100644 --- a/test/llvmpasses/pipeline-o0.jl +++ b/test/llvmpasses/pipeline-o0.jl @@ -9,7 +9,7 @@ include(joinpath("..", "testhelpers", "llvmpasses.jl")) # CHECK-NOT: julia.get_pgcstack # CHECK: asm # CHECK-NOT: julia.gc_alloc_obj -# CHECK: ijl_gc_pool_alloc +# CHECK: ijl_gc_pool_alloc_instrumented # COM: we want something vaguely along the lines of asm load from the fs register -> allocate bytes function simple() Ref(0) diff --git a/test/threads.jl b/test/threads.jl index fb684b275e864..1a0dbeb2d4dbf 100644 --- a/test/threads.jl +++ b/test/threads.jl @@ -327,3 +327,9 @@ end @test_throws ArgumentError @macroexpand(@threads 1) # arg isn't an Expr @test_throws ArgumentError @macroexpand(@threads if true 1 end) # arg doesn't start with for end + +@testset "num_stack_mappings metric" begin + @test @ccall(jl_get_num_stack_mappings()::Cint) >= 1 + # There must be at least two: one for the root test task and one for the async task: + @test fetch(@async(@ccall(jl_get_num_stack_mappings()::Cint))) >= 2 +end