diff --git a/Make.inc b/Make.inc
index def5f40f97926..eac6d325d6d59 100644
--- a/Make.inc
+++ b/Make.inc
@@ -195,7 +195,7 @@ endif
 JULIA_VERSION := $(shell cat $(JULIAHOME)/VERSION)
 JULIA_MAJOR_VERSION := $(shell echo $(JULIA_VERSION) | cut -d'-' -f 1 | cut -d'.' -f 1)
 JULIA_MINOR_VERSION := $(shell echo $(JULIA_VERSION) | cut -d'-' -f 1 | cut -d'.' -f 2)
-JULIA_PATCH_VERSION := $(shell echo $(JULIA_VERSION) | cut -d'-' -f 1 | cut -d'.' -f 3)
+JULIA_PATCH_VERSION := $(shell echo $(JULIA_VERSION) | cut -d'-' -f 1 | cut -d'+' -f 1 | cut -d'.' -f 3)
 
 # libjulia's SONAME will follow the format libjulia.so.$(SOMAJOR). Before v1.0.0,
 # SOMAJOR will be a two-decimal value, e.g. libjulia.so.0.5, whereas at and beyond
diff --git a/NEWS.md b/NEWS.md
index 695867f2e4f63..18a5b0e620e7d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -38,6 +38,7 @@ Compiler/Runtime improvements
 * All uses of the `@pure` macro in `Base` have been replaced with the now-preferred `Base.@assume_effects` ([#44776]).
 * `invoke(f, invokesig, args...)` calls to a less-specific method than would normally be chosen
   for `f(args...)` are no longer spuriously invalidated when loading package precompile files ([#46010]).
+* The mark phase of the Garbage Collector is now multi-threaded ([#48600]).
 
 Command-line option changes
 ---------------------------
@@ -49,6 +50,8 @@ Command-line option changes
   number of interactive threads to create (`auto` currently means 1) ([#42302]).
 * New option `--heap-size-hint=<size>` suggests a size limit to invoke garbage collection more eagerly.
   The size may be specified in bytes, kilobytes (1000k), megabytes (300M), or gigabytes (1.5G) ([#45369]).
+* New option `--gcthreads` to set how many threads will be used by the Garbage Collector ([#48600]).
+  The default is set to `N/2` where `N` is the amount of worker threads (`--threads`) used by Julia.
 
 Multi-threading changes
 -----------------------
diff --git a/VERSION b/VERSION
index 77fee73a8cf94..7bede500b4e81 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.9.3
+1.9.4+RAI
diff --git a/base/options.jl b/base/options.jl
index b9fa5b6ec5508..fb043672dc19a 100644
--- a/base/options.jl
+++ b/base/options.jl
@@ -11,6 +11,7 @@ struct JLOptions
     cpu_target::Ptr{UInt8}
     nthreadpools::Int16
     nthreads::Int16
+    ngcthreads::Int16
     nthreads_per_pool::Ptr{Int16}
     nprocs::Int32
     machine_file::Ptr{UInt8}
diff --git a/base/partr.jl b/base/partr.jl
index c77a24bdcc003..f84d7bd1a37b7 100644
--- a/base/partr.jl
+++ b/base/partr.jl
@@ -50,7 +50,7 @@ function multiq_sift_down(heap::taskheap, idx::Int32)
             child = Int(child)
             child > length(heap.tasks) && break
             if isassigned(heap.tasks, child) &&
-                    heap.tasks[child].priority < heap.tasks[idx].priority
+                    heap.tasks[child].priority <= heap.tasks[idx].priority
                 t = heap.tasks[idx]
                 heap.tasks[idx] = heap.tasks[child]
                 heap.tasks[child] = t
diff --git a/base/reflection.jl b/base/reflection.jl
index 1d8aa84cb3ad3..376a66c3ac8e3 100644
--- a/base/reflection.jl
+++ b/base/reflection.jl
@@ -1092,6 +1092,7 @@ struct CodegenParams
     prefer_specsig::Cint
     gnu_pubnames::Cint
     debug_info_kind::Cint
+    safepoint_on_entry::Cint
 
     lookup::Ptr{Cvoid}
 
@@ -1100,12 +1101,14 @@ struct CodegenParams
     function CodegenParams(; track_allocations::Bool=true, code_coverage::Bool=true,
                    prefer_specsig::Bool=false,
                    gnu_pubnames=true, debug_info_kind::Cint = default_debug_info_kind(),
+                   safepoint_on_entry::Bool=true,
                    lookup::Ptr{Cvoid}=cglobal(:jl_rettype_inferred),
                    generic_context = nothing)
         return new(
             Cint(track_allocations), Cint(code_coverage),
             Cint(prefer_specsig),
             Cint(gnu_pubnames), debug_info_kind,
+            Cint(safepoint_on_entry),
             lookup, generic_context)
     end
 end
diff --git a/base/task.jl b/base/task.jl
index 92598159e999f..d19189fa3cb5e 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -688,7 +688,7 @@ end
 
 ## scheduler and work queue
 
-struct IntrusiveLinkedListSynchronized{T}
+mutable struct IntrusiveLinkedListSynchronized{T}
     queue::IntrusiveLinkedList{T}
     lock::Threads.SpinLock
     IntrusiveLinkedListSynchronized{T}() where {T} = new(IntrusiveLinkedList{T}(), Threads.SpinLock())
diff --git a/base/threadingconstructs.jl b/base/threadingconstructs.jl
index 0c6563c54f99f..ce216f290ae12 100644
--- a/base/threadingconstructs.jl
+++ b/base/threadingconstructs.jl
@@ -136,6 +136,13 @@ function threadpooltids(pool::Symbol)
     end
 end
 
+"""
+    Threads.ngcthreads() -> Int
+
+Returns the number of GC threads currently configured.
+"""
+ngcthreads() = Int(unsafe_load(cglobal(:jl_n_gcthreads, Cint))) + 1
+
 function threading_run(fun, static)
     ccall(:jl_enter_threaded_region, Cvoid, ())
     n = threadpoolsize()
diff --git a/base/timing.jl b/base/timing.jl
index c994889d8902c..375ff7aaea60e 100644
--- a/base/timing.jl
+++ b/base/timing.jl
@@ -20,10 +20,13 @@ struct GC_Num
     max_memory      ::Int64
     time_to_safepoint           ::Int64
     max_time_to_safepoint       ::Int64
+    total_time_to_safepoint     ::Int64
     sweep_time      ::Int64
     mark_time       ::Int64
     total_sweep_time  ::Int64
     total_mark_time   ::Int64
+    last_full_sweep ::Int64
+    last_incremental_sweep ::Int64
 end
 
 gc_num() = ccall(:jl_gc_num, GC_Num, ())
@@ -96,6 +99,13 @@ function gc_live_bytes()
     Int(ccall(:jl_gc_live_bytes, Int64, ())) + num.allocd + num.deferred_alloc
 end
 
+# must be kept in sync with the value from `src/julia_threads.h``
+const JL_GC_N_MAX_POOLS = 51
+function gc_page_utilization_data()
+    page_utilization_raw = cglobal(:jl_gc_page_utilization_stats, Float64)
+    return Base.unsafe_wrap(Array, page_utilization_raw, JL_GC_N_MAX_POOLS, own=false)
+end
+
 """
     Base.jit_total_bytes()
 
diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl
index 11a9c1b552061..b1b098a5581b2 100644
--- a/contrib/generate_precompile.jl
+++ b/contrib/generate_precompile.jl
@@ -1,7 +1,7 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 if Threads.maxthreadid() != 1
-    @warn "Running this file with multiple Julia threads may lead to a build error" Base.maxthreadid()
+    @warn "Running this file with multiple Julia threads may lead to a build error" Base.Threads.maxthreadid()
 end
 
 if Base.isempty(Base.ARGS) || Base.ARGS[1] !== "0"
@@ -245,7 +245,7 @@ const HELP_PROMPT = "help?> "
 
 function generate_precompile_statements()
     start_time = time_ns()
-    debug_output = devnull # or stdout
+    debug_output = stdout
     sysimg = Base.unsafe_string(Base.JLOptions().image_file)
 
     # Extract the precompile statements from the precompile file
diff --git a/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/md5 b/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/md5
new file mode 100644
index 0000000000000..fb4a9537dda0d
--- /dev/null
+++ b/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/md5
@@ -0,0 +1 @@
+d9630229362baeff301c473bb3e05905
diff --git a/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/sha512 b/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/sha512
new file mode 100644
index 0000000000000..e6e1af11a228e
--- /dev/null
+++ b/deps/checksums/Pkg-397c8bf20ef70a78247d4cbd3d59f28a3116c884.tar.gz/sha512
@@ -0,0 +1 @@
+d8d421d9cce00fddaddbe88ae02076c58f5a669064b197092ceab6051c26fb3aab42f03be3f945dc5dbf376ee0d0a29c33208121e20b3e613a92311ca171828d
diff --git a/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/md5 b/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/md5
deleted file mode 100644
index 9b7ee38b00368..0000000000000
--- a/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/md5
+++ /dev/null
@@ -1 +0,0 @@
-0a5ac5f1302352d902b3acd4523c6cef
diff --git a/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/sha512 b/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/sha512
deleted file mode 100644
index 71efa5b40fb74..0000000000000
--- a/deps/checksums/Pkg-ffe4615b1e4e39b818a49bb1a06467932d5eaf51.tar.gz/sha512
+++ /dev/null
@@ -1 +0,0 @@
-69993723d04457c3c6268b1c4844ebed2563e56a0d3cc25e1a0048029491ba423e6808f53a66cf879ed39323112cda07344e9c678e2d6abc2d7444e4e5336c2f
diff --git a/doc/man/julia.1 b/doc/man/julia.1
index 383c588c58dae..fa9f641b1e76f 100644
--- a/doc/man/julia.1
+++ b/doc/man/julia.1
@@ -118,6 +118,11 @@ supported (Linux and Windows). If this is not supported (macOS) or
 process affinity is not configured, it uses the number of CPU
 threads.
 
+.TP
+--gcthreads <n>
+Enable n GC threads; If unspecified is set to half of the
+compute worker threads.
+
 .TP
 -p, --procs {N|auto}
 Integer value N launches N additional local worker processes `auto` launches as many workers
diff --git a/doc/src/base/multi-threading.md b/doc/src/base/multi-threading.md
index 4932aef4cc938..fb75b21479707 100644
--- a/doc/src/base/multi-threading.md
+++ b/doc/src/base/multi-threading.md
@@ -10,6 +10,7 @@ Base.Threads.nthreads
 Base.Threads.threadpool
 Base.Threads.nthreadpools
 Base.Threads.threadpoolsize
+Base.Threads.ngcthreads
 ```
 
 See also [Multi-Threading](@ref man-multithreading).
diff --git a/doc/src/manual/command-line-interface.md b/doc/src/manual/command-line-interface.md
index 54c56a354c7a3..781a77a33dadb 100644
--- a/doc/src/manual/command-line-interface.md
+++ b/doc/src/manual/command-line-interface.md
@@ -106,8 +106,9 @@ The following is a complete list of command-line switches available when launchi
 |`-e`, `--eval <expr>`                  |Evaluate `<expr>`|
 |`-E`, `--print <expr>`                 |Evaluate `<expr>` and display the result|
 |`-L`, `--load <file>`                  |Load `<file>` immediately on all processors|
-|`-t`, `--threads {N\|auto`}            |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future.  Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.|
-|`-p`, `--procs {N\|auto`}              |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)|
+|`-t`, `--threads {N\|auto}`            |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future.  Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.|
+| `--gcthreads {N}`                     |Enable N GC threads; If unspecified is set to half of the compute worker threads.|
+|`-p`, `--procs {N\|auto}`              |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)|
 |`--machine-file <file>`                |Run processes on hosts listed in `<file>`|
 |`-i`                                   |Interactive mode; REPL runs and `isinteractive()` is true|
 |`-q`, `--quiet`                        |Quiet startup: no banner, suppress REPL warnings|
diff --git a/doc/src/manual/environment-variables.md b/doc/src/manual/environment-variables.md
index f29e5b7aaf8f7..68fb79fc0a9a6 100644
--- a/doc/src/manual/environment-variables.md
+++ b/doc/src/manual/environment-variables.md
@@ -315,6 +315,14 @@ then spinning threads never sleep. Otherwise, `$JULIA_THREAD_SLEEP_THRESHOLD` is
 interpreted as an unsigned 64-bit integer (`uint64_t`) and gives, in
 nanoseconds, the amount of time after which spinning threads should sleep.
 
+### [`JULIA_NUM_GC_THREADS`](@id env-gc-threads)
+
+Sets the number of threads used by Garbage Collection. If unspecified is set to
+half of the number of worker threads.
+
+!!! compat "Julia 1.10"
+    The environment variable was added in 1.10
+
 ### `JULIA_EXCLUSIVE`
 
 If set to anything besides `0`, then Julia's thread policy is consistent with
diff --git a/doc/src/manual/multi-threading.md b/doc/src/manual/multi-threading.md
index f8c6b040941f4..e1cb17bd5b93f 100644
--- a/doc/src/manual/multi-threading.md
+++ b/doc/src/manual/multi-threading.md
@@ -72,6 +72,15 @@ julia> Threads.threadid()
     three processes have 2 threads enabled. For more fine grained control over worker
     threads use [`addprocs`](@ref) and pass `-t`/`--threads` as `exeflags`.
 
+### Multiple GC Threads
+
+The Garbage Collector (GC) can use multiple threads. The amount used is either half the number
+of compute worker threads or configured by either the `--gcthreads` command line argument or by using the
+[`JULIA_NUM_GC_THREADS`](@ref env-gc-threads) environment variable.
+
+!!! compat "Julia 1.10"
+    The `--gcthreads` command line argument requires at least Julia 1.10.
+
 ## [Threadpools](@id man-threadpools)
 
 When a program's threads are busy with many tasks to run, tasks may experience
diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md
index 5b18f57a186be..60e2d8042ee18 100644
--- a/doc/src/manual/profile.md
+++ b/doc/src/manual/profile.md
@@ -310,7 +310,122 @@ the amount of memory allocated by each line of code.
 
 ### Line-by-Line Allocation Tracking
 
-To measure allocation line-by-line, start Julia with the `--track-allocation=<setting>` command-line
+While [`@time`](@ref) logs high-level stats about memory usage and garbage collection over the course
+of evaluating an expression, it can be useful to log each garbage collection event, to get an
+intuitive sense of how often the garbage collector is running, how long it's running each time,
+and how much garbage it collects each time. This can be enabled with
+[`GC.enable_logging(true)`](@ref), which causes Julia to log to stderr every time
+a garbage collection happens.
+
+### [Allocation Profiler](@id allocation-profiler)
+
+!!! compat "Julia 1.8"
+    This functionality requires at least Julia 1.8.
+
+The allocation profiler records the stack trace, type, and size of each
+allocation while it is running. It can be invoked with
+[`Profile.Allocs.@profile`](@ref).
+
+This information about the allocations is returned as an array of `Alloc`
+objects, wrapped in an `AllocResults` object. The best way to visualize these is
+currently with the [PProf.jl](https://github.com/JuliaPerf/PProf.jl) and
+[ProfileCanvas.jl](https://github.com/pfitzseb/ProfileCanvas.jl) packages, which
+can visualize the call stacks which are making the most allocations.
+
+The allocation profiler does have significant overhead, so a `sample_rate`
+argument can be passed to speed it up by making it skip some allocations.
+Passing `sample_rate=1.0` will make it record everything (which is slow);
+`sample_rate=0.1` will record only 10% of the allocations (faster), etc.
+
+!!! compat "Julia 1.11"
+
+    Older versions of Julia could not capture types in all cases. In older versions of
+    Julia, if you see an allocation of type `Profile.Allocs.UnknownType`, it means that
+    the profiler doesn't know what type of object was allocated. This mainly happened when
+    the allocation was coming from generated code produced by the compiler. See
+    [issue #43688](https://github.com/JuliaLang/julia/issues/43688) for more info.
+
+    Since Julia 1.11, all allocations should have a type reported.
+
+For more details on how to use this tool, please see the following talk from JuliaCon 2022:
+https://www.youtube.com/watch?v=BFvpwC8hEWQ
+
+##### Allocation Profiler Example
+
+In this simple example, we use PProf to visualize the alloc profile. You could use another
+visualization tool instead. We collect the profile (specifying a sample rate), then we visualize it.
+```julia
+using Profile, PProf
+Profile.Allocs.clear()
+Profile.Allocs.@profile sample_rate=0.0001 my_function()
+PProf.Allocs.pprof()
+```
+
+Here is a more in-depth example, showing how we can tune the sample rate. A
+good number of samples to aim for is around 1 - 10 thousand. Too many, and the
+profile visualizer can get overwhelmed, and profiling will be slow. Too few,
+and you don't have a representative sample.
+
+
+```julia-repl
+julia> import Profile
+
+julia> @time my_function()  # Estimate allocations from a (second-run) of the function
+  0.110018 seconds (1.50 M allocations: 58.725 MiB, 17.17% gc time)
+500000
+
+julia> Profile.Allocs.clear()
+
+julia> Profile.Allocs.@profile sample_rate=0.001 begin   # 1.5 M * 0.001 = ~1.5K allocs.
+           my_function()
+       end
+500000
+
+julia> prof = Profile.Allocs.fetch();  # If you want, you can also manually inspect the results.
+
+julia> length(prof.allocs)  # Confirm we have expected number of allocations.
+1515
+
+julia> using PProf  # Now, visualize with an external tool, like PProf or ProfileCanvas.
+
+julia> PProf.Allocs.pprof(prof; from_c=false)  # You can optionally pass in a previously fetched profile result.
+Analyzing 1515 allocation samples... 100%|████████████████████████████████| Time: 0:00:00
+Main binary filename not available.
+Serving web UI on http://localhost:62261
+"alloc-profile.pb.gz"
+```
+Then you can view the profile by navigating to http://localhost:62261, and the profile is saved to disk.
+See PProf package for more options.
+
+##### Allocation Profiling Tips
+
+As stated above, aim for around 1-10 thousand samples in your profile.
+
+Note that we are uniformly sampling in the space of _all allocations_, and are not weighting
+our samples by the size of the allocation. So a given allocation profile may not give a
+representative profile of where most bytes are allocated in your program, unless you had set
+`sample_rate=1`.
+
+Allocations can come from users directly constructing objects, but can also come from inside
+the runtime or be inserted into compiled code to handle type instability. Looking at the
+"source code" view can be helpful to isolate them, and then other external tools such as
+[`Cthulhu.jl`](https://github.com/JuliaDebug/Cthulhu.jl) can be useful for identifying the
+cause of the allocation.
+
+##### Allocation Profile Visualization Tools
+
+There are several profiling visualization tools now that can all display Allocation
+Profiles. Here is a small list of some of the main ones we know about:
+- [PProf.jl](https://github.com/JuliaPerf/PProf.jl)
+- [ProfileCanvas.jl](https://github.com/pfitzseb/ProfileCanvas.jl)
+- VSCode's built-in profile visualizer (`@profview_allocs`) [docs needed]
+- Viewing the results directly in the REPL
+  - You can inspect the results in the REPL via [`Profile.Allocs.fetch()`](@ref), to view
+    the stacktrace and type of each allocation.
+
+#### Line-by-Line Allocation Tracking
+
+An alternative way to measure allocations is to start Julia with the `--track-allocation=<setting>` command-line
 option, for which you can choose `none` (the default, do not measure allocation), `user` (measure
 memory allocation everywhere except Julia's core code), or `all` (measure memory allocation at
 each line of Julia code). Allocation gets measured for each line of compiled code. When you quit
diff --git a/src/Makefile b/src/Makefile
index a6939f629412f..2a66e3a0dccac 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -43,8 +43,8 @@ endif
 SRCS := \
 	jltypes gf typemap smallintset ast builtins module interpreter symbol \
 	dlload sys init task array staticdata toplevel jl_uv datatype \
-	simplevector runtime_intrinsics precompile jloptions \
-	threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler method \
+	simplevector runtime_intrinsics precompile jloptions mtarraylist \
+	threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler method \
 	jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \
 	crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall
 
@@ -99,7 +99,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0)
 UV_HEADERS += uv.h
 UV_HEADERS += uv/*.h
 endif
-PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
+PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
 ifeq ($(OS),WINNT)
 PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h)
 endif
@@ -353,7 +353,7 @@ $(BUILDDIR)/julia_version.h: $(JULIAHOME)/VERSION
 	@echo "#ifndef JL_VERSION_H" >> $@.$(JULIA_BUILD_MODE).tmp
 	@echo "#define JL_VERSION_H" >> $@.$(JULIA_BUILD_MODE).tmp
 	@echo "#define JULIA_VERSION_STRING" \"$(JULIA_VERSION)\" >> $@.$(JULIA_BUILD_MODE).tmp
-	@echo $(JULIA_VERSION) | awk 'BEGIN {FS="[.,-]"} \
+	@echo $(JULIA_VERSION) | awk 'BEGIN {FS="[.,+-]"} \
 	{print "#define JULIA_VERSION_MAJOR " $$1 "\n" \
 	"#define JULIA_VERSION_MINOR " $$2 "\n" \
 	"#define JULIA_VERSION_PATCH " $$3 ; \
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index c091111f31617..8ce84acb30901 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -3933,7 +3933,6 @@ static Value *emit_defer_signal(jl_codectx_t &ctx)
     return ctx.builder.CreateInBoundsGEP(ctx.types().T_sigatomic, ptls, ArrayRef<Value*>(offset), "jl_defer_signal");
 }
 
-
 #ifndef JL_NDEBUG
 static int compare_cgparams(const jl_cgparams_t *a, const jl_cgparams_t *b)
 {
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 6bd0b0d16865a..a4773acb3fbea 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -1279,6 +1279,7 @@ extern "C" {
         1,
 #endif
         (int) DICompileUnit::DebugEmissionKind::FullDebug,
+        1,
         jl_rettype_inferred, NULL };
 }
 
@@ -7805,7 +7806,11 @@ static jl_llvm_functions_t
         ctx.builder.CreateAlignedStore(load_world, world_age_field, Align(sizeof(size_t)));
     }
 
-    // step 11b. Do codegen in control flow order
+    // step 11b. Emit the entry safepoint
+    if (JL_FEAT_TEST(ctx, safepoint_on_entry))
+        emit_gc_safepoint(ctx.builder, get_current_ptls(ctx), ctx.tbaa().tbaa_const);
+
+    // step 11c. Do codegen in control flow order
     std::vector<int> workstack;
     std::map<int, BasicBlock*> BB;
     std::map<size_t, BasicBlock*> come_from_bb;
diff --git a/src/codegen_shared.h b/src/codegen_shared.h
index 329cc567e8c5f..e0edb792d7645 100644
--- a/src/codegen_shared.h
+++ b/src/codegen_shared.h
@@ -3,6 +3,7 @@
 #include <utility>
 #include <llvm/ADT/ArrayRef.h>
 #include <llvm/Support/Debug.h>
+#include <llvm/IR/Attributes.h>
 #include <llvm/IR/DebugLoc.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/MDBuilder.h>
@@ -233,20 +234,39 @@ static inline void emit_signal_fence(llvm::IRBuilder<> &builder)
     builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SyncScope::SingleThread);
 }
 
-static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa)
+static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa, bool final = false)
 {
+    using namespace llvm;
+    llvm::Value *signal_page = get_current_signal_page_from_ptls(builder, ptls, tbaa);
     emit_signal_fence(builder);
-    builder.CreateLoad(getSizeTy(builder.getContext()), get_current_signal_page_from_ptls(builder, ptls, tbaa), true);
+    Module *M = builder.GetInsertBlock()->getModule();
+    LLVMContext &C = builder.getContext();
+    // inline jlsafepoint_func->realize(M)
+    if (final) {
+        auto T_size = getSizeTy(builder.getContext());
+        builder.CreateLoad(T_size, signal_page, true);
+    }
+    else {
+        Function *F = M->getFunction("julia.safepoint");
+        if (!F) {
+            auto T_size = getSizeTy(builder.getContext());
+            auto T_psize = T_size->getPointerTo();
+            FunctionType *FT = FunctionType::get(Type::getVoidTy(C), {T_psize}, false);
+            F = Function::Create(FT, Function::ExternalLinkage, "julia.safepoint", M);
+            F->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+        }
+        builder.CreateCall(F, {signal_page});
+    }
     emit_signal_fence(builder);
 }
 
-static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, llvm::Value *old_state)
+static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, llvm::Value *old_state, bool final)
 {
     using namespace llvm;
     Type *T_int8 = state->getType();
-    ptls = emit_bitcast_with_builder(builder, ptls, builder.getInt8PtrTy());
+    llvm::Value *ptls_i8 = emit_bitcast_with_builder(builder, ptls, builder.getInt8PtrTy());
     Constant *offset = ConstantInt::getSigned(builder.getInt32Ty(), offsetof(jl_tls_states_t, gc_state));
-    Value *gc_state = builder.CreateInBoundsGEP(T_int8, ptls, ArrayRef<Value*>(offset), "gc_state");
+    Value *gc_state = builder.CreateInBoundsGEP(T_int8, ptls_i8, ArrayRef<Value*>(offset), "gc_state");
     if (old_state == nullptr) {
         old_state = builder.CreateLoad(T_int8, gc_state);
         cast<LoadInst>(old_state)->setOrdering(AtomicOrdering::Monotonic);
@@ -266,38 +286,38 @@ static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::V
                          passBB, exitBB);
     builder.SetInsertPoint(passBB);
     MDNode *tbaa = get_tbaa_const(builder.getContext());
-    emit_gc_safepoint(builder, ptls, tbaa);
+    emit_gc_safepoint(builder, ptls, tbaa, final);
     builder.CreateBr(exitBB);
     builder.SetInsertPoint(exitBB);
     return old_state;
 }
 
-static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls)
+static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls, bool final)
 {
     using namespace llvm;
     Value *state = builder.getInt8(0);
-    return emit_gc_state_set(builder, ptls, state, nullptr);
+    return emit_gc_state_set(builder, ptls, state, nullptr, final);
 }
 
-static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state)
+static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, bool final)
 {
     using namespace llvm;
     Value *old_state = builder.getInt8(0);
-    return emit_gc_state_set(builder, ptls, state, old_state);
+    return emit_gc_state_set(builder, ptls, state, old_state, final);
 }
 
-static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls)
+static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls, bool final)
 {
     using namespace llvm;
     Value *state = builder.getInt8(JL_GC_STATE_SAFE);
-    return emit_gc_state_set(builder, ptls, state, nullptr);
+    return emit_gc_state_set(builder, ptls, state, nullptr, final);
 }
 
-static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state)
+static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, bool final)
 {
     using namespace llvm;
     Value *old_state = builder.getInt8(JL_GC_STATE_SAFE);
-    return emit_gc_state_set(builder, ptls, state, old_state);
+    return emit_gc_state_set(builder, ptls, state, old_state, final);
 }
 
 // Compatibility shims for LLVM attribute APIs that were renamed in LLVM 14.
diff --git a/src/gc-alloc-profiler.h b/src/gc-alloc-profiler.h
index 3fd8bf4388a0a..fcd8e45caa2d8 100644
--- a/src/gc-alloc-profiler.h
+++ b/src/gc-alloc-profiler.h
@@ -35,6 +35,7 @@ void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t
 
 extern int g_alloc_profile_enabled;
 
+// This should only be used from _deprecated_ code paths. We shouldn't see UNKNOWN anymore.
 #define jl_gc_unknown_type_tag ((jl_datatype_t*)0xdeadaa03)
 
 static inline void maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *typ) JL_NOTSAFEPOINT {
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 3f60ca17e0dc4..78768d5802824 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -27,19 +27,16 @@ jl_gc_pagemeta_t *jl_gc_page_metadata(void *data)
 // the end of the page.
 JL_DLLEXPORT jl_taggedvalue_t *jl_gc_find_taggedvalue_pool(char *p, size_t *osize_p)
 {
-    if (!page_metadata(p))
+    if (!gc_alloc_map_is_set(p))
         // Not in the pool
         return NULL;
-    struct jl_gc_metadata_ext info = page_metadata_ext(p);
+    jl_gc_pagemeta_t *meta = page_metadata(p);
     char *page_begin = gc_page_data(p) + GC_PAGE_OFFSET;
     // In the page header
     if (p < page_begin)
         return NULL;
     size_t ofs = p - page_begin;
-    // Check if this is a free page
-    if (!(info.pagetable0->allocmap[info.pagetable0_i32] & (uint32_t)(1 << info.pagetable0_i)))
-        return NULL;
-    int osize = info.meta->osize;
+    int osize = meta->osize;
     // Shouldn't be needed, just in case
     if (osize == 0)
         return NULL;
@@ -111,44 +108,14 @@ static void gc_clear_mark_page(jl_gc_pagemeta_t *pg, int bits)
     }
 }
 
-static void gc_clear_mark_pagetable0(pagetable0_t *pagetable0, int bits)
-{
-    for (int pg_i = 0; pg_i < REGION0_PG_COUNT / 32; pg_i++) {
-        uint32_t line = pagetable0->allocmap[pg_i];
-        if (line) {
-            for (int j = 0; j < 32; j++) {
-                if ((line >> j) & 1) {
-                    gc_clear_mark_page(pagetable0->meta[pg_i * 32 + j], bits);
-                }
-            }
-        }
-    }
-}
-
-static void gc_clear_mark_pagetable1(pagetable1_t *pagetable1, int bits)
-{
-    for (int pg_i = 0; pg_i < REGION1_PG_COUNT / 32; pg_i++) {
-        uint32_t line = pagetable1->allocmap0[pg_i];
-        if (line) {
-            for (int j = 0; j < 32; j++) {
-                if ((line >> j) & 1) {
-                    gc_clear_mark_pagetable0(pagetable1->meta0[pg_i * 32 + j], bits);
-                }
-            }
-        }
-    }
-}
-
-static void gc_clear_mark_pagetable(int bits)
+static void gc_clear_mark_outer(int bits)
 {
-    for (int pg_i = 0; pg_i < (REGION2_PG_COUNT + 31) / 32; pg_i++) {
-        uint32_t line = memory_map.allocmap1[pg_i];
-        if (line) {
-            for (int j = 0; j < 32; j++) {
-                if ((line >> j) & 1) {
-                    gc_clear_mark_pagetable1(memory_map.meta1[pg_i * 32 + j], bits);
-                }
-            }
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
+        while (pg != NULL) {
+            gc_clear_mark_page(pg, bits);
+            pg = pg->next;
         }
     }
 }
@@ -184,7 +151,7 @@ static void clear_mark(int bits)
         v = v->next;
     }
 
-    gc_clear_mark_pagetable(bits);
+    gc_clear_mark_outer(bits);
 }
 
 static void restore(void)
@@ -198,21 +165,32 @@ static void restore(void)
 
 static void gc_verify_track(jl_ptls_t ptls)
 {
-    jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
+    // `gc_verify_track` is limited to single-threaded GC
+    if (jl_n_gcthreads != 0)
+        return;
     do {
-        jl_gc_mark_sp_t sp;
-        gc_mark_sp_init(gc_cache, &sp);
+        jl_gc_markqueue_t mq;
+        jl_gc_markqueue_t *mq2 = &ptls->mark_queue;
+        ws_queue_t *cq = &mq.chunk_queue;
+        ws_queue_t *q = &mq.ptr_queue;
+        jl_atomic_store_relaxed(&cq->top, 0);
+        jl_atomic_store_relaxed(&cq->bottom, 0);
+        jl_atomic_store_relaxed(&cq->array, jl_atomic_load_relaxed(&mq2->chunk_queue.array));
+        jl_atomic_store_relaxed(&q->top, 0);
+        jl_atomic_store_relaxed(&q->bottom, 0);
+        jl_atomic_store_relaxed(&q->array, jl_atomic_load_relaxed(&mq2->ptr_queue.array));
+        arraylist_new(&mq.reclaim_set, 32);
         arraylist_push(&lostval_parents_done, lostval);
         jl_safe_printf("Now looking for %p =======\n", lostval);
         clear_mark(GC_CLEAN);
-        gc_mark_queue_all_roots(ptls, &sp);
-        gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0);
-        for (int i = 0; i < gc_n_threads; i++) {
+        gc_mark_queue_all_roots(ptls, &mq);
+        gc_mark_finlist(&mq, &to_finalize, 0);
+        for (int i = 0; i < gc_n_threads;i++) {
             jl_ptls_t ptls2 = gc_all_tls_states[i];
-            gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
+            gc_mark_finlist(&mq, &ptls2->finalizers, 0);
         }
-        gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0);
-        gc_mark_loop(ptls, sp);
+        gc_mark_finlist(&mq, &finalizer_list_marked, 0);
+        gc_mark_loop_serial_(ptls, &mq);
         if (lostval_parents.len == 0) {
             jl_safe_printf("Could not find the missing link. We missed a toplevel root. This is odd.\n");
             break;
@@ -246,22 +224,35 @@ static void gc_verify_track(jl_ptls_t ptls)
 
 void gc_verify(jl_ptls_t ptls)
 {
-    jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
-    jl_gc_mark_sp_t sp;
-    gc_mark_sp_init(gc_cache, &sp);
+    // `gc_verify` is limited to single-threaded GC
+    if (jl_n_gcthreads != 0) {
+        jl_safe_printf("Warn. GC verify disabled in multi-threaded GC\n");
+        return;
+    }
+    jl_gc_markqueue_t mq;
+    jl_gc_markqueue_t *mq2 = &ptls->mark_queue;
+    ws_queue_t *cq = &mq.chunk_queue;
+    ws_queue_t *q = &mq.ptr_queue;
+    jl_atomic_store_relaxed(&cq->top, 0);
+    jl_atomic_store_relaxed(&cq->bottom, 0);
+    jl_atomic_store_relaxed(&cq->array, jl_atomic_load_relaxed(&mq2->chunk_queue.array));
+    jl_atomic_store_relaxed(&q->top, 0);
+    jl_atomic_store_relaxed(&q->bottom, 0);
+    jl_atomic_store_relaxed(&q->array, jl_atomic_load_relaxed(&mq2->ptr_queue.array));
+    arraylist_new(&mq.reclaim_set, 32);
     lostval = NULL;
     lostval_parents.len = 0;
     lostval_parents_done.len = 0;
     clear_mark(GC_CLEAN);
     gc_verifying = 1;
-    gc_mark_queue_all_roots(ptls, &sp);
-    gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0);
-    for (int i = 0; i < gc_n_threads; i++) {
+    gc_mark_queue_all_roots(ptls, &mq);
+    gc_mark_finlist(&mq, &to_finalize, 0);
+    for (int i = 0; i < gc_n_threads;i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
+        gc_mark_finlist(&mq, &ptls2->finalizers, 0);
     }
-    gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0);
-    gc_mark_loop(ptls, sp);
+    gc_mark_finlist(&mq, &finalizer_list_marked, 0);
+    gc_mark_loop_serial_(ptls, &mq);
     int clean_len = bits_save[GC_CLEAN].len;
     for(int i = 0; i < clean_len + bits_save[GC_OLD].len; i++) {
         jl_taggedvalue_t *v = (jl_taggedvalue_t*)bits_save[i >= clean_len ? GC_OLD : GC_CLEAN].items[i >= clean_len ? i - clean_len : i];
@@ -500,7 +491,7 @@ int jl_gc_debug_check_other(void)
     return gc_debug_alloc_check(&jl_gc_debug_env.other);
 }
 
-void jl_gc_debug_print_status(void)
+void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT
 {
     uint64_t pool_count = jl_gc_debug_env.pool.num;
     uint64_t other_count = jl_gc_debug_env.other.num;
@@ -509,7 +500,7 @@ void jl_gc_debug_print_status(void)
                    pool_count + other_count, pool_count, other_count, gc_num.pause);
 }
 
-void jl_gc_debug_critical_error(void)
+void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT
 {
     jl_gc_debug_print_status();
     if (!jl_gc_debug_env.wait_for_debugger)
@@ -537,7 +528,6 @@ void gc_scrub_record_task(jl_task_t *t)
 
 JL_NO_ASAN static void gc_scrub_range(char *low, char *high)
 {
-    jl_ptls_t ptls = jl_current_task->ptls;
     jl_jmp_buf *old_buf = jl_get_safe_restore();
     jl_jmp_buf buf;
     if (jl_setjmp(buf, 0)) {
@@ -556,14 +546,6 @@ JL_NO_ASAN static void gc_scrub_range(char *low, char *high)
         // Make sure the sweep rebuild the freelist
         pg->has_marked = 1;
         pg->has_young = 1;
-        // Find the age bit
-        char *page_begin = gc_page_data(tag) + GC_PAGE_OFFSET;
-        int obj_id = (((char*)tag) - page_begin) / osize;
-        uint8_t *ages = pg->ages + obj_id / 8;
-        // Force this to be a young object to save some memory
-        // (especially on 32bit where it's more likely to have pointer-like
-        //  bit patterns)
-        *ages &= ~(1 << (obj_id % 8));
         memset(tag, 0xff, osize);
         // set mark to GC_MARKED (young and marked)
         tag->bits.gc = GC_MARKED;
@@ -762,45 +744,37 @@ void gc_final_pause_end(int64_t t0, int64_t tend)
 
 static void gc_stats_pagetable0(pagetable0_t *pagetable0, unsigned *p0)
 {
-    for (int pg_i = 0; pg_i < REGION0_PG_COUNT / 32; pg_i++) {
-        uint32_t line = pagetable0->allocmap[pg_i] | pagetable0->freemap[pg_i];
-        if (line) {
-            for (int j = 0; j < 32; j++) {
-                if ((line >> j) & 1) {
-                    (*p0)++;
-                }
-            }
+    for (int pg_i = 0; pg_i < REGION0_PG_COUNT; pg_i++) {
+        uint8_t meta = pagetable0->meta[pg_i];
+        assert(meta == GC_PAGE_UNMAPPED || meta == GC_PAGE_ALLOCATED ||
+               meta == GC_PAGE_LAZILY_FREED || meta == GC_PAGE_FREED);
+        if (meta != GC_PAGE_UNMAPPED) {
+            (*p0)++;
         }
     }
 }
 
 static void gc_stats_pagetable1(pagetable1_t *pagetable1, unsigned *p1, unsigned *p0)
 {
-    for (int pg_i = 0; pg_i < REGION1_PG_COUNT / 32; pg_i++) {
-        uint32_t line = pagetable1->allocmap0[pg_i] | pagetable1->freemap0[pg_i];
-        if (line) {
-            for (int j = 0; j < 32; j++) {
-                if ((line >> j) & 1) {
-                    (*p1)++;
-                    gc_stats_pagetable0(pagetable1->meta0[pg_i * 32 + j], p0);
-                }
-            }
+    for (int pg_i = 0; pg_i < REGION1_PG_COUNT; pg_i++) {
+        pagetable0_t *pagetable0 = pagetable1->meta0[pg_i];
+        if (pagetable0 == NULL) {
+            continue;
         }
+        (*p1)++;
+        gc_stats_pagetable0(pagetable0, p0);
     }
 }
 
 static void gc_stats_pagetable(unsigned *p2, unsigned *p1, unsigned *p0)
 {
-    for (int pg_i = 0; pg_i < (REGION2_PG_COUNT + 31) / 32; pg_i++) {
-        uint32_t line = memory_map.allocmap1[pg_i] | memory_map.freemap1[pg_i];
-        if (line) {
-            for (int j = 0; j < 32; j++) {
-                if ((line >> j) & 1) {
-                    (*p2)++;
-                    gc_stats_pagetable1(memory_map.meta1[pg_i * 32 + j], p1, p0);
-                }
-            }
+    for (int pg_i = 0; pg_i < REGION2_PG_COUNT; pg_i++) {
+        pagetable1_t *pagetable1 = alloc_map.meta1[pg_i];
+        if (pagetable1 == NULL) {
+            continue;
         }
+        (*p2)++;
+        gc_stats_pagetable1(pagetable1, p1, p0);
     }
 }
 
@@ -809,7 +783,7 @@ void jl_print_gc_stats(JL_STREAM *s)
 #ifdef _OS_LINUX_
     malloc_stats();
 #endif
-    double ptime = jl_clock_now() - process_t0;
+    double ptime = jl_hrtime() - process_t0;
     jl_safe_printf("exec time\t%.5f sec\n", ptime);
     if (gc_num.pause > 0) {
         jl_safe_printf("gc time  \t%.5f sec (%2.1f%%) in %d (%d full) collections\n",
@@ -1030,7 +1004,7 @@ void jl_gc_debug_init(void)
 #endif
 
 #ifdef GC_FINAL_STATS
-    process_t0 = jl_clock_now();
+    process_t0 = jl_hrtime();
 #endif
 }
 
@@ -1152,7 +1126,7 @@ void gc_stats_big_obj(void)
 static int64_t poolobj_sizes[4];
 static int64_t empty_pages;
 
-static void gc_count_pool_page(jl_gc_pagemeta_t *pg)
+static void gc_count_pool_page(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
 {
     int osize = pg->osize;
     char *data = pg->data;
@@ -1171,44 +1145,16 @@ static void gc_count_pool_page(jl_gc_pagemeta_t *pg)
     }
 }
 
-static void gc_count_pool_pagetable0(pagetable0_t *pagetable0)
-{
-    for (int pg_i = 0; pg_i < REGION0_PG_COUNT / 32; pg_i++) {
-        uint32_t line = pagetable0->allocmap[pg_i];
-        if (line) {
-            for (int j = 0; j < 32; j++) {
-                if ((line >> j) & 1) {
-                    gc_count_pool_page(pagetable0->meta[pg_i * 32 + j]);
-                }
-            }
-        }
-    }
-}
-
-static void gc_count_pool_pagetable1(pagetable1_t *pagetable1)
-{
-    for (int pg_i = 0; pg_i < REGION1_PG_COUNT / 32; pg_i++) {
-        uint32_t line = pagetable1->allocmap0[pg_i];
-        if (line) {
-            for (int j = 0; j < 32; j++) {
-                if ((line >> j) & 1) {
-                    gc_count_pool_pagetable0(pagetable1->meta0[pg_i * 32 + j]);
-                }
-            }
-        }
-    }
-}
-
 static void gc_count_pool_pagetable(void)
 {
-    for (int pg_i = 0; pg_i < (REGION2_PG_COUNT + 31) / 32; pg_i++) {
-        uint32_t line = memory_map.allocmap1[pg_i];
-        if (line) {
-            for (int j = 0; j < 32; j++) {
-                if ((line >> j) & 1) {
-                    gc_count_pool_pagetable1(memory_map.meta1[pg_i * 32 + j]);
-                }
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
+        while (pg != NULL) {
+            if (gc_alloc_map_is_set(pg->data)) {
+                gc_count_pool_page(pg);
             }
+            pg = pg->next;
         }
     }
 }
@@ -1264,139 +1210,6 @@ int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
     return (slot - start) / elsize;
 }
 
-// Print a backtrace from the bottom (start) of the mark stack up to `sp`
-// `pc_offset` will be added to `sp` for convenience in the debugger.
-NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset)
-{
-    jl_jmp_buf *old_buf = jl_get_safe_restore();
-    jl_jmp_buf buf;
-    jl_set_safe_restore(&buf);
-    if (jl_setjmp(buf, 0) != 0) {
-        jl_safe_printf("\n!!! ERROR when unwinding gc mark loop -- ABORTING !!!\n");
-        jl_set_safe_restore(old_buf);
-        return;
-    }
-    void **top = sp.pc + pc_offset;
-    jl_gc_mark_data_t *data_top = sp.data;
-    sp.data = ptls->gc_cache.data_stack;
-    sp.pc = ptls->gc_cache.pc_stack;
-    int isroot = 1;
-    while (sp.pc < top) {
-        void *pc = *sp.pc;
-        const char *prefix = isroot ? "r--" : " `-";
-        isroot = 0;
-        if (pc == gc_mark_label_addrs[GC_MARK_L_marked_obj]) {
-            gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p: Root object: %p :: %p (bits: %d)\n        of type ",
-                           (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits);
-            jl_((void*)data->tag);
-            isroot = 1;
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_scan_only]) {
-            gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p: Queued root: %p :: %p (bits: %d)\n        of type ",
-                           (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits);
-            jl_((void*)data->tag);
-            isroot = 1;
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_finlist]) {
-            gc_mark_finlist_t *data = gc_repush_markdata(&sp, gc_mark_finlist_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p: Finalizer list from %p to %p\n",
-                           (void*)data, (void*)data->begin, (void*)data->end);
-            isroot = 1;
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_objarray]) {
-            gc_mark_objarray_t *data = gc_repush_markdata(&sp, gc_mark_objarray_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p:  %s Array in object %p :: %p -- [%p, %p)\n        of type ",
-                           (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
-                           (void*)data->begin, (void*)data->end);
-            jl_(jl_typeof(data->parent));
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_obj8]) {
-            gc_mark_obj8_t *data = gc_repush_markdata(&sp, gc_mark_obj8_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent);
-            uint8_t *desc = (uint8_t*)jl_dt_layout_ptrs(vt->layout);
-            jl_safe_printf("%p:  %s Object (8bit) %p :: %p -- [%d, %d)\n        of type ",
-                           (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
-                           (int)(data->begin - desc), (int)(data->end - desc));
-            jl_(jl_typeof(data->parent));
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_obj16]) {
-            gc_mark_obj16_t *data = gc_repush_markdata(&sp, gc_mark_obj16_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent);
-            uint16_t *desc = (uint16_t*)jl_dt_layout_ptrs(vt->layout);
-            jl_safe_printf("%p:  %s Object (16bit) %p :: %p -- [%d, %d)\n        of type ",
-                           (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
-                           (int)(data->begin - desc), (int)(data->end - desc));
-            jl_(jl_typeof(data->parent));
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_obj32]) {
-            gc_mark_obj32_t *data = gc_repush_markdata(&sp, gc_mark_obj32_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent);
-            uint32_t *desc = (uint32_t*)jl_dt_layout_ptrs(vt->layout);
-            jl_safe_printf("%p:  %s Object (32bit) %p :: %p -- [%d, %d)\n        of type ",
-                           (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
-                           (int)(data->begin - desc), (int)(data->end - desc));
-            jl_(jl_typeof(data->parent));
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_stack]) {
-            gc_mark_stackframe_t *data = gc_repush_markdata(&sp, gc_mark_stackframe_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p:  %s Stack frame %p -- %d of %d (%s)\n",
-                           (void*)data, prefix, (void*)data->s, (int)data->i,
-                           (int)data->nroots >> 1,
-                           (data->nroots & 1) ? "indirect" : "direct");
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_module_binding]) {
-            // module_binding
-            gc_mark_binding_t *data = gc_repush_markdata(&sp, gc_mark_binding_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p:  %s Module (bindings) %p (bits %d) -- [%p, %p)\n",
-                           (void*)data, prefix, (void*)data->parent, (int)data->bits,
-                           (void*)data->begin, (void*)data->end);
-        }
-        else {
-            jl_safe_printf("Unknown pc %p --- ABORTING !!!\n", pc);
-            break;
-        }
-    }
-    jl_set_safe_restore(old_buf);
-}
-
 static int gc_logging_enabled = 0;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
diff --git a/src/gc-page-profiler.c b/src/gc-page-profiler.c
new file mode 100644
index 0000000000000..5af1c3d014770
--- /dev/null
+++ b/src/gc-page-profiler.c
@@ -0,0 +1,167 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include "gc-page-profiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// whether page profiling is enabled
+int page_profile_enabled;
+// number of pages written
+size_t page_profile_pages_written;
+// stream to write page profile to
+ios_t *page_profile_stream;
+// mutex for page profile
+uv_mutex_t page_profile_lock;
+
+gc_page_profiler_serializer_t gc_page_serializer_create(void) JL_NOTSAFEPOINT
+{
+    gc_page_profiler_serializer_t serializer;
+    if (__unlikely(page_profile_enabled)) {
+        arraylist_new(&serializer.typestrs, GC_PAGE_SZ);
+    }
+    else {
+        serializer.typestrs.len = 0;
+    }
+    return serializer;
+}
+
+void gc_page_serializer_init(gc_page_profiler_serializer_t *serializer,
+                             jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
+{
+    if (__unlikely(page_profile_enabled)) {
+        serializer->typestrs.len = 0;
+        serializer->data = (char *)pg->data;
+        serializer->osize = pg->osize;
+    }
+}
+
+void gc_page_serializer_destroy(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT
+{
+    if (__unlikely(page_profile_enabled)) {
+        arraylist_free(&serializer->typestrs);
+    }
+}
+
+void gc_page_serializer_write(gc_page_profiler_serializer_t *serializer,
+                              const char *str) JL_NOTSAFEPOINT
+{
+    if (__unlikely(page_profile_enabled)) {
+        arraylist_push(&serializer->typestrs, (void *)str);
+    }
+}
+
+void gc_enable_page_profile(void) JL_NOTSAFEPOINT
+{
+    page_profile_enabled = 1;
+}
+
+void gc_disable_page_profile(void) JL_NOTSAFEPOINT
+{
+    page_profile_enabled = 0;
+}
+
+int gc_page_profile_is_enabled(void) JL_NOTSAFEPOINT
+{
+    return page_profile_enabled;
+}
+
+void gc_page_profile_write_preamble(gc_page_profiler_serializer_t *serializer)
+    JL_NOTSAFEPOINT
+{
+    if (__unlikely(page_profile_enabled)) {
+        char str[GC_TYPE_STR_MAXLEN];
+        snprintf(str, GC_TYPE_STR_MAXLEN,
+                 "{\"address\": \"%p\",\"object_size\": %d,\"objects\": [",
+                 serializer->data, serializer->osize);
+        ios_write(page_profile_stream, str, strlen(str));
+    }
+}
+
+void gc_page_profile_write_epilogue(gc_page_profiler_serializer_t *serializer)
+    JL_NOTSAFEPOINT
+{
+    if (__unlikely(page_profile_enabled)) {
+        const char *str = "]}";
+        ios_write(page_profile_stream, str, strlen(str));
+    }
+}
+
+void gc_page_profile_write_comma(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT
+{
+    if (__unlikely(page_profile_enabled)) {
+        // write comma if not first page
+        if (page_profile_pages_written > 0) {
+            const char *str = ",";
+            ios_write(page_profile_stream, str, strlen(str));
+        }
+    }
+}
+
+void gc_page_profile_write_to_file(gc_page_profiler_serializer_t *serializer)
+    JL_NOTSAFEPOINT
+{
+    if (__unlikely(page_profile_enabled)) {
+        // write to file
+        uv_mutex_lock(&page_profile_lock);
+        gc_page_profile_write_comma(serializer);
+        gc_page_profile_write_preamble(serializer);
+        char str[GC_TYPE_STR_MAXLEN];
+        for (size_t i = 0; i < serializer->typestrs.len; i++) {
+            const char *name = (const char *)serializer->typestrs.items[i];
+            if (name == GC_SERIALIZER_EMPTY) {
+                snprintf(str, GC_TYPE_STR_MAXLEN, "\"empty\",");
+            }
+            else if (name == GC_SERIALIZER_GARBAGE) {
+                snprintf(str, GC_TYPE_STR_MAXLEN, "\"garbage\",");
+            }
+            else {
+                snprintf(str, GC_TYPE_STR_MAXLEN, "\"%s\",", name);
+            }
+            // remove trailing comma for last element
+            if (i == serializer->typestrs.len - 1) {
+                str[strlen(str) - 1] = '\0';
+            }
+            ios_write(page_profile_stream, str, strlen(str));
+        }
+        gc_page_profile_write_epilogue(serializer);
+        page_profile_pages_written++;
+        uv_mutex_unlock(&page_profile_lock);
+    }
+}
+
+void gc_page_profile_write_json_preamble(ios_t *stream) JL_NOTSAFEPOINT
+{
+    if (__unlikely(page_profile_enabled)) {
+        uv_mutex_lock(&page_profile_lock);
+        const char *str = "{\"pages\": [";
+        ios_write(stream, str, strlen(str));
+        uv_mutex_unlock(&page_profile_lock);
+    }
+}
+
+void gc_page_profile_write_json_epilogue(ios_t *stream) JL_NOTSAFEPOINT
+{
+    if (__unlikely(page_profile_enabled)) {
+        uv_mutex_lock(&page_profile_lock);
+        const char *str = "]}";
+        ios_write(stream, str, strlen(str));
+        uv_mutex_unlock(&page_profile_lock);
+    }
+}
+
+JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
+{
+    gc_enable_page_profile();
+    page_profile_pages_written = 0;
+    page_profile_stream = stream;
+    gc_page_profile_write_json_preamble(stream);
+    jl_gc_collect(JL_GC_FULL);
+    gc_page_profile_write_json_epilogue(stream);
+    gc_disable_page_profile();
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/gc-page-profiler.h b/src/gc-page-profiler.h
new file mode 100644
index 0000000000000..b103e23905ba5
--- /dev/null
+++ b/src/gc-page-profiler.h
@@ -0,0 +1,63 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifndef GC_PAGE_PROFILER_H
+#define GC_PAGE_PROFILER_H
+
+#include "gc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GC_TYPE_STR_MAXLEN (512)
+
+typedef struct {
+    arraylist_t typestrs;
+    char *data;
+    int osize;
+} gc_page_profiler_serializer_t;
+
+// mutex for page profile
+extern uv_mutex_t page_profile_lock;
+
+// Serializer functions
+gc_page_profiler_serializer_t gc_page_serializer_create(void) JL_NOTSAFEPOINT;
+void gc_page_serializer_init(gc_page_profiler_serializer_t *serializer, jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT;
+void gc_page_serializer_destroy(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT;
+void gc_page_serializer_write(gc_page_profiler_serializer_t *serializer, const char *str) JL_NOTSAFEPOINT;
+// Page profile functions
+#define GC_SERIALIZER_EMPTY ((const char *)0x1)
+#define GC_SERIALIZER_GARBAGE ((const char *)0x2)
+STATIC_INLINE void gc_page_profile_write_empty_page(gc_page_profiler_serializer_t *serializer,
+                                 int enabled) JL_NOTSAFEPOINT
+{
+    if (__unlikely(enabled)) {
+        gc_page_serializer_write(serializer, GC_SERIALIZER_EMPTY);
+    }
+}
+STATIC_INLINE void gc_page_profile_write_garbage(gc_page_profiler_serializer_t *serializer,
+                                                 int enabled) JL_NOTSAFEPOINT
+{
+    if (__unlikely(enabled)) {
+        gc_page_serializer_write(serializer, GC_SERIALIZER_GARBAGE);
+    }
+}
+STATIC_INLINE void gc_page_profile_write_live_obj(gc_page_profiler_serializer_t *serializer,
+                                                  jl_taggedvalue_t *v,
+                                                  int enabled) JL_NOTSAFEPOINT
+{
+    if (__unlikely(enabled)) {
+        const char *name = jl_typeof_str(jl_valueof(v));
+        gc_page_serializer_write(serializer, name);
+    }
+}
+void gc_enable_page_profile(void) JL_NOTSAFEPOINT;
+void gc_disable_page_profile(void) JL_NOTSAFEPOINT;
+int gc_page_profile_is_enabled(void) JL_NOTSAFEPOINT;
+void gc_page_profile_write_to_file(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // GC_PAGE_PROFILER_H
diff --git a/src/gc-pages.c b/src/gc-pages.c
index d579eb0cd4fbb..f015b5de2295e 100644
--- a/src/gc-pages.c
+++ b/src/gc-pages.c
@@ -19,7 +19,31 @@ extern "C" {
 #define MIN_BLOCK_PG_ALLOC (1) // 16 KB
 
 static int block_pg_cnt = DEFAULT_BLOCK_PG_ALLOC;
-static size_t current_pg_count = 0;
+static _Atomic(size_t) current_pg_count = 0;
+
+// Julia allocates large blocks (64M) with mmap. These are never
+// released back but the underlying physical memory may be released
+// with calls to madvise(MADV_DONTNEED).
+// These large blocks are used to allocated jl_page_size sized
+// pages, that are tracked by current_pg_count.
+static uint64_t poolmem_bytes_allocated = 0;
+static uint64_t poolmem_blocks_allocated_total = 0;
+
+
+JL_DLLEXPORT uint64_t jl_poolmem_blocks_allocated_total(void)
+{
+    return poolmem_blocks_allocated_total;
+}
+
+JL_DLLEXPORT uint64_t jl_poolmem_bytes_allocated(void)
+{
+    return poolmem_bytes_allocated;
+}
+
+JL_DLLEXPORT uint64_t jl_current_pg_count(void)
+{
+    return (uint64_t)jl_atomic_load(&current_pg_count);
+}
 
 void jl_gc_init_page(void)
 {
@@ -33,7 +57,7 @@ void jl_gc_init_page(void)
 
 // Try to allocate a memory block for multiple pages
 // Return `NULL` if allocation failed. Result is aligned to `GC_PAGE_SZ`.
-static char *jl_gc_try_alloc_pages(int pg_cnt) JL_NOTSAFEPOINT
+char *jl_gc_try_alloc_pages_(int pg_cnt) JL_NOTSAFEPOINT
 {
     size_t pages_sz = GC_PAGE_SZ * pg_cnt;
 #ifdef _OS_WINDOWS_
@@ -48,6 +72,12 @@ static char *jl_gc_try_alloc_pages(int pg_cnt) JL_NOTSAFEPOINT
                             MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
     if (mem == MAP_FAILED)
         return NULL;
+    poolmem_bytes_allocated += pages_sz;
+    poolmem_blocks_allocated_total++;
+
+#ifdef MADV_NOHUGEPAGE
+    madvise(mem, pages_sz, MADV_NOHUGEPAGE);
+#endif
 #endif
     if (GC_PAGE_SZ > jl_page_size)
         // round data pointer up to the nearest gc_page_data-aligned
@@ -63,13 +93,12 @@ static char *jl_gc_try_alloc_pages(int pg_cnt) JL_NOTSAFEPOINT
 // smaller `MIN_BLOCK_PG_ALLOC` a `jl_memory_exception` is thrown.
 // Assumes `gc_perm_lock` is acquired, the lock is released before the
 // exception is thrown.
-static jl_gc_pagemeta_t *jl_gc_alloc_new_page(void) JL_NOTSAFEPOINT
+char *jl_gc_try_alloc_pages(void) JL_NOTSAFEPOINT
 {
-    // try to allocate a large block of memory (or a small one)
-    unsigned pg, pg_cnt = block_pg_cnt;
+    unsigned pg_cnt = block_pg_cnt;
     char *mem = NULL;
     while (1) {
-        if (__likely((mem = jl_gc_try_alloc_pages(pg_cnt))))
+        if (__likely((mem = jl_gc_try_alloc_pages_(pg_cnt))))
             break;
         size_t min_block_pg_alloc = MIN_BLOCK_PG_ALLOC;
         if (GC_PAGE_SZ * min_block_pg_alloc < jl_page_size)
@@ -86,204 +115,71 @@ static jl_gc_pagemeta_t *jl_gc_alloc_new_page(void) JL_NOTSAFEPOINT
             jl_throw(jl_memory_exception);
         }
     }
-
-    // now need to insert these pages into the pagetable metadata
-    // if any allocation fails, this just stops recording more pages from that point
-    // and will free (munmap) the remainder
-    jl_gc_pagemeta_t *page_meta =
-        (jl_gc_pagemeta_t*)jl_gc_perm_alloc_nolock(pg_cnt * sizeof(jl_gc_pagemeta_t), 1,
-                                                   sizeof(void*), 0);
-    pg = 0;
-    if (page_meta) {
-        for (; pg < pg_cnt; pg++) {
-            struct jl_gc_metadata_ext info;
-            uint32_t msk;
-            unsigned i;
-            pagetable1_t **ppagetable1;
-            pagetable0_t **ppagetable0;
-            jl_gc_pagemeta_t **pmeta;
-
-            char *ptr = mem + (GC_PAGE_SZ * pg);
-            page_meta[pg].data = ptr;
-
-            // create & store the level 2 / outermost info
-            i = REGION_INDEX(ptr);
-            info.pagetable_i = i % 32;
-            info.pagetable_i32 = i / 32;
-            msk = (1u << info.pagetable_i);
-            if ((memory_map.freemap1[info.pagetable_i32] & msk) == 0)
-                memory_map.freemap1[info.pagetable_i32] |= msk; // has free
-            info.pagetable1 = *(ppagetable1 = &memory_map.meta1[i]);
-            if (!info.pagetable1) {
-                info.pagetable1 = (pagetable1_t*)jl_gc_perm_alloc_nolock(sizeof(pagetable1_t), 1,
-                                                                         sizeof(void*), 0);
-                *ppagetable1 = info.pagetable1;
-                if (!info.pagetable1)
-                    break;
-            }
-
-            // create & store the level 1 info
-            i = REGION1_INDEX(ptr);
-            info.pagetable1_i = i % 32;
-            info.pagetable1_i32 = i / 32;
-            msk = (1u << info.pagetable1_i);
-            if ((info.pagetable1->freemap0[info.pagetable1_i32] & msk) == 0)
-                info.pagetable1->freemap0[info.pagetable1_i32] |= msk; // has free
-            info.pagetable0 = *(ppagetable0 = &info.pagetable1->meta0[i]);
-            if (!info.pagetable0) {
-                info.pagetable0 = (pagetable0_t*)jl_gc_perm_alloc_nolock(sizeof(pagetable0_t), 1,
-                                                                         sizeof(void*), 0);
-                *ppagetable0 = info.pagetable0;
-                if (!info.pagetable0)
-                    break;
-            }
-
-            // create & store the level 0 / page info
-            i = REGION0_INDEX(ptr);
-            info.pagetable0_i = i % 32;
-            info.pagetable0_i32 = i / 32;
-            msk = (1u << info.pagetable0_i);
-            info.pagetable0->freemap[info.pagetable0_i32] |= msk; // is free
-            pmeta = &info.pagetable0->meta[i];
-            info.meta = (*pmeta = &page_meta[pg]);
-        }
-    }
-
-    if (pg < pg_cnt) {
-#ifndef _OS_WINDOWS_
-        // Trim the allocation to only cover the region
-        // that we successfully created the metadata for.
-        // This is not supported by the Windows kernel,
-        // so we have to just skip it there and just lose these virtual addresses.
-        munmap(mem + LLT_ALIGN(GC_PAGE_SZ * pg, jl_page_size),
-               GC_PAGE_SZ * pg_cnt - LLT_ALIGN(GC_PAGE_SZ * pg, jl_page_size));
-#endif
-        if (pg == 0) {
-            uv_mutex_unlock(&gc_perm_lock);
-            jl_throw(jl_memory_exception);
-        }
-    }
-    return page_meta;
+    return mem;
 }
 
 // get a new page, either from the freemap
 // or from the kernel if none are available
 NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT
 {
-    struct jl_gc_metadata_ext info;
-    uv_mutex_lock(&gc_perm_lock);
-
     int last_errno = errno;
 #ifdef _OS_WINDOWS_
     DWORD last_error = GetLastError();
 #endif
-    // scan over memory_map page-table for existing allocated but unused pages
-    for (info.pagetable_i32 = memory_map.lb; info.pagetable_i32 < (REGION2_PG_COUNT + 31) / 32; info.pagetable_i32++) {
-        uint32_t freemap1 = memory_map.freemap1[info.pagetable_i32];
-        for (info.pagetable_i = 0; freemap1; info.pagetable_i++, freemap1 >>= 1) {
-            unsigned next = ffs_u32(freemap1);
-            info.pagetable_i += next;
-            freemap1 >>= next;
-            info.pagetable1 = memory_map.meta1[info.pagetable_i + info.pagetable_i32 * 32];
-            // repeat over page-table level 1
-            for (info.pagetable1_i32 = info.pagetable1->lb; info.pagetable1_i32 < REGION1_PG_COUNT / 32; info.pagetable1_i32++) {
-                uint32_t freemap0 = info.pagetable1->freemap0[info.pagetable1_i32];
-                for (info.pagetable1_i = 0; freemap0; info.pagetable1_i++, freemap0 >>= 1) {
-                    unsigned next = ffs_u32(freemap0);
-                    info.pagetable1_i += next;
-                    freemap0 >>= next;
-                    info.pagetable0 = info.pagetable1->meta0[info.pagetable1_i + info.pagetable1_i32 * 32];
-                    // repeat over page-table level 0
-                    for (info.pagetable0_i32 = info.pagetable0->lb; info.pagetable0_i32 < REGION0_PG_COUNT / 32; info.pagetable0_i32++) {
-                        uint32_t freemap = info.pagetable0->freemap[info.pagetable0_i32];
-                        if (freemap) {
-                            info.pagetable0_i = ffs_u32(freemap);
-                            info.meta = info.pagetable0->meta[info.pagetable0_i + info.pagetable0_i32 * 32];
-                            assert(info.meta->data);
-                            // new pages available starting at min of lb and pagetable_i32
-                            if (memory_map.lb < info.pagetable_i32)
-                                memory_map.lb = info.pagetable_i32;
-                            if (info.pagetable1->lb < info.pagetable1_i32)
-                                info.pagetable1->lb = info.pagetable1_i32;
-                            if (info.pagetable0->lb < info.pagetable0_i32)
-                                info.pagetable0->lb = info.pagetable0_i32;
-                            goto have_free_page; // break out of all of these loops
-                        }
-                    }
-                    info.pagetable1->freemap0[info.pagetable1_i32] &= ~(uint32_t)(1u << info.pagetable1_i); // record that this was full
-                }
-            }
-            memory_map.freemap1[info.pagetable_i32] &= ~(uint32_t)(1u << info.pagetable_i); // record that this was full
-        }
-    }
+    jl_gc_pagemeta_t *meta = NULL;
 
-    // no existing pages found, allocate a new one
-    {
-        jl_gc_pagemeta_t *meta = jl_gc_alloc_new_page();
-        info = page_metadata_ext(meta->data);
-        assert(meta == info.meta);
-        // new pages are now available starting at max of lb and pagetable_i32
-        if (memory_map.lb > info.pagetable_i32)
-            memory_map.lb = info.pagetable_i32;
-        if (info.pagetable1->lb > info.pagetable1_i32)
-            info.pagetable1->lb = info.pagetable1_i32;
-        if (info.pagetable0->lb > info.pagetable0_i32)
-            info.pagetable0->lb = info.pagetable0_i32;
+    // try to get page from `pool_clean`
+    meta = pop_lf_back(&global_page_pool_clean);
+    if (meta != NULL) {
+        gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED);
+        goto exit;
     }
 
-have_free_page:
-    // in-use pages are now ending at min of ub and pagetable_i32
-    if (memory_map.ub < info.pagetable_i32)
-        memory_map.ub = info.pagetable_i32;
-    if (info.pagetable1->ub < info.pagetable1_i32)
-        info.pagetable1->ub = info.pagetable1_i32;
-    if (info.pagetable0->ub < info.pagetable0_i32)
-        info.pagetable0->ub = info.pagetable0_i32;
-
-    // mark this entry as in-use and not free
-    info.pagetable0->freemap[info.pagetable0_i32] &= ~(uint32_t)(1u << info.pagetable0_i);
-    info.pagetable0->allocmap[info.pagetable0_i32] |= (uint32_t)(1u << info.pagetable0_i);
-    info.pagetable1->allocmap0[info.pagetable1_i32] |= (uint32_t)(1u << info.pagetable1_i);
-    memory_map.allocmap1[info.pagetable_i32] |= (uint32_t)(1u << info.pagetable_i);
+    // try to get page from `pool_freed`
+    meta = pop_lf_back(&global_page_pool_freed);
+    if (meta != NULL) {
+        gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED);
+        goto exit;
+    }
 
+    uv_mutex_lock(&gc_perm_lock);
+    // another thread may have allocated a large block while we were waiting...
+    meta = pop_lf_back(&global_page_pool_clean);
+    if (meta != NULL) {
+        uv_mutex_unlock(&gc_perm_lock);
+        gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED);
+        goto exit;
+    }
+    // must map a new set of pages
+    char *data = jl_gc_try_alloc_pages();
+    meta = (jl_gc_pagemeta_t*)malloc_s(block_pg_cnt * sizeof(jl_gc_pagemeta_t));
+    for (int i = 0; i < block_pg_cnt; i++) {
+        jl_gc_pagemeta_t *pg = &meta[i];
+        pg->data = data + GC_PAGE_SZ * i;
+        gc_alloc_map_maybe_create(pg->data);
+        if (i == 0) {
+            gc_alloc_map_set(pg->data, GC_PAGE_ALLOCATED);
+        }
+        else {
+            push_lf_back(&global_page_pool_clean, pg);
+        }
+    }
+    uv_mutex_unlock(&gc_perm_lock);
+exit:
 #ifdef _OS_WINDOWS_
-    VirtualAlloc(info.meta->data, GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE);
-#endif
-#ifdef _OS_WINDOWS_
+    VirtualAlloc(meta->data, GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE);
     SetLastError(last_error);
 #endif
     errno = last_errno;
-    current_pg_count++;
-    gc_final_count_page(current_pg_count);
-    uv_mutex_unlock(&gc_perm_lock);
-    return info.meta;
+    jl_atomic_fetch_add(&current_pg_count, 1);
+    return meta;
 }
 
 // return a page to the freemap allocator
-void jl_gc_free_page(void *p) JL_NOTSAFEPOINT
+void jl_gc_free_page(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
 {
-    // update the allocmap and freemap to indicate this contains a free entry
-    struct jl_gc_metadata_ext info = page_metadata_ext(p);
-    uint32_t msk;
-    msk = (uint32_t)(1u << info.pagetable0_i);
-    assert(!(info.pagetable0->freemap[info.pagetable0_i32] & msk));
-    assert(info.pagetable0->allocmap[info.pagetable0_i32] & msk);
-    info.pagetable0->allocmap[info.pagetable0_i32] &= ~msk;
-    info.pagetable0->freemap[info.pagetable0_i32] |= msk;
-
-    msk = (uint32_t)(1u << info.pagetable1_i);
-    assert(info.pagetable1->allocmap0[info.pagetable1_i32] & msk);
-    if ((info.pagetable1->freemap0[info.pagetable1_i32] & msk) == 0)
-        info.pagetable1->freemap0[info.pagetable1_i32] |= msk;
-
-    msk = (uint32_t)(1u << info.pagetable_i);
-    assert(memory_map.allocmap1[info.pagetable_i32] & msk);
-    if ((memory_map.freemap1[info.pagetable_i32] & msk) == 0)
-        memory_map.freemap1[info.pagetable_i32] |= msk;
-
-    free(info.meta->ages);
-    info.meta->ages = NULL;
-
+    void *p = pg->data;
+    gc_alloc_map_set((char*)p, GC_PAGE_FREED);
     // tell the OS we don't need these pages right now
     size_t decommit_size = GC_PAGE_SZ;
     if (GC_PAGE_SZ < jl_page_size) {
@@ -293,16 +189,15 @@ void jl_gc_free_page(void *p) JL_NOTSAFEPOINT
         void *otherp = (void*)((uintptr_t)p & ~(jl_page_size - 1)); // round down to the nearest physical page
         p = otherp;
         while (n_pages--) {
-            struct jl_gc_metadata_ext info = page_metadata_ext(otherp);
-            msk = (uint32_t)(1u << info.pagetable0_i);
-            if (info.pagetable0->allocmap[info.pagetable0_i32] & msk)
-                goto no_decommit;
+            if (gc_alloc_map_is_set((char*)otherp)) {
+                return;
+            }
             otherp = (void*)((char*)otherp + GC_PAGE_SZ);
         }
     }
 #ifdef _OS_WINDOWS_
     VirtualFree(p, decommit_size, MEM_DECOMMIT);
-#elif defined(MADV_FREE)
+#elif 0
     static int supports_madv_free = 1;
     if (supports_madv_free) {
         if (madvise(p, decommit_size, MADV_FREE) == -1) {
@@ -316,20 +211,8 @@ void jl_gc_free_page(void *p) JL_NOTSAFEPOINT
 #else
     madvise(p, decommit_size, MADV_DONTNEED);
 #endif
-    /* TODO: Should we leave this poisoned and rather allow the GC to read poisoned pointers from
-     *       the page when it sweeps pools?
-     */
     msan_unpoison(p, decommit_size);
-
-no_decommit:
-    // new pages are now available starting at max of lb and pagetable_i32
-    if (memory_map.lb > info.pagetable_i32)
-        memory_map.lb = info.pagetable_i32;
-    if (info.pagetable1->lb > info.pagetable1_i32)
-        info.pagetable1->lb = info.pagetable1_i32;
-    if (info.pagetable0->lb > info.pagetable0_i32)
-        info.pagetable0->lb = info.pagetable0_i32;
-    current_pg_count--;
+    jl_atomic_fetch_add(&current_pg_count, -1);
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index b35c1722c82ff..693cb8d0eadf0 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -61,6 +61,9 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
         munmap(stk, bufsz);
         return MAP_FAILED;
     }
+#ifdef MADV_NOHUGEPAGE
+    madvise(stk, bufsz, MADV_NOHUGEPAGE);
+#endif
 #endif
     jl_atomic_fetch_add(&num_stack_mappings, 1);
     return stk;
@@ -73,6 +76,10 @@ static void free_stack(void *stkbuf, size_t bufsz)
 }
 #endif
 
+JL_DLLEXPORT uint32_t jl_get_num_stack_mappings(void)
+{
+    return jl_atomic_load_relaxed(&num_stack_mappings);
+}
 
 const unsigned pool_sizes[] = {
     128 * 1024,
@@ -112,7 +119,7 @@ static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz)
     if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(bufsz);
         if (pool_sizes[pool_id] == bufsz) {
-            arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf);
             return;
         }
     }
@@ -141,7 +148,7 @@ void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task)
 #ifdef _COMPILER_ASAN_ENABLED_
             __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);
 #endif
-            arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf);
         }
     }
 }
@@ -156,9 +163,9 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     if (ssize <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(ssize);
         ssize = pool_sizes[pool_id];
-        arraylist_t *pool = &ptls->heap.free_stacks[pool_id];
+        small_arraylist_t *pool = &ptls->heap.free_stacks[pool_id];
         if (pool->len > 0) {
-            stk = arraylist_pop(pool);
+            stk = small_arraylist_pop(pool);
         }
     }
     else {
@@ -177,8 +184,8 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     }
     *bufsz = ssize;
     if (owner) {
-        arraylist_t *live_tasks = &ptls->heap.live_tasks;
-        arraylist_push(live_tasks, owner);
+        small_arraylist_t *live_tasks = &ptls->heap.live_tasks;
+        mtarraylist_push(live_tasks, owner);
     }
     return stk;
 }
@@ -202,7 +209,7 @@ void sweep_stack_pools(void)
 
         // free half of stacks that remain unused since last sweep
         for (int p = 0; p < JL_N_STACK_POOLS; p++) {
-            arraylist_t *al = &ptls2->heap.free_stacks[p];
+            small_arraylist_t *al = &ptls2->heap.free_stacks[p];
             size_t n_to_free;
             if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
                 n_to_free = al->len / 2;
@@ -213,12 +220,12 @@ void sweep_stack_pools(void)
                 n_to_free = 0;
             }
             for (int n = 0; n < n_to_free; n++) {
-                void *stk = arraylist_pop(al);
+                void *stk = small_arraylist_pop(al);
                 free_stack(stk, pool_sizes[p]);
             }
         }
 
-        arraylist_t *live_tasks = &ptls2->heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->heap.live_tasks;
         size_t n = 0;
         size_t ndel = 0;
         size_t l = live_tasks->len;
@@ -261,24 +268,52 @@ void sweep_stack_pools(void)
 
 JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
 {
-    jl_task_t *ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    arraylist_t *live_tasks = &ptls->heap.live_tasks;
-    size_t i, j, l;
-    jl_array_t *a;
-    do {
-        l = live_tasks->len;
-        a = jl_alloc_vec_any(l + 1); // may gc, changing the number of tasks
-    } while (l + 1 < live_tasks->len);
-    l = live_tasks->len;
-    void **lst = live_tasks->items;
-    j = 0;
-    ((void**)jl_array_data(a))[j++] = ptls->root_task;
-    for (i = 0; i < l; i++) {
-        if (((jl_task_t*)lst[i])->stkbuf != NULL)
-            ((void**)jl_array_data(a))[j++] = lst[i];
+    size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
+    size_t l = 0; // l is not reset on restart, so we keep getting more aggressive at making a big enough list everything it fails
+restart:
+    for (size_t i = 0; i < nthreads; i++) {
+        // skip GC threads since they don't have tasks
+        if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) {
+            continue;
+        }
+        jl_ptls_t ptls2 = allstates[i];
+        if (ptls2 == NULL)
+            continue;
+        small_arraylist_t *live_tasks = &ptls2->heap.live_tasks;
+        size_t n = mtarraylist_length(live_tasks);
+        l += n + (ptls2->root_task->stkbuf != NULL);
+    }
+    l += l / 20; // add 5% for margin of estimation error
+    jl_array_t *a = jl_alloc_vec_any(l); // may gc, changing the number of tasks and forcing us to reload everything
+    nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
+    size_t j = 0;
+    for (size_t i = 0; i < nthreads; i++) {
+        // skip GC threads since they don't have tasks
+        if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) {
+            continue;
+        }
+        jl_ptls_t ptls2 = allstates[i];
+        if (ptls2 == NULL)
+            continue;
+        jl_task_t *t = ptls2->root_task;
+        if (t->stkbuf != NULL) {
+            if (j == l)
+                goto restart;
+            ((void**)jl_array_data(a))[j++] = t;
+        }
+        small_arraylist_t *live_tasks = &ptls2->heap.live_tasks;
+        size_t n = mtarraylist_length(live_tasks);
+        for (size_t i = 0; i < n; i++) {
+            jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i);
+            if (t->stkbuf != NULL) {
+                if (j == l)
+                    goto restart;
+                ((void**)jl_array_data(a))[j++] = t;
+            }
+        }
     }
-    l = jl_array_len(a);
     if (j < l) {
         JL_GC_PUSH1(&a);
         jl_array_del_end(a, l - j);
diff --git a/src/gc.c b/src/gc.c
index 1df6db4986a2f..e8cfae27de0da 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -1,6 +1,7 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
 #include "gc.h"
+#include "gc-page-profiler.h"
 #include "julia_gcext.h"
 #include "julia_assert.h"
 #ifdef __GLIBC__
@@ -11,6 +12,22 @@
 extern "C" {
 #endif
 
+// Number of threads currently running the GC mark-loop
+_Atomic(int) gc_n_threads_marking;
+// Number of threads sweeping
+_Atomic(int) gc_n_threads_sweeping;
+// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping
+_Atomic(jl_gc_page_stack_t *) gc_allocd_scratch;
+// `tid` of mutator thread that triggered GC
+_Atomic(int) gc_master_tid;
+// `tid` of first GC thread
+int gc_first_tid;
+// Mutex/cond used to synchronize sleep/wakeup of GC threads
+uv_mutex_t gc_threads_lock;
+uv_cond_t gc_threads_cond;
+// Mutex used to coordinate entry of GC threads in the mark loop
+uv_mutex_t gc_queue_observer_lock;
+
 // Linked list of callback functions
 
 typedef void (*jl_gc_cb_func_t)(void);
@@ -26,6 +43,7 @@ static jl_gc_callback_list_t *gc_cblist_pre_gc;
 static jl_gc_callback_list_t *gc_cblist_post_gc;
 static jl_gc_callback_list_t *gc_cblist_notify_external_alloc;
 static jl_gc_callback_list_t *gc_cblist_notify_external_free;
+static jl_gc_callback_list_t *gc_cblist_notify_gc_pressure;
 
 #define gc_invoke_callbacks(ty, list, args) \
     do { \
@@ -112,16 +130,13 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre
         jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb);
 }
 
-// Save/restore local mark stack to/from thread-local storage.
-
-STATIC_INLINE void export_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) {
-    ptls->gc_mark_sp = *sp;
-}
-
-STATIC_INLINE void import_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) {
-    // Has the stack been reallocated in the meantime?
-    *sp = ptls->gc_mark_sp;
-}
+JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable)
+ {
+     if (enable)
+         jl_gc_register_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb);
+     else
+         jl_gc_deregister_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb);
+ }
 
 // Protect all access to `finalizer_list_marked` and `to_finalize`.
 // For accessing `ptls->finalizers`, the lock is needed if a thread
@@ -179,8 +194,6 @@ JL_DLLEXPORT uintptr_t jl_get_buff_tag(void)
     return jl_buff_tag;
 }
 
-pagetable_t memory_map;
-
 // List of marked big objects.  Not per-thread.  Accessed only by master thread.
 bigval_t *big_objects_marked = NULL;
 
@@ -327,16 +340,16 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
         jl_wake_libuv();
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        // This acquire load pairs with the release stores
-        // in the signal handler of safepoint so we are sure that
-        // all the stores on those threads are visible.
-        // We're currently also using atomic store release in mutator threads
-        // (in jl_gc_state_set), but we may want to use signals to flush the
-        // memory operations on those threads lazily instead.
-        while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
-            jl_cpu_pause(); // yield?
+        if (ptls2 != NULL) {
+            // This acquire load pairs with the release stores
+            // in the signal handler of safepoint so we are sure that
+            // all the stores on those threads are visible.
+            // We're currently also using atomic store release in mutator threads
+            // (in jl_gc_state_set), but we may want to use signals to flush the
+            // memory operations on those threads lazily instead.
+            while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
+                jl_cpu_pause(); // yield?
+        }
     }
 }
 
@@ -644,7 +657,7 @@ void jl_gc_run_all_finalizers(jl_task_t *ct)
     schedule_all_finalizers(&finalizer_list_marked);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2)
+        if (ptls2 != NULL)
             schedule_all_finalizers(&ptls2->finalizers);
     }
     // unlock here because `run_finalizers` locks this
@@ -720,7 +733,7 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o)
     gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2)
+        if (ptls2 != NULL)
             finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i);
     }
     finalize_object(&finalizer_list_marked, o, &copied_list, 0);
@@ -760,12 +773,13 @@ static void gc_sweep_foreign_objs(void)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2)
+        if (ptls2 != NULL)
             gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs);
     }
 }
 
 // GC knobs and self-measurement variables
+static int under_memory_pressure;
 static int64_t last_gc_total_bytes = 0;
 
 // max_total_memory is a suggestion.  We try very hard to stay
@@ -802,7 +816,7 @@ static int mark_reset_age = 0;
  *
  * <-[(quick)sweep]-
  *                 |
- *     ---->  GC_OLD  <--[(quick)sweep && age>promotion]--
+ *     ---->  GC_OLD  <--[(quick)sweep]-------------------
  *     |     |                                           |
  *     |     |  GC_MARKED (in remset)                    |
  *     |     |     ^            |                        |
@@ -819,9 +833,9 @@ static int mark_reset_age = 0;
  *  ========= above this line objects are old =========  |
  *                                                       |
  *  ----[new]------> GC_CLEAN ------[mark]-----------> GC_MARKED
- *                    |    ^                                   |
- *  <-[(quick)sweep]---    |                                   |
- *                         --[(quick)sweep && age<=promotion]---
+ *                    |
+ *  <-[(quick)sweep]---
+ *
  */
 
 // A quick sweep is a sweep where `!sweep_full`
@@ -835,18 +849,10 @@ static int mark_reset_age = 0;
 // When a write barrier triggers, the offending marked object is both queued,
 // so as not to trigger the barrier again, and put in the remset.
 
-
-#define PROMOTE_AGE 1
-// this cannot be increased as is without changing :
-// - sweep_page which is specialized for 1bit age
-// - the size of the age storage in jl_gc_pagemeta_t
-
-
 static int64_t scanned_bytes; // young bytes scanned while marking
 static int64_t perm_scanned_bytes; // old bytes scanned while marking
 int prev_sweep_full = 1;
-
-#define inc_sat(v,s) v = (v) >= s ? s : (v)+1
+int current_sweep_full = 0;
 
 // Full collection heuristics
 static int64_t live_bytes = 0;
@@ -893,7 +899,7 @@ static void gc_sync_all_caches_nolock(jl_ptls_t ptls)
     assert(gc_n_threads);
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2)
+        if (ptls2 != NULL)
             gc_sync_cache_nolock(ptls, &ptls2->gc_cache);
     }
 }
@@ -912,21 +918,13 @@ STATIC_INLINE void gc_queue_big_marked(jl_ptls_t ptls, bigval_t *hdr,
     ptls->gc_cache.nbig_obj = nobj + 1;
 }
 
-// `gc_setmark_tag` can be called concurrently on multiple threads.
-// In all cases, the function atomically sets the mark bits and returns
-// the GC bits set as well as if the tag was unchanged by this thread.
-// All concurrent calls on the same object are guaranteed to be setting the
-// bits to the same value.
-// For normal objects, this is the bits with only `GC_MARKED` changed to `1`
-// For buffers, this is the bits of the owner object.
-// For `mark_reset_age`, this is `GC_MARKED` with `GC_OLD` cleared.
-// The return value is `1` if the object was not marked before.
-// Returning `0` can happen if another thread marked it in parallel.
-STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode,
-                                 uintptr_t tag, uint8_t *bits) JL_NOTSAFEPOINT
-{
-    assert(!gc_marked(tag));
+// Atomically set the mark bit for object and return whether it was previously unmarked
+FORCE_INLINE int gc_try_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode) JL_NOTSAFEPOINT
+{
     assert(gc_marked(mark_mode));
+    uintptr_t tag = o->header;
+    if (gc_marked(tag))
+        return 0;
     if (mark_reset_age) {
         // Reset the object as if it was just allocated
         mark_mode = GC_MARKED;
@@ -938,7 +936,6 @@ STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode,
         tag = tag | mark_mode;
         assert((tag & 0x3) == mark_mode);
     }
-    *bits = mark_mode;
     tag = jl_atomic_exchange_relaxed((_Atomic(uintptr_t)*)&o->header, tag);
     verify_val(jl_valueof(o));
     return !gc_marked(tag);
@@ -949,7 +946,7 @@ STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode,
 STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o,
                                   uint8_t mark_mode) JL_NOTSAFEPOINT
 {
-    assert(!page_metadata(o));
+    assert(!gc_alloc_map_is_set((char*)o));
     bigval_t *hdr = bigval_header(o);
     if (mark_mode == GC_OLD_MARKED) {
         ptls->gc_cache.perm_scanned_bytes += hdr->sz & ~3;
@@ -960,9 +957,8 @@ STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o,
         // We can't easily tell if the object is old or being promoted
         // from the gc bits but if the `age` is `0` then the object
         // must be already on a young list.
-        if (mark_reset_age && hdr->age) {
+        if (mark_reset_age) {
             // Reset the object as if it was just allocated
-            hdr->age = 0;
             gc_queue_big_marked(ptls, hdr, 1);
         }
     }
@@ -973,13 +969,11 @@ STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o,
 // This function should be called exactly once during marking for each pool
 // object being marked to update the page metadata.
 STATIC_INLINE void gc_setmark_pool_(jl_ptls_t ptls, jl_taggedvalue_t *o,
-                                    uint8_t mark_mode,
-                                    jl_gc_pagemeta_t *page) JL_NOTSAFEPOINT
+                                    uint8_t mark_mode, jl_gc_pagemeta_t *page) JL_NOTSAFEPOINT
 {
 #ifdef MEMDEBUG
     gc_setmark_big(ptls, o, mark_mode);
 #else
-    jl_assume(page);
     if (mark_mode == GC_OLD_MARKED) {
         ptls->gc_cache.perm_scanned_bytes += page->osize;
         static_assert(sizeof(_Atomic(uint16_t)) == sizeof(page->nold), "");
@@ -989,10 +983,6 @@ STATIC_INLINE void gc_setmark_pool_(jl_ptls_t ptls, jl_taggedvalue_t *o,
         ptls->gc_cache.scanned_bytes += page->osize;
         if (mark_reset_age) {
             page->has_young = 1;
-            char *page_begin = gc_page_data(o) + GC_PAGE_OFFSET;
-            int obj_id = (((char*)o) - page_begin) / page->osize;
-            uint8_t *ages = page->ages + obj_id / 8;
-            jl_atomic_fetch_and_relaxed((_Atomic(uint8_t)*)ages, ~(1 << (obj_id % 8)));
         }
     }
     objprofile_count(jl_typeof(jl_valueof(o)),
@@ -1004,7 +994,7 @@ STATIC_INLINE void gc_setmark_pool_(jl_ptls_t ptls, jl_taggedvalue_t *o,
 STATIC_INLINE void gc_setmark_pool(jl_ptls_t ptls, jl_taggedvalue_t *o,
                                    uint8_t mark_mode) JL_NOTSAFEPOINT
 {
-    gc_setmark_pool_(ptls, o, mark_mode, page_metadata(o));
+    gc_setmark_pool_(ptls, o, mark_mode, page_metadata((char*)o));
 }
 
 STATIC_INLINE void gc_setmark(jl_ptls_t ptls, jl_taggedvalue_t *o,
@@ -1021,18 +1011,15 @@ STATIC_INLINE void gc_setmark(jl_ptls_t ptls, jl_taggedvalue_t *o,
 STATIC_INLINE void gc_setmark_buf_(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) JL_NOTSAFEPOINT
 {
     jl_taggedvalue_t *buf = jl_astaggedvalue(o);
-    uintptr_t tag = buf->header;
-    if (gc_marked(tag))
-        return;
-    uint8_t bits;
+    uint8_t bits = (gc_old(buf->header) && !mark_reset_age) ? GC_OLD_MARKED : GC_MARKED;;
     // If the object is larger than the max pool size it can't be a pool object.
     // This should be accurate most of the time but there might be corner cases
     // where the size estimate is a little off so we do a pool lookup to make
     // sure.
-    if (__likely(gc_setmark_tag(buf, mark_mode, tag, &bits)) && !gc_verifying) {
+    if (__likely(gc_try_setmark_tag(buf, mark_mode)) && !gc_verifying) {
         if (minsz <= GC_MAX_SZCLASS) {
             jl_gc_pagemeta_t *page = page_metadata(buf);
-            if (page) {
+            if (page != NULL) {
                 gc_setmark_pool_(ptls, buf, bits, page);
                 return;
             }
@@ -1046,38 +1033,7 @@ void gc_setmark_buf(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) JL
     gc_setmark_buf_(ptls, o, mark_mode, minsz);
 }
 
-void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT
-{
-    jl_taggedvalue_t *o = jl_astaggedvalue(v);
-    jl_datatype_t *dt = (jl_datatype_t*)jl_typeof(v);
-    size_t dtsz = jl_datatype_size(dt);
-    if (o->bits.gc == GC_OLD_MARKED)
-        return;
-    o->bits.gc = GC_OLD_MARKED;
-    if (dt == jl_simplevector_type) {
-        size_t l = jl_svec_len(v);
-        dtsz = l * sizeof(void*) + sizeof(jl_svec_t);
-    }
-    else if (dt->name == jl_array_typename) {
-        jl_array_t *a = (jl_array_t*)v;
-        if (!a->flags.pooled)
-            dtsz = GC_MAX_SZCLASS + 1;
-    }
-    else if (dt == jl_module_type) {
-        dtsz = sizeof(jl_module_t);
-    }
-    else if (dt == jl_task_type) {
-        dtsz = sizeof(jl_task_t);
-    }
-    else if (dt == jl_symbol_type) {
-        return;
-    }
-    gc_setmark(ptls, o, GC_OLD_MARKED, dtsz);
-    if (dt->layout->npointers != 0)
-        jl_gc_queue_root(v);
-}
-
-static inline void maybe_collect(jl_ptls_t ptls)
+STATIC_INLINE void maybe_collect(jl_ptls_t ptls)
 {
     if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) {
         jl_gc_collect(JL_GC_AUTO);
@@ -1095,7 +1051,7 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls,
     jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*),
                                                   jl_weakref_type);
     wr->value = value;  // NOTE: wb not needed here
-    arraylist_push(&ptls->heap.weak_refs, wr);
+    small_arraylist_push(&ptls->heap.weak_refs, wr);
     return wr;
 }
 
@@ -1104,14 +1060,14 @@ static void clear_weak_refs(void)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        size_t n, l = ptls2->heap.weak_refs.len;
-        void **lst = ptls2->heap.weak_refs.items;
-        for (n = 0; n < l; n++) {
-            jl_weakref_t *wr = (jl_weakref_t*)lst[n];
-            if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc))
-                wr->value = (jl_value_t*)jl_nothing;
+        if (ptls2 != NULL) {
+            size_t n, l = ptls2->heap.weak_refs.len;
+            void **lst = ptls2->heap.weak_refs.items;
+            for (n = 0; n < l; n++) {
+                jl_weakref_t *wr = (jl_weakref_t*)lst[n];
+                if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc))
+                    wr->value = (jl_value_t*)jl_nothing;
+            }
         }
     }
 }
@@ -1121,27 +1077,27 @@ static void sweep_weak_refs(void)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        size_t n = 0;
-        size_t ndel = 0;
-        size_t l = ptls2->heap.weak_refs.len;
-        void **lst = ptls2->heap.weak_refs.items;
-        if (l == 0)
-            continue;
-        while (1) {
-            jl_weakref_t *wr = (jl_weakref_t*)lst[n];
-            if (gc_marked(jl_astaggedvalue(wr)->bits.gc))
-                n++;
-            else
-                ndel++;
-            if (n >= l - ndel)
-                break;
-            void *tmp = lst[n];
-            lst[n] = lst[n + ndel];
-            lst[n + ndel] = tmp;
+        if (ptls2 != NULL) {
+            size_t n = 0;
+            size_t ndel = 0;
+            size_t l = ptls2->heap.weak_refs.len;
+            void **lst = ptls2->heap.weak_refs.items;
+            if (l == 0)
+                continue;
+            while (1) {
+                jl_weakref_t *wr = (jl_weakref_t*)lst[n];
+                if (gc_marked(jl_astaggedvalue(wr)->bits.gc))
+                    n++;
+                else
+                    ndel++;
+                if (n >= l - ndel)
+                    break;
+                void *tmp = lst[n];
+                lst[n] = lst[n + ndel];
+                lst[n + ndel] = tmp;
+            }
+            ptls2->heap.weak_refs.len -= ndel;
         }
-        ptls2->heap.weak_refs.len -= ndel;
     }
 }
 
@@ -1149,7 +1105,7 @@ static void sweep_weak_refs(void)
 // big value list
 
 // Size includes the tag and the tag is not cleared!!
-static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
+STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
 {
     maybe_collect(ptls);
     size_t offs = offsetof(bigval_t, header);
@@ -1172,19 +1128,24 @@ static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
     memset(v, 0xee, allocsz);
 #endif
     v->sz = allocsz;
-    v->age = 0;
     gc_big_object_link(v, &ptls->heap.big_objects);
     return jl_valueof(&v->header);
 }
 
-// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
+// Deprecated version, supported for legacy code.
 JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
 {
     jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz);
-
     maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag);
     return val;
 }
+// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_big_alloc_instrumented(jl_ptls_t ptls, size_t sz, jl_value_t *type)
+{
+    jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz);
+    maybe_record_alloc_to_profile(val, sz, (jl_datatype_t*)type);
+    return val;
+}
 
 // This wrapper exists only to prevent `jl_gc_big_alloc_inner` from being inlined into
 // its callers. We provide an external-facing interface for callers, and inline `jl_gc_big_alloc_inner`
@@ -1204,16 +1165,8 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
         int old_bits = bits;
         if (gc_marked(bits)) {
             pv = &v->next;
-            int age = v->age;
-            if (age >= PROMOTE_AGE || bits == GC_OLD_MARKED) {
-                if (sweep_full || bits == GC_MARKED) {
-                    bits = GC_OLD;
-                }
-            }
-            else {
-                inc_sat(age, PROMOTE_AGE);
-                v->age = age;
-                bits = GC_CLEAN;
+            if (sweep_full || bits == GC_MARKED) {
+                bits = GC_OLD;
             }
             v->bits.gc = bits;
         }
@@ -1242,9 +1195,8 @@ static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        sweep_big_list(sweep_full, &ptls2->heap.big_objects);
+        if (ptls2 != NULL)
+            sweep_big_list(sweep_full, &ptls2->heap.big_objects);
     }
     if (sweep_full) {
         bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked);
@@ -1313,9 +1265,15 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
     gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls = gc_all_tls_states[i];
-        if (ptls) {
-            memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
+        if (ptls != NULL) {
+            // don't reset `pool_live_bytes` here
             jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
+            jl_atomic_store_relaxed(&ptls->gc_num.freed, 0);
+            jl_atomic_store_relaxed(&ptls->gc_num.malloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_num.realloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_num.poolalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_num.bigalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_num.freecall, 0);
         }
     }
 }
@@ -1362,81 +1320,75 @@ static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT
     assert(gc_n_threads);
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2 == NULL)
-            continue;
-        mallocarray_t *ma = ptls2->heap.mallocarrays;
-        mallocarray_t **pma = &ptls2->heap.mallocarrays;
-        while (ma != NULL) {
-            mallocarray_t *nxt = ma->next;
-            int bits = jl_astaggedvalue(ma->a)->bits.gc;
-            if (gc_marked(bits)) {
-                pma = &ma->next;
-            }
-            else {
-                *pma = nxt;
-                assert(ma->a->flags.how == 2);
-                jl_gc_free_array(ma->a);
-                ma->next = ptls2->heap.mafreelist;
-                ptls2->heap.mafreelist = ma;
+        if (ptls2 != NULL) {
+            mallocarray_t *ma = ptls2->heap.mallocarrays;
+            mallocarray_t **pma = &ptls2->heap.mallocarrays;
+            while (ma != NULL) {
+                mallocarray_t *nxt = ma->next;
+                int bits = jl_astaggedvalue(ma->a)->bits.gc;
+                if (gc_marked(bits)) {
+                    pma = &ma->next;
+                }
+                else {
+                    *pma = nxt;
+                    assert(ma->a->flags.how == 2);
+                    jl_gc_free_array(ma->a);
+                    ma->next = ptls2->heap.mafreelist;
+                    ptls2->heap.mafreelist = ma;
+                }
+                gc_time_count_mallocd_array(bits);
+                ma = nxt;
             }
-            gc_time_count_mallocd_array(bits);
-            ma = nxt;
         }
     }
     gc_time_mallocd_array_end();
 }
 
 // pool allocation
-static inline jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT
+STATIC_INLINE jl_taggedvalue_t *gc_reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
 {
     assert(GC_PAGE_OFFSET >= sizeof(void*));
     pg->nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / p->osize;
     pg->pool_n = p - ptls2->heap.norm_pools;
-    memset(pg->ages, 0, GC_PAGE_SZ / 8 / p->osize + 1);
     jl_taggedvalue_t *beg = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET);
-    jl_taggedvalue_t *next = (jl_taggedvalue_t*)pg->data;
-    if (fl == NULL) {
-        next->next = NULL;
-    }
-    else {
-        // Insert free page after first page.
-        // This prevents unnecessary fragmentation from multiple pages
-        // being allocated from at the same time. Instead, objects will
-        // only ever be allocated from the first object in the list.
-        // This is specifically being relied on by the implementation
-        // of jl_gc_internal_obj_base_ptr() so that the function does
-        // not have to traverse the entire list.
-        jl_taggedvalue_t *flpage = (jl_taggedvalue_t *)gc_page_data(fl);
-        next->next = flpage->next;
-        flpage->next = beg;
-        beg = fl;
-    }
     pg->has_young = 0;
     pg->has_marked = 0;
-    pg->fl_begin_offset = -1;
-    pg->fl_end_offset = -1;
     pg->prev_nold = 0;
     pg->nold = 0;
+    pg->fl_begin_offset = UINT16_MAX;
+    pg->fl_end_offset = UINT16_MAX;
     return beg;
 }
 
+jl_gc_page_stack_t global_page_pool_lazily_freed;
+jl_gc_page_stack_t global_page_pool_clean;
+jl_gc_page_stack_t global_page_pool_freed;
+pagetable_t alloc_map;
+
 // Add a new page to the pool. Discards any pages in `p->newpages` before.
-static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
+static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
 {
     // Do not pass in `ptls` as argument. This slows down the fast path
     // in pool_alloc significantly
     jl_ptls_t ptls = jl_current_task->ptls;
-    jl_gc_pagemeta_t *pg = jl_gc_alloc_page();
+    jl_gc_pagemeta_t *pg = pop_lf_back(&ptls->page_metadata_buffered);
+    if (pg != NULL) {
+        gc_alloc_map_set(pg->data, GC_PAGE_ALLOCATED);
+    }
+    else {
+        pg = jl_gc_alloc_page();
+    }
     pg->osize = p->osize;
-    pg->ages = (uint8_t*)malloc_s(GC_PAGE_SZ / 8 / p->osize + 1);
     pg->thread_n = ptls->tid;
-    jl_taggedvalue_t *fl = reset_page(ptls, p, pg, NULL);
+    set_page_metadata(pg);
+    push_lf_back(&ptls->page_metadata_allocd, pg);
+    jl_taggedvalue_t *fl = gc_reset_page(ptls, p, pg);
     p->newpages = fl;
     return fl;
 }
 
 // Size includes the tag and the tag is not cleared!!
-static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset,
+STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset,
                                           int osize)
 {
     // Use the pool offset instead of the pool address as the argument
@@ -1450,17 +1402,19 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset
     maybe_collect(ptls);
     jl_atomic_store_relaxed(&ptls->gc_num.allocd,
         jl_atomic_load_relaxed(&ptls->gc_num.allocd) + osize);
+    jl_atomic_store_relaxed(&ptls->gc_num.pool_live_bytes,
+        jl_atomic_load_relaxed(&ptls->gc_num.pool_live_bytes) + osize);
     jl_atomic_store_relaxed(&ptls->gc_num.poolalloc,
         jl_atomic_load_relaxed(&ptls->gc_num.poolalloc) + 1);
     // first try to use the freelist
     jl_taggedvalue_t *v = p->freelist;
-    if (v) {
+    if (v != NULL) {
         jl_taggedvalue_t *next = v->next;
         p->freelist = next;
         if (__unlikely(gc_page_data(v) != gc_page_data(next))) {
             // we only update pg's fields when the freelist changes page
             // since pg's metadata is likely not in cache
-            jl_gc_pagemeta_t *pg = jl_assume(page_metadata(v));
+            jl_gc_pagemeta_t *pg = jl_assume(page_metadata_unsafe(v));
             assert(pg->osize == p->osize);
             pg->nfree = 0;
             pg->has_young = 1;
@@ -1474,19 +1428,16 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset
     // If there's no pages left or the current page is used up,
     // we need to use the slow path.
     char *cur_page = gc_page_data((char*)v - 1);
-    if (__unlikely(!v || cur_page + GC_PAGE_SZ < (char*)next)) {
-        if (v) {
+    if (__unlikely(v == NULL || cur_page + GC_PAGE_SZ < (char*)next)) {
+        if (v != NULL) {
             // like the freelist case,
             // but only update the page metadata when it is full
-            jl_gc_pagemeta_t *pg = jl_assume(page_metadata((char*)v - 1));
+            jl_gc_pagemeta_t *pg = jl_assume(page_metadata_unsafe((char*)v - 1));
             assert(pg->osize == p->osize);
             pg->nfree = 0;
             pg->has_young = 1;
-            v = *(jl_taggedvalue_t**)cur_page;
         }
-        // Not an else!!
-        if (!v)
-            v = add_page(p);
+        v = gc_add_page(p);
         next = (jl_taggedvalue_t*)((char*)v + osize);
     }
     p->newpages = next;
@@ -1494,15 +1445,22 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset
     return jl_valueof(v);
 }
 
-// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code.
+// Deprecated version, supported for legacy code.
 JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
                                           int osize)
 {
     jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
-
     maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag);
     return val;
 }
+// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc_instrumented(jl_ptls_t ptls, int pool_offset,
+                                        int osize, jl_value_t* type)
+{
+    jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
+    maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type);
+    return val;
+}
 
 // This wrapper exists only to prevent `jl_gc_pool_alloc_inner` from being inlined into
 // its callers. We provide an external-facing interface for callers, and inline `jl_gc_pool_alloc_inner`
@@ -1523,99 +1481,126 @@ int jl_gc_classify_pools(size_t sz, int *osize)
 
 // sweep phase
 
-int64_t lazy_freed_pages = 0;
+gc_fragmentation_stat_t gc_page_fragmentation_stats[JL_GC_N_POOLS];
+JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];
+
+STATIC_INLINE void gc_update_page_fragmentation_data(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
+{
+    gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[pg->pool_n];
+    jl_atomic_fetch_add(&stats->n_freed_objs, pg->nfree);
+    jl_atomic_fetch_add(&stats->n_pages_allocd, 1);
+}
+
+STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
+{
+    for (int i = 0; i < JL_GC_N_POOLS; i++) {
+        gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[i];
+        double utilization = 1.0;
+        size_t n_freed_objs = jl_atomic_load_relaxed(&stats->n_freed_objs);
+        size_t n_pages_allocd = jl_atomic_load_relaxed(&stats->n_pages_allocd);
+        if (n_pages_allocd != 0) {
+            utilization -= ((double)n_freed_objs * (double)jl_gc_sizeclasses[i]) / (double)n_pages_allocd / (double)GC_PAGE_SZ;
+        }
+        jl_gc_page_utilization_stats[i] = utilization;
+        jl_atomic_store_relaxed(&stats->n_freed_objs, 0);
+        jl_atomic_store_relaxed(&stats->n_pages_allocd, 0);
+    }
+}
+
+int64_t buffered_pages = 0;
 
 // Returns pointer to terminal pointer of list rooted at *pfl.
-static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t **pfl, int sweep_full, int osize) JL_NOTSAFEPOINT
+static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *buffered,
+                          jl_gc_pagemeta_t *pg, int osize) JL_NOTSAFEPOINT
 {
     char *data = pg->data;
-    uint8_t *ages = pg->ages;
-    jl_taggedvalue_t *v = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET);
-    char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - osize;
+    jl_taggedvalue_t *v0 = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET);
+    char *lim = data + GC_PAGE_SZ - osize;
+    char *lim_newpages = data + GC_PAGE_SZ;
+    if (gc_page_data((char*)p->newpages - 1) == data) {
+        lim_newpages = (char*)p->newpages;
+    }
     size_t old_nfree = pg->nfree;
     size_t nfree;
+    // avoid loading a global variable in the hot path
+    int page_profile_enabled = gc_page_profile_is_enabled();
+    gc_page_serializer_init(s, pg);
 
+    int re_use_page = 1;
+    int keep_as_local_buffer = 0;
     int freedall = 1;
     int pg_skpd = 1;
     if (!pg->has_marked) {
+        re_use_page = 0;
         // lazy version: (empty) if the whole page was already unused, free it (return it to the pool)
         // eager version: (freedall) free page as soon as possible
         // the eager one uses less memory.
         // FIXME - need to do accounting on a per-thread basis
         // on quick sweeps, keep a few pages empty but allocated for performance
-        if (!sweep_full && lazy_freed_pages <= default_collect_interval / GC_PAGE_SZ) {
-            jl_ptls_t ptls2 = gc_all_tls_states[pg->thread_n];
-            jl_taggedvalue_t *begin = reset_page(ptls2, p, pg, p->newpages);
-            p->newpages = begin;
-            begin->next = (jl_taggedvalue_t*)0;
-            lazy_freed_pages++;
-        }
-        else {
-            jl_gc_free_page(data);
+        if (!current_sweep_full && buffered_pages <= default_collect_interval / GC_PAGE_SZ) {
+            buffered_pages++;
+            keep_as_local_buffer = 1;
         }
         nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / osize;
+        gc_page_profile_write_empty_page(s, page_profile_enabled);
         goto done;
     }
     // For quick sweep, we might be able to skip the page if the page doesn't
     // have any young live cell before marking.
-    if (!sweep_full && !pg->has_young) {
+    if (!current_sweep_full && !pg->has_young) {
         assert(!prev_sweep_full || pg->prev_nold >= pg->nold);
         if (!prev_sweep_full || pg->prev_nold == pg->nold) {
-            // the position of the freelist begin/end in this page
-            // is stored in its metadata
-            if (pg->fl_begin_offset != (uint16_t)-1) {
-                *pfl = page_pfl_beg(pg);
-                pfl = (jl_taggedvalue_t**)page_pfl_end(pg);
-            }
             freedall = 0;
             nfree = pg->nfree;
+            gc_page_profile_write_empty_page(s, page_profile_enabled);
             goto done;
         }
     }
 
     pg_skpd = 0;
-    {  // scope to avoid clang goto errors
+    {   // scope to avoid clang goto errors
         int has_marked = 0;
         int has_young = 0;
         int16_t prev_nold = 0;
         int pg_nfree = 0;
+        jl_taggedvalue_t *fl = NULL;
+        jl_taggedvalue_t **pfl = &fl;
         jl_taggedvalue_t **pfl_begin = NULL;
-        uint8_t msk = 1; // mask for the age bit in the current age byte
+        // collect page profile
+        jl_taggedvalue_t *v = v0;
+        if (page_profile_enabled) {
+            while ((char*)v <= lim) {
+                int bits = v->bits.gc;
+                if (!gc_marked(bits) || (char*)v >= lim_newpages) {
+                    gc_page_profile_write_garbage(s, page_profile_enabled);
+                }
+                else {
+                    gc_page_profile_write_live_obj(s, v, page_profile_enabled);
+                }
+                v = (jl_taggedvalue_t*)((char*)v + osize);
+            }
+            v = v0;
+        }
+        // sweep the page
         while ((char*)v <= lim) {
             int bits = v->bits.gc;
-            if (!gc_marked(bits)) {
+            // if an object is past `lim_newpages` then we can guarantee it's garbage
+            if (!gc_marked(bits) || (char*)v >= lim_newpages) {
                 *pfl = v;
                 pfl = &v->next;
-                pfl_begin = pfl_begin ? pfl_begin : pfl;
+                pfl_begin = (pfl_begin != NULL) ? pfl_begin : pfl;
                 pg_nfree++;
-                *ages &= ~msk;
             }
             else { // marked young or old
-                if (*ages & msk || bits == GC_OLD_MARKED) { // old enough
-                    // `!age && bits == GC_OLD_MARKED` is possible for
-                    // non-first-class objects like `jl_binding_t`
-                    if (sweep_full || bits == GC_MARKED) {
-                        bits = v->bits.gc = GC_OLD; // promote
-                    }
-                    prev_nold++;
-                }
-                else {
-                    assert(bits == GC_MARKED);
-                    bits = v->bits.gc = GC_CLEAN; // unmark
-                    has_young = 1;
+                if (current_sweep_full || bits == GC_MARKED) { // old enough
+                    bits = v->bits.gc = GC_OLD; // promote
                 }
+                prev_nold++;
                 has_marked |= gc_marked(bits);
-                *ages |= msk;
                 freedall = 0;
             }
             v = (jl_taggedvalue_t*)((char*)v + osize);
-            msk <<= 1;
-            if (!msk) {
-                msk = 1;
-                ages++;
-            }
         }
-
         assert(!freedall);
         pg->has_marked = has_marked;
         pg->has_young = has_young;
@@ -1624,12 +1609,12 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t
             pg->fl_end_offset = (char*)pfl - data;
         }
         else {
-            pg->fl_begin_offset = -1;
-            pg->fl_end_offset = -1;
+            pg->fl_begin_offset = UINT16_MAX;
+            pg->fl_end_offset = UINT16_MAX;
         }
 
         pg->nfree = pg_nfree;
-        if (sweep_full) {
+        if (current_sweep_full) {
             pg->nold = 0;
             pg->prev_nold = prev_nold;
         }
@@ -1637,97 +1622,36 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t
     nfree = pg->nfree;
 
 done:
+    if (re_use_page) {
+        push_lf_back(allocd, pg);
+    }
+    else {
+        gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED);
+        if (keep_as_local_buffer) {
+            push_lf_back(buffered, pg);
+        }
+        else {
+            push_lf_back(&global_page_pool_lazily_freed, pg);
+        }
+    }
+    gc_page_profile_write_to_file(s);
+    gc_update_page_fragmentation_data(pg);
     gc_time_count_page(freedall, pg_skpd);
-    gc_num.freed += (nfree - old_nfree) * osize;
-    return pfl;
+    jl_ptls_t ptls = gc_all_tls_states[pg->thread_n];
+    jl_atomic_fetch_add(&ptls->gc_num.pool_live_bytes, GC_PAGE_SZ - GC_PAGE_OFFSET - nfree * osize);
+    jl_atomic_fetch_add((_Atomic(int64_t) *)&gc_num.freed, (nfree - old_nfree) * osize);
 }
 
 // the actual sweeping over all allocated pages in a memory pool
-static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg, int sweep_full) JL_NOTSAFEPOINT
+STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *lazily_freed,
+                                      jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
 {
     int p_n = pg->pool_n;
     int t_n = pg->thread_n;
     jl_ptls_t ptls2 = gc_all_tls_states[t_n];
     jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n];
     int osize = pg->osize;
-    pfl[t_n * JL_GC_N_POOLS + p_n] = sweep_page(p, pg, pfl[t_n * JL_GC_N_POOLS + p_n], sweep_full, osize);
-}
-
-// sweep over a pagetable0 for all allocated pages
-static inline int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *pagetable0, int sweep_full) JL_NOTSAFEPOINT
-{
-    unsigned ub = 0;
-    unsigned alloc = 0;
-    for (unsigned pg_i = 0; pg_i <= pagetable0->ub; pg_i++) {
-        uint32_t line = pagetable0->allocmap[pg_i];
-        unsigned j;
-        if (!line)
-            continue;
-        ub = pg_i;
-        alloc = 1;
-        for (j = 0; line; j++, line >>= 1) {
-            unsigned next = ffs_u32(line);
-            j += next;
-            line >>= next;
-            jl_gc_pagemeta_t *pg = pagetable0->meta[pg_i * 32 + j];
-            sweep_pool_page(pfl, pg, sweep_full);
-        }
-    }
-    pagetable0->ub = ub;
-    return alloc;
-}
-
-// sweep over pagetable1 for all pagetable0 that may contain allocated pages
-static inline int sweep_pool_pagetable1(jl_taggedvalue_t ***pfl, pagetable1_t *pagetable1, int sweep_full) JL_NOTSAFEPOINT
-{
-    unsigned ub = 0;
-    unsigned alloc = 0;
-    for (unsigned pg_i = 0; pg_i <= pagetable1->ub; pg_i++) {
-        uint32_t line = pagetable1->allocmap0[pg_i];
-        unsigned j;
-        for (j = 0; line; j++, line >>= 1) {
-            unsigned next = ffs_u32(line);
-            j += next;
-            line >>= next;
-            pagetable0_t *pagetable0 = pagetable1->meta0[pg_i * 32 + j];
-            if (pagetable0 && !sweep_pool_pagetable0(pfl, pagetable0, sweep_full))
-                pagetable1->allocmap0[pg_i] &= ~(1 << j); // no allocations found, remember that for next time
-        }
-        if (pagetable1->allocmap0[pg_i]) {
-            ub = pg_i;
-            alloc = 1;
-        }
-    }
-    pagetable1->ub = ub;
-    return alloc;
-}
-
-// sweep over all memory for all pagetable1 that may contain allocated pages
-static void sweep_pool_pagetable(jl_taggedvalue_t ***pfl, int sweep_full) JL_NOTSAFEPOINT
-{
-    if (REGION2_PG_COUNT == 1) { // compile-time optimization
-        pagetable1_t *pagetable1 = memory_map.meta1[0];
-        if (pagetable1)
-            sweep_pool_pagetable1(pfl, pagetable1, sweep_full);
-        return;
-    }
-    unsigned ub = 0;
-    for (unsigned pg_i = 0; pg_i <= memory_map.ub; pg_i++) {
-        uint32_t line = memory_map.allocmap1[pg_i];
-        unsigned j;
-        for (j = 0; line; j++, line >>= 1) {
-            unsigned next = ffs_u32(line);
-            j += next;
-            line >>= next;
-            pagetable1_t *pagetable1 = memory_map.meta1[pg_i * 32 + j];
-            if (pagetable1 && !sweep_pool_pagetable1(pfl, pagetable1, sweep_full))
-                memory_map.allocmap1[pg_i] &= ~(1 << j); // no allocations found, remember that for next time
-        }
-        if (memory_map.allocmap1[pg_i]) {
-            ub = pg_i;
-        }
-    }
-    memory_map.ub = ub;
+    gc_sweep_page(s, p, allocd, lazily_freed, pg, osize);
 }
 
 // sweep over all memory that is being used and not in a pool
@@ -1739,7 +1663,7 @@ static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
 
 static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT
 {
-    assert(pg->fl_begin_offset != (uint16_t)-1);
+    assert(pg->fl_begin_offset != UINT16_MAX);
     char *cur_pg = gc_page_data(last);
     // Fast path for page that has no allocation
     jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
@@ -1753,11 +1677,72 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
     pg->nfree = nfree;
 }
 
+void gc_sweep_wake_all(void)
+{
+    uv_mutex_lock(&gc_threads_lock);
+    for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        jl_atomic_fetch_add(&ptls2->gc_sweeps_requested, 1);
+    }
+    uv_cond_broadcast(&gc_threads_cond);
+    uv_mutex_unlock(&gc_threads_lock);
+}
+
+void gc_sweep_wait_for_all(void)
+{
+    jl_atomic_store(&gc_allocd_scratch, NULL);
+    while (jl_atomic_load_relaxed(&gc_n_threads_sweeping) != 0) {
+        jl_cpu_pause();
+    }
+}
+
+void gc_sweep_pool_parallel(void)
+{
+    jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
+    jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
+    if (allocd_scratch != NULL) {
+        gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
+        while (1) {
+            int found_pg = 0;
+            for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+                jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+                if (ptls2 == NULL) {
+                    continue;
+                }
+                jl_gc_page_stack_t *allocd = &allocd_scratch[t_i];
+                jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd);
+                if (pg == NULL) {
+                    continue;
+                }
+                gc_sweep_pool_page(&serializer, allocd, &ptls2->page_metadata_buffered, pg);
+                found_pg = 1;
+            }
+            if (!found_pg) {
+                break;
+            }
+        }
+        gc_page_serializer_destroy(&serializer);
+    }
+    jl_atomic_fetch_add(&gc_n_threads_sweeping, -1);
+}
+
+void gc_free_pages(void)
+{
+    while (1) {
+        jl_gc_pagemeta_t *pg = pop_lf_back(&global_page_pool_lazily_freed);
+        if (pg == NULL) {
+            break;
+        }
+        jl_gc_free_page(pg);
+        push_lf_back(&global_page_pool_freed, pg);
+    }
+}
+
 // setup the data-structures for a sweep over all memory pools
-static void gc_sweep_pool(int sweep_full)
+static void gc_sweep_pool(void)
 {
     gc_time_pool_start();
-    lazy_freed_pages = 0;
+    buffered_pages = 0;
 
     // For the benefit of the analyzer, which doesn't know that gc_n_threads
     // doesn't change over the course of this function
@@ -1777,11 +1762,12 @@ static void gc_sweep_pool(int sweep_full)
             }
             continue;
         }
+        jl_atomic_store_relaxed(&ptls2->gc_num.pool_live_bytes, 0);
         for (int i = 0; i < JL_GC_N_POOLS; i++) {
             jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
             jl_taggedvalue_t *last = p->freelist;
-            if (last) {
-                jl_gc_pagemeta_t *pg = jl_assume(page_metadata(last));
+            if (last != NULL) {
+                jl_gc_pagemeta_t *pg = jl_assume(page_metadata_unsafe(last));
                 gc_pool_sync_nfree(pg, last);
                 pg->has_young = 1;
             }
@@ -1789,31 +1775,74 @@ static void gc_sweep_pool(int sweep_full)
             pfl[t_i * JL_GC_N_POOLS + i] = &p->freelist;
 
             last = p->newpages;
-            if (last) {
+            if (last != NULL) {
                 char *last_p = (char*)last;
-                jl_gc_pagemeta_t *pg = jl_assume(page_metadata(last_p - 1));
+                jl_gc_pagemeta_t *pg = jl_assume(page_metadata_unsafe(last_p - 1));
                 assert(last_p - gc_page_data(last_p - 1) >= GC_PAGE_OFFSET);
                 pg->nfree = (GC_PAGE_SZ - (last_p - gc_page_data(last_p - 1))) / p->osize;
                 pg->has_young = 1;
             }
-            p->newpages = NULL;
+        }
+        jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_buffered.bottom);
+        while (pg != NULL) {
+            jl_gc_pagemeta_t *pg2 = pg->next;
+            buffered_pages++;
+            pg = pg2;
         }
     }
 
     // the actual sweeping
-    sweep_pool_pagetable(pfl, sweep_full);
+    jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t));
+    memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t));
+    jl_atomic_store(&gc_allocd_scratch, tmp);
+    gc_sweep_wake_all();
+    gc_sweep_pool_parallel();
+    gc_sweep_wait_for_all();
 
-    // null out terminal pointers of free lists
     for (int t_i = 0; t_i < n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2 == NULL)
+        if (ptls2 != NULL) {
+            ptls2->page_metadata_allocd = tmp[t_i];
+            for (int i = 0; i < JL_GC_N_POOLS; i++) {
+                jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
+                p->newpages = NULL;
+            }
+        }
+    }
+
+    // merge free lists
+    for (int t_i = 0; t_i < n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2 == NULL) {
             continue;
-        for (int i = 0; i < JL_GC_N_POOLS; i++) {
-            *pfl[t_i * JL_GC_N_POOLS + i] = NULL;
+        }
+        jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
+        while (pg != NULL) {
+            jl_gc_pagemeta_t *pg2 = pg->next;
+            if (pg->fl_begin_offset != UINT16_MAX) {
+                char *cur_pg = pg->data;
+                jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
+                jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
+                *pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
+                pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
+            }
+            pg = pg2;
+        }
+    }
+
+    // null out terminal pointers of free lists
+    for (int t_i = 0; t_i < n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2 != NULL) {
+            for (int i = 0; i < JL_GC_N_POOLS; i++) {
+                *pfl[t_i * JL_GC_N_POOLS + i] = NULL;
+            }
         }
     }
 
-    gc_time_pool_end(sweep_full);
+    gc_free_pages();
+    gc_dump_page_utilization_data();
+    gc_time_pool_end(current_sweep_full);
 }
 
 static void gc_sweep_perm_alloc(void)
@@ -1893,7 +1922,7 @@ static void *volatile gc_findval; // for usage from gdb, for finding the gc-root
 
 // Handle the case where the stack is only partially copied.
 STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset,
-                                          uintptr_t lb, uintptr_t ub)
+                                          uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT
 {
     uintptr_t addr = (uintptr_t)_addr;
     if (addr >= lb && addr < ub)
@@ -1902,929 +1931,682 @@ STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset,
 }
 
 STATIC_INLINE uintptr_t gc_read_stack(void *_addr, uintptr_t offset,
-                                      uintptr_t lb, uintptr_t ub)
+                                      uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT
 {
     uintptr_t real_addr = gc_get_stack_addr(_addr, offset, lb, ub);
     return *(uintptr_t*)real_addr;
 }
 
 JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_ptls_t ptls, jl_datatype_t *vt,
-                                                  jl_gc_mark_sp_t sp)
+                                                  jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT
 {
     jl_safe_printf("GC error (probable corruption) :\n");
     jl_gc_debug_print_status();
     jl_(vt);
     jl_gc_debug_critical_error();
-    gc_mark_loop_unwind(ptls, sp, 0);
     abort();
 }
 
-// This stores the label address in the mark loop function.
-// We can't directly store that to a global array so we need some hack to get that.
-// See the call to `gc_mark_loop` in init with a `NULL` `ptls`.
-void *gc_mark_label_addrs[_GC_MARK_L_MAX];
-
-// Double the local mark stack (both pc and data)
-static void NOINLINE gc_mark_stack_resize(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) JL_NOTSAFEPOINT
-{
-    jl_gc_mark_data_t *old_data = gc_cache->data_stack;
-    void **pc_stack = sp->pc_start;
-    size_t stack_size = (char*)sp->pc_end - (char*)pc_stack;
-    ptrdiff_t datadiff = (char*)sp->data - (char*)old_data;
-    gc_cache->data_stack = (jl_gc_mark_data_t *)realloc_s(old_data, stack_size * 2 * sizeof(jl_gc_mark_data_t));
-    sp->data = (jl_gc_mark_data_t *)((char*)gc_cache->data_stack + datadiff);
-
-    sp->pc_start = gc_cache->pc_stack = (void**)realloc_s(pc_stack, stack_size * 2 * sizeof(void*));
-    gc_cache->pc_stack_end = sp->pc_end = sp->pc_start + stack_size * 2;
-    sp->pc = sp->pc_start + (sp->pc - pc_stack);
-}
-
-// Push a work item to the stack. The type of the work item is marked with `pc`.
-// The data needed is in `data` and is of size `data_size`.
-// If there isn't enough space on the stack, the stack will be resized with the stack
-// lock held. The caller should invalidate any local cache of the stack addresses that's not
-// in `gc_cache` or `sp`
-// The `sp` will be updated on return if `inc` is true.
-STATIC_INLINE void gc_mark_stack_push(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
-                                      void *pc, void *data, size_t data_size, int inc) JL_NOTSAFEPOINT
+// Check if `nptr` is tagged for `old + refyoung`,
+// Push the object to the remset and update the `nptr` counter if necessary.
+STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj,
+                                       uintptr_t nptr) JL_NOTSAFEPOINT
 {
-    assert(data_size <= sizeof(jl_gc_mark_data_t));
-    if (__unlikely(sp->pc == sp->pc_end))
-        gc_mark_stack_resize(gc_cache, sp);
-    *sp->pc = pc;
-    memcpy(sp->data, data, data_size);
-    if (inc) {
-        sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + data_size);
-        sp->pc++;
+    if (__unlikely((nptr & 0x3) == 0x3)) {
+        ptls->heap.remset_nptr += nptr >> 2;
+        arraylist_t *remset = ptls->heap.remset;
+        size_t len = remset->len;
+        if (__unlikely(len >= remset->max)) {
+            arraylist_push(remset, obj);
+        }
+        else {
+            remset->len = len + 1;
+            remset->items[len] = obj;
+        }
     }
 }
 
-// Check if the reference is non-NULL and atomically set the mark bit.
-// Update `*nptr`, which is the `nptr` field of the parent item, if the object is young.
-// Return the tag (with GC bits cleared) and the GC bits in `*ptag` and `*pbits`.
-// Return whether the object needs to be scanned / have metadata updated.
-STATIC_INLINE int gc_try_setmark(jl_value_t *obj, uintptr_t *nptr,
-                                 uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT
+// Push a work item to the queue
+STATIC_INLINE void gc_ptr_queue_push(jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT
 {
-    if (!obj)
-        return 0;
-    jl_taggedvalue_t *o = jl_astaggedvalue(obj);
-    uintptr_t tag = o->header;
-    if (!gc_marked(tag)) {
-        uint8_t bits;
-        int res = gc_setmark_tag(o, GC_MARKED, tag, &bits);
-        if (!gc_old(bits))
-            *nptr = *nptr | 1;
-        *ptag = tag & ~(uintptr_t)0xf;
-        *pbits = bits;
-        return __likely(res);
-    }
-    else if (!gc_old(tag)) {
-        *nptr = *nptr | 1;
-    }
-    return 0;
+    ws_array_t *old_a = ws_queue_push(&mq->ptr_queue, &obj, sizeof(jl_value_t*));
+    // Put `old_a` in `reclaim_set` to be freed after the mark phase
+    if (__unlikely(old_a != NULL))
+        arraylist_push(&mq->reclaim_set, old_a);
 }
 
-// Queue a finalizer list to be scanned in the mark loop. Start marking from index `start`.
-void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
-                           arraylist_t *list, size_t start)
+// Pop from the mark queue
+STATIC_INLINE jl_value_t *gc_ptr_queue_pop(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT
 {
-    size_t len = list->len;
-    if (len <= start)
-        return;
-    jl_value_t **items = (jl_value_t**)list->items;
-    gc_mark_finlist_t markdata = {items + start, items + len};
-    gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_finlist],
-                       &markdata, sizeof(markdata), 1);
+    jl_value_t *v = NULL;
+    ws_queue_pop(&mq->ptr_queue, &v, sizeof(jl_value_t*));
+    return v;
 }
 
-// Queue a object to be scanned. The object should already be marked and the GC metadata
-// should already be updated for it. Only scanning of the object should be performed.
-STATIC_INLINE void gc_mark_queue_scan_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
-                                          jl_value_t *obj)
+// Steal from `mq2`
+STATIC_INLINE jl_value_t *gc_ptr_queue_steal_from(jl_gc_markqueue_t *mq2) JL_NOTSAFEPOINT
 {
-    jl_taggedvalue_t *o = jl_astaggedvalue(obj);
-    uintptr_t tag = o->header;
-    uint8_t bits = tag & 0xf;
-    tag = tag & ~(uintptr_t)0xf;
-    gc_mark_marked_obj_t data = {obj, tag, bits};
-    gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_scan_only],
-                       &data, sizeof(data), 1);
+    jl_value_t *v = NULL;
+    ws_queue_steal_from(&mq2->ptr_queue, &v, sizeof(jl_value_t*));
+    return v;
 }
 
-// Mark and queue a object to be scanned.
-// The object will be marked atomically which can also happen concurrently.
-// It will be queued if the object wasn't marked already (or concurrently by another thread)
-// Returns whether the object is young.
-STATIC_INLINE int gc_mark_queue_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, void *_obj) JL_NOTSAFEPOINT
+// Push chunk `*c` into chunk queue
+STATIC_INLINE void gc_chunkqueue_push(jl_gc_markqueue_t *mq, jl_gc_chunk_t *c) JL_NOTSAFEPOINT
 {
-    jl_value_t *obj = (jl_value_t*)jl_assume(_obj);
-    uintptr_t nptr = 0;
-    uintptr_t tag = 0;
-    uint8_t bits = 0;
-    if (!gc_try_setmark(obj, &nptr, &tag, &bits))
-        return (int)nptr;
-    gc_mark_marked_obj_t data = {obj, tag, bits};
-    gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_marked_obj],
-                       &data, sizeof(data), 1);
-    return (int)nptr;
+    ws_array_t *old_a = ws_queue_push(&mq->chunk_queue, c, sizeof(jl_gc_chunk_t));
+    // Put `old_a` in `reclaim_set` to be freed after the mark phase
+    if (__unlikely(old_a != NULL))
+        arraylist_push(&mq->reclaim_set, old_a);
 }
 
-int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_value_t *obj)
+// Pop chunk from chunk queue
+STATIC_INLINE jl_gc_chunk_t gc_chunkqueue_pop(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT
 {
-    return gc_mark_queue_obj(gc_cache, sp, obj);
+    jl_gc_chunk_t c = {.cid = GC_empty_chunk};
+    ws_queue_pop(&mq->chunk_queue, &c, sizeof(jl_gc_chunk_t));
+    return c;
 }
 
-JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
+// Steal chunk from `mq2`
+STATIC_INLINE jl_gc_chunk_t gc_chunkqueue_steal_from(jl_gc_markqueue_t *mq2) JL_NOTSAFEPOINT
 {
-    return gc_mark_queue_obj(&ptls->gc_cache, &ptls->gc_mark_sp, obj);
+    jl_gc_chunk_t c = {.cid = GC_empty_chunk};
+    ws_queue_steal_from(&mq2->chunk_queue, &c, sizeof(jl_gc_chunk_t));
+    return c;
 }
 
-JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
-                                            jl_value_t **objs, size_t nobjs)
+// Enqueue an unmarked obj. last bit of `nptr` is set if `_obj` is young
+STATIC_INLINE void gc_try_claim_and_push(jl_gc_markqueue_t *mq, void *_obj,
+                           uintptr_t *nptr) JL_NOTSAFEPOINT
 {
-    gc_mark_objarray_t data = { parent, objs, objs + nobjs, 1,
-                                jl_astaggedvalue(parent)->bits.gc & 2 };
-    gc_mark_stack_push(&ptls->gc_cache, &ptls->gc_mark_sp,
-                       gc_mark_label_addrs[GC_MARK_L_objarray],
-                       &data, sizeof(data), 1);
+    if (_obj == NULL)
+        return;
+    jl_value_t *obj = (jl_value_t *)jl_assume(_obj);
+    jl_taggedvalue_t *o = jl_astaggedvalue(obj);
+    if (!gc_old(o->header) && nptr)
+        *nptr |= 1;
+    if (gc_try_setmark_tag(o, GC_MARKED))
+        gc_ptr_queue_push(mq, obj);
 }
 
-
-// Check if `nptr` is tagged for `old + refyoung`,
-// Push the object to the remset and update the `nptr` counter if necessary.
-STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, uintptr_t nptr) JL_NOTSAFEPOINT
+// Mark object with 8bit field descriptors
+STATIC_INLINE jl_value_t *gc_mark_obj8(jl_ptls_t ptls, char *obj8_parent, uint8_t *obj8_begin,
+                         uint8_t *obj8_end, uintptr_t nptr) JL_NOTSAFEPOINT
 {
-    if (__unlikely((nptr & 0x3) == 0x3)) {
-        ptls->heap.remset_nptr += nptr >> 2;
-        arraylist_t *remset = ptls->heap.remset;
-        size_t len = remset->len;
-        if (__unlikely(len >= remset->max)) {
-            arraylist_push(remset, obj);
-        }
-        else {
-            remset->len = len + 1;
-            remset->items[len] = obj;
+    (void)jl_assume(obj8_begin < obj8_end);
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t **slot = NULL;
+    jl_value_t *new_obj = NULL;
+    for (; obj8_begin < obj8_end; obj8_begin++) {
+        slot = &((jl_value_t**)obj8_parent)[*obj8_begin];
+        new_obj = *slot;
+        if (new_obj != NULL) {
+            verify_parent2("object", obj8_parent, slot, "field(%d)",
+                            gc_slot_to_fieldidx(obj8_parent, slot, (jl_datatype_t*)jl_typeof(obj8_parent)));
+            if (obj8_begin + 1 != obj8_end) {
+                gc_try_claim_and_push(mq, new_obj, &nptr);
+            }
+            else {
+                // Unroll marking of last item to avoid pushing
+                // and popping it right away
+                jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                nptr |= !gc_old(o->header);
+                if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL;
+            }
+            gc_heap_snapshot_record_object_edge((jl_value_t*)obj8_parent, slot);
         }
     }
+    gc_mark_push_remset(ptls, (jl_value_t *)obj8_parent, nptr);
+    return new_obj;
 }
 
-// Scan a dense array of object references, see `gc_mark_objarray_t`
-STATIC_INLINE int gc_mark_scan_objarray(jl_ptls_t ptls, jl_gc_mark_sp_t *sp,
-                                        gc_mark_objarray_t *objary,
-                                        jl_value_t **begin, jl_value_t **end,
-                                        jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits)
+// Mark object with 16bit field descriptors
+STATIC_INLINE jl_value_t *gc_mark_obj16(jl_ptls_t ptls, char *obj16_parent, uint16_t *obj16_begin,
+                          uint16_t *obj16_end, uintptr_t nptr) JL_NOTSAFEPOINT
 {
-    (void)jl_assume(objary == (gc_mark_objarray_t*)sp->data);
-    for (; begin < end; begin += objary->step) {
-        *pnew_obj = *begin;
-        if (*pnew_obj) {
-            verify_parent2("obj array", objary->parent, begin, "elem(%d)",
-                           gc_slot_to_arrayidx(objary->parent, begin));
-            gc_heap_snapshot_record_array_edge(objary->parent, begin);
-        }
-        if (!gc_try_setmark(*pnew_obj, &objary->nptr, ptag, pbits))
-            continue;
-        begin += objary->step;
-        // Found an object to mark
-        if (begin < end) {
-            // Haven't done with this one yet. Update the content and push it back
-            objary->begin = begin;
-            gc_repush_markdata(sp, gc_mark_objarray_t);
-        }
-        else {
-            // Finished scanning this one, finish up by checking the GC invariance
-            // and let the next item replacing the current one directly.
-            gc_mark_push_remset(ptls, objary->parent, objary->nptr);
-        }
-        return 1;
-    }
-    gc_mark_push_remset(ptls, objary->parent, objary->nptr);
-    return 0;
-}
-
-// Scan a sparse array of object references, see `gc_mark_objarray_t`
-STATIC_INLINE int gc_mark_scan_array8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp,
-                                      gc_mark_array8_t *ary8,
-                                      jl_value_t **begin, jl_value_t **end,
-                                      uint8_t *elem_begin, uint8_t *elem_end,
-                                      jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits)
-{
-    (void)jl_assume(ary8 == (gc_mark_array8_t*)sp->data);
-    size_t elsize = ((jl_array_t*)ary8->elem.parent)->elsize / sizeof(jl_value_t*);
-    for (; begin < end; begin += elsize) {
-        for (; elem_begin < elem_end; elem_begin++) {
-            jl_value_t **slot = &begin[*elem_begin];
-            *pnew_obj = *slot;
-            if (*pnew_obj) {
-                verify_parent2("array", ary8->elem.parent, slot, "elem(%d)",
-                               gc_slot_to_arrayidx(ary8->elem.parent, begin));
-                gc_heap_snapshot_record_array_edge(ary8->elem.parent, slot);
-            }
-            if (!gc_try_setmark(*pnew_obj, &ary8->elem.nptr, ptag, pbits))
-                continue;
-            elem_begin++;
-            // Found an object to mark
-            if (elem_begin < elem_end) {
-                // Haven't done with this one yet. Update the content and push it back
-                ary8->elem.begin = elem_begin;
-                ary8->begin = begin;
-                gc_repush_markdata(sp, gc_mark_array8_t);
+    (void)jl_assume(obj16_begin < obj16_end);
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t **slot = NULL;
+    jl_value_t *new_obj = NULL;
+    for (; obj16_begin < obj16_end; obj16_begin++) {
+        slot = &((jl_value_t **)obj16_parent)[*obj16_begin];
+        new_obj = *slot;
+        if (new_obj != NULL) {
+            verify_parent2("object", obj16_parent, slot, "field(%d)",
+                            gc_slot_to_fieldidx(obj16_parent, slot, (jl_datatype_t*)jl_typeof(obj16_parent)));
+            if (obj16_begin + 1 != obj16_end) {
+                gc_try_claim_and_push(mq, new_obj, &nptr);
             }
             else {
-                begin += elsize;
-                if (begin < end) {
-                    // Haven't done with this array yet. Reset the content and push it back
-                    ary8->elem.begin = ary8->rebegin;
-                    ary8->begin = begin;
-                    gc_repush_markdata(sp, gc_mark_array8_t);
-                }
-                else {
-                    // Finished scanning this one, finish up by checking the GC invariance
-                    // and let the next item replacing the current one directly.
-                    gc_mark_push_remset(ptls, ary8->elem.parent, ary8->elem.nptr);
-                }
+                // Unroll marking of last item to avoid pushing
+                // and popping it right away
+                jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                nptr |= !gc_old(o->header);
+                if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL;
             }
-            return 1;
+            gc_heap_snapshot_record_object_edge((jl_value_t*)obj16_parent, slot);
         }
-        elem_begin = ary8->rebegin;
-    }
-    gc_mark_push_remset(ptls, ary8->elem.parent, ary8->elem.nptr);
-    return 0;
-}
-
-// Scan a sparse array of object references, see `gc_mark_objarray_t`
-STATIC_INLINE int gc_mark_scan_array16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp,
-                                      gc_mark_array16_t *ary16,
-                                      jl_value_t **begin, jl_value_t **end,
-                                      uint16_t *elem_begin, uint16_t *elem_end,
-                                      jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits)
-{
-    (void)jl_assume(ary16 == (gc_mark_array16_t*)sp->data);
-    size_t elsize = ((jl_array_t*)ary16->elem.parent)->elsize / sizeof(jl_value_t*);
-    for (; begin < end; begin += elsize) {
-        for (; elem_begin < elem_end; elem_begin++) {
-            jl_value_t **slot = &begin[*elem_begin];
-            *pnew_obj = *slot;
-            if (*pnew_obj) {
-                verify_parent2("array", ary16->elem.parent, slot, "elem(%d)",
-                               gc_slot_to_arrayidx(ary16->elem.parent, begin));
-                gc_heap_snapshot_record_array_edge(ary16->elem.parent, slot);
-            }
-            if (!gc_try_setmark(*pnew_obj, &ary16->elem.nptr, ptag, pbits))
-                continue;
-            elem_begin++;
-            // Found an object to mark
-            if (elem_begin < elem_end) {
-                // Haven't done with this one yet. Update the content and push it back
-                ary16->elem.begin = elem_begin;
-                ary16->begin = begin;
-                gc_repush_markdata(sp, gc_mark_array16_t);
+    }
+    gc_mark_push_remset(ptls, (jl_value_t *)obj16_parent, nptr);
+    return new_obj;
+}
+
+// Mark object with 32bit field descriptors
+STATIC_INLINE jl_value_t *gc_mark_obj32(jl_ptls_t ptls, char *obj32_parent, uint32_t *obj32_begin,
+                          uint32_t *obj32_end, uintptr_t nptr) JL_NOTSAFEPOINT
+{
+    (void)jl_assume(obj32_begin < obj32_end);
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t **slot = NULL;
+    jl_value_t *new_obj = NULL;
+    for (; obj32_begin < obj32_end; obj32_begin++) {
+        slot = &((jl_value_t **)obj32_parent)[*obj32_begin];
+        new_obj = *slot;
+        if (new_obj != NULL) {
+            verify_parent2("object", obj32_parent, slot, "field(%d)",
+                            gc_slot_to_fieldidx(obj32_parent, slot, (jl_datatype_t*)jl_typeof(obj32_parent)));
+            if (obj32_begin + 1 != obj32_end) {
+                gc_try_claim_and_push(mq, new_obj, &nptr);
             }
             else {
-                begin += elsize;
-                if (begin < end) {
-                    // Haven't done with this array yet. Reset the content and push it back
-                    ary16->elem.begin = ary16->rebegin;
-                    ary16->begin = begin;
-                    gc_repush_markdata(sp, gc_mark_array16_t);
-                }
-                else {
-                    // Finished scanning this one, finish up by checking the GC invariance
-                    // and let the next item replacing the current one directly.
-                    gc_mark_push_remset(ptls, ary16->elem.parent, ary16->elem.nptr);
-                }
+                // Unroll marking of last item to avoid pushing
+                // and popping it right away
+                jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                nptr |= !gc_old(o->header);
+                if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL;
             }
-            return 1;
+            gc_heap_snapshot_record_object_edge((jl_value_t*)obj32_parent, slot);
         }
-        elem_begin = ary16->rebegin;
     }
-    gc_mark_push_remset(ptls, ary16->elem.parent, ary16->elem.nptr);
-    return 0;
-}
-
-
-// Scan an object with 8bits field descriptors. see `gc_mark_obj8_t`
-STATIC_INLINE int gc_mark_scan_obj8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj8_t *obj8,
-                                    char *parent, uint8_t *begin, uint8_t *end,
-                                    jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits)
-{
-    (void)jl_assume(obj8 == (gc_mark_obj8_t*)sp->data);
-    (void)jl_assume(begin < end);
-    for (; begin < end; begin++) {
-        jl_value_t **slot = &((jl_value_t**)parent)[*begin];
-        *pnew_obj = *slot;
-        if (*pnew_obj) {
-            verify_parent2("object", parent, slot, "field(%d)",
-                           gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent)));
-            gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot);
+    gc_mark_push_remset(ptls, (jl_value_t *)obj32_parent, nptr);
+    return new_obj;
+}
+
+// Mark object array
+STATIC_INLINE void gc_mark_objarray(jl_ptls_t ptls, jl_value_t *obj_parent, jl_value_t **obj_begin,
+                      jl_value_t **obj_end, uint32_t step, uintptr_t nptr) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t *new_obj;
+    // Decide whether need to chunk objary
+    (void)jl_assume(step > 0);
+    if ((nptr & 0x2) == 0x2) {
+        // pre-scan this object: most of this object should be old, so look for
+        // the first young object before starting this chunk
+        // (this also would be valid for young objects, but probably less beneficial)
+        for (; obj_begin < obj_end; obj_begin += step) {
+            jl_value_t **slot = obj_begin;
+            new_obj = *slot;
+            if (new_obj != NULL) {
+                verify_parent2("obj array", obj_parent, obj_begin, "elem(%d)",
+                               gc_slot_to_arrayidx(obj_parent, obj_begin));
+                jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                if (!gc_old(o->header))
+                    nptr |= 1;
+                if (!gc_marked(o->header))
+                    break;
+                gc_heap_snapshot_record_array_edge(obj_parent, slot);
+            }
         }
-        if (!gc_try_setmark(*pnew_obj, &obj8->nptr, ptag, pbits))
-            continue;
-        begin++;
-        // Found an object to mark
-        if (begin < end) {
-            // Haven't done with this one yet. Update the content and push it back
-            obj8->begin = begin;
-            gc_repush_markdata(sp, gc_mark_obj8_t);
+    }
+    size_t too_big = (obj_end - obj_begin) / GC_CHUNK_BATCH_SIZE > step; // use this order of operations to avoid idiv
+    jl_value_t **scan_end = obj_end;
+    int pushed_chunk = 0;
+    if (too_big) {
+        scan_end = obj_begin + step * GC_CHUNK_BATCH_SIZE;
+        // case 1: array owner is young, so we won't need to scan through all its elements
+        // to know that we will never need to push it to the remset. it's fine
+        // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue
+        // ASAP in order to expose as much parallelism as possible
+        // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after
+        // scanning the array elements
+        if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) {
+            jl_gc_chunk_t c = {GC_objary_chunk, obj_parent, scan_end, obj_end, NULL, NULL, step, nptr};
+            gc_chunkqueue_push(mq, &c);
+            pushed_chunk = 1;
         }
-        else {
-            // Finished scanning this one, finish up by checking the GC invariance
-            // and let the next item replacing the current one directly.
-            gc_mark_push_remset(ptls, obj8->parent, obj8->nptr);
-        }
-        return 1;
-    }
-    gc_mark_push_remset(ptls, obj8->parent, obj8->nptr);
-    return 0;
-}
-
-// Scan an object with 16bits field descriptors. see `gc_mark_obj16_t`
-STATIC_INLINE int gc_mark_scan_obj16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj16_t *obj16,
-                                     char *parent, uint16_t *begin, uint16_t *end,
-                                     jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT
-{
-    (void)jl_assume(obj16 == (gc_mark_obj16_t*)sp->data);
-    (void)jl_assume(begin < end);
-    for (; begin < end; begin++) {
-        jl_value_t **slot = &((jl_value_t**)parent)[*begin];
-        *pnew_obj = *slot;
-        if (*pnew_obj) {
-            verify_parent2("object", parent, slot, "field(%d)",
-                           gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent)));
-            gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot);
-        }
-        if (!gc_try_setmark(*pnew_obj, &obj16->nptr, ptag, pbits))
-            continue;
-        begin++;
-        // Found an object to mark
-        if (begin < end) {
-            // Haven't done with this one yet. Update the content and push it back
-            obj16->begin = begin;
-            gc_repush_markdata(sp, gc_mark_obj16_t);
-        }
-        else {
-            // Finished scanning this one, finish up by checking the GC invariance
-            // and let the next item replacing the current one directly.
-            gc_mark_push_remset(ptls, obj16->parent, obj16->nptr);
-        }
-        return 1;
-    }
-    gc_mark_push_remset(ptls, obj16->parent, obj16->nptr);
-    return 0;
-}
-
-// Scan an object with 32bits field descriptors. see `gc_mark_obj32_t`
-STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj32_t *obj32,
-                                     char *parent, uint32_t *begin, uint32_t *end,
-                                     jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits)
-{
-    (void)jl_assume(obj32 == (gc_mark_obj32_t*)sp->data);
-    (void)jl_assume(begin < end);
-    for (; begin < end; begin++) {
-        jl_value_t **slot = &((jl_value_t**)parent)[*begin];
-        *pnew_obj = *slot;
-        if (*pnew_obj) {
-            verify_parent2("object", parent, slot, "field(%d)",
-                           gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent)));
-            gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot);
+    }
+    for (; obj_begin < scan_end; obj_begin += step) {
+        jl_value_t **slot = obj_begin;
+        new_obj = *obj_begin;
+        if (new_obj != NULL) {
+            verify_parent2("obj array", obj_parent, obj_begin, "elem(%d)",
+                        gc_slot_to_arrayidx(obj_parent, obj_begin));
+            gc_try_claim_and_push(mq, new_obj, &nptr);
+            gc_heap_snapshot_record_array_edge(obj_parent, slot);
         }
-        if (!gc_try_setmark(*pnew_obj, &obj32->nptr, ptag, pbits))
-            continue;
-        begin++;
-        // Found an object to mark
-        if (begin < end) {
-            // Haven't done with this one yet. Update the content and push it back
-            obj32->begin = begin;
-            gc_repush_markdata(sp, gc_mark_obj32_t);
+    }
+    if (too_big) {
+        if (!pushed_chunk) {
+            jl_gc_chunk_t c = {GC_objary_chunk, obj_parent, scan_end, obj_end, NULL, NULL, step, nptr};
+            gc_chunkqueue_push(mq, &c);
         }
-        else {
-            // Finished scanning this one, finish up by checking the GC invariance
-            // and let the next item replacing the current one directly.
-            gc_mark_push_remset(ptls, obj32->parent, obj32->nptr);
+    }
+    else {
+        gc_mark_push_remset(ptls, obj_parent, nptr);
+    }
+}
+
+// Mark array with 8bit field descriptors
+STATIC_INLINE void gc_mark_array8(jl_ptls_t ptls, jl_value_t *ary8_parent, jl_value_t **ary8_begin,
+                    jl_value_t **ary8_end, uint8_t *elem_begin, uint8_t *elem_end,
+                    uintptr_t nptr) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t *new_obj;
+    size_t elsize = ((jl_array_t *)ary8_parent)->elsize / sizeof(jl_value_t *);
+    assert(elsize > 0);
+    // Decide whether need to chunk objary
+    if ((nptr & 0x2) == 0x2) {
+        // pre-scan this object: most of this object should be old, so look for
+        // the first young object before starting this chunk
+        // (this also would be valid for young objects, but probably less beneficial)
+        for (; ary8_begin < ary8_end; ary8_begin += elsize) {
+            int early_end = 0;
+            for (uint8_t *pindex = elem_begin; pindex < elem_end; pindex++) {
+                jl_value_t **slot = &ary8_begin[*pindex];
+                new_obj = *slot;
+                if (new_obj != NULL) {
+                    verify_parent2("array", ary8_parent, &new_obj, "elem(%d)",
+                                gc_slot_to_arrayidx(ary8_parent, ary8_begin));
+                    jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                    if (!gc_old(o->header))
+                        nptr |= 1;
+                    if (!gc_marked(o->header)){
+                        early_end = 1;
+                        break;
+                    }
+                    gc_heap_snapshot_record_array_edge(ary8_parent, slot);
+                }
+            }
+            if (early_end)
+                break;
         }
-        return 1;
     }
-    gc_mark_push_remset(ptls, obj32->parent, obj32->nptr);
-    return 0;
-}
-
-#if defined(__GNUC__) && !defined(_OS_EMSCRIPTEN_)
-#  define gc_mark_laddr(name) (&&name)
-#  define gc_mark_jmp(ptr) goto *(ptr)
-#else
-#define gc_mark_laddr(name) ((void*)(uintptr_t)GC_MARK_L_##name)
-#define gc_mark_jmp(ptr) do {                   \
-        switch ((int)(uintptr_t)ptr) {          \
-        case GC_MARK_L_marked_obj:              \
-            goto marked_obj;                    \
-        case GC_MARK_L_scan_only:               \
-            goto scan_only;                     \
-        case GC_MARK_L_finlist:                 \
-            goto finlist;                       \
-        case GC_MARK_L_objarray:                \
-            goto objarray;                      \
-        case GC_MARK_L_array8:                  \
-            goto array8;                        \
-        case GC_MARK_L_array16:                 \
-            goto array16;                       \
-        case GC_MARK_L_obj8:                    \
-            goto obj8;                          \
-        case GC_MARK_L_obj16:                   \
-            goto obj16;                         \
-        case GC_MARK_L_obj32:                   \
-            goto obj32;                         \
-        case GC_MARK_L_stack:                   \
-            goto stack;                         \
-        case GC_MARK_L_excstack:                \
-            goto excstack;                      \
-        case GC_MARK_L_module_binding:          \
-            goto module_binding;                \
-        default:                                \
-            abort();                            \
-        }                                       \
-    } while (0)
-#endif
-
-// This is the main marking loop.
-// It uses an iterative (mostly) Depth-first search (DFS) to mark all the objects.
-// Instead of using the native stack, two stacks are manually maintained,
-// one (fixed-size) pc stack which stores the return address and one (variable-size)
-// data stack which stores the local variables needed by the scanning code.
-// Using a manually maintained stack has a few advantages
-//
-// 1. We can resize the stack as we go and never worry about stack overflow
-//    This is especitally useful when enters the GC in a deep call stack.
-//    It also removes the very deep GC call stack in a profile.
-// 2. We can minimize the number of local variables to save on the stack.
-//    This includes minimizing the sizes of the stack frames and only saving variables
-//    that have been changed before making "function calls" (i.e. `goto mark;`)
-// 3. We can perform end-of-loop tail-call optimization for common cases.
-// 4. The marking can be interrupted more easily since all the states are maintained
-//    in a well-defined format already.
-//    This will be useful if we want to have incremental marking again.
-// 5. The frames can be stolen by another thread more easily and it is not necessary
-//    to copy works to be stolen to another queue. Useful for parallel marking.
-//    (Will still require synchronization in stack popping of course.)
-// 6. A flat function (i.e. no or very few function calls) also give the compiler
-//    opportunity to keep more states in registers that doesn't have to be spilled as often.
-//
-// We use two stacks so that the thief on another thread can steal the fixed sized pc stack
-// and use that to figure out the size of the struct on the variable size data stack.
-//
-// The main disadvantages are that we bypass some stack-based CPU optimizations including the
-// stack engine and return address prediction.
-// Using two stacks also double the number of operations on the stack pointer
-// though we still only need to use one of them (the pc stack pointer) for bounds check.
-// In general, it seems that the reduction of stack memory ops and instructions count
-// have a larger positive effect on the performance. =)
-
-// As a general guide we do not want to make non-inlined function calls in this function
-// if possible since a large number of registers has to be spilled when that happens.
-// This is especially true on on X86 which doesn't have many (any?)
-// callee saved general purpose registers.
-// (OTOH, the spill will likely make use of the stack engine which is otherwise idle so
-//  the performance impact is minimum as long as it's not in the hottest path)
-
-// There are three external entry points to the loop, corresponding to label
-// `marked_obj`, `scan_only` and `finlist` (see the corresponding functions
-// `gc_mark_queue_obj`, `gc_mark_queue_scan_obj` and `gc_mark_queue_finlist` above).
-// The scanning of the object starts with `goto mark`, which updates the metadata and scans
-// the object whose information is stored in `new_obj`, `tag` and `bits`.
-// The branches in `mark` will dispatch the object to one of the scan "loop"s to be scanned
-// as either a normal julia object or one of the special objects with specific storage format.
-// Each of the scan "loop" will perform a DFS of the object in the following way
-//
-// 1. When encountering an pointer (julia object reference) slots, load, perform NULL check
-//    and atomically set the mark bits to determine if the object needs to be scanned.
-// 2. If yes, it'll push itself back onto the mark stack (after updating fields that are changed)
-//    using `gc_repush_markdata` to increment the stack pointers.
-//    This step can also be replaced by a tail call by finishing up the marking of the current
-//    object when the end of the current object is reached.
-// 3. Jump to `mark`. The marking of the current object will be resumed after the child is
-//    scanned by popping the stack frame back.
-//
-// Some of the special object scannings use BFS to simplify the code (Task and Module).
-
-// The jumps from the dispatch to the scan "loop"s are done by first pushing a frame
-// to the stacks while only increment the data stack pointer before jumping to the loop
-// This way the scan "loop" gets exactly what it expects after a stack pop.
-// Additional optimizations are done for some of the common cases by skipping
-// the unnecessary data stack pointer increment and the load from the stack
-// (i.e. store to load forwarding). See `objary_loaded`, `obj8_loaded` and `obj16_loaded`.
-JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp)
-{
-    if (__unlikely(ptls == NULL)) {
-        gc_mark_label_addrs[GC_MARK_L_marked_obj] = gc_mark_laddr(marked_obj);
-        gc_mark_label_addrs[GC_MARK_L_scan_only] = gc_mark_laddr(scan_only);
-        gc_mark_label_addrs[GC_MARK_L_finlist] = gc_mark_laddr(finlist);
-        gc_mark_label_addrs[GC_MARK_L_objarray] = gc_mark_laddr(objarray);
-        gc_mark_label_addrs[GC_MARK_L_array8] = gc_mark_laddr(array8);
-        gc_mark_label_addrs[GC_MARK_L_array16] = gc_mark_laddr(array16);
-        gc_mark_label_addrs[GC_MARK_L_obj8] = gc_mark_laddr(obj8);
-        gc_mark_label_addrs[GC_MARK_L_obj16] = gc_mark_laddr(obj16);
-        gc_mark_label_addrs[GC_MARK_L_obj32] = gc_mark_laddr(obj32);
-        gc_mark_label_addrs[GC_MARK_L_stack] = gc_mark_laddr(stack);
-        gc_mark_label_addrs[GC_MARK_L_excstack] = gc_mark_laddr(excstack);
-        gc_mark_label_addrs[GC_MARK_L_module_binding] = gc_mark_laddr(module_binding);
-        return;
+    size_t too_big = (ary8_end - ary8_begin) / GC_CHUNK_BATCH_SIZE > elsize; // use this order of operations to avoid idiv
+    jl_value_t **scan_end = ary8_end;
+    int pushed_chunk = 0;
+    if (too_big) {
+        scan_end = ary8_begin + elsize * GC_CHUNK_BATCH_SIZE;
+        // case 1: array owner is young, so we won't need to scan through all its elements
+        // to know that we will never need to push it to the remset. it's fine
+        // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue
+        // ASAP in order to expose as much parallelism as possible
+        // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after
+        // scanning the array elements
+        if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) {
+            jl_gc_chunk_t c = {GC_ary8_chunk, ary8_parent, scan_end, ary8_end, elem_begin, elem_end, 0, nptr};
+            gc_chunkqueue_push(mq, &c);
+            pushed_chunk = 1;
+        }
     }
-
-    jl_value_t *new_obj = NULL;
-    uintptr_t tag = 0;
-    uint8_t bits = 0;
-    int meta_updated = 0;
-
-    gc_mark_objarray_t *objary;
-    jl_value_t **objary_begin;
-    jl_value_t **objary_end;
-
-    gc_mark_array8_t *ary8;
-    gc_mark_array16_t *ary16;
-
-    gc_mark_obj8_t *obj8;
-    char *obj8_parent;
-    uint8_t *obj8_begin;
-    uint8_t *obj8_end;
-
-    gc_mark_obj16_t *obj16;
-    char *obj16_parent;
-    uint16_t *obj16_begin;
-    uint16_t *obj16_end;
-
-pop:
-    if (sp.pc == sp.pc_start) {
-        // TODO: stealing form another thread
-        return;
+    for (; ary8_begin < ary8_end; ary8_begin += elsize) {
+        for (uint8_t *pindex = elem_begin; pindex < elem_end; pindex++) {
+            jl_value_t **slot = &ary8_begin[*pindex];
+            new_obj = *slot;
+            if (new_obj != NULL) {
+                verify_parent2("array", ary8_parent, &new_obj, "elem(%d)",
+                               gc_slot_to_arrayidx(ary8_parent, ary8_begin));
+                gc_try_claim_and_push(mq, new_obj, &nptr);
+                gc_heap_snapshot_record_array_edge(ary8_parent, slot);
+            }
+        }
     }
-    sp.pc--;
-    gc_mark_jmp(*sp.pc); // computed goto
-
-marked_obj: {
-        // An object that has been marked and needs have metadata updated and scanned.
-        gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t);
-        new_obj = obj->obj;
-        tag = obj->tag;
-        bits = obj->bits;
-        goto mark;
-    }
-
-scan_only: {
-        // An object that has been marked and needs to be scanned.
-        gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t);
-        new_obj = obj->obj;
-        tag = obj->tag;
-        bits = obj->bits;
-        meta_updated = 1;
-        goto mark;
-    }
-
-objarray:
-    objary = gc_pop_markdata(&sp, gc_mark_objarray_t);
-    objary_begin = objary->begin;
-    objary_end = objary->end;
-objarray_loaded:
-    if (gc_mark_scan_objarray(ptls, &sp, objary, objary_begin, objary_end,
-                              &new_obj, &tag, &bits))
-        goto mark;
-    goto pop;
-
-array8:
-    ary8 = gc_pop_markdata(&sp, gc_mark_array8_t);
-    objary_begin = ary8->begin;
-    objary_end = ary8->end;
-    obj8_begin = ary8->elem.begin;
-    obj8_end = ary8->elem.end;
-array8_loaded:
-    if (gc_mark_scan_array8(ptls, &sp, ary8, objary_begin, objary_end, obj8_begin, obj8_end,
-                            &new_obj, &tag, &bits))
-        goto mark;
-    goto pop;
-
-array16:
-    ary16 = gc_pop_markdata(&sp, gc_mark_array16_t);
-    objary_begin = ary16->begin;
-    objary_end = ary16->end;
-    obj16_begin = ary16->elem.begin;
-    obj16_end = ary16->elem.end;
-array16_loaded:
-    if (gc_mark_scan_array16(ptls, &sp, ary16, objary_begin, objary_end, obj16_begin, obj16_end,
-                            &new_obj, &tag, &bits))
-        goto mark;
-    goto pop;
-
-obj8:
-    obj8 = gc_pop_markdata(&sp, gc_mark_obj8_t);
-    obj8_parent = (char*)obj8->parent;
-    obj8_begin = obj8->begin;
-    obj8_end = obj8->end;
-obj8_loaded:
-    if (gc_mark_scan_obj8(ptls, &sp, obj8, obj8_parent, obj8_begin, obj8_end,
-                          &new_obj, &tag, &bits))
-        goto mark;
-    goto pop;
-
-obj16:
-    obj16 = gc_pop_markdata(&sp, gc_mark_obj16_t);
-    obj16_parent = (char*)obj16->parent;
-    obj16_begin = obj16->begin;
-    obj16_end = obj16->end;
-obj16_loaded:
-    if (gc_mark_scan_obj16(ptls, &sp, obj16, obj16_parent, obj16_begin, obj16_end,
-                           &new_obj, &tag, &bits))
-        goto mark;
-    goto pop;
-
-obj32: {
-        gc_mark_obj32_t *obj32 = gc_pop_markdata(&sp, gc_mark_obj32_t);
-        char *parent = (char*)obj32->parent;
-        uint32_t *begin = obj32->begin;
-        uint32_t *end = obj32->end;
-        if (gc_mark_scan_obj32(ptls, &sp, obj32, parent, begin, end, &new_obj, &tag, &bits))
-            goto mark;
-        goto pop;
+    if (too_big) {
+        if (!pushed_chunk) {
+            jl_gc_chunk_t c = {GC_ary8_chunk, ary8_parent, scan_end, ary8_end, elem_begin, elem_end, 0, nptr};
+            gc_chunkqueue_push(mq, &c);
+        }
     }
-
-stack: {
-        // Scan the stack. see `gc_mark_stackframe_t`
-        // The task object this stack belongs to is being scanned separately as a normal
-        // 8bit field descriptor object.
-        gc_mark_stackframe_t *stack = gc_pop_markdata(&sp, gc_mark_stackframe_t);
-        jl_gcframe_t *s = stack->s;
-        uint32_t i = stack->i;
-        uint32_t nroots = stack->nroots;
-        uintptr_t offset = stack->offset;
-        uintptr_t lb = stack->lb;
-        uintptr_t ub = stack->ub;
-        uint32_t nr = nroots >> 2;
-        uintptr_t nptr = 0;
-        while (1) {
-            jl_value_t ***rts = (jl_value_t***)(((void**)s) + 2);
-            for (; i < nr; i++) {
-                if (nroots & 1) {
-                    void **slot = (void**)gc_read_stack(&rts[i], offset, lb, ub);
-                    new_obj = (jl_value_t*)gc_read_stack(slot, offset, lb, ub);
-                }
-                else {
-                    new_obj = (jl_value_t*)gc_read_stack(&rts[i], offset, lb, ub);
-                    if (gc_ptr_tag(new_obj, 1)) {
-                        // handle tagged pointers in finalizer list
-                        new_obj = gc_ptr_clear_tag(new_obj, 1);
-                        // skip over the finalizer fptr
-                        i++;
+    else {
+        gc_mark_push_remset(ptls, ary8_parent, nptr);
+    }
+}
+
+// Mark array with 16bit field descriptors
+STATIC_INLINE void gc_mark_array16(jl_ptls_t ptls, jl_value_t *ary16_parent, jl_value_t **ary16_begin,
+                     jl_value_t **ary16_end, uint16_t *elem_begin, uint16_t *elem_end,
+                     uintptr_t nptr) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t *new_obj;
+    size_t elsize = ((jl_array_t *)ary16_parent)->elsize / sizeof(jl_value_t *);
+    assert(elsize > 0);
+    // Decide whether need to chunk objary
+    if ((nptr & 0x2) == 0x2) {
+        // pre-scan this object: most of this object should be old, so look for
+        // the first young object before starting this chunk
+        // (this also would be valid for young objects, but probably less beneficial)
+        for (; ary16_begin < ary16_end; ary16_begin += elsize) {
+            int early_end = 0;
+            for (uint16_t *pindex = elem_begin; pindex < elem_end; pindex++) {
+                jl_value_t **slot = &ary16_begin[*pindex];
+                new_obj = *slot;
+                if (new_obj != NULL) {
+                    verify_parent2("array", ary16_parent, &new_obj, "elem(%d)",
+                                gc_slot_to_arrayidx(ary16_parent, ary16_begin));
+                    jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                    if (!gc_old(o->header))
+                        nptr |= 1;
+                    if (!gc_marked(o->header)){
+                        early_end = 1;
+                        break;
                     }
-                    if (gc_ptr_tag(new_obj, 2))
-                        continue;
-                }
-                if (!gc_try_setmark(new_obj, &nptr, &tag, &bits))
-                    continue;
-                gc_heap_snapshot_record_frame_to_object_edge(s, new_obj);
-                i++;
-                if (i < nr) {
-                    // Haven't done with this one yet. Update the content and push it back
-                    stack->i = i;
-                    gc_repush_markdata(&sp, gc_mark_stackframe_t);
-                }
-                // TODO stack addresses needs copy stack handling
-                else if ((s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub))) {
-                    gc_heap_snapshot_record_frame_to_frame_edge(stack->s, s);
-                    stack->s = s;
-                    stack->i = 0;
-                    uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub);
-                    assert(new_nroots <= UINT32_MAX);
-                    stack->nroots = (uint32_t)new_nroots;
-                    gc_repush_markdata(&sp, gc_mark_stackframe_t);
+                    gc_heap_snapshot_record_array_edge(ary16_parent, slot);
                 }
-                goto mark;
             }
-            s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub);
-            // walk up one stack frame
-            if (s != 0) {
-                gc_heap_snapshot_record_frame_to_frame_edge(stack->s, s);
-                stack->s = s;
-                i = 0;
-                uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub);
-                assert(new_nroots <= UINT32_MAX);
-                nroots = stack->nroots = (uint32_t)new_nroots;
-                nr = nroots >> 2;
-                continue;
+            if (early_end)
+                break;
+        }
+    }
+    size_t too_big = (ary16_end - ary16_begin) / GC_CHUNK_BATCH_SIZE > elsize; // use this order of operations to avoid idiv
+    jl_value_t **scan_end = ary16_end;
+    int pushed_chunk = 0;
+    if (too_big) {
+        scan_end = ary16_begin + elsize * GC_CHUNK_BATCH_SIZE;
+        // case 1: array owner is young, so we won't need to scan through all its elements
+        // to know that we will never need to push it to the remset. it's fine
+        // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue
+        // ASAP in order to expose as much parallelism as possible
+        // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after
+        // scanning the array elements
+        if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) {
+            jl_gc_chunk_t c = {GC_ary16_chunk, ary16_parent, scan_end, ary16_end, elem_begin, elem_end, elsize, nptr};
+            gc_chunkqueue_push(mq, &c);
+            pushed_chunk = 1;
+        }
+    }
+    for (; ary16_begin < scan_end; ary16_begin += elsize) {
+        for (uint16_t *pindex = elem_begin; pindex < elem_end; pindex++) {
+            jl_value_t **slot = &ary16_begin[*pindex];
+            new_obj = *slot;
+            if (new_obj != NULL) {
+                verify_parent2("array", ary16_parent, &new_obj, "elem(%d)",
+                               gc_slot_to_arrayidx(ary16_parent, ary16_begin));
+                gc_try_claim_and_push(mq, new_obj, &nptr);
+                gc_heap_snapshot_record_array_edge(ary16_parent, slot);
             }
-            goto pop;
         }
     }
+    if (too_big) {
+        if (!pushed_chunk) {
+            jl_gc_chunk_t c = {GC_ary16_chunk, ary16_parent, scan_end, ary16_end, elem_begin, elem_end, elsize, nptr};
+            gc_chunkqueue_push(mq, &c);
+        }
+    }
+    else {
+        gc_mark_push_remset(ptls, ary16_parent, nptr);
+    }
+}
 
-excstack: {
-        // Scan an exception stack
-        gc_mark_excstack_t *stackitr = gc_pop_markdata(&sp, gc_mark_excstack_t);
-        jl_excstack_t *excstack = stackitr->s;
-        size_t itr = stackitr->itr;
-        size_t bt_index = stackitr->bt_index;
-        size_t jlval_index = stackitr->jlval_index;
-        while (itr > 0) {
-            size_t bt_size = jl_excstack_bt_size(excstack, itr);
-            jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr);
-            for (; bt_index < bt_size; bt_index += jl_bt_entry_size(bt_data + bt_index)) {
-                jl_bt_element_t *bt_entry = bt_data + bt_index;
-                if (jl_bt_is_native(bt_entry))
-                    continue;
-                // Found an extended backtrace entry: iterate over any
-                // GC-managed values inside.
-                size_t njlvals = jl_bt_num_jlvals(bt_entry);
-                while (jlval_index < njlvals) {
-                    new_obj = jl_bt_entry_jlvalue(bt_entry, jlval_index);
-                    gc_heap_snapshot_record_frame_to_object_edge(bt_entry, new_obj);
-                    uintptr_t nptr = 0;
-                    jlval_index += 1;
-                    if (gc_try_setmark(new_obj, &nptr, &tag, &bits)) {
-                        stackitr->itr = itr;
-                        stackitr->bt_index = bt_index;
-                        stackitr->jlval_index = jlval_index;
-                        gc_repush_markdata(&sp, gc_mark_excstack_t);
-                        goto mark;
-                    }
-                }
-                jlval_index = 0;
-            }
-            // The exception comes last - mark it
-            new_obj = jl_excstack_exception(excstack, itr);
-            gc_heap_snapshot_record_frame_to_object_edge(excstack, new_obj);
-            itr = jl_excstack_next(excstack, itr);
-            bt_index = 0;
-            jlval_index = 0;
-            uintptr_t nptr = 0;
-            if (gc_try_setmark(new_obj, &nptr, &tag, &bits)) {
-                stackitr->itr = itr;
-                stackitr->bt_index = bt_index;
-                stackitr->jlval_index = jlval_index;
-                gc_repush_markdata(&sp, gc_mark_excstack_t);
-                goto mark;
-            }
+// Mark chunk of large array
+STATIC_INLINE void gc_mark_chunk(jl_ptls_t ptls, jl_gc_markqueue_t *mq, jl_gc_chunk_t *c) JL_NOTSAFEPOINT
+{
+    switch (c->cid) {
+        case GC_objary_chunk: {
+            jl_value_t *obj_parent = c->parent;
+            jl_value_t **obj_begin = c->begin;
+            jl_value_t **obj_end = c->end;
+            uint32_t step = c->step;
+            uintptr_t nptr = c->nptr;
+            gc_mark_objarray(ptls, obj_parent, obj_begin, obj_end, step,
+                             nptr);
+            break;
+        }
+        case GC_ary8_chunk: {
+            jl_value_t *ary8_parent = c->parent;
+            jl_value_t **ary8_begin = c->begin;
+            jl_value_t **ary8_end = c->end;
+            uint8_t *elem_begin = (uint8_t *)c->elem_begin;
+            uint8_t *elem_end = (uint8_t *)c->elem_end;
+            uintptr_t nptr = c->nptr;
+            gc_mark_array8(ptls, ary8_parent, ary8_begin, ary8_end, elem_begin, elem_end,
+                           nptr);
+            break;
+        }
+        case GC_ary16_chunk: {
+            jl_value_t *ary16_parent = c->parent;
+            jl_value_t **ary16_begin = c->begin;
+            jl_value_t **ary16_end = c->end;
+            uint16_t *elem_begin = (uint16_t *)c->elem_begin;
+            uint16_t *elem_end = (uint16_t *)c->elem_end;
+            uintptr_t nptr = c->nptr;
+            gc_mark_array16(ptls, ary16_parent, ary16_begin, ary16_end, elem_begin, elem_end,
+                            nptr);
+            break;
+        }
+        case GC_finlist_chunk: {
+            jl_value_t **fl_begin = c->begin;
+            jl_value_t **fl_end = c->end;
+            gc_mark_finlist_(mq, fl_begin, fl_end);
+            break;
+        }
+        default: {
+            // `empty-chunk` should be checked by caller
+            jl_safe_printf("GC internal error: chunk mismatch\n");
+            abort();
         }
-        goto pop;
     }
+}
 
-module_binding: {
-        // Scan a module. see `gc_mark_binding_t`
-        // Other fields of the module will be scanned after the bindings are scanned
-        gc_mark_binding_t *binding = gc_pop_markdata(&sp, gc_mark_binding_t);
-        jl_binding_t **begin = binding->begin;
-        jl_binding_t **end = binding->end;
-        uint8_t mbits = binding->bits;
-        for (; begin < end; begin += 2) {
-            jl_binding_t *b = *begin;
-            if (b == (jl_binding_t*)HT_NOTFOUND)
-                continue;
-            if (jl_object_in_image((jl_value_t*)b)) {
-                jl_taggedvalue_t *buf = jl_astaggedvalue(b);
-                uintptr_t tag = buf->header;
-                uint8_t bits;
-                if (!gc_marked(tag))
-                    gc_setmark_tag(buf, GC_OLD_MARKED, tag, &bits);
+// Mark gc frame
+STATIC_INLINE void gc_mark_stack(jl_ptls_t ptls, jl_gcframe_t *s, uint32_t nroots, uintptr_t offset,
+                   uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t *new_obj;
+    uint32_t nr = nroots >> 2;
+    while (1) {
+        jl_value_t ***rts = (jl_value_t ***)(((void **)s) + 2);
+        for (uint32_t i = 0; i < nr; i++) {
+            if (nroots & 1) {
+                void **slot = (void **)gc_read_stack(&rts[i], offset, lb, ub);
+                new_obj = (jl_value_t *)gc_read_stack(slot, offset, lb, ub);
             }
             else {
-                gc_setmark_buf_(ptls, b, mbits, sizeof(jl_binding_t));
-            }
-            void *vb = jl_astaggedvalue(b);
-            verify_parent1("module", binding->parent, &vb, "binding_buff");
-            // Record the size used for the box for non-const bindings
-            gc_heap_snapshot_record_module_to_binding(binding->parent, b);
-            (void)vb;
-            jl_value_t *ty = jl_atomic_load_relaxed(&b->ty);
-            if (ty && ty != (jl_value_t*)jl_any_type) {
-                verify_parent2("module", binding->parent,
-                               &b->ty, "binding(%s)", jl_symbol_name(b->name));
-                if (gc_try_setmark(ty, &binding->nptr, &tag, &bits)) {
-                    new_obj = ty;
-                    gc_repush_markdata(&sp, gc_mark_binding_t);
-                    goto mark;
-                }
-            }
-            jl_value_t *value = jl_atomic_load_relaxed(&b->value);
-            jl_value_t *globalref = jl_atomic_load_relaxed(&b->globalref);
-            if (value) {
-                verify_parent2("module", binding->parent,
-                               &b->value, "binding(%s)", jl_symbol_name(b->name));
-                if (gc_try_setmark(value, &binding->nptr, &tag, &bits)) {
-                    new_obj = value;
-                    begin += 2;
-                    binding->begin = begin;
-                    gc_repush_markdata(&sp, gc_mark_binding_t);
-                    uintptr_t gr_tag;
-                    uint8_t gr_bits;
-                    if (gc_try_setmark(globalref, &binding->nptr, &gr_tag, &gr_bits)) {
-                        gc_mark_marked_obj_t data = {globalref, gr_tag, gr_bits};
-                        gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(marked_obj),
-                                           &data, sizeof(data), 1);
-                    }
-                    goto mark;
+                new_obj = (jl_value_t *)gc_read_stack(&rts[i], offset, lb, ub);
+                if (gc_ptr_tag(new_obj, 1)) {
+                    // handle tagged pointers in finalizer list
+                    new_obj = (jl_value_t *)gc_ptr_clear_tag(new_obj, 1);
+                    // skip over the finalizer fptr
+                    i++;
                 }
+                if (gc_ptr_tag(new_obj, 2))
+                    continue;
             }
-            if (gc_try_setmark(globalref, &binding->nptr, &tag, &bits)) {
-                begin += 2;
-                binding->begin = begin;
-                gc_repush_markdata(&sp, gc_mark_binding_t);
-                new_obj = globalref;
-                goto mark;
+            if (new_obj != NULL) {
+                gc_try_claim_and_push(mq, new_obj, NULL);
+                gc_heap_snapshot_record_frame_to_object_edge(s, new_obj);
             }
         }
-        jl_module_t *m = binding->parent;
-        int scanparent = gc_try_setmark((jl_value_t*)m->parent, &binding->nptr, &tag, &bits);
-        size_t nusings = m->usings.len;
-        if (nusings) {
-            // this is only necessary because bindings for "using" modules
-            // are added only when accessed. therefore if a module is replaced
-            // after "using" it but before accessing it, this array might
-            // contain the only reference.
-            objary_begin = (jl_value_t**)m->usings.items;
-            objary_end = objary_begin + nusings;
-            gc_mark_objarray_t data = {(jl_value_t*)m, objary_begin, objary_end, 1, binding->nptr};
-            gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray),
-                               &data, sizeof(data), 0);
-            if (!scanparent) {
-                objary = (gc_mark_objarray_t*)sp.data;
-                goto objarray_loaded;
+        jl_gcframe_t *sprev = (jl_gcframe_t *)gc_read_stack(&s->prev, offset, lb, ub);
+        if (sprev == NULL)
+            break;
+        gc_heap_snapshot_record_frame_to_frame_edge(s, sprev);
+        s = sprev;
+        uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub);
+        assert(new_nroots <= UINT32_MAX);
+        nroots = (uint32_t)new_nroots;
+        nr = nroots >> 2;
+    }
+}
+
+// Mark exception stack
+STATIC_INLINE void gc_mark_excstack(jl_ptls_t ptls, jl_excstack_t *excstack, size_t itr) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t *new_obj;
+    while (itr > 0) {
+        size_t bt_size = jl_excstack_bt_size(excstack, itr);
+        jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr);
+        for (size_t bt_index = 0; bt_index < bt_size;
+             bt_index += jl_bt_entry_size(bt_data + bt_index)) {
+            jl_bt_element_t *bt_entry = bt_data + bt_index;
+            if (jl_bt_is_native(bt_entry))
+                continue;
+            // Found an extended backtrace entry: iterate over any
+            // GC-managed values inside.
+            size_t njlvals = jl_bt_num_jlvals(bt_entry);
+            for (size_t jlval_index = 0; jlval_index < njlvals; jlval_index++) {
+                new_obj = jl_bt_entry_jlvalue(bt_entry, jlval_index);
+                gc_try_claim_and_push(mq, new_obj, NULL);
+                gc_heap_snapshot_record_frame_to_object_edge(bt_entry, new_obj);
             }
-            sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(data));
-            sp.pc++;
+        }
+        // The exception comes last - mark it
+        new_obj = jl_excstack_exception(excstack, itr);
+        itr = jl_excstack_next(excstack, itr);
+        gc_try_claim_and_push(mq, new_obj, NULL);
+        gc_heap_snapshot_record_frame_to_object_edge(excstack, new_obj);
+    }
+}
+
+// Mark module binding
+STATIC_INLINE void gc_mark_module_binding(jl_ptls_t ptls, jl_module_t *mb_parent, jl_binding_t **mb_begin,
+                            jl_binding_t **mb_end, uintptr_t nptr,
+                            uint8_t bits) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    for (; mb_begin < mb_end; mb_begin += 2) {
+        jl_binding_t *b = *mb_begin;
+        if (b == (jl_binding_t *)HT_NOTFOUND)
+            continue;
+        if (jl_object_in_image((jl_value_t*)b)) {
+            jl_taggedvalue_t *buf = jl_astaggedvalue(b);
+            gc_try_setmark_tag(buf, GC_OLD_MARKED);
         }
         else {
-            gc_mark_push_remset(ptls, (jl_value_t*)m, binding->nptr);
+            gc_setmark_buf_(ptls, b, bits, sizeof(jl_binding_t));
         }
-        if (scanparent) {
-            new_obj = (jl_value_t*)m->parent;
-            goto mark;
+        void *vb = jl_astaggedvalue(b);
+        verify_parent1("module", binding->parent, &vb, "binding_buff");
+        // Record the size used for the box for non-const bindings
+        gc_heap_snapshot_record_module_to_binding(mb_parent, b);
+        (void)vb;
+        jl_value_t *ty = jl_atomic_load_relaxed(&b->ty);
+        if (ty && ty != (jl_value_t*)jl_any_type) {
+            verify_parent2("module", binding->parent,
+                           &b->ty, "binding(%s)", jl_symbol_name(b->name));
+            gc_try_claim_and_push(mq, ty, &nptr);
         }
-        goto pop;
+        jl_value_t *value = jl_atomic_load_relaxed(&b->value);
+        if (value) {
+            verify_parent2("module", binding->parent,
+                           &b->value, "binding(%s)", jl_symbol_name(b->name));
+            gc_try_claim_and_push(mq, value, &nptr);
+        }
+        jl_value_t *globalref = jl_atomic_load_relaxed(&b->globalref);
+        gc_try_claim_and_push(mq, globalref, &nptr);
+    }
+    gc_try_claim_and_push(mq, (jl_value_t *)mb_parent->parent, &nptr);
+    size_t nusings = mb_parent->usings.len;
+    if (nusings > 0) {
+        // this is only necessary because bindings for "using" modules
+        // are added only when accessed. therefore if a module is replaced
+        // after "using" it but before accessing it, this array might
+        // contain the only reference.
+        jl_value_t *obj_parent = (jl_value_t *)mb_parent;
+        jl_value_t **objary_begin = (jl_value_t **)mb_parent->usings.items;
+        jl_value_t **objary_end = objary_begin + nusings;
+        gc_mark_objarray(ptls, obj_parent, objary_begin, objary_end, 1, nptr);
     }
+    else {
+        gc_mark_push_remset(ptls, (jl_value_t *)mb_parent, nptr);
+    }
+}
 
-finlist: {
-        // Scan a finalizer (or format compatible) list. see `gc_mark_finlist_t`
-        gc_mark_finlist_t *finlist = gc_pop_markdata(&sp, gc_mark_finlist_t);
-        jl_value_t **begin = finlist->begin;
-        jl_value_t **end = finlist->end;
-        for (; begin < end; begin++) {
-            new_obj = *begin;
-            if (__unlikely(!new_obj))
-                continue;
-            if (gc_ptr_tag(new_obj, 1)) {
-                new_obj = (jl_value_t*)gc_ptr_clear_tag(new_obj, 1);
-                begin++;
-                assert(begin < end);
-            }
-            if (gc_ptr_tag(new_obj, 2))
-                continue;
-            uintptr_t nptr = 0;
-            if (!gc_try_setmark(new_obj, &nptr, &tag, &bits))
-                continue;
-            begin++;
-            // Found an object to mark
-            if (begin < end) {
-                // Haven't done with this one yet. Update the content and push it back
-                finlist->begin = begin;
-                gc_repush_markdata(&sp, gc_mark_finlist_t);
-            }
-            goto mark;
+void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end)
+{
+    jl_value_t *new_obj;
+    // Decide whether need to chunk finlist
+    size_t nrefs = (fl_end - fl_begin);
+    if (nrefs > GC_CHUNK_BATCH_SIZE) {
+        jl_gc_chunk_t c = {GC_finlist_chunk, NULL, fl_begin + GC_CHUNK_BATCH_SIZE, fl_end, 0, 0, 0, 0};
+        gc_chunkqueue_push(mq, &c);
+        fl_end = fl_begin + GC_CHUNK_BATCH_SIZE;
+    }
+    for (; fl_begin < fl_end; fl_begin++) {
+        new_obj = *fl_begin;
+        if (__unlikely(!new_obj))
+            continue;
+        if (gc_ptr_tag(new_obj, 1)) {
+            new_obj = (jl_value_t *)gc_ptr_clear_tag(new_obj, 1);
+            fl_begin++;
+            assert(fl_begin < fl_end);
         }
-        goto pop;
+        if (gc_ptr_tag(new_obj, 2))
+            continue;
+        gc_try_claim_and_push(mq, new_obj, NULL);
     }
+}
 
-mark: {
-        // Generic scanning entry point.
-        // Expects `new_obj`, `tag` and `bits` to be set correctly.
-#ifdef JL_DEBUG_BUILD
+// Mark finalizer list (or list of objects following same format)
+void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start)
+{
+    size_t len = list->len;
+    if (len <= start)
+        return;
+    jl_value_t **fl_begin = (jl_value_t **)list->items + start;
+    jl_value_t **fl_end = (jl_value_t **)list->items + len;
+    gc_mark_finlist_(mq, fl_begin, fl_end);
+}
+
+JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
+{
+    int may_claim = gc_try_setmark_tag(jl_astaggedvalue(obj), GC_MARKED);
+    if (may_claim)
+        gc_ptr_queue_push(&ptls->mark_queue, obj);
+    return may_claim;
+}
+
+JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
+                                            jl_value_t **objs, size_t nobjs)
+{
+    uintptr_t nptr = (nobjs << 2) | (jl_astaggedvalue(parent)->bits.gc & 2);
+    gc_mark_objarray(ptls, parent, objs, objs + nobjs, 1, nptr);
+}
+
+// Enqueue and mark all outgoing references from `new_obj` which have not been marked
+// yet. `meta_updated` is mostly used to make sure we don't update metadata twice for
+// objects which have been enqueued into the `remset`
+FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_new_obj,
+                              int meta_updated)
+{
+    jl_value_t *new_obj = (jl_value_t *)_new_obj;
+    mark_obj: {
+    #ifdef JL_DEBUG_BUILD
         if (new_obj == gc_findval)
             jl_raise_debugger();
-#endif
+    #endif
         jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
-        jl_datatype_t *vt = (jl_datatype_t*)tag;
-        int foreign_alloc = 0;
+        jl_datatype_t *vt = (jl_datatype_t *)(o->header & ~(uintptr_t)0xf);
+        uint8_t bits = (gc_old(o->header) && !mark_reset_age) ? GC_OLD_MARKED : GC_MARKED;
         int update_meta = __likely(!meta_updated && !gc_verifying);
-        if (update_meta && o->bits.in_image) {
+        int foreign_alloc = 0;
+        if (update_meta && jl_object_in_image(new_obj)) {
             foreign_alloc = 1;
             update_meta = 0;
         }
-        meta_updated = 0;
         // Symbols are always marked
         assert(vt != jl_symbol_type);
         if (vt == jl_simplevector_type) {
             size_t l = jl_svec_len(new_obj);
             jl_value_t **data = jl_svec_data(new_obj);
-            size_t dtsz = l * sizeof(void*) + sizeof(jl_svec_t);
+            size_t dtsz = l * sizeof(void *) + sizeof(jl_svec_t);
             if (update_meta)
                 gc_setmark(ptls, o, bits, dtsz);
             else if (foreign_alloc)
                 objprofile_count(vt, bits == GC_OLD_MARKED, dtsz);
+            jl_value_t *objary_parent = new_obj;
+            jl_value_t **objary_begin = data;
+            jl_value_t **objary_end = data + l;
+            uint32_t step = 1;
             uintptr_t nptr = (l << 2) | (bits & GC_OLD);
-            objary_begin = data;
-            objary_end = data + l;
-            gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr};
-            gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray),
-                               &markdata, sizeof(markdata), 0);
-            objary = (gc_mark_objarray_t*)sp.data;
-            goto objarray_loaded;
+            gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr);
         }
         else if (vt->name == jl_array_typename) {
-            jl_array_t *a = (jl_array_t*)new_obj;
+            jl_array_t *a = (jl_array_t *)new_obj;
             jl_array_flags_t flags = a->flags;
             if (update_meta) {
                 if (flags.pooled)
@@ -2832,9 +2614,10 @@ mark: {
                 else
                     gc_setmark_big(ptls, o, bits);
             }
-            else if (foreign_alloc)
+            else if (foreign_alloc) {
                 objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_array_t));
-            if (flags.how ==0){
+            }
+            if (flags.how == 0) {
                 void *data_ptr = (char*)a + sizeof(jl_array_t) +jl_array_ndimwords(a->flags.ndims) * sizeof(size_t);
                 gc_heap_snapshot_record_hidden_edge(new_obj, data_ptr, jl_array_nbytes(a), 2);
             }
@@ -2862,102 +2645,81 @@ mark: {
             else if (flags.how == 3) {
                 jl_value_t *owner = jl_array_data_owner(a);
                 uintptr_t nptr = (1 << 2) | (bits & GC_OLD);
+                gc_try_claim_and_push(mq, owner, &nptr);
                 gc_heap_snapshot_record_internal_array_edge(new_obj, owner);
-                int markowner = gc_try_setmark(owner, &nptr, &tag, &bits);
                 gc_mark_push_remset(ptls, new_obj, nptr);
-                if (markowner) {
-                    new_obj = owner;
-                    goto mark;
-                }
-                goto pop;
+                return;
             }
-            if (a->data == NULL || jl_array_len(a) == 0)
-                goto pop;
+            if (!a->data || jl_array_len(a) == 0)
+                return;
             if (flags.ptrarray) {
-                if ((jl_datatype_t*)jl_tparam0(vt) == jl_symbol_type)
-                    goto pop;
+                if ((jl_datatype_t *)jl_tparam0(vt) == jl_symbol_type)
+                    return;
                 size_t l = jl_array_len(a);
+                jl_value_t *objary_parent = new_obj;
+                jl_value_t **objary_begin = (jl_value_t **)a->data;
+                jl_value_t **objary_end = objary_begin + l;
+                uint32_t step = 1;
                 uintptr_t nptr = (l << 2) | (bits & GC_OLD);
-                objary_begin = (jl_value_t**)a->data;
-                objary_end = objary_begin + l;
-                gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray),
-                                   &markdata, sizeof(markdata), 0);
-                objary = (gc_mark_objarray_t*)sp.data;
-                goto objarray_loaded;
+                gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr);
             }
             else if (flags.hasptr) {
-                jl_datatype_t *et = (jl_datatype_t*)jl_tparam0(vt);
+                jl_datatype_t *et = (jl_datatype_t *)jl_tparam0(vt);
                 const jl_datatype_layout_t *layout = et->layout;
                 unsigned npointers = layout->npointers;
-                unsigned elsize = a->elsize / sizeof(jl_value_t*);
+                unsigned elsize = a->elsize / sizeof(jl_value_t *);
                 size_t l = jl_array_len(a);
+                jl_value_t *objary_parent = new_obj;
+                jl_value_t **objary_begin = (jl_value_t **)a->data;
+                jl_value_t **objary_end = objary_begin + l * elsize;
+                uint32_t step = elsize;
                 uintptr_t nptr = ((l * npointers) << 2) | (bits & GC_OLD);
-                objary_begin = (jl_value_t**)a->data;
-                objary_end = objary_begin + l * elsize;
                 if (npointers == 1) { // TODO: detect anytime time stride is uniform?
                     objary_begin += layout->first_ptr;
-                    gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, elsize, nptr};
-                    gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray),
-                                       &markdata, sizeof(markdata), 0);
-                    objary = (gc_mark_objarray_t*)sp.data;
-                    goto objarray_loaded;
+                    gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr);
                 }
                 else if (layout->fielddesc_type == 0) {
-                    obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout);
-                    obj8_end = obj8_begin + npointers;
-                    gc_mark_array8_t markdata = {objary_begin, objary_end, obj8_begin, {new_obj, obj8_begin, obj8_end, nptr}};
-                    gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array8),
-                                       &markdata, sizeof(markdata), 0);
-                    ary8 = (gc_mark_array8_t*)sp.data;
-                    goto array8_loaded;
+                    uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout);
+                    uint8_t *obj8_end = obj8_begin + npointers;
+                    gc_mark_array8(ptls, objary_parent, objary_begin, objary_end, obj8_begin,
+                                   obj8_end, nptr);
                 }
                 else if (layout->fielddesc_type == 1) {
-                    obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout);
-                    obj16_end = obj16_begin + npointers;
-                    gc_mark_array16_t markdata = {objary_begin, objary_end, obj16_begin, {new_obj, obj16_begin, obj16_end, nptr}};
-                    gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array16),
-                                       &markdata, sizeof(markdata), 0);
-                    ary16 = (gc_mark_array16_t*)sp.data;
-                    goto array16_loaded;
+                    uint16_t *obj16_begin = (uint16_t *)jl_dt_layout_ptrs(layout);
+                    uint16_t *obj16_end = obj16_begin + npointers;
+                    gc_mark_array16(ptls, objary_parent, objary_begin, objary_end, obj16_begin,
+                                    obj16_end, nptr);
                 }
                 else {
                     assert(0 && "unimplemented");
                 }
             }
-            goto pop;
         }
         else if (vt == jl_module_type) {
             if (update_meta)
                 gc_setmark(ptls, o, bits, sizeof(jl_module_t));
             else if (foreign_alloc)
                 objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_module_t));
-            jl_module_t *m = (jl_module_t*)new_obj;
-            jl_binding_t **table = (jl_binding_t**)m->bindings.table;
-            size_t bsize = m->bindings.size;
-            uintptr_t nptr = ((bsize + m->usings.len + 1) << 2) | (bits & GC_OLD);
-            gc_mark_binding_t markdata = {m, table + 1, table + bsize, nptr, bits};
-            gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(module_binding),
-                               &markdata, sizeof(markdata), 0);
-            sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata));
-            goto module_binding;
+            jl_module_t *mb_parent = (jl_module_t *)new_obj;
+            jl_binding_t **mb_begin = (jl_binding_t **)mb_parent->bindings.table + 1;
+            size_t bsize = mb_parent->bindings.size;
+            jl_binding_t **mb_end = (jl_binding_t **)mb_parent->bindings.table + bsize;
+            uintptr_t nptr = ((bsize + mb_parent->usings.len + 1) << 2) | (bits & GC_OLD);
+            gc_mark_module_binding(ptls, mb_parent, mb_begin, mb_end, nptr, bits);
         }
         else if (vt == jl_task_type) {
             if (update_meta)
                 gc_setmark(ptls, o, bits, sizeof(jl_task_t));
             else if (foreign_alloc)
                 objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_task_t));
-            jl_task_t *ta = (jl_task_t*)new_obj;
+            jl_task_t *ta = (jl_task_t *)new_obj;
             gc_scrub_record_task(ta);
             if (gc_cblist_task_scanner) {
-                export_gc_state(ptls, &sp);
                 int16_t tid = jl_atomic_load_relaxed(&ta->tid);
-                gc_invoke_callbacks(jl_gc_cb_task_scanner_t,
-                    gc_cblist_task_scanner,
-                    (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task));
-                import_gc_state(ptls, &sp);
+                gc_invoke_callbacks(jl_gc_cb_task_scanner_t, gc_cblist_task_scanner,
+                                    (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task));
             }
-#ifdef COPY_STACKS
+    #ifdef COPY_STACKS
             void *stkbuf = ta->stkbuf;
             if (stkbuf && ta->copy_stack) {
                 gc_setmark_buf_(ptls, stkbuf, bits, ta->bufsz);
@@ -2966,14 +2728,14 @@ mark: {
                 // TODO: edge to stack data
                 // TODO: synthetic node for stack data (how big is it?)
             }
-#endif
+    #endif
             jl_gcframe_t *s = ta->gcstack;
             size_t nroots;
             uintptr_t offset = 0;
             uintptr_t lb = 0;
             uintptr_t ub = (uintptr_t)-1;
-#ifdef COPY_STACKS
-            if (stkbuf && ta->copy_stack && ta->ptls == NULL) {
+    #ifdef COPY_STACKS
+            if (stkbuf && ta->copy_stack && !ta->ptls) {
                 int16_t tid = jl_atomic_load_relaxed(&ta->tid);
                 assert(tid >= 0);
                 jl_ptls_t ptls2 = gc_all_tls_states[tid];
@@ -2981,38 +2743,38 @@ mark: {
                 lb = ub - ta->copy_stack;
                 offset = (uintptr_t)stkbuf - lb;
             }
-#endif
-            if (s) {
+    #endif
+            if (s != NULL) {
                 nroots = gc_read_stack(&s->nroots, offset, lb, ub);
                 gc_heap_snapshot_record_task_to_frame_edge(ta, s);
-
                 assert(nroots <= UINT32_MAX);
-                gc_mark_stackframe_t stackdata = {s, 0, (uint32_t)nroots, offset, lb, ub};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(stack),
-                                   &stackdata, sizeof(stackdata), 1);
+                gc_mark_stack(ptls, s, (uint32_t)nroots, offset, lb, ub);
             }
             if (ta->excstack) {
-                gc_heap_snapshot_record_task_to_frame_edge(ta, ta->excstack);
-                gc_setmark_buf_(ptls, ta->excstack, bits, sizeof(jl_excstack_t) +
-                                sizeof(uintptr_t)*ta->excstack->reserved_size);
-                gc_mark_excstack_t stackdata = {ta->excstack, ta->excstack->top, 0, 0};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(excstack),
-                                   &stackdata, sizeof(stackdata), 1);
+                jl_excstack_t *excstack = ta->excstack;
+                gc_heap_snapshot_record_task_to_frame_edge(ta, excstack);
+                size_t itr = ta->excstack->top;
+                gc_setmark_buf_(ptls, excstack, bits,
+                                sizeof(jl_excstack_t) +
+                                    sizeof(uintptr_t) * excstack->reserved_size);
+                gc_mark_excstack(ptls, excstack, itr);
             }
             const jl_datatype_layout_t *layout = jl_task_type->layout;
             assert(layout->fielddesc_type == 0);
             assert(layout->nfields > 0);
             uint32_t npointers = layout->npointers;
-            obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout);
-            obj8_end = obj8_begin + npointers;
+            char *obj8_parent = (char *)ta;
+            uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout);
+            uint8_t *obj8_end = obj8_begin + npointers;
             // assume tasks always reference young objects: set lowest bit
             uintptr_t nptr = (npointers << 2) | 1 | bits;
-            gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr};
-            gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8),
-                               &markdata, sizeof(markdata), 0);
-            obj8 = (gc_mark_obj8_t*)sp.data;
-            obj8_parent = (char*)ta;
-            goto obj8_loaded;
+            new_obj = gc_mark_obj8(ptls, obj8_parent, obj8_begin, obj8_end, nptr);
+            if (new_obj != NULL) {
+                if (!meta_updated)
+                    goto mark_obj;
+                else
+                    gc_ptr_queue_push(mq, new_obj);
+            }
         }
         else if (vt == jl_string_type) {
             size_t dtsz = jl_string_len(new_obj) + sizeof(size_t) + 1;
@@ -3020,140 +2782,457 @@ mark: {
                 gc_setmark(ptls, o, bits, dtsz);
             else if (foreign_alloc)
                 objprofile_count(vt, bits == GC_OLD_MARKED, dtsz);
-            goto pop;
         }
         else {
             if (__unlikely(!jl_is_datatype(vt)))
-                gc_assert_datatype_fail(ptls, vt, sp);
+                gc_assert_datatype_fail(ptls, vt, mq);
             size_t dtsz = jl_datatype_size(vt);
             if (update_meta)
                 gc_setmark(ptls, o, bits, dtsz);
             else if (foreign_alloc)
                 objprofile_count(vt, bits == GC_OLD_MARKED, dtsz);
             if (vt == jl_weakref_type)
-                goto pop;
+                return;
             const jl_datatype_layout_t *layout = vt->layout;
             uint32_t npointers = layout->npointers;
             if (npointers == 0)
-                goto pop;
-            uintptr_t nptr = npointers << 2 | (bits & GC_OLD);
-            assert((layout->nfields > 0 || layout->fielddesc_type == 3) && "opaque types should have been handled specially");
+                return;
+            uintptr_t nptr = (npointers << 2 | (bits & GC_OLD));
+            assert((layout->nfields > 0 || layout->fielddesc_type == 3) &&
+                   "opaque types should have been handled specially");
             if (layout->fielddesc_type == 0) {
-                obj8_parent = (char*)new_obj;
-                obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout);
-                obj8_end = obj8_begin + npointers;
+                char *obj8_parent = (char *)new_obj;
+                uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout);
+                uint8_t *obj8_end = obj8_begin + npointers;
                 assert(obj8_begin < obj8_end);
-                gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8),
-                                   &markdata, sizeof(markdata), 0);
-                obj8 = (gc_mark_obj8_t*)sp.data;
-                goto obj8_loaded;
+                new_obj = gc_mark_obj8(ptls, obj8_parent, obj8_begin, obj8_end, nptr);
+                if (new_obj != NULL) {
+                    if (!meta_updated)
+                        goto mark_obj;
+                    else
+                        gc_ptr_queue_push(mq, new_obj);
+                }
             }
             else if (layout->fielddesc_type == 1) {
-                obj16_parent = (char*)new_obj;
-                obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout);
-                obj16_end = obj16_begin + npointers;
+                char *obj16_parent = (char *)new_obj;
+                uint16_t *obj16_begin = (uint16_t *)jl_dt_layout_ptrs(layout);
+                uint16_t *obj16_end = obj16_begin + npointers;
                 assert(obj16_begin < obj16_end);
-                gc_mark_obj16_t markdata = {new_obj, obj16_begin, obj16_end, nptr};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj16),
-                                   &markdata, sizeof(markdata), 0);
-                obj16 = (gc_mark_obj16_t*)sp.data;
-                goto obj16_loaded;
+                new_obj = gc_mark_obj16(ptls, obj16_parent, obj16_begin, obj16_end, nptr);
+                if (new_obj != NULL) {
+                    if (!meta_updated)
+                        goto mark_obj;
+                    else
+                        gc_ptr_queue_push(mq, new_obj);
+                }
             }
             else if (layout->fielddesc_type == 2) {
                 // This is very uncommon
                 // Do not do store to load forwarding to save some code size
-                uint32_t *obj32_begin = (uint32_t*)jl_dt_layout_ptrs(layout);
+                char *obj32_parent = (char *)new_obj;
+                uint32_t *obj32_begin = (uint32_t *)jl_dt_layout_ptrs(layout);
                 uint32_t *obj32_end = obj32_begin + npointers;
-                gc_mark_obj32_t markdata = {new_obj, obj32_begin, obj32_end, nptr};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj32),
-                                   &markdata, sizeof(markdata), 0);
-                sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata));
-                goto obj32;
+                assert(obj32_begin < obj32_end);
+                new_obj = gc_mark_obj32(ptls, obj32_parent, obj32_begin, obj32_end, nptr);
+                if (new_obj != NULL) {
+                    if (!meta_updated)
+                        goto mark_obj;
+                    else
+                        gc_ptr_queue_push(mq, new_obj);
+                }
             }
             else {
                 assert(layout->fielddesc_type == 3);
-                jl_fielddescdyn_t *desc = (jl_fielddescdyn_t*)jl_dt_layout_fields(layout);
+                jl_fielddescdyn_t *desc = (jl_fielddescdyn_t *)jl_dt_layout_fields(layout);
                 int old = jl_astaggedvalue(new_obj)->bits.gc & 2;
-                export_gc_state(ptls, &sp);
                 uintptr_t young = desc->markfunc(ptls, new_obj);
-                import_gc_state(ptls, &sp);
                 if (old && young)
                     gc_mark_push_remset(ptls, new_obj, young * 4 + 3);
+            }
+        }
+    }
+}
+
+// Used in gc-debug
+void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
+{
+    while (1) {
+        void *new_obj = (void *)gc_ptr_queue_pop(&ptls->mark_queue);
+        // No more objects to mark
+        if (__unlikely(new_obj == NULL)) {
+            return;
+        }
+        gc_mark_outrefs(ptls, mq, new_obj, 0);
+    }
+}
+
+// Drain items from worker's own chunkqueue
+void gc_drain_own_chunkqueue(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
+{
+    jl_gc_chunk_t c = {.cid = GC_empty_chunk};
+    do {
+        c = gc_chunkqueue_pop(mq);
+        if (c.cid != GC_empty_chunk) {
+            gc_mark_chunk(ptls, mq, &c);
+            gc_mark_loop_serial_(ptls, mq);
+        }
+    } while (c.cid != GC_empty_chunk);
+}
+
+// Main mark loop. Stack (allocated on the heap) of `jl_value_t *`
+// is used to keep track of processed items. Maintaning this stack (instead of
+// native one) avoids stack overflow when marking deep objects and
+// makes it easier to implement parallel marking via work-stealing
+JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls)
+{
+    gc_mark_loop_serial_(ptls, &ptls->mark_queue);
+    gc_drain_own_chunkqueue(ptls, &ptls->mark_queue);
+}
+
+void gc_mark_and_steal(jl_ptls_t ptls)
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_gc_markqueue_t *mq_master = NULL;
+    int master_tid = jl_atomic_load(&gc_master_tid);
+    if (master_tid == -1) {
+        return;
+    }
+    mq_master = &gc_all_tls_states[master_tid]->mark_queue;
+    void *new_obj;
+    jl_gc_chunk_t c;
+    pop : {
+        new_obj = gc_ptr_queue_pop(mq);
+        if (new_obj != NULL) {
+            goto mark;
+        }
+        c = gc_chunkqueue_pop(mq);
+        if (c.cid != GC_empty_chunk) {
+            gc_mark_chunk(ptls, mq, &c);
+            goto pop;
+        }
+        goto steal;
+    }
+    mark : {
+        gc_mark_outrefs(ptls, mq, new_obj, 0);
+        goto pop;
+    }
+    // Note that for the stealing heuristics, we try to
+    // steal chunks much more agressively than pointers,
+    // since we know chunks will likely expand into a lot
+    // of work for the mark loop
+    steal : {
+        // Try to steal chunk from random GC thread
+        for (int i = 0; i < 4 * jl_n_gcthreads; i++) {
+            uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_gcthreads;
+            jl_gc_markqueue_t *mq2 = &gc_all_tls_states[v]->mark_queue;
+            c = gc_chunkqueue_steal_from(mq2);
+            if (c.cid != GC_empty_chunk) {
+                gc_mark_chunk(ptls, mq, &c);
+                goto pop;
+            }
+        }
+        // Sequentially walk GC threads to try to steal chunk
+        for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) {
+            jl_gc_markqueue_t *mq2 = &gc_all_tls_states[i]->mark_queue;
+            c = gc_chunkqueue_steal_from(mq2);
+            if (c.cid != GC_empty_chunk) {
+                gc_mark_chunk(ptls, mq, &c);
+                goto pop;
+            }
+        }
+        // Try to steal chunk from master thread
+        if (mq_master != NULL) {
+            c = gc_chunkqueue_steal_from(mq_master);
+            if (c.cid != GC_empty_chunk) {
+                gc_mark_chunk(ptls, mq, &c);
                 goto pop;
             }
         }
+        // Try to steal pointer from random GC thread
+        for (int i = 0; i < 4 * jl_n_gcthreads; i++) {
+            uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_gcthreads;
+            jl_gc_markqueue_t *mq2 = &gc_all_tls_states[v]->mark_queue;
+            new_obj = gc_ptr_queue_steal_from(mq2);
+            if (new_obj != NULL)
+                goto mark;
+        }
+        // Sequentially walk GC threads to try to steal pointer
+        for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) {
+            jl_gc_markqueue_t *mq2 = &gc_all_tls_states[i]->mark_queue;
+            new_obj = gc_ptr_queue_steal_from(mq2);
+            if (new_obj != NULL)
+                goto mark;
+        }
+        // Try to steal pointer from master thread
+        if (mq_master != NULL) {
+            new_obj = gc_ptr_queue_steal_from(mq_master);
+            if (new_obj != NULL)
+                goto mark;
+        }
+    }
+}
+
+size_t gc_count_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT
+{
+    // assume each chunk is worth 256 units of work and each pointer
+    // is worth 1 unit of work
+    size_t work = 256 * (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) -
+        jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top));
+    work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) -
+        jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top));
+    return work;
+}
+
+/**
+ * Correctness argument for the mark-loop termination protocol.
+ *
+ * Safety properties:
+ * - No work items shall be in any thread's queues when `gc_mark_loop_barrier` observes
+ * that `gc_n_threads_marking` is zero.
+ *
+ * - No work item shall be stolen from the master thread (i.e. mutator thread which started
+ * GC and which helped the `jl_n_gcthreads` - 1 threads to mark) after
+ * `gc_mark_loop_barrier` observes that `gc_n_threads_marking` is zero. This property is
+ * necessary because we call `gc_mark_loop_serial` after marking the finalizer list in
+ * `_jl_gc_collect`, and want to ensure that we have the serial mark-loop semantics there,
+ * and that no work is stolen from us at that point.
+ *
+ * Proof:
+ * - Suppose the master thread observes that `gc_n_threads_marking` is zero in
+ * `gc_mark_loop_barrier` and there is a work item left in one thread's queue at that point.
+ * Since threads try to steal from all threads' queues, this implies that all threads must
+ * have tried to steal from the queue which still has a work item left, but failed to do so,
+ * which violates the semantics of Chase-Lev's work-stealing queue.
+ *
+ * - Let E1 be the event "master thread writes -1 to gc_master_tid" and E2 be the event
+ * "master thread observes that `gc_n_threads_marking` is zero". Since we're using
+ * sequentially consistent atomics, E1 => E2. Now suppose one thread which is spinning in
+ * `gc_should_mark` tries to enter the mark-loop after E2. In order to do so, it must
+ * increment `gc_n_threads_marking` to 1 in an event E3, and then read `gc_master_tid` in an
+ * event E4. Since we're using sequentially consistent atomics, E3 => E4. Since we observed
+ * `gc_n_threads_marking` as zero in E2, then E2 => E3, and we conclude E1 => E4, so that
+ * the thread which is spinning in `gc_should_mark` must observe that `gc_master_tid` is -1
+ * and therefore won't enter the mark-loop.
+ */
+
+int gc_should_mark(jl_ptls_t ptls)
+{
+    int should_mark = 0;
+    int n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
+    // fast path
+    if (n_threads_marking == 0) {
+        return 0;
+    }
+    uv_mutex_lock(&gc_queue_observer_lock);
+    while (1) {
+        int tid = jl_atomic_load(&gc_master_tid);
+        // fast path
+        if (tid == -1) {
+            break;
+        }
+        n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
+        // fast path
+        if (n_threads_marking == 0) {
+            break;
+        }
+        size_t work = gc_count_work_in_queue(gc_all_tls_states[tid]);
+        for (tid = gc_first_tid; tid < gc_first_tid + jl_n_gcthreads; tid++) {
+            work += gc_count_work_in_queue(gc_all_tls_states[tid]);
+        }
+        // if there is a lot of work left, enter the mark loop
+        if (work >= 16 * n_threads_marking) {
+            jl_atomic_fetch_add(&gc_n_threads_marking, 1);
+            should_mark = 1;
+            break;
+        }
+        jl_cpu_pause();
+    }
+    uv_mutex_unlock(&gc_queue_observer_lock);
+    return should_mark;
+}
+
+void gc_wake_all_for_marking(jl_ptls_t ptls)
+{
+    jl_atomic_store(&gc_master_tid, ptls->tid);
+    uv_mutex_lock(&gc_threads_lock);
+    jl_atomic_fetch_add(&gc_n_threads_marking, 1);
+    uv_cond_broadcast(&gc_threads_cond);
+    uv_mutex_unlock(&gc_threads_lock);
+}
+
+void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
+{
+    if (master) {
+        gc_wake_all_for_marking(ptls);
+        gc_mark_and_steal(ptls);
+        jl_atomic_fetch_add(&gc_n_threads_marking, -1);
+    }
+    while (1) {
+        int should_mark = gc_should_mark(ptls);
+        if (!should_mark) {
+            break;
+        }
+        gc_mark_and_steal(ptls);
+        jl_atomic_fetch_add(&gc_n_threads_marking, -1);
+    }
+}
+
+void gc_mark_loop(jl_ptls_t ptls)
+{
+    if (jl_n_gcthreads == 0 || gc_heap_snapshot_enabled) {
+        gc_mark_loop_serial(ptls);
+    }
+    else {
+        gc_mark_loop_parallel(ptls, 1);
+    }
+}
+
+void gc_mark_loop_barrier(void)
+{
+    jl_atomic_store(&gc_master_tid, -1);
+    while (jl_atomic_load(&gc_n_threads_marking) != 0) {
+        jl_cpu_pause();
+    }
+}
+
+void gc_mark_clean_reclaim_sets(void)
+{
+    // Clean up `reclaim-sets`
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        arraylist_t *reclaim_set2 = &ptls2->mark_queue.reclaim_set;
+        ws_array_t *a = NULL;
+        while ((a = (ws_array_t *)arraylist_pop(reclaim_set2)) != NULL) {
+            free(a->buffer);
+            free(a);
+        }
     }
 }
 
-static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
-                                     jl_ptls_t ptls2)
+static void gc_premark(jl_ptls_t ptls2)
+{
+    arraylist_t *remset = ptls2->heap.remset;
+    ptls2->heap.remset = ptls2->heap.last_remset;
+    ptls2->heap.last_remset = remset;
+    ptls2->heap.remset->len = 0;
+    ptls2->heap.remset_nptr = 0;
+    // avoid counting remembered objects & bindings twice
+    // in `perm_scanned_bytes`
+    size_t len = remset->len;
+    void **items = remset->items;
+    for (size_t i = 0; i < len; i++) {
+        jl_value_t *item = (jl_value_t*)items[i];
+        objprofile_count(jl_typeof(item), 2, 0);
+        jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED;
+    }
+    len = ptls2->heap.rem_bindings.len;
+    items = ptls2->heap.rem_bindings.items;
+    for (size_t i = 0; i < len; i++) {
+        void *ptr = items[i];
+        jl_astaggedvalue(ptr)->bits.gc = GC_OLD_MARKED;
+    }
+}
+
+static void gc_queue_thread_local(jl_gc_markqueue_t *mq, jl_ptls_t ptls2)
 {
     jl_task_t *task;
     task = ptls2->root_task;
-    if (task) {
-        gc_mark_queue_obj(gc_cache, sp, task);
+    if (task != NULL) {
+        gc_try_claim_and_push(mq, task, NULL);
         gc_heap_snapshot_record_root((jl_value_t*)task, "root task");
     }
     task = jl_atomic_load_relaxed(&ptls2->current_task);
-    if (task) {
-        gc_mark_queue_obj(gc_cache, sp, task);
+    if (task != NULL) {
+        gc_try_claim_and_push(mq, task, NULL);
         gc_heap_snapshot_record_root((jl_value_t*)task, "current task");
     }
     task = ptls2->next_task;
-    if (task) {
-        gc_mark_queue_obj(gc_cache, sp, task);
+    if (task != NULL) {
+        gc_try_claim_and_push(mq, task, NULL);
         gc_heap_snapshot_record_root((jl_value_t*)task, "next task");
     }
     task = ptls2->previous_task;
-    if (task) { // shouldn't be necessary, but no reason not to
-        gc_mark_queue_obj(gc_cache, sp, task);
+    if (task != NULL) {
+        gc_try_claim_and_push(mq, task, NULL);
         gc_heap_snapshot_record_root((jl_value_t*)task, "previous task");
     }
     if (ptls2->previous_exception) {
-        gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception);
+        gc_try_claim_and_push(mq, ptls2->previous_exception, NULL);
         gc_heap_snapshot_record_root((jl_value_t*)ptls2->previous_exception, "previous exception");
     }
 }
 
+static void gc_queue_bt_buf(jl_gc_markqueue_t *mq, jl_ptls_t ptls2)
+{
+    jl_bt_element_t *bt_data = ptls2->bt_data;
+    size_t bt_size = ptls2->bt_size;
+    for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) {
+        jl_bt_element_t *bt_entry = bt_data + i;
+        if (jl_bt_is_native(bt_entry))
+            continue;
+        size_t njlvals = jl_bt_num_jlvals(bt_entry);
+        for (size_t j = 0; j < njlvals; j++)
+            gc_try_claim_and_push(mq, jl_bt_entry_jlvalue(bt_entry, j), NULL);
+    }
+}
+
+static void gc_queue_remset(jl_ptls_t ptls, jl_ptls_t ptls2)
+{
+    size_t len = ptls2->heap.last_remset->len;
+    void **items = ptls2->heap.last_remset->items;
+    for (size_t i = 0; i < len; i++) {
+        // Objects in the `remset` are already marked,
+        // so a `gc_try_claim_and_push` wouldn't work here
+        gc_mark_outrefs(ptls, &ptls->mark_queue, (jl_value_t *)items[i], 1);
+    }
+    int n_bnd_refyoung = 0;
+    len = ptls2->heap.rem_bindings.len;
+    items = ptls2->heap.rem_bindings.items;
+    for (size_t i = 0; i < len; i++) {
+        jl_binding_t *ptr = (jl_binding_t*)items[i];
+        uintptr_t bnd_refyoung = 0;
+        jl_value_t *v = jl_atomic_load_relaxed(&ptr->value);
+        gc_try_claim_and_push(&ptls->mark_queue, v, &bnd_refyoung);
+        jl_value_t *ty = jl_atomic_load_relaxed(&ptr->ty);
+        gc_try_claim_and_push(&ptls->mark_queue, ty, &bnd_refyoung);
+        jl_value_t *globalref = jl_atomic_load_relaxed(&ptr->globalref);
+        gc_try_claim_and_push(&ptls->mark_queue, globalref, &bnd_refyoung);
+        if (bnd_refyoung) {
+            items[n_bnd_refyoung] = ptr;
+            n_bnd_refyoung++;
+        }
+    }
+    ptls2->heap.rem_bindings.len = n_bnd_refyoung;
+}
+
 extern jl_value_t *cmpswap_names JL_GLOBALLY_ROOTED;
 
 // mark the initial root set
-static void mark_roots(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp)
+static void gc_mark_roots(jl_gc_markqueue_t *mq)
 {
     // modules
-    gc_mark_queue_obj(gc_cache, sp, jl_main_module);
+    gc_try_claim_and_push(mq, jl_main_module, NULL);
     gc_heap_snapshot_record_root((jl_value_t*)jl_main_module, "main_module");
-
     // invisible builtin values
-    if (jl_an_empty_vec_any != NULL)
-        gc_mark_queue_obj(gc_cache, sp, jl_an_empty_vec_any);
-    if (jl_module_init_order != NULL)
-        gc_mark_queue_obj(gc_cache, sp, jl_module_init_order);
+    gc_try_claim_and_push(mq, jl_an_empty_vec_any, NULL);
+    gc_try_claim_and_push(mq, jl_module_init_order, NULL);
     for (size_t i = 0; i < jl_current_modules.size; i += 2) {
         if (jl_current_modules.table[i + 1] != HT_NOTFOUND) {
-            gc_mark_queue_obj(gc_cache, sp, jl_current_modules.table[i]);
+            gc_try_claim_and_push(mq, jl_current_modules.table[i], NULL);
             gc_heap_snapshot_record_root((jl_value_t*)jl_current_modules.table[i], "top level module");
         }
     }
-    gc_mark_queue_obj(gc_cache, sp, jl_anytuple_type_type);
+    gc_try_claim_and_push(mq, jl_anytuple_type_type, NULL);
     for (size_t i = 0; i < N_CALL_CACHE; i++) {
         jl_typemap_entry_t *v = jl_atomic_load_relaxed(&call_cache[i]);
-        if (v != NULL) {
-            gc_mark_queue_obj(gc_cache, sp, v);
-        }
-    }
-    if (jl_all_methods != NULL) {
-        gc_mark_queue_obj(gc_cache, sp, jl_all_methods);
+        gc_try_claim_and_push(mq, v, NULL);
     }
-    if (_jl_debug_method_invalidation != NULL)
-        gc_mark_queue_obj(gc_cache, sp, _jl_debug_method_invalidation);
-
+    gc_try_claim_and_push(mq, jl_all_methods, NULL);
+    gc_try_claim_and_push(mq, _jl_debug_method_invalidation, NULL);
     // constants
-    gc_mark_queue_obj(gc_cache, sp, jl_emptytuple_type);
-    if (cmpswap_names != NULL)
-        gc_mark_queue_obj(gc_cache, sp, cmpswap_names);
-    gc_mark_queue_obj(gc_cache, sp, jl_global_roots_table);
+    gc_try_claim_and_push(mq, jl_emptytuple_type, NULL);
+    gc_try_claim_and_push(mq, cmpswap_names, NULL);
+    gc_try_claim_and_push(mq, jl_global_roots_table, NULL);
 }
 
 // find unmarked objects that need to be finalized from the finalizer list "list".
@@ -3283,79 +3362,23 @@ JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT
     return newtb - oldtb;
 }
 
-JL_DLLEXPORT int64_t jl_gc_live_bytes(void)
-{
-    return live_bytes;
-}
-
-static void jl_gc_premark(jl_ptls_t ptls2)
-{
-    arraylist_t *remset = ptls2->heap.remset;
-    ptls2->heap.remset = ptls2->heap.last_remset;
-    ptls2->heap.last_remset = remset;
-    ptls2->heap.remset->len = 0;
-    ptls2->heap.remset_nptr = 0;
-
-    // avoid counting remembered objects & bindings twice
-    // in `perm_scanned_bytes`
-    size_t len = remset->len;
-    void **items = remset->items;
-    for (size_t i = 0; i < len; i++) {
-        jl_value_t *item = (jl_value_t*)items[i];
-        objprofile_count(jl_typeof(item), 2, 0);
-        jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED;
-    }
-    len = ptls2->heap.rem_bindings.len;
-    items = ptls2->heap.rem_bindings.items;
-    for (size_t i = 0; i < len; i++) {
-        void *ptr = items[i];
-        jl_astaggedvalue(ptr)->bits.gc = GC_OLD_MARKED;
-    }
-}
-
-static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2)
+JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void)
 {
-    size_t len = ptls2->heap.last_remset->len;
-    void **items = ptls2->heap.last_remset->items;
-    for (size_t i = 0; i < len; i++)
-        gc_mark_queue_scan_obj(gc_cache, sp, (jl_value_t*)items[i]);
-    int n_bnd_refyoung = 0;
-    len = ptls2->heap.rem_bindings.len;
-    items = ptls2->heap.rem_bindings.items;
-    for (size_t i = 0; i < len; i++) {
-        jl_binding_t *ptr = (jl_binding_t*)items[i];
-        // A null pointer can happen here when the binding is cleaned up
-        // as an exception is thrown after it was already queued (#10221)
-        int bnd_refyoung = 0;
-        jl_value_t *v = jl_atomic_load_relaxed(&ptr->value);
-        if (v != NULL && gc_mark_queue_obj(gc_cache, sp, v))
-            bnd_refyoung = 1;
-        jl_value_t *ty = jl_atomic_load_relaxed(&ptr->ty);
-        if (ty != NULL && gc_mark_queue_obj(gc_cache, sp, ty))
-            bnd_refyoung = 1;
-        jl_value_t *globalref = jl_atomic_load_relaxed(&ptr->globalref);
-        if (globalref != NULL && gc_mark_queue_obj(gc_cache, sp, globalref))
-            bnd_refyoung = 1;
-        if (bnd_refyoung) {
-            items[n_bnd_refyoung] = ptr;
-            n_bnd_refyoung++;
+    int n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    int64_t pool_live_bytes = 0;
+    for (int i = 0; i < n_threads; i++) {
+        jl_ptls_t ptls2 = all_tls_states[i];
+        if (ptls2 != NULL) {
+            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_num.pool_live_bytes);
         }
     }
-    ptls2->heap.rem_bindings.len = n_bnd_refyoung;
+    return pool_live_bytes;
 }
 
-static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2)
+JL_DLLEXPORT int64_t jl_gc_live_bytes(void)
 {
-    jl_bt_element_t *bt_data = ptls2->bt_data;
-    size_t bt_size = ptls2->bt_size;
-    for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) {
-        jl_bt_element_t *bt_entry = bt_data + i;
-        if (jl_bt_is_native(bt_entry))
-            continue;
-        size_t njlvals = jl_bt_num_jlvals(bt_entry);
-        for (size_t j = 0; j < njlvals; j++)
-            gc_mark_queue_obj(gc_cache, sp, jl_bt_entry_jlvalue(bt_entry, j));
-    }
+    return live_bytes;
 }
 
 size_t jl_maxrss(void);
@@ -3365,9 +3388,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
 {
     combine_thread_gc_counts(&gc_num);
 
-    jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
-    jl_gc_mark_sp_t sp;
-    gc_mark_sp_init(gc_cache, &sp);
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
 
     uint64_t gc_start_time = jl_hrtime();
     int64_t last_perm_scanned_bytes = perm_scanned_bytes;
@@ -3379,33 +3400,39 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         if (ptls2 != NULL)
-            jl_gc_premark(ptls2);
+            gc_premark(ptls2);
     }
 
     assert(gc_n_threads);
+    int single_threaded = (jl_n_gcthreads == 0 || gc_heap_snapshot_enabled);
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2 == NULL)
-            continue;
-        // 2.1. mark every object in the `last_remsets` and `rem_binding`
-        jl_gc_queue_remset(gc_cache, &sp, ptls2);
-        // 2.2. mark every thread local root
-        jl_gc_queue_thread_local(gc_cache, &sp, ptls2);
-        // 2.3. mark any managed objects in the backtrace buffer
-        // TODO: treat these as roots for gc_heap_snapshot_record
-        jl_gc_queue_bt_buf(gc_cache, &sp, ptls2);
+        jl_gc_markqueue_t *mq2 = mq;
+        jl_ptls_t ptls_gc_thread = NULL;
+        if (!single_threaded) {
+            ptls_gc_thread = gc_all_tls_states[gc_first_tid + t_i % jl_n_gcthreads];
+            mq2 = &ptls_gc_thread->mark_queue;
+        }
+        if (ptls2 != NULL) {
+            // 2.1. mark every thread local root
+            gc_queue_thread_local(mq2, ptls2);
+            // 2.2. mark any managed objects in the backtrace buffer
+            // TODO: treat these as roots for gc_heap_snapshot_record
+            gc_queue_bt_buf(mq2, ptls2);
+            // 2.3. mark every object in the `last_remsets` and `rem_binding`
+            gc_queue_remset(single_threaded ? ptls : ptls_gc_thread, ptls2);
+        }
     }
 
     // 3. walk roots
-    mark_roots(gc_cache, &sp);
+    gc_mark_roots(mq);
     if (gc_cblist_root_scanner) {
-        export_gc_state(ptls, &sp);
         gc_invoke_callbacks(jl_gc_cb_root_scanner_t,
             gc_cblist_root_scanner, (collection));
-        import_gc_state(ptls, &sp);
     }
-    gc_mark_loop(ptls, sp);
-    gc_mark_sp_init(gc_cache, &sp);
+    gc_mark_loop(ptls);
+    gc_mark_loop_barrier();
+    gc_mark_clean_reclaim_sets();
     gc_num.since_sweep += gc_num.allocd;
     JL_PROBE_GC_MARK_END(scanned_bytes, perm_scanned_bytes);
     gc_settime_premark_end();
@@ -3426,9 +3453,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        sweep_finalizer_list(&ptls2->finalizers);
+        if (ptls2 != NULL)
+            sweep_finalizer_list(&ptls2->finalizers);
     }
     if (prev_sweep_full) {
         sweep_finalizer_list(&finalizer_list_marked);
@@ -3437,15 +3463,13 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
+        if (ptls2 != NULL)
+            gc_mark_finlist(mq, &ptls2->finalizers, 0);
     }
-    gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, orig_marked_len);
+    gc_mark_finlist(mq, &finalizer_list_marked, orig_marked_len);
     // "Flush" the mark stack before flipping the reset_age bit
     // so that the objects are not incorrectly reset.
-    gc_mark_loop(ptls, sp);
-    gc_mark_sp_init(gc_cache, &sp);
+    gc_mark_loop_serial(ptls);
     // Conservative marking relies on age to tell allocated objects
     // and freelist entries apart.
     mark_reset_age = !jl_gc_conservative_gc_support_enabled();
@@ -3453,9 +3477,10 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     // `to_finalize` list. These objects are only reachable from this list
     // and should not be referenced by any old objects so this won't break
     // the GC invariant.
-    gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0);
-    gc_mark_loop(ptls, sp);
+    gc_mark_finlist(mq, &to_finalize, 0);
+    gc_mark_loop_serial(ptls);
     mark_reset_age = 0;
+
     gc_settime_postmark_end();
 
     // Flush everything in mark cache
@@ -3480,9 +3505,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        nptr += ptls2->heap.remset_nptr;
+        if (ptls2 != NULL)
+            nptr += ptls2->heap.remset_nptr;
     }
 
     // many pointers in the intergen frontier => "quick" mark is not quick
@@ -3503,7 +3527,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         size_t maxmem = 0;
 #ifdef _P64
         // on a big memory machine, increase max_collect_interval to totalmem / nthreads / 2
-        maxmem = total_mem / gc_n_threads / 2;
+        maxmem = total_mem / (gc_n_threads - jl_n_gcthreads) / 2;
 #endif
         if (maxmem < max_collect_interval)
             maxmem = max_collect_interval;
@@ -3535,16 +3559,17 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         last_long_collect_interval = gc_num.interval;
     }
     scanned_bytes = 0;
-    // 5. start sweeping
+    // 6. start sweeping
     uint64_t start_sweep_time = jl_hrtime();
     JL_PROBE_GC_SWEEP_BEGIN(sweep_full);
+    current_sweep_full = sweep_full;
     sweep_weak_refs();
     sweep_stack_pools();
     gc_sweep_foreign_objs();
     gc_sweep_other(ptls, sweep_full);
     gc_scrub();
     gc_verify_tags();
-    gc_sweep_pool(sweep_full);
+    gc_sweep_pool();
     if (sweep_full)
         gc_sweep_perm_alloc();
     JL_PROBE_GC_SWEEP_END();
@@ -3554,9 +3579,15 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     uint64_t sweep_time = gc_end_time - start_sweep_time;
     gc_num.total_sweep_time += sweep_time;
     gc_num.sweep_time = sweep_time;
+    if (sweep_full) {
+        gc_num.last_full_sweep = gc_end_time;
+    }
+    else {
+        gc_num.last_incremental_sweep = gc_end_time;
+    }
 
     // sweeping is over
-    // 6. if it is a quick sweep, put back the remembered objects in queued state
+    // 7. if it is a quick sweep, put back the remembered objects in queued state
     // so that we don't trigger the barrier again on them.
     assert(gc_n_threads);
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
@@ -3590,7 +3621,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     }
 #endif
 
-
     _report_gc_finished(pause, gc_num.freed, sweep_full, recollect);
 
     gc_final_pause_end(gc_start_time, gc_end_time);
@@ -3608,19 +3638,19 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
 
     if (collection == JL_GC_AUTO) {
         //If we aren't freeing enough or are seeing lots and lots of pointers let it increase faster
-        if(!not_freed_enough || large_frontier) {
+        if (!not_freed_enough || large_frontier) {
             int64_t tot = 2 * (live_bytes + gc_num.since_sweep) / 3;
             if (gc_num.interval > tot) {
                 gc_num.interval = tot;
                 last_long_collect_interval = tot;
             }
+        }
         // If the current interval is larger than half the live data decrease the interval
-        } else {
+        else {
             int64_t half = (live_bytes / 2);
             if (gc_num.interval > half)
                 gc_num.interval = half;
         }
-
         // But never go below default
         if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval;
     }
@@ -3633,6 +3663,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         else {
             // We can't stay under our goal so let's go back to
             // the minimum interval and hope things get better
+            under_memory_pressure = 1;
             gc_num.interval = default_collect_interval;
         }
     }
@@ -3704,6 +3735,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     if (duration > gc_num.max_time_to_safepoint)
         gc_num.max_time_to_safepoint = duration;
     gc_num.time_to_safepoint = duration;
+    gc_num.total_time_to_safepoint += duration;
 
     gc_invoke_callbacks(jl_gc_cb_pre_gc_t,
         gc_cblist_pre_gc, (collection));
@@ -3735,22 +3767,27 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
 
     gc_invoke_callbacks(jl_gc_cb_post_gc_t,
         gc_cblist_post_gc, (collection));
+
+    if (under_memory_pressure) {
+        gc_invoke_callbacks(jl_gc_cb_notify_gc_pressure_t,
+            gc_cblist_notify_gc_pressure, ());
+    }
+    under_memory_pressure = 0;
 #ifdef _OS_WINDOWS_
     SetLastError(last_error);
 #endif
     errno = last_errno;
 }
 
-void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp)
+void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
 {
-    jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
     assert(gc_n_threads);
     for (size_t i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2)
-            jl_gc_queue_thread_local(gc_cache, sp, ptls2);
+        if (ptls2 != NULL)
+            gc_queue_thread_local(mq, ptls2);
     }
-    mark_roots(gc_cache, sp);
+    gc_mark_roots(mq);
 }
 
 // allocator entry points
@@ -3770,8 +3807,10 @@ void jl_init_thread_heap(jl_ptls_t ptls)
         p[i].freelist = NULL;
         p[i].newpages = NULL;
     }
-    arraylist_new(&heap->weak_refs, 0);
-    arraylist_new(&heap->live_tasks, 0);
+    small_arraylist_new(&heap->weak_refs, 0);
+    small_arraylist_new(&heap->live_tasks, 0);
+    for (int i = 0; i < JL_N_STACK_POOLS; i++)
+        small_arraylist_new(&heap->free_stacks[i], 0);
     heap->mallocarrays = NULL;
     heap->mafreelist = NULL;
     heap->big_objects = NULL;
@@ -3787,10 +3826,20 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     gc_cache->perm_scanned_bytes = 0;
     gc_cache->scanned_bytes = 0;
     gc_cache->nbig_obj = 0;
-    size_t init_size = 1024;
-    gc_cache->pc_stack = (void**)malloc_s(init_size * sizeof(void*));
-    gc_cache->pc_stack_end = gc_cache->pc_stack + init_size;
-    gc_cache->data_stack = (jl_gc_mark_data_t *)malloc_s(init_size * sizeof(jl_gc_mark_data_t));
+
+    // Initialize GC mark-queue
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    ws_queue_t *cq = &mq->chunk_queue;
+    ws_array_t *wsa = create_ws_array(GC_CHUNK_QUEUE_INIT_SIZE, sizeof(jl_gc_chunk_t));
+    jl_atomic_store_relaxed(&cq->top, 0);
+    jl_atomic_store_relaxed(&cq->bottom, 0);
+    jl_atomic_store_relaxed(&cq->array, wsa);
+    ws_queue_t *q = &mq->ptr_queue;
+    ws_array_t *wsa2 = create_ws_array(GC_PTR_QUEUE_INIT_SIZE, sizeof(jl_value_t *));
+    jl_atomic_store_relaxed(&q->top, 0);
+    jl_atomic_store_relaxed(&q->bottom, 0);
+    jl_atomic_store_relaxed(&q->array, wsa2);
+    arraylist_new(&mq->reclaim_set, 32);
 
     memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
     jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
@@ -3801,8 +3850,12 @@ void jl_gc_init(void)
 {
     JL_MUTEX_INIT(&heapsnapshot_lock);
     JL_MUTEX_INIT(&finalizers_lock);
+    uv_mutex_init(&page_profile_lock);
     uv_mutex_init(&gc_cache_lock);
     uv_mutex_init(&gc_perm_lock);
+    uv_mutex_init(&gc_threads_lock);
+    uv_cond_init(&gc_threads_cond);
+    uv_mutex_init(&gc_queue_observer_lock);
 
     jl_gc_init_page();
     jl_gc_debug_init();
@@ -3834,13 +3887,11 @@ void jl_gc_init(void)
 #endif
     if (jl_options.heap_size_hint)
         jl_gc_set_max_memory(jl_options.heap_size_hint);
-
-    jl_gc_mark_sp_t sp = {NULL, NULL, NULL, NULL};
-    gc_mark_loop(NULL, sp);
     t_start = jl_hrtime();
 }
 
-void jl_gc_set_max_memory(uint64_t max_mem) {
+JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem)
+{
     if (max_mem > 0
         && max_mem < (uint64_t)1 << (sizeof(memsize_t) * 8 - 1)) {
         max_total_memory = max_mem;
@@ -3864,7 +3915,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
-    if (pgcstack && ct->world_age) {
+    if (pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         jl_atomic_store_relaxed(&ptls->gc_num.allocd,
@@ -3879,7 +3930,7 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
-    if (pgcstack && ct->world_age) {
+    if (pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         jl_atomic_store_relaxed(&ptls->gc_num.allocd,
@@ -3895,7 +3946,7 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
     free(p);
-    if (pgcstack && ct->world_age) {
+    if (pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         jl_atomic_store_relaxed(&ptls->gc_num.freed,
             jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz);
@@ -3908,7 +3959,7 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
-    if (pgcstack && ct->world_age) {
+    if (pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         if (sz < old)
@@ -4047,7 +4098,10 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
     SetLastError(last_error);
 #endif
     errno = last_errno;
-    maybe_record_alloc_to_profile((jl_value_t*)b, sz, jl_gc_unknown_type_tag);
+    // gc_managed_realloc_ is currently used exclusively for resizing array buffers.
+    if (allocsz > oldsz) {
+        maybe_record_alloc_to_profile((jl_value_t*)b, allocsz - oldsz, (jl_datatype_t*)jl_buff_tag);
+    }
     return b;
 }
 
@@ -4088,7 +4142,6 @@ jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
     // old pointer.
     bigval_t *newbig = (bigval_t*)gc_managed_realloc_(ptls, hdr, allocsz, oldsz, 1, s, 0);
     newbig->sz = allocsz;
-    newbig->age = 0;
     gc_big_object_link(newbig, &ptls->heap.big_objects);
     jl_value_t *snew = jl_valueof(&newbig->header);
     *(size_t*)snew = sz;
@@ -4164,6 +4217,9 @@ void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offs
     errno = last_errno;
     if (__unlikely(pool == MAP_FAILED))
         return NULL;
+#ifdef MADV_NOHUGEPAGE
+    madvise(pool, GC_PERM_POOL_SIZE, MADV_NOHUGEPAGE);
+#endif
 #endif
     gc_perm_pool = (uintptr_t)pool;
     gc_perm_end = gc_perm_pool + GC_PERM_POOL_SIZE;
@@ -4259,7 +4315,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
 {
     p = (char *) p - 1;
     jl_gc_pagemeta_t *meta = page_metadata(p);
-    if (meta && meta->ages) {
+    if (meta) {
         char *page = gc_page_data(p);
         // offset within page.
         size_t off = (char *)p - page;
@@ -4285,7 +4341,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
         jl_gc_pool_t *pool =
             gc_all_tls_states[meta->thread_n]->heap.norm_pools +
             meta->pool_n;
-        if (meta->fl_begin_offset == (uint16_t) -1) {
+        if (meta->fl_begin_offset == UINT16_MAX) {
             // case 2: this is a page on the newpages list
             jl_taggedvalue_t *newpages = pool->newpages;
             // Check if the page is being allocated from via newpages
@@ -4294,7 +4350,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
             char *data = gc_page_data(newpages);
             if (data != meta->data) {
                 // Pages on newpages form a linked list where only the
-                // first one is allocated from (see reset_page()).
+                // first one is allocated from (see gc_reset_page()).
                 // All other pages are empty.
                 return NULL;
             }
@@ -4322,7 +4378,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
         // entries and 1 for live objects. The above subcases arise
         // because allocating a cell will not update the age bit, so we
         // need extra logic for pages that have been allocated from.
-        unsigned obj_id = (off - off2) / osize;
         // We now distinguish between the second and third subcase.
         // Freelist entries are consumed in ascending order. Anything
         // before the freelist pointer was either live during the last
@@ -4330,11 +4385,9 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
         if (gc_page_data(cell) == gc_page_data(pool->freelist)
             && (char *)cell < (char *)pool->freelist)
             goto valid_object;
-        // We know now that the age bit reflects liveness status during
-        // the last sweep and that the cell has not been reused since.
-        if (!(meta->ages[obj_id / 8] & (1 << (obj_id % 8)))) {
-            return NULL;
-        }
+        // already skipped marked or old objects above, so here
+        // the age bits are 0, thus the object is on the freelist
+        return NULL;
         // Not a freelist entry, therefore a valid object.
     valid_object:
         // We have to treat objects with type `jl_buff_tag` differently,
diff --git a/src/gc.h b/src/gc.h
index 7b02df69abbc1..ff52269b73af9 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -31,8 +31,12 @@
 extern "C" {
 #endif
 
+#ifdef GC_SMALL_PAGE
+#define GC_PAGE_LG2 12 // log2(size of a page)
+#else
 #define GC_PAGE_LG2 14 // log2(size of a page)
-#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k
+#endif
+#define GC_PAGE_SZ (1 << GC_PAGE_LG2)
 #define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT))
 
 #define jl_malloc_tag ((void*)0xdeadaa01)
@@ -42,7 +46,6 @@ extern "C" {
 typedef struct {
     uint64_t num;
     uint64_t next;
-
     uint64_t min;
     uint64_t interv;
     uint64_t max;
@@ -77,179 +80,49 @@ typedef struct {
     uint64_t    max_memory;
     uint64_t    time_to_safepoint;
     uint64_t    max_time_to_safepoint;
+    uint64_t    total_time_to_safepoint;
     uint64_t    sweep_time;
     uint64_t    mark_time;
     uint64_t    total_sweep_time;
     uint64_t    total_mark_time;
+    uint64_t    last_full_sweep;
+    uint64_t    last_incremental_sweep;
 } jl_gc_num_t;
 
-enum {
-    GC_MARK_L_marked_obj,
-    GC_MARK_L_scan_only,
-    GC_MARK_L_finlist,
-    GC_MARK_L_objarray,
-    GC_MARK_L_array8,
-    GC_MARK_L_array16,
-    GC_MARK_L_obj8,
-    GC_MARK_L_obj16,
-    GC_MARK_L_obj32,
-    GC_MARK_L_stack,
-    GC_MARK_L_excstack,
-    GC_MARK_L_module_binding,
-    _GC_MARK_L_MAX
-};
-
-// The following structs (`gc_mark_*_t`) contain iterator state used for the
-// scanning of various object types.
-//
-// The `nptr` member records the number of pointers slots referenced by
-// an object to be used in the full collection heuristics as well as whether the object
-// references young objects.
-// `nptr >> 2` is the number of pointers fields referenced by the object.
-// The lowest bit of `nptr` is set if the object references young object.
-// The 2nd lowest bit of `nptr` is the GC old bits of the object after marking.
-// A `0x3` in the low bits means that the object needs to be in the remset.
-
-// An generic object that's marked and needs to be scanned
-// The metadata might need update too (depend on the PC)
-typedef struct {
-    jl_value_t *obj; // The object
-    uintptr_t tag; // The tag with the GC bits masked out
-    uint8_t bits; // The GC bits after tagging (`bits & 1 == 1`)
-} gc_mark_marked_obj_t;
-
-// An object array. This can come from an array, svec, or the using array or a module
-typedef struct {
-    jl_value_t *parent; // The parent object to trigger write barrier on.
-    jl_value_t **begin; // The first slot to be scanned.
-    jl_value_t **end; // The end address (after the last slot to be scanned)
-    uint32_t step; // Number of pointers to jump between marks
-    uintptr_t nptr; // See notes about `nptr` above.
-} gc_mark_objarray_t;
-
-// A normal object with 8bits field descriptors
-typedef struct {
-    jl_value_t *parent; // The parent object to trigger write barrier on.
-    uint8_t *begin; // Current field descriptor.
-    uint8_t *end; // End of field descriptor.
-    uintptr_t nptr; // See notes about `nptr` above.
-} gc_mark_obj8_t;
-
-// A normal object with 16bits field descriptors
-typedef struct {
-    jl_value_t *parent; // The parent object to trigger write barrier on.
-    uint16_t *begin; // Current field descriptor.
-    uint16_t *end; // End of field descriptor.
-    uintptr_t nptr; // See notes about `nptr` above.
-} gc_mark_obj16_t;
-
-// A normal object with 32bits field descriptors
-typedef struct {
-    jl_value_t *parent; // The parent object to trigger write barrier on.
-    uint32_t *begin; // Current field descriptor.
-    uint32_t *end; // End of field descriptor.
-    uintptr_t nptr; // See notes about `nptr` above.
-} gc_mark_obj32_t;
-
-typedef struct {
-    jl_value_t **begin; // The first slot to be scanned.
-    jl_value_t **end; // The end address (after the last slot to be scanned)
-    uint8_t *rebegin;
-    gc_mark_obj8_t elem;
-} gc_mark_array8_t;
-
-typedef struct {
-    jl_value_t **begin; // The first slot to be scanned.
-    jl_value_t **end; // The end address (after the last slot to be scanned)
-    uint16_t *rebegin;
-    gc_mark_obj16_t elem;
-} gc_mark_array16_t;
-
-// Stack frame
-typedef struct {
-    jl_gcframe_t *s; // The current stack frame
-    uint32_t i; // The current slot index in the frame
-    uint32_t nroots; // `nroots` fields in the frame
-    // Parameters to mark the copy_stack range.
-    uintptr_t offset;
-    uintptr_t lb;
-    uintptr_t ub;
-} gc_mark_stackframe_t;
-
-// Exception stack data
-typedef struct {
-    jl_excstack_t *s;   // Stack of exceptions
-    size_t itr;         // Iterator into exception stack
-    size_t bt_index;    // Current backtrace buffer entry index
-    size_t jlval_index; // Index into GC managed values for current bt entry
-} gc_mark_excstack_t;
-
-// Module bindings. This is also the beginning of module scanning.
-// The loop will start marking other references in a module after the bindings are marked
-typedef struct {
-    jl_module_t *parent; // The parent module to trigger write barrier on.
-    jl_binding_t **begin; // The first slot to be scanned.
-    jl_binding_t **end; // The end address (after the last slot to be scanned)
-    uintptr_t nptr; // See notes about `nptr` above.
-    uint8_t bits; // GC bits of the module (the bits to mark the binding buffer with)
-} gc_mark_binding_t;
-
-// Finalizer (or object) list
-typedef struct {
-    jl_value_t **begin;
-    jl_value_t **end;
-} gc_mark_finlist_t;
-
-// This is used to determine the max size of the data objects on the data stack.
-// We'll use this size to determine the size of the data stack corresponding to a
-// PC stack size. Since the data objects are not all of the same size, we'll waste
-// some memory on the data stack this way but that size is unlikely going to be significant.
-union _jl_gc_mark_data {
-    gc_mark_marked_obj_t marked;
-    gc_mark_objarray_t objarray;
-    gc_mark_array8_t array8;
-    gc_mark_array16_t array16;
-    gc_mark_obj8_t obj8;
-    gc_mark_obj16_t obj16;
-    gc_mark_obj32_t obj32;
-    gc_mark_stackframe_t stackframe;
-    gc_mark_excstack_t excstackframe;
-    gc_mark_binding_t binding;
-    gc_mark_finlist_t finlist;
-};
-
-// Pop a data struct from the mark data stack (i.e. decrease the stack pointer)
-// This should be used after dispatch and therefore the pc stack pointer is already popped from
-// the stack.
-STATIC_INLINE void *gc_pop_markdata_(jl_gc_mark_sp_t *sp, size_t size)
-{
-    jl_gc_mark_data_t *data = (jl_gc_mark_data_t *)(((char*)sp->data) - size);
-    sp->data = data;
-    return data;
-}
-#define gc_pop_markdata(sp, type) ((type*)gc_pop_markdata_(sp, sizeof(type)))
-
-// Re-push a frame to the mark stack (both data and pc)
-// The data and pc are expected to be on the stack (or updated in place) already.
-// Mainly useful to pause the current scanning in order to scan an new object.
-STATIC_INLINE void *gc_repush_markdata_(jl_gc_mark_sp_t *sp, size_t size) JL_NOTSAFEPOINT
-{
-    jl_gc_mark_data_t *data = sp->data;
-    sp->pc++;
-    sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + size);
-    return data;
-}
-#define gc_repush_markdata(sp, type) ((type*)gc_repush_markdata_(sp, sizeof(type)))
+// Array chunks (work items representing suffixes of
+// large arrays of pointers left to be marked)
+
+typedef enum {
+    GC_empty_chunk = 0, // for sentinel representing no items left in chunk queue
+    GC_objary_chunk,    // for chunk of object array
+    GC_ary8_chunk,      // for chunk of array with 8 bit field descriptors
+    GC_ary16_chunk,     // for chunk of array with 16 bit field descriptors
+    GC_finlist_chunk,   // for chunk of finalizer list
+} gc_chunk_id_t;
+
+typedef struct _jl_gc_chunk_t {
+    gc_chunk_id_t cid;
+    struct _jl_value_t *parent; // array owner
+    struct _jl_value_t **begin; // pointer to first element that needs scanning
+    struct _jl_value_t **end;   // pointer to last element that needs scanning
+    void *elem_begin;           // used to scan pointers within objects when marking `ary8` or `ary16`
+    void *elem_end;             // used to scan pointers within objects when marking `ary8` or `ary16`
+    uint32_t step;              // step-size used when marking objarray
+    uintptr_t nptr;             // (`nptr` & 0x1) if array has young element and (`nptr` & 0x2) if array owner is old
+} jl_gc_chunk_t;
+
+#define GC_CHUNK_BATCH_SIZE (1 << 16)       // maximum number of references that can be processed
+                                            // without creating a chunk
+
+#define GC_PTR_QUEUE_INIT_SIZE (1 << 18)    // initial size of queue of `jl_value_t *`
+#define GC_CHUNK_QUEUE_INIT_SIZE (1 << 14)  // initial size of chunk-queue
 
 // layout for big (>2k) objects
 
 JL_EXTENSION typedef struct _bigval_t {
     struct _bigval_t *next;
     struct _bigval_t **prev; // pointer to the next field of the prev entry
-    union {
-        size_t sz;
-        uintptr_t age : 2;
-    };
+    size_t sz;
 #ifdef _P64 // Add padding so that the value is 64-byte aligned
     // (8 pointers of 8 bytes each) - (4 other pointers in struct)
     void *_padding[8 - 4];
@@ -275,7 +148,10 @@ typedef struct _mallocarray_t {
 } mallocarray_t;
 
 // pool page metadata
-typedef struct {
+typedef struct _jl_gc_pagemeta_t {
+    // next metadata structure in per-thread list
+    // or in one of the `jl_gc_global_page_pool_t`
+    struct _jl_gc_pagemeta_t *next;
     // index of pool that owns this page
     uint8_t pool_n;
     // Whether any cell in the page is marked
@@ -302,36 +178,74 @@ typedef struct {
     // number of free objects in this page.
     // invalid if pool that owns this page is allocating objects from this page.
     uint16_t nfree;
-    uint16_t osize; // size of each object in this page
+    uint16_t osize;           // size of each object in this page
     uint16_t fl_begin_offset; // offset of first free object in this page
     uint16_t fl_end_offset;   // offset of last free object in this page
     uint16_t thread_n;        // thread id of the heap that owns this page
     char *data;
-    uint8_t *ages;
 } jl_gc_pagemeta_t;
 
-// Page layout:
-//  Newpage freelist: sizeof(void*)
-//  Padding: GC_PAGE_OFFSET - sizeof(void*)
-//  Blocks: osize * n
-//    Tag: sizeof(jl_taggedvalue_t)
-//    Data: <= osize - sizeof(jl_taggedvalue_t)
+extern jl_gc_page_stack_t global_page_pool_lazily_freed;
+extern jl_gc_page_stack_t global_page_pool_clean;
+extern jl_gc_page_stack_t global_page_pool_freed;
+
+// Lock-free stack implementation taken
+// from Herlihy's "The Art of Multiprocessor Programming"
+// XXX: this is not a general-purpose lock-free stack. We can
+// get away with just using a CAS and not implementing some ABA
+// prevention mechanism since once a node is popped from the
+// `jl_gc_page_stack_t`, it may only be pushed back to them
+// in the sweeping phase, which also doesn't push a node into the
+// same stack after it's popped
+
+STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
+{
+    while (1) {
+        jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
+        elt->next = old_back;
+        if (jl_atomic_cmpswap(&pool->bottom, &old_back, elt)) {
+            break;
+        }
+        jl_cpu_pause();
+    }
+}
+
+STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
+{
+    while (1) {
+        jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
+        if (old_back == NULL) {
+            return NULL;
+        }
+        if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) {
+            return old_back;
+        }
+        jl_cpu_pause();
+    }
+}
+
+typedef struct {
+    _Atomic(size_t) n_freed_objs;
+    _Atomic(size_t) n_pages_allocd;
+} gc_fragmentation_stat_t;
 
-// Memory map:
-//  The complete address space is divided up into a multi-level page table.
-//  The three levels have similar but slightly different structures:
-//    - pagetable0_t: the bottom/leaf level (covers the contiguous addresses)
-//    - pagetable1_t: the middle level
-//    - pagetable2_t: the top/leaf level (covers the entire virtual address space)
-//  Corresponding to these similar structures is a large amount of repetitive
-//  code that is nearly the same but not identical. It could be made less
-//  repetitive with C macros, but only at the cost of debuggability. The specialized
-//  structure of this representation allows us to partially unroll and optimize
-//  various conditions at each level.
-
-//  The following constants define the branching factors at each level.
-//  The constants and GC_PAGE_LG2 must therefore sum to sizeof(void*).
-//  They should all be multiples of 32 (sizeof(uint32_t)) except that REGION2_PG_COUNT may also be 1.
+#ifdef GC_SMALL_PAGE
+#ifdef _P64
+#define REGION0_PG_COUNT (1 << 16)
+#define REGION1_PG_COUNT (1 << 18)
+#define REGION2_PG_COUNT (1 << 18)
+#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0xFFFF) // shift by GC_PAGE_LG2
+#define REGION1_INDEX(p) (((uintptr_t)(p) >> 28) & 0x3FFFF)
+#define REGION_INDEX(p)  (((uintptr_t)(p) >> 46) & 0x3FFFF)
+#else
+#define REGION0_PG_COUNT (1 << 10)
+#define REGION1_PG_COUNT (1 << 10)
+#define REGION2_PG_COUNT (1 << 0)
+#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0x3FF) // shift by GC_PAGE_LG2
+#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF)
+#define REGION_INDEX(p)  (0)
+#endif
+#else
 #ifdef _P64
 #define REGION0_PG_COUNT (1 << 16)
 #define REGION1_PG_COUNT (1 << 16)
@@ -347,39 +261,122 @@ typedef struct {
 #define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF)
 #define REGION_INDEX(p)  (0)
 #endif
+#endif
 
 // define the representation of the levels of the page-table (0 to 2)
 typedef struct {
-    jl_gc_pagemeta_t *meta[REGION0_PG_COUNT];
-    uint32_t allocmap[REGION0_PG_COUNT / 32];
-    uint32_t freemap[REGION0_PG_COUNT / 32];
-    // store a lower bound of the first free page in each region
-    int lb;
-    // an upper bound of the last non-free page
-    int ub;
+    uint8_t meta[REGION0_PG_COUNT];
 } pagetable0_t;
 
 typedef struct {
     pagetable0_t *meta0[REGION1_PG_COUNT];
-    uint32_t allocmap0[REGION1_PG_COUNT / 32];
-    uint32_t freemap0[REGION1_PG_COUNT / 32];
-    // store a lower bound of the first free page in each region
-    int lb;
-    // an upper bound of the last non-free page
-    int ub;
 } pagetable1_t;
 
 typedef struct {
     pagetable1_t *meta1[REGION2_PG_COUNT];
-    uint32_t allocmap1[(REGION2_PG_COUNT + 31) / 32];
-    uint32_t freemap1[(REGION2_PG_COUNT + 31) / 32];
-    // store a lower bound of the first free page in each region
-    int lb;
-    // an upper bound of the last non-free page
-    int ub;
 } pagetable_t;
 
-#ifdef __clang_gcanalyzer__
+#define GC_PAGE_UNMAPPED        0
+#define GC_PAGE_ALLOCATED       1
+#define GC_PAGE_LAZILY_FREED    2
+#define GC_PAGE_FREED           3
+
+extern pagetable_t alloc_map;
+
+STATIC_INLINE uint8_t gc_alloc_map_is_set(char *_data) JL_NOTSAFEPOINT
+{
+    uintptr_t data = ((uintptr_t)_data);
+    unsigned i;
+    i = REGION_INDEX(data);
+    pagetable1_t *r1 = alloc_map.meta1[i];
+    if (r1 == NULL)
+        return 0;
+    i = REGION1_INDEX(data);
+    pagetable0_t *r0 = r1->meta0[i];
+    if (r0 == NULL)
+        return 0;
+    i = REGION0_INDEX(data);
+    return (r0->meta[i] == GC_PAGE_ALLOCATED);
+}
+
+STATIC_INLINE void gc_alloc_map_set(char *_data, uint8_t v) JL_NOTSAFEPOINT
+{
+    uintptr_t data = ((uintptr_t)_data);
+    unsigned i;
+    i = REGION_INDEX(data);
+    pagetable1_t *r1 = alloc_map.meta1[i];
+    assert(r1 != NULL);
+    i = REGION1_INDEX(data);
+    pagetable0_t *r0 = r1->meta0[i];
+    assert(r0 != NULL);
+    i = REGION0_INDEX(data);
+    r0->meta[i] = v;
+}
+
+STATIC_INLINE void gc_alloc_map_maybe_create(char *_data) JL_NOTSAFEPOINT
+{
+    uintptr_t data = ((uintptr_t)_data);
+    unsigned i;
+    i = REGION_INDEX(data);
+    pagetable1_t *r1 = alloc_map.meta1[i];
+    if (r1 == NULL) {
+        r1 = (pagetable1_t*)calloc_s(sizeof(pagetable1_t));
+        alloc_map.meta1[i] = r1;
+    }
+    i = REGION1_INDEX(data);
+    pagetable0_t *r0 = r1->meta0[i];
+    if (r0 == NULL) {
+        r0 = (pagetable0_t*)calloc_s(sizeof(pagetable0_t));
+        r1->meta0[i] = r0;
+    }
+}
+
+// Page layout:
+//  Metadata pointer: sizeof(jl_gc_pagemeta_t*)
+//  Padding: GC_PAGE_OFFSET - sizeof(jl_gc_pagemeta_t*)
+//  Blocks: osize * n
+//    Tag: sizeof(jl_taggedvalue_t)
+//    Data: <= osize - sizeof(jl_taggedvalue_t)
+
+STATIC_INLINE char *gc_page_data(void *x) JL_NOTSAFEPOINT
+{
+    return (char*)(((uintptr_t)x >> GC_PAGE_LG2) << GC_PAGE_LG2);
+}
+
+STATIC_INLINE jl_gc_pagemeta_t *page_metadata_unsafe(void *_data) JL_NOTSAFEPOINT
+{
+    return *(jl_gc_pagemeta_t**)(gc_page_data(_data));
+}
+
+STATIC_INLINE jl_gc_pagemeta_t *page_metadata(void *_data) JL_NOTSAFEPOINT
+{
+    if (!gc_alloc_map_is_set((char*)_data)) {
+        return NULL;
+    }
+    return page_metadata_unsafe(_data);
+}
+
+STATIC_INLINE void set_page_metadata(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
+{
+    *(jl_gc_pagemeta_t**)(pg->data) = pg;
+}
+
+STATIC_INLINE void push_page_metadata_back(jl_gc_pagemeta_t **ppg, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
+{
+    elt->next = *ppg;
+    *ppg = elt;
+}
+
+STATIC_INLINE jl_gc_pagemeta_t *pop_page_metadata_back(jl_gc_pagemeta_t **ppg) JL_NOTSAFEPOINT
+{
+    jl_gc_pagemeta_t *v = *ppg;
+    if (*ppg != NULL) {
+        *ppg = (*ppg)->next;
+    }
+    return v;
+}
+
+#ifdef __clang_gcanalyzer__ /* clang may not have __builtin_ffs */
 unsigned ffs_u32(uint32_t bitvec) JL_NOTSAFEPOINT;
 #else
 STATIC_INLINE unsigned ffs_u32(uint32_t bitvec)
@@ -389,11 +386,11 @@ STATIC_INLINE unsigned ffs_u32(uint32_t bitvec)
 #endif
 
 extern jl_gc_num_t gc_num;
-extern pagetable_t memory_map;
 extern bigval_t *big_objects_marked;
 extern arraylist_t finalizer_list_marked;
 extern arraylist_t to_finalize;
-extern int64_t lazy_freed_pages;
+extern int64_t buffered_pages;
+extern int gc_first_tid;
 extern int gc_n_threads;
 extern jl_ptls_t* gc_all_tls_states;
 
@@ -402,12 +399,6 @@ STATIC_INLINE bigval_t *bigval_header(jl_taggedvalue_t *o) JL_NOTSAFEPOINT
     return container_of(o, bigval_t, header);
 }
 
-// round an address inside a gcpage's data to its beginning
-STATIC_INLINE char *gc_page_data(void *x) JL_NOTSAFEPOINT
-{
-    return (char*)(((uintptr_t)x >> GC_PAGE_LG2) << GC_PAGE_LG2);
-}
-
 STATIC_INLINE jl_taggedvalue_t *page_pfl_beg(jl_gc_pagemeta_t *p) JL_NOTSAFEPOINT
 {
     return (jl_taggedvalue_t*)(p->data + p->fl_begin_offset);
@@ -445,52 +436,6 @@ STATIC_INLINE void *gc_ptr_clear_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT
 
 NOINLINE uintptr_t gc_get_stack_ptr(void);
 
-STATIC_INLINE jl_gc_pagemeta_t *page_metadata(void *_data) JL_NOTSAFEPOINT
-{
-    uintptr_t data = ((uintptr_t)_data);
-    unsigned i;
-    i = REGION_INDEX(data);
-    pagetable1_t *r1 = memory_map.meta1[i];
-    if (!r1)
-        return NULL;
-    i = REGION1_INDEX(data);
-    pagetable0_t *r0 = r1->meta0[i];
-    if (!r0)
-        return NULL;
-    i = REGION0_INDEX(data);
-    return r0->meta[i];
-}
-
-struct jl_gc_metadata_ext {
-    pagetable1_t *pagetable1;
-    pagetable0_t *pagetable0;
-    jl_gc_pagemeta_t *meta;
-    unsigned pagetable_i32, pagetable_i;
-    unsigned pagetable1_i32, pagetable1_i;
-    unsigned pagetable0_i32, pagetable0_i;
-};
-
-STATIC_INLINE struct jl_gc_metadata_ext page_metadata_ext(void *_data) JL_NOTSAFEPOINT
-{
-    uintptr_t data = (uintptr_t)_data;
-    struct jl_gc_metadata_ext info;
-    unsigned i;
-    i = REGION_INDEX(data);
-    info.pagetable_i = i % 32;
-    info.pagetable_i32 = i / 32;
-    info.pagetable1 = memory_map.meta1[i];
-    i = REGION1_INDEX(data);
-    info.pagetable1_i = i % 32;
-    info.pagetable1_i32 = i / 32;
-    info.pagetable0 = info.pagetable1->meta0[i];
-    i = REGION0_INDEX(data);
-    info.pagetable0_i = i % 32;
-    info.pagetable0_i32 = i / 32;
-    info.meta = info.pagetable0->meta[i];
-    assert(info.meta);
-    return info;
-}
-
 STATIC_INLINE void gc_big_object_unlink(const bigval_t *hdr) JL_NOTSAFEPOINT
 {
     *hdr->prev = hdr->next;
@@ -508,28 +453,26 @@ STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) JL_NOTSAFE
     *list = hdr;
 }
 
-STATIC_INLINE void gc_mark_sp_init(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp)
-{
-    sp->pc = gc_cache->pc_stack;
-    sp->data = gc_cache->data_stack;
-    sp->pc_start = gc_cache->pc_stack;
-    sp->pc_end = gc_cache->pc_stack_end;
-}
-
-void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp);
-void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
-                           arraylist_t *list, size_t start);
-void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp);
+extern uv_mutex_t gc_threads_lock;
+extern uv_cond_t gc_threads_cond;
+extern _Atomic(int) gc_n_threads_marking;
+extern _Atomic(int) gc_n_threads_sweeping;
+void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
+void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT;
+void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_NOTSAFEPOINT;
+void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
+void gc_mark_loop_serial(jl_ptls_t ptls);
+void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
+void gc_sweep_pool_parallel(void);
+void gc_free_pages(void);
 void sweep_stack_pools(void);
 void jl_gc_debug_init(void);
 
-extern void *gc_mark_label_addrs[_GC_MARK_L_MAX];
-
 // GC pages
 
-void jl_gc_init_page(void);
+void jl_gc_init_page(void) JL_NOTSAFEPOINT;
 NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT;
-void jl_gc_free_page(void *p) JL_NOTSAFEPOINT;
+void jl_gc_free_page(jl_gc_pagemeta_t *p) JL_NOTSAFEPOINT;
 
 // GC debug
 
@@ -609,7 +552,6 @@ static inline void gc_verify_tags(void)
 }
 #endif
 
-
 #ifdef GC_VERIFY
 extern jl_value_t *lostval;
 void gc_verify(jl_ptls_t ptls);
@@ -650,10 +592,9 @@ extern int gc_verifying;
 #define gc_verifying (0)
 #endif
 
-
 int gc_slot_to_fieldidx(void *_obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT;
 int gc_slot_to_arrayidx(void *_obj, void *begin) JL_NOTSAFEPOINT;
-NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset);
+NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int offset) JL_NOTSAFEPOINT;
 
 #ifdef GC_DEBUG_ENV
 JL_DLLEXPORT extern jl_gc_debug_env_t jl_gc_debug_env;
@@ -717,6 +658,7 @@ void gc_count_pool(void);
 size_t jl_array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable);
+JL_DLLEXPORT uint32_t jl_get_num_stack_mappings(void);
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT;
 
 #ifdef __cplusplus
diff --git a/src/init.c b/src/init.c
index 522f16041d566..ea344fb9e1e62 100644
--- a/src/init.c
+++ b/src/init.c
@@ -804,6 +804,8 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
     _finish_julia_init(rel, ptls, ct);
 }
 
+void jl_init_heartbeat(void);
+
 static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_task_t *ct)
 {
     jl_resolve_sysimg_location(rel);
@@ -840,9 +842,12 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_
     if (jl_base_module == NULL) {
         // nthreads > 1 requires code in Base
         jl_atomic_store_relaxed(&jl_n_threads, 1);
+        jl_n_gcthreads = 0;
     }
     jl_start_threads();
 
+    jl_init_heartbeat();
+
     jl_gc_enable(1);
 
     if (jl_options.image_file && (!jl_generating_output() || jl_options.incremental) && jl_module_init_order) {
diff --git a/src/interpreter.c b/src/interpreter.c
index 1f9c416d99b1b..6f546db9bbbb1 100644
--- a/src/interpreter.c
+++ b/src/interpreter.c
@@ -65,7 +65,8 @@ extern void JL_GC_ENABLEFRAME(interpreter_state*) JL_NOTSAFEPOINT;
 // we define this separately so that we can populate the frame before we add it to the backtrace
 // it's recommended to mark the containing function with NOINLINE, though not essential
 #define JL_GC_ENABLEFRAME(frame) \
-  ((void**)&frame[1])[0] = __builtin_frame_address(0);
+    jl_signal_fence(); \
+    ((void**)&frame[1])[0] = __builtin_frame_address(0);
 
 #endif
 
diff --git a/src/jl_exported_data.inc b/src/jl_exported_data.inc
index fee57ed60dd7a..f28ecefbded4a 100644
--- a/src/jl_exported_data.inc
+++ b/src/jl_exported_data.inc
@@ -132,6 +132,7 @@
 #define JL_EXPORTED_DATA_SYMBOLS(XX) \
     XX(jl_n_threadpools, int) \
     XX(jl_n_threads, _Atomic(int)) \
+    XX(jl_n_gcthreads, int) \
     XX(jl_options, jl_options_t) \
     XX(jl_task_gcstack_offset, int) \
     XX(jl_task_ptls_offset, int) \
diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index 7f29176e67755..7699dc8cdb904 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -160,6 +160,7 @@
     XX(jl_gc_alloc_3w) \
     XX(jl_gc_alloc_typed) \
     XX(jl_gc_big_alloc) \
+    XX(jl_gc_big_alloc_instrumented) \
     XX(jl_gc_collect) \
     XX(jl_gc_conservative_gc_support_enabled) \
     XX(jl_gc_counted_calloc) \
@@ -176,6 +177,7 @@
     XX(jl_gc_get_max_memory) \
     XX(jl_gc_internal_obj_base_ptr) \
     XX(jl_gc_is_enabled) \
+    XX(jl_gc_pool_live_bytes) \
     XX(jl_gc_live_bytes) \
     XX(jl_gc_managed_malloc) \
     XX(jl_gc_managed_realloc) \
@@ -186,6 +188,7 @@
     XX(jl_gc_new_weakref_th) \
     XX(jl_gc_num) \
     XX(jl_gc_pool_alloc) \
+    XX(jl_gc_pool_alloc_instrumented) \
     XX(jl_gc_queue_multiroot) \
     XX(jl_gc_queue_root) \
     XX(jl_gc_safepoint) \
@@ -196,6 +199,7 @@
     XX(jl_gc_set_cb_pre_gc) \
     XX(jl_gc_set_cb_root_scanner) \
     XX(jl_gc_set_cb_task_scanner) \
+    XX(jl_gc_set_max_memory) \
     XX(jl_gc_sync_total_bytes) \
     XX(jl_gc_total_hrtime) \
     XX(jl_gdblookup) \
diff --git a/src/jloptions.c b/src/jloptions.c
index 988078c7c58d9..812543a09399e 100644
--- a/src/jloptions.c
+++ b/src/jloptions.c
@@ -40,6 +40,7 @@ JL_DLLEXPORT void jl_init_options(void)
                         NULL, // cpu_target ("native", "core2", etc...)
                         0,    // nthreadpools
                         0,    // nthreads
+                        0,    // ngcthreads
                         NULL, // nthreads_per_pool
                         0,    // nprocs
                         NULL, // machine_file
@@ -129,6 +130,7 @@ static const char opts[]  =
     "                           interface if supported (Linux and Windows) or to the number of CPU\n"
     "                           threads if not supported (MacOS) or if process affinity is not\n"
     "                           configured, and sets M to 1.\n"
+    " --gcthreads=N             Use N threads for GC, set to half of the number of compute threads if unspecified.\n"
     " -p, --procs {N|auto}      Integer value N launches N additional local worker processes\n"
     "                           \"auto\" launches as many workers as the number of local CPU threads (logical cores)\n"
     " --machine-file <file>     Run processes on hosts listed in <file>\n\n"
@@ -253,7 +255,8 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
            opt_strip_metadata,
            opt_strip_ir,
            opt_heap_size_hint,
-           opt_permalloc_pkgimg
+           opt_permalloc_pkgimg,
+           opt_gc_threads,
     };
     static const char* const shortopts = "+vhqH:e:E:L:J:C:it:p:O:g:";
     static const struct option longopts[] = {
@@ -278,6 +281,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
         { "cpu-target",      required_argument, 0, 'C' },
         { "procs",           required_argument, 0, 'p' },
         { "threads",         required_argument, 0, 't' },
+        { "gcthreads",       required_argument, 0, opt_gc_threads },
         { "machine-file",    required_argument, 0, opt_machine_file },
         { "project",         optional_argument, 0, opt_project },
         { "color",           required_argument, 0, opt_color },
@@ -811,8 +815,6 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
                             break;
                     }
                     jl_options.heap_size_hint = (uint64_t)(value * multiplier);
-
-                    jl_gc_set_max_memory(jl_options.heap_size_hint);
                 }
             }
             if (jl_options.heap_size_hint == 0)
@@ -827,6 +829,13 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
             else
                 jl_errorf("julia: invalid argument to --permalloc-pkgimg={yes|no} (%s)", optarg);
             break;
+        case opt_gc_threads:
+            errno = 0;
+            long ngcthreads = strtol(optarg, &endptr, 10);
+            if (errno != 0 || optarg == endptr || *endptr != 0 || ngcthreads < 1 || ngcthreads >= INT16_MAX)
+                jl_errorf("julia: --gcthreads=<n>; n must be an integer >= 1");
+            jl_options.ngcthreads = (int16_t)ngcthreads;
+            break;
         default:
             jl_errorf("julia: unhandled option -- %c\n"
                       "This is a bug, please report it.", c);
diff --git a/src/jloptions.h b/src/jloptions.h
index 0b72b4c3be062..93f6d321f38d6 100644
--- a/src/jloptions.h
+++ b/src/jloptions.h
@@ -15,6 +15,7 @@ typedef struct {
     const char *cpu_target;
     int8_t nthreadpools;
     int16_t nthreads;
+    int16_t ngcthreads;
     const int16_t *nthreads_per_pool;
     int32_t nprocs;
     const char *machine_file;
diff --git a/src/julia.h b/src/julia.h
index 0a88633905324..a5d3259ad0d3f 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -927,6 +927,8 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, struct _jl_task_t *owner) JL_N
 JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz);
 JL_DLLEXPORT void jl_gc_use(jl_value_t *a);
 JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void);
+// Set GC memory trigger in bytes for greedy memory collecting
+JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);
 
 JL_DLLEXPORT void jl_clear_malloc_data(void);
 
@@ -968,6 +970,11 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
                                          int isaligned, jl_value_t *owner);
 JL_DLLEXPORT void jl_gc_safepoint(void);
 
+void *mtarraylist_get(small_arraylist_t *_a, size_t idx) JL_NOTSAFEPOINT;
+size_t mtarraylist_length(small_arraylist_t *_a) JL_NOTSAFEPOINT;
+void mtarraylist_add(small_arraylist_t *_a, void *elt, size_t idx) JL_NOTSAFEPOINT;
+void mtarraylist_push(small_arraylist_t *_a, void *elt) JL_NOTSAFEPOINT;
+
 // object accessors -----------------------------------------------------------
 
 #define jl_svec_len(t)              (((jl_svec_t*)(t))->length)
@@ -1684,6 +1691,7 @@ JL_DLLEXPORT jl_sym_t *jl_get_ARCH(void) JL_NOTSAFEPOINT;
 JL_DLLEXPORT jl_value_t *jl_get_libllvm(void) JL_NOTSAFEPOINT;
 extern JL_DLLIMPORT int jl_n_threadpools;
 extern JL_DLLIMPORT _Atomic(int) jl_n_threads;
+extern JL_DLLIMPORT int jl_n_gcthreads;
 extern JL_DLLIMPORT int *jl_n_threads_per_pool;
 
 // environment entries
@@ -2241,9 +2249,11 @@ typedef struct {
 
     // controls the emission of debug-info. mirrors the clang options
     int gnu_pubnames;       // can we emit the gnu pubnames debuginfo
-    int debug_info_kind; // Enum for line-table-only, line-directives-only,
+    int debug_info_kind;    // Enum for line-table-only, line-directives-only,
                             // limited, standalone
 
+    int safepoint_on_entry; // Emit a safepoint on entry to each function
+
     // Cache access. Default: jl_rettype_inferred.
     jl_codeinstance_lookup_t lookup;
 
diff --git a/src/julia_gcext.h b/src/julia_gcext.h
index 27f0a6b5ec11c..e7cb57d622a78 100644
--- a/src/julia_gcext.h
+++ b/src/julia_gcext.h
@@ -34,6 +34,10 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_alloc(jl_gc_cb_notify_external_al
 JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_free_t cb,
         int enable);
 
+// Memory pressure callback
+typedef void (*jl_gc_cb_notify_gc_pressure_t)(void);
+JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable);
+
 // Types for custom mark and sweep functions.
 typedef uintptr_t (*jl_markfunc_t)(jl_ptls_t, jl_value_t *obj);
 typedef void (*jl_sweepfunc_t)(jl_value_t *obj);
diff --git a/src/julia_internal.h b/src/julia_internal.h
index e5419efd2a2ac..b932cc82be6ea 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -199,6 +199,8 @@ JL_DLLEXPORT void jl_lock_profile(void) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_unlock_profile(void) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_lock_profile_wr(void) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT;
+int jl_lock_stackwalk(void) JL_NOTSAFEPOINT;
+void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT;
 
 // number of cycles since power-on
 static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT
@@ -335,7 +337,6 @@ void *jl_gc_perm_alloc_nolock(size_t sz, int zero,
     unsigned align, unsigned offset) JL_NOTSAFEPOINT;
 void *jl_gc_perm_alloc(size_t sz, int zero,
     unsigned align, unsigned offset) JL_NOTSAFEPOINT;
-void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v);
 void gc_sweep_sysimg(void);
 
 
@@ -358,24 +359,48 @@ static const int jl_gc_sizeclasses[] = {
     144, 160, 176, 192, 208, 224, 240, 256,
 
     // the following tables are computed for maximum packing efficiency via the formula:
-    // pg = 2^14
+    // pg = GC_SMALL_PAGE ? 2^12 : 2^14
     // sz = (div.(pg-8, rng).÷16)*16; hcat(sz, (pg-8).÷sz, pg .- (pg-8).÷sz.*sz)'
 
+#ifdef GC_SMALL_PAGE
+    // rng = 15:-1:2 (14 pools)
+    272, 288, 304, 336, 368, 400, 448, 496, 576, 672, 816, 1008, 1360, 2032
+//  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, /pool
+//  16, 64, 144, 64, 48, 96, 64, 128, 64, 64, 16, 64, 16, 32, bytes lost
+#else
     // rng = 60:-4:32 (8 pools)
     272, 288, 304, 336, 368, 400, 448, 496,
-//   60,  56,  53,  48,  44,  40,  36,  33, /pool
-//   64, 256, 272, 256, 192, 384, 256,  16, bytes lost
+//  60, 56, 53, 48, 44, 40, 36, 33, /pool
+//  64, 256, 272, 256, 192, 384, 256,  16, bytes lost
 
     // rng = 30:-2:16 (8 pools)
     544, 576, 624, 672, 736, 816, 896, 1008,
-//   30,  28,  26,  24,  22,  20,  18,  16, /pool
-//   64, 256, 160, 256, 192,  64, 256, 256, bytes lost
+//  30, 28, 26, 24, 22, 20, 18, 16, /pool
+//  64, 256, 160, 256, 192,  64, 256, 256, bytes lost
 
     // rng = 15:-1:8 (8 pools)
     1088, 1168, 1248, 1360, 1488, 1632, 1808, 2032
-//    15,   14,   13,   12,   11,   10,    9,    8, /pool
-//    64,   32,  160,   64,   16,   64,  112,  128, bytes lost
+//   15, 14, 13, 12, 11, 10, 9, 8, /pool
+//   64, 32, 160, 64, 16, 64, 112,  128, bytes lost
+#endif
 };
+#ifdef GC_SMALL_PAGE
+#ifdef _P64
+#  define JL_GC_N_POOLS 39
+#elif MAX_ALIGN == 8
+#  define JL_GC_N_POOLS 40
+#else
+#  define JL_GC_N_POOLS 41
+#endif
+#else
+#ifdef _P64
+#  define JL_GC_N_POOLS 49
+#elif MAX_ALIGN == 8
+#  define JL_GC_N_POOLS 50
+#else
+#  define JL_GC_N_POOLS 51
+#endif
+#endif
 static_assert(sizeof(jl_gc_sizeclasses) / sizeof(jl_gc_sizeclasses[0]) == JL_GC_N_POOLS, "");
 
 STATIC_INLINE int jl_gc_alignment(size_t sz)
@@ -402,7 +427,12 @@ JL_DLLEXPORT int jl_alignment(size_t sz);
 
 // the following table is computed as:
 // [searchsortedfirst(jl_gc_sizeclasses, i) - 1 for i = 0:16:jl_gc_sizeclasses[end]]
-static const uint8_t szclass_table[] = {0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48};
+static const uint8_t szclass_table[] =
+#ifdef GC_SMALL_PAGE
+    {0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,33,33,34,34,34,34,34,34,35,35,35,35,35,35,35,35,35,36,36,36,36,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38};
+#else
+    {0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,34,34,35,35,35,36,36,36,37,37,37,37,38,38,38,38,38,39,39,39,39,39,40,40,40,40,40,40,40,41,41,41,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,46,46,46,46,46,46,46,46,46,47,47,47,47,47,47,47,47,47,47,47,48,48,48,48,48,48,48,48,48,48,48,48,48,48};
+#endif
 static_assert(sizeof(szclass_table) == 128, "");
 
 STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz)
@@ -560,9 +590,6 @@ void jl_gc_run_all_finalizers(jl_task_t *ct);
 void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task);
 void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT;
 
-// Set GC memory trigger in bytes for greedy memory collecting
-void jl_gc_set_max_memory(uint64_t max_mem);
-
 JL_DLLEXPORT void jl_gc_queue_binding(jl_binding_t *bnd) JL_NOTSAFEPOINT;
 void gc_setmark_buf(jl_ptls_t ptls, void *buf, uint8_t, size_t) JL_NOTSAFEPOINT;
 
@@ -582,8 +609,8 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOT
     }
 }
 
-void jl_gc_debug_print_status(void);
-JL_DLLEXPORT void jl_gc_debug_critical_error(void);
+void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT;
 void jl_print_gc_stats(JL_STREAM *s);
 void jl_gc_reset_alloc_count(void);
 uint32_t jl_get_gs_ctr(void);
@@ -1173,14 +1200,17 @@ size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_cont
 #endif
 JL_DLLEXPORT jl_value_t *jl_get_backtrace(void);
 void jl_critical_error(int sig, int si_code, bt_context_t *context, jl_task_t *ct);
-JL_DLLEXPORT void jl_raise_debugger(void);
+JL_DLLEXPORT void jl_raise_debugger(void) JL_NOTSAFEPOINT;
 int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT;
-void jl_print_native_codeloc(uintptr_t ip) JL_NOTSAFEPOINT;
-void jl_print_bt_entry_codeloc(jl_bt_element_t *bt_data) JL_NOTSAFEPOINT;
+void jl_print_native_codeloc(char *pre_str, uintptr_t ip) JL_NOTSAFEPOINT;
+void jl_print_bt_entry_codeloc(int sig, jl_bt_element_t *bt_data) JL_NOTSAFEPOINT;
 #ifdef _OS_WINDOWS_
 JL_DLLEXPORT void jl_refresh_dbg_module_list(void);
 #endif
+int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx) JL_NOTSAFEPOINT;
+void jl_thread_resume(int tid) JL_NOTSAFEPOINT;
+
 // *to is NULL or malloc'd pointer, from is allowed to be NULL
 STATIC_INLINE char *jl_copy_str(char **to, const char *from) JL_NOTSAFEPOINT
 {
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 847465b363a2e..f69f9dd4baacf 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -5,6 +5,7 @@
 #define JL_THREADS_H
 
 #include "julia_atomics.h"
+#include "work-stealing-queue.h"
 #ifndef _OS_WINDOWS_
 #include "pthread.h"
 #endif
@@ -106,7 +107,7 @@ typedef struct {
 
 // handle to reference an OS thread
 #ifdef _OS_WINDOWS_
-typedef DWORD jl_thread_t;
+typedef HANDLE jl_thread_t;
 #else
 typedef pthread_t jl_thread_t;
 #endif
@@ -127,6 +128,7 @@ typedef struct {
 
 typedef struct {
     _Atomic(int64_t) allocd;
+    _Atomic(int64_t) pool_live_bytes;
     _Atomic(int64_t) freed;
     _Atomic(uint64_t) malloc;
     _Atomic(uint64_t) realloc;
@@ -137,10 +139,10 @@ typedef struct {
 
 typedef struct {
     // variable for tracking weak references
-    arraylist_t weak_refs;
+    small_arraylist_t weak_refs;
     // live tasks started on this thread
     // that are holding onto a stack from the pool
-    arraylist_t live_tasks;
+    small_arraylist_t live_tasks;
 
     // variables for tracking malloc'd arrays
     struct _mallocarray_t *mallocarrays;
@@ -158,29 +160,18 @@ typedef struct {
     arraylist_t *last_remset;
 
     // variables for allocating objects from pools
-#ifdef _P64
-#  define JL_GC_N_POOLS 49
-#elif MAX_ALIGN == 8
-#  define JL_GC_N_POOLS 50
-#else
-#  define JL_GC_N_POOLS 51
-#endif
-    jl_gc_pool_t norm_pools[JL_GC_N_POOLS];
+#define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h`
+    jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS];
 
 #define JL_N_STACK_POOLS 16
-    arraylist_t free_stacks[JL_N_STACK_POOLS];
+    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
 } jl_thread_heap_t;
 
-// Cache of thread local change to global metadata during GC
-// This is sync'd after marking.
-typedef union _jl_gc_mark_data jl_gc_mark_data_t;
-
 typedef struct {
-    void **pc; // Current stack address for the pc (up growing)
-    jl_gc_mark_data_t *data; // Current stack address for the data (up growing)
-    void **pc_start; // Cached value of `gc_cache->pc_stack`
-    void **pc_end; // Cached value of `gc_cache->pc_stack_end`
-} jl_gc_mark_sp_t;
+    ws_queue_t chunk_queue;
+    ws_queue_t ptr_queue;
+    arraylist_t reclaim_set;
+} jl_gc_markqueue_t;
 
 typedef struct {
     // thread local increment of `perm_scanned_bytes`
@@ -198,12 +189,14 @@ typedef struct {
     // this makes sure that a single objects can only appear once in
     // the lists (the mark bit cannot be flipped to `0` without sweeping)
     void *big_obj[1024];
-    void **pc_stack;
-    void **pc_stack_end;
-    jl_gc_mark_data_t *data_stack;
 } jl_gc_mark_cache_t;
 
 struct _jl_bt_element_t;
+struct _jl_gc_pagemeta_t;
+
+typedef struct {
+    _Atomic(struct _jl_gc_pagemeta_t *) bottom;
+} jl_gc_page_stack_t;
 
 // This includes all the thread local states we care about for a thread.
 // Changes to TLS field types must be reflected in codegen.
@@ -266,9 +259,12 @@ typedef struct _jl_tls_states_t {
 #endif
     jl_thread_t system_id;
     arraylist_t finalizers;
+    jl_gc_page_stack_t page_metadata_allocd;
+    jl_gc_page_stack_t page_metadata_buffered;
+    jl_gc_markqueue_t mark_queue;
     jl_gc_mark_cache_t gc_cache;
     arraylist_t sweep_objs;
-    jl_gc_mark_sp_t gc_mark_sp;
+    _Atomic(int64_t) gc_sweeps_requested;
     // Saved exception for previous *external* API call or NULL if cleared.
     // Access via jl_exception_occurred().
     struct _jl_value_t *previous_exception;
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 1da37a249fbd2..30a5d9a59f676 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -27,6 +27,7 @@ STATISTIC(GetGCFrameSlotCount, "Number of lowered getGCFrameSlotFunc intrinsics"
 STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics");
 STATISTIC(QueueGCRootCount, "Number of lowered queueGCRootFunc intrinsics");
 STATISTIC(QueueGCBindingCount, "Number of lowered queueGCBindingFunc intrinsics");
+STATISTIC(SafepointCount, "Number of lowered safepoint intrinsics");
 
 using namespace llvm;
 
@@ -72,6 +73,9 @@ struct FinalLowerGC: private JuliaPassContext {
 
     // Lowers a `julia.queue_gc_binding` intrinsic.
     Value *lowerQueueGCBinding(CallInst *target, Function &F);
+
+    // Lowers a `julia.safepoint` intrinsic.
+    Value *lowerSafepoint(CallInst *target, Function &F);
 };
 
 Value *FinalLowerGC::lowerNewGCFrame(CallInst *target, Function &F)
@@ -202,15 +206,28 @@ Value *FinalLowerGC::lowerQueueGCBinding(CallInst *target, Function &F)
     return target;
 }
 
+Value *FinalLowerGC::lowerSafepoint(CallInst *target, Function &F)
+{
+    ++SafepointCount;
+    assert(target->arg_size() == 1);
+    IRBuilder<> builder(target->getContext());
+    builder.SetInsertPoint(target);
+    auto T_size = getSizeTy(builder.getContext());
+    Value* signal_page = target->getOperand(0);
+    Value* load = builder.CreateLoad(T_size, signal_page, true);
+    return load;
+}
+
 Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
 {
     ++GCAllocBytesCount;
-    assert(target->arg_size() == 2);
+    assert(target->arg_size() == 3);
     CallInst *newI;
 
     IRBuilder<> builder(target);
     builder.SetCurrentDebugLocation(target->getDebugLoc());
     auto ptls = target->getArgOperand(0);
+    auto type = target->getArgOperand(2);
     Attribute derefAttr;
 
     if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
@@ -221,19 +238,19 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
         if (offset < 0) {
             newI = builder.CreateCall(
                 bigAllocFunc,
-                { ptls, ConstantInt::get(getSizeTy(F.getContext()), sz + sizeof(void*)) });
+                { ptls, ConstantInt::get(getSizeTy(F.getContext()), sz + sizeof(void*)), type });
             derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), sz + sizeof(void*));
         }
         else {
             auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset);
             auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
-            newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize });
+            newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize, type });
             derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), osize);
         }
     } else {
         auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), getSizeTy(F.getContext()));
         size = builder.CreateAdd(size, ConstantInt::get(getSizeTy(F.getContext()), sizeof(void*)));
-        newI = builder.CreateCall(allocTypedFunc, { ptls, size, ConstantPointerNull::get(Type::getInt8PtrTy(F.getContext())) });
+        newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
         derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), sizeof(void*));
     }
     newI->setAttributes(newI->getCalledFunction()->getAttributes());
@@ -316,16 +333,20 @@ static void replaceInstruction(
 
 bool FinalLowerGC::runOnFunction(Function &F)
 {
-    LLVM_DEBUG(dbgs() << "FINAL GC LOWERING: Processing function " << F.getName() << "\n");
     // Check availability of functions again since they might have been deleted.
     initFunctions(*F.getParent());
-    if (!pgcstack_getter && !adoptthread_func)
+    if (!pgcstack_getter && !adoptthread_func) {
+        LLVM_DEBUG(dbgs() << "FINAL GC LOWERING: Skipping function " << F.getName() << "\n");
         return false;
+    }
 
     // Look for a call to 'julia.get_pgcstack'.
     pgcstack = getPGCstack(F);
-    if (!pgcstack)
+    if (!pgcstack) {
+        LLVM_DEBUG(dbgs() << "FINAL GC LOWERING: Skipping function " << F.getName() << " no pgcstack\n");
         return false;
+    }
+    LLVM_DEBUG(dbgs() << "FINAL GC LOWERING: Processing function " << F.getName() << "\n");
 
     // Acquire intrinsic functions.
     auto newGCFrameFunc = getOrNull(jl_intrinsics::newGCFrame);
@@ -335,6 +356,7 @@ bool FinalLowerGC::runOnFunction(Function &F)
     auto GCAllocBytesFunc = getOrNull(jl_intrinsics::GCAllocBytes);
     auto queueGCRootFunc = getOrNull(jl_intrinsics::queueGCRoot);
     auto queueGCBindingFunc = getOrNull(jl_intrinsics::queueGCBinding);
+    auto safepointFunc = getOrNull(jl_intrinsics::safepoint);
 
     // Lower all calls to supported intrinsics.
     for (BasicBlock &BB : F) {
@@ -346,6 +368,7 @@ bool FinalLowerGC::runOnFunction(Function &F)
             }
 
             Value *callee = CI->getCalledOperand();
+            assert(callee);
 
             if (callee == newGCFrameFunc) {
                 replaceInstruction(CI, lowerNewGCFrame(CI, F), it);
@@ -370,6 +393,10 @@ bool FinalLowerGC::runOnFunction(Function &F)
             else if (callee == queueGCBindingFunc) {
                 replaceInstruction(CI, lowerQueueGCBinding(CI, F), it);
             }
+            else if (callee == safepointFunc) {
+                lowerSafepoint(CI, F);
+                it = CI->eraseFromParent();
+            }
             else {
                 ++it;
             }
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index eaba9c7b10d98..9c8959ae7874a 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2324,22 +2324,6 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
                 IRBuilder<> builder(CI);
                 builder.SetCurrentDebugLocation(CI->getDebugLoc());
 
-                // Create a call to the `julia.gc_alloc_bytes` intrinsic, which is like
-                // `julia.gc_alloc_obj` except it doesn't set the tag.
-                auto allocBytesIntrinsic = getOrDeclare(jl_intrinsics::GCAllocBytes);
-                auto ptlsLoad = get_current_ptls_from_task(builder, CI->getArgOperand(0), tbaa_gcframe);
-                auto ptls = builder.CreateBitCast(ptlsLoad, Type::getInt8PtrTy(builder.getContext()));
-                auto newI = builder.CreateCall(
-                    allocBytesIntrinsic,
-                    {
-                        ptls,
-                        builder.CreateIntCast(
-                            CI->getArgOperand(1),
-                            allocBytesIntrinsic->getFunctionType()->getParamType(1),
-                            false)
-                    });
-                newI->takeName(CI);
-
                 // LLVM alignment/bit check is not happy about addrspacecast and refuse
                 // to remove write barrier because of it.
                 // We pretty much only load using `T_size` so try our best to strip
@@ -2378,7 +2362,36 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
                         builder.CreateAlignmentAssumption(DL, tag, 16);
                     }
                 }
-                // Set the tag.
+
+                // Create a call to the `julia.gc_alloc_bytes` intrinsic, which is like
+                // `julia.gc_alloc_obj` except it specializes the call based on the constant
+                // size of the object to allocate, to save one indirection, and doesn't set
+                // the type tag. (Note that if the size is not a constant, it will call
+                // gc_alloc_obj, and will redundantly set the tag.)
+                auto allocBytesIntrinsic = getOrDeclare(jl_intrinsics::GCAllocBytes);
+                auto ptlsLoad = get_current_ptls_from_task(builder, CI->getArgOperand(0), tbaa_gcframe);
+                auto ptls = builder.CreateBitCast(ptlsLoad, Type::getInt8PtrTy(builder.getContext()));
+                auto newI = builder.CreateCall(
+                    allocBytesIntrinsic,
+                    {
+                        ptls,
+                        builder.CreateIntCast(
+                            CI->getArgOperand(1),
+                            allocBytesIntrinsic->getFunctionType()->getParamType(1),
+                            false),
+                        builder.CreatePtrToInt(tag, T_size),
+                    });
+                newI->takeName(CI);
+
+                // Now, finally, set the tag. We do this in IR instead of in the C alloc
+                // function, to provide possible optimization opportunities. (I think? TBH
+                // the most recent editor of this code is not entirely clear on why we
+                // prefer to set the tag in the generated code. Providing optimziation
+                // opportunities is the most likely reason; the tradeoff is slightly
+                // larger code size and increased compilation time, compiling this
+                // instruction at every allocation site, rather than once in the C alloc
+                // function.)
+                auto &M = *builder.GetInsertBlock()->getModule();
                 StoreInst *store = builder.CreateAlignedStore(
                     tag, EmitTagPtr(builder, tag_type, newI), Align(sizeof(size_t)));
                 store->setOrdering(AtomicOrdering::Unordered);
diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp
index fa3437ffdce48..91850ebe8df07 100644
--- a/src/llvm-pass-helpers.cpp
+++ b/src/llvm-pass-helpers.cpp
@@ -119,6 +119,13 @@ namespace jl_intrinsics {
     static const char *POP_GC_FRAME_NAME = "julia.pop_gc_frame";
     static const char *QUEUE_GC_ROOT_NAME = "julia.queue_gc_root";
     static const char *QUEUE_GC_BINDING_NAME = "julia.queue_gc_binding";
+    static const char *SAFEPOINT_NAME = "julia.safepoint";
+
+    static auto T_size_t(const JuliaPassContext &context) {
+        return sizeof(size_t) == sizeof(uint32_t) ?
+            Type::getInt32Ty(context.getLLVMContext()) :
+            Type::getInt64Ty(context.getLLVMContext());
+    }
 
     // Annotates a function with attributes suitable for GC allocation
     // functions. Specifically, the return value is marked noalias and nonnull.
@@ -150,9 +157,8 @@ namespace jl_intrinsics {
                 FunctionType::get(
                     context.T_prjlvalue,
                     { Type::getInt8PtrTy(context.getLLVMContext()),
-                        sizeof(size_t) == sizeof(uint32_t) ?
-                        Type::getInt32Ty(context.getLLVMContext()) :
-                        Type::getInt64Ty(context.getLLVMContext()) },
+                        T_size_t(context),
+                        T_size_t(context) }, // type
                     false),
                 Function::ExternalLinkage,
                 GC_ALLOC_BYTES_NAME);
@@ -224,15 +230,37 @@ namespace jl_intrinsics {
             intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
             return intrinsic;
         });
+
+    const IntrinsicDescription safepoint(
+        SAFEPOINT_NAME,
+        [](const JuliaPassContext &context) {
+            auto T_size = getSizeTy(context.getLLVMContext());
+            auto T_psize = T_size->getPointerTo();
+            auto intrinsic = Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    {T_psize},
+                    false),
+                Function::ExternalLinkage,
+                SAFEPOINT_NAME);
+            intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+            return intrinsic;
+        });
 }
 
 namespace jl_well_known {
-    static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc);
-    static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc);
+    static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc_instrumented);
+    static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc_instrumented);
     static const char *GC_QUEUE_ROOT_NAME = XSTR(jl_gc_queue_root);
     static const char *GC_QUEUE_BINDING_NAME = XSTR(jl_gc_queue_binding);
     static const char *GC_ALLOC_TYPED_NAME = XSTR(jl_gc_alloc_typed);
 
+    static auto T_size_t(const JuliaPassContext &context) {
+        return sizeof(size_t) == sizeof(uint32_t) ?
+            Type::getInt32Ty(context.getLLVMContext()) :
+            Type::getInt64Ty(context.getLLVMContext());
+    }
+
     using jl_intrinsics::addGCAllocAttributes;
 
     const WellKnownFunctionDescription GCBigAlloc(
@@ -242,9 +270,8 @@ namespace jl_well_known {
                 FunctionType::get(
                     context.T_prjlvalue,
                     { Type::getInt8PtrTy(context.getLLVMContext()),
-                        sizeof(size_t) == sizeof(uint32_t) ?
-                        Type::getInt32Ty(context.getLLVMContext()) :
-                        Type::getInt64Ty(context.getLLVMContext()) },
+                        T_size_t(context),
+                        T_size_t(context) },
                     false),
                 Function::ExternalLinkage,
                 GC_BIG_ALLOC_NAME);
@@ -258,7 +285,7 @@ namespace jl_well_known {
             auto poolAllocFunc = Function::Create(
                 FunctionType::get(
                     context.T_prjlvalue,
-                    { Type::getInt8PtrTy(context.getLLVMContext()), Type::getInt32Ty(context.getLLVMContext()), Type::getInt32Ty(context.getLLVMContext()) },
+                    { Type::getInt8PtrTy(context.getLLVMContext()), Type::getInt32Ty(context.getLLVMContext()), Type::getInt32Ty(context.getLLVMContext()), T_size_t(context) },
                     false),
                 Function::ExternalLinkage,
                 GC_POOL_ALLOC_NAME);
@@ -301,10 +328,8 @@ namespace jl_well_known {
                 FunctionType::get(
                     context.T_prjlvalue,
                     { Type::getInt8PtrTy(context.getLLVMContext()),
-                        sizeof(size_t) == sizeof(uint32_t) ?
-                        Type::getInt32Ty(context.getLLVMContext()) :
-                        Type::getInt64Ty(context.getLLVMContext()),
-                        Type::getInt8PtrTy(context.getLLVMContext()) },
+                        T_size_t(context),
+                        T_size_t(context) }, // type
                     false),
                 Function::ExternalLinkage,
                 GC_ALLOC_TYPED_NAME);
diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h
index f25f9181ddb18..e54f39c05ba59 100644
--- a/src/llvm-pass-helpers.h
+++ b/src/llvm-pass-helpers.h
@@ -129,6 +129,9 @@ namespace jl_intrinsics {
 
     // `julia.queue_gc_binding`: an intrinsic that queues a binding for GC.
     extern const IntrinsicDescription queueGCBinding;
+
+    // `julia.safepoint`: an intrinsic that triggers a GC safepoint.
+    extern const IntrinsicDescription safepoint;
 }
 
 // A namespace for well-known Julia runtime function descriptions.
diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp
index a39a73c5393a2..be4cc3a1edf2a 100644
--- a/src/llvm-ptls.cpp
+++ b/src/llvm-ptls.cpp
@@ -207,7 +207,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter,
         IRBuilder<> builder(fastTerm->getParent());
         fastTerm->removeFromParent();
         MDNode *tbaa = tbaa_gcframe;
-        Value *prior = emit_gc_unsafe_enter(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, pgcstack), tbaa));
+        Value *prior = emit_gc_unsafe_enter(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, pgcstack), tbaa), true);
         builder.Insert(fastTerm);
         phi->addIncoming(pgcstack, fastTerm->getParent());
         // emit pre-return cleanup
@@ -219,7 +219,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter,
             for (auto &BB : *pgcstack->getParent()->getParent()) {
                 if (isa<ReturnInst>(BB.getTerminator())) {
                     IRBuilder<> builder(BB.getTerminator());
-                    emit_gc_unsafe_leave(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, phi), tbaa), last_gc_state);
+                    emit_gc_unsafe_leave(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, phi), tbaa), last_gc_state, true);
                 }
             }
         }
diff --git a/src/mach_dyld_atfork.tbd b/src/mach_dyld_atfork.tbd
index 9a5d18099dbcf..c2cda4417ec38 100644
--- a/src/mach_dyld_atfork.tbd
+++ b/src/mach_dyld_atfork.tbd
@@ -21,5 +21,6 @@ install-name:    '/usr/lib/libSystem.B.dylib'
 exports:
   - targets:         [ arm64-macos, arm64e-macos, x86_64-macos, x86_64-maccatalyst,
                        arm64-maccatalyst, arm64e-maccatalyst ]
-    symbols:         [ __dyld_atfork_parent, __dyld_atfork_prepare ]
+    symbols:         [ __dyld_atfork_parent, __dyld_atfork_prepare,
+                      __dyld_dlopen_atfork_parent, __dyld_dlopen_atfork_prepare ]
 ...
diff --git a/src/mtarraylist.c b/src/mtarraylist.c
new file mode 100644
index 0000000000000..8bad44797dab4
--- /dev/null
+++ b/src/mtarraylist.c
@@ -0,0 +1,81 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include "julia.h"
+#include "julia_internal.h"
+#include "julia_assert.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// this file provides some alternate API functions for small_arraylist (push and add)
+// which can be safely observed from other threads concurrently
+// there is only permitted to be a single writer thread (or a mutex)
+// but there can be any number of observers
+
+typedef struct {
+    _Atomic(uint32_t) len;
+    uint32_t max;
+    _Atomic(_Atomic(void*)*) items;
+    _Atomic(void*) _space[SMALL_AL_N_INLINE];
+} small_mtarraylist_t;
+
+// change capacity to at least newlen
+static void mtarraylist_resizeto(small_mtarraylist_t *a, size_t len, size_t newlen) JL_NOTSAFEPOINT
+{
+    size_t max = a->max;
+    if (newlen > max) {
+        size_t nm = max * 2;
+        if (nm == 0)
+            nm = 1;
+        while (newlen > nm)
+            nm *= 2;
+        void *olditems = (void*)jl_atomic_load_relaxed(&a->items);
+        void *p = calloc_s(nm * sizeof(void*));
+        memcpy(p, olditems, len * sizeof(void*));
+        jl_atomic_store_release(&a->items, (_Atomic(void*)*)p);
+        a->max = nm;
+        if (olditems != (void*)&a->_space[0]) {
+            jl_task_t *ct = jl_current_task;
+            jl_gc_add_quiescent(ct->ptls, (void**)olditems, free);
+        }
+    }
+}
+
+// single-threaded
+void mtarraylist_push(small_arraylist_t *_a, void *elt)
+{
+    small_mtarraylist_t *a = (small_mtarraylist_t*)_a;
+    size_t len = jl_atomic_load_relaxed(&a->len);
+    mtarraylist_resizeto(a, len, len + 1);
+    jl_atomic_store_release(&jl_atomic_load_relaxed(&a->items)[len], elt);
+    jl_atomic_store_release(&a->len, len + 1);
+}
+
+// single-threaded
+void mtarraylist_add(small_arraylist_t *_a, void *elt, size_t idx)
+{
+    small_mtarraylist_t *a = (small_mtarraylist_t*)_a;
+    size_t len = jl_atomic_load_relaxed(&a->len);
+    mtarraylist_resizeto(a, len, idx + 1);
+    jl_atomic_store_release(&jl_atomic_load_relaxed(&a->items)[idx], elt);
+    if (jl_atomic_load_relaxed(&a->len) < idx + 1)
+        jl_atomic_store_release(&a->len, idx + 1);
+}
+
+// concurrent-safe
+size_t mtarraylist_length(small_arraylist_t *_a)
+{
+    small_mtarraylist_t *a = (small_mtarraylist_t*)_a;
+    return jl_atomic_load_relaxed(&a->len);
+}
+
+// concurrent-safe
+void *mtarraylist_get(small_arraylist_t *_a, size_t idx)
+{
+    small_mtarraylist_t *a = (small_mtarraylist_t*)_a;
+    size_t len = jl_atomic_load_acquire(&a->len);
+    if (idx >= len)
+        return NULL;
+    return jl_atomic_load_relaxed(&jl_atomic_load_relaxed(&a->items)[idx]);
+}
diff --git a/src/options.h b/src/options.h
index 82b71431ecea0..1ff0f0ce545bd 100644
--- a/src/options.h
+++ b/src/options.h
@@ -81,6 +81,11 @@
 // Automatic Instrumenting Profiler
 //#define ENABLE_TIMINGS
 
+// pool allocator configuration options
+
+// GC_SMALL_PAGE allocates objects in 4k pages
+// #define GC_SMALL_PAGE
+
 
 // method dispatch profiling --------------------------------------------------
 
@@ -134,10 +139,16 @@
 // threadpools specification
 #define THREADPOOLS_NAME                "JULIA_THREADPOOLS"
 
+// GC threads
+#define NUM_GC_THREADS_NAME             "JULIA_NUM_GC_THREADS"
+
 // affinitization behavior
 #define MACHINE_EXCLUSIVE_NAME          "JULIA_EXCLUSIVE"
 #define DEFAULT_MACHINE_EXCLUSIVE       0
 
+// heartbeats
+#define JL_HEARTBEAT_THREAD
+
 // partr -- parallel tasks runtime options ------------------------------------
 
 // multiq
diff --git a/src/partr.c b/src/partr.c
index bbcf5048cace2..c86dae7239097 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -78,7 +78,7 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
 
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
-                                         jl_gc_mark_sp_t *sp, jl_value_t *obj) JL_NOTSAFEPOINT;
+                                         jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
 
 // parallel task runtime
 // ---
@@ -108,7 +108,48 @@ void jl_init_threadinginfra(void)
 
 void JL_NORETURN jl_finish_task(jl_task_t *t);
 
-// thread function: used by all except the main thread
+static inline int may_mark(void) JL_NOTSAFEPOINT
+{
+    return (jl_atomic_load(&gc_n_threads_marking) > 0);
+}
+
+static inline int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT
+{
+    return (jl_atomic_load(&ptls->gc_sweeps_requested) > 0);
+}
+
+// gc thread function
+void jl_gc_threadfun(void *arg)
+{
+    jl_threadarg_t *targ = (jl_threadarg_t*)arg;
+
+    // initialize this thread (set tid and create heap)
+    jl_ptls_t ptls = jl_init_threadtls(targ->tid);
+
+    // wait for all threads
+    jl_gc_state_set(ptls, JL_GC_STATE_WAITING, 0);
+    uv_barrier_wait(targ->barrier);
+
+    // free the thread argument here
+    free(targ);
+
+    while (1) {
+        uv_mutex_lock(&gc_threads_lock);
+        while (!may_mark() && !may_sweep(ptls)) {
+            uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
+        }
+        uv_mutex_unlock(&gc_threads_lock);
+        if (may_mark()) {
+            gc_mark_loop_parallel(ptls, 0);
+        }
+        if (may_sweep(ptls)) { // not an else!
+            gc_sweep_pool_parallel();
+            jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1);
+        }
+    }
+}
+
+// thread function: used by all mutator threads except the main thread
 void jl_threadfun(void *arg)
 {
     jl_threadarg_t *targ = (jl_threadarg_t*)arg;
@@ -419,7 +460,6 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
             uv_mutex_lock(&ptls->sleep_lock);
             while (may_sleep(ptls)) {
                 uv_cond_wait(&ptls->wake_signal, &ptls->sleep_lock);
-                // TODO: help with gc work here, if applicable
             }
             assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping);
             uv_mutex_unlock(&ptls->sleep_lock);
diff --git a/src/signal-handling.c b/src/signal-handling.c
index 391a97055af84..7137cafdef9c4 100644
--- a/src/signal-handling.c
+++ b/src/signal-handling.c
@@ -474,7 +474,7 @@ void jl_critical_error(int sig, int si_code, bt_context_t *context, jl_task_t *c
         *bt_size = n = rec_backtrace_ctx(bt_data, JL_MAX_BT_SIZE, context, NULL);
     }
     for (i = 0; i < n; i += jl_bt_entry_size(bt_data + i)) {
-        jl_print_bt_entry_codeloc(bt_data + i);
+        jl_print_bt_entry_codeloc(sig, bt_data + i);
     }
     jl_gc_debug_print_status();
     jl_gc_debug_critical_error();
diff --git a/src/signals-mach.c b/src/signals-mach.c
index 8ac7e5301d7ad..a772ecbd0a901 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -36,6 +36,9 @@ extern int _keymgr_set_lockmode_processwide_ptr(unsigned int key, unsigned int m
 extern void _dyld_atfork_prepare(void) __attribute__((weak_import));
 extern void _dyld_atfork_parent(void) __attribute__((weak_import));
 //extern void _dyld_fork_child(void) __attribute__((weak_import));
+extern void _dyld_dlopen_atfork_prepare(void) __attribute__((weak_import));
+extern void _dyld_dlopen_atfork_parent(void) __attribute__((weak_import));
+//extern void _dyld_dlopen_atfork_child(void) __attribute__((weak_import));
 
 static void attach_exception_port(thread_port_t thread, int segv_only);
 
@@ -381,12 +384,12 @@ static void attach_exception_port(thread_port_t thread, int segv_only)
     HANDLE_MACH_ERROR("thread_set_exception_ports", ret);
 }
 
-static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
+static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     if (ptls2 == NULL) // this thread is not alive
         return 0;
-    jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
+    jl_task_t *ct2 = jl_atomic_load_relaxed(&ptls2->current_task);
     if (ct2 == NULL) // this thread is already dead
         return 0;
 
@@ -404,18 +407,18 @@ static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
     return 1;
 }
 
-static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t **ctx)
+int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
 {
     (void)timeout;
-    static host_thread_state_t state;
+    host_thread_state_t state;
     if (!jl_thread_suspend_and_get_state2(tid, &state)) {
-        *ctx = NULL;
-        return;
+        return 0;
     }
-    *ctx = (unw_context_t*)&state;
+    *ctx = *(unw_context_t*)&state;
+    return 1;
 }
 
-static void jl_thread_resume(int tid, int sig)
+void jl_thread_resume(int tid)
 {
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
@@ -568,7 +571,12 @@ static int jl_lock_profile_mach(int dlsymlock)
     // workaround for old keymgr bugs
     void *unused = NULL;
     int keymgr_locked = _keymgr_get_and_lock_processwide_ptr_2(KEYMGR_GCC3_DW2_OBJ_LIST, &unused) == 0;
-    // workaround for new dlsym4 bugs (API and bugs introduced in macOS 12.1)
+    // workaround for new dlsym4 bugs in the workaround for dlsym bugs: _dyld_atfork_prepare
+    // acquires its locks in the wrong order, but fortunately we happen to able to guard it
+    // with this call to force it to prevent that TSAN violation from causing a deadlock
+    if (dlsymlock && _dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
+        _dyld_dlopen_atfork_prepare();
+    // workaround for new dlsym4 bugs (API and bugs introduced circa macOS 12.1)
     if (dlsymlock && _dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
         _dyld_atfork_prepare();
     return keymgr_locked;
@@ -576,15 +584,24 @@ static int jl_lock_profile_mach(int dlsymlock)
 
 static void jl_unlock_profile_mach(int dlsymlock, int keymgr_locked)
 {
-    if (dlsymlock && _dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) \
-        _dyld_atfork_parent(); \
+    if (dlsymlock && _dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
+        _dyld_atfork_parent();
+    if (dlsymlock && _dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
+        _dyld_dlopen_atfork_parent();
     if (keymgr_locked)
         _keymgr_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
     jl_unlock_profile();
 }
 
-#define jl_lock_profile()       int keymgr_locked = jl_lock_profile_mach(1)
-#define jl_unlock_profile()     jl_unlock_profile_mach(1, keymgr_locked)
+int jl_lock_stackwalk(void)
+{
+    return jl_lock_profile_mach(1);
+}
+
+void jl_unlock_stackwalk(int lockret)
+{
+    jl_unlock_profile_mach(1, lockret);
+}
 
 void *mach_profile_listener(void *arg)
 {
@@ -615,15 +632,19 @@ void *mach_profile_listener(void *arg)
                 break;
             }
 
+            if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
+                _dyld_dlopen_atfork_prepare();
             if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
                 _dyld_atfork_prepare(); // briefly acquire the dlsym lock
             host_thread_state_t state;
-            if (!jl_thread_suspend_and_get_state2(i, &state))
-                continue;
+            int valid_thread = jl_thread_suspend_and_get_state2(i, &state);
             unw_context_t *uc = (unw_context_t*)&state;
             if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
                 _dyld_atfork_parent(); // quickly release the dlsym lock
-
+            if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
+                _dyld_dlopen_atfork_parent();
+            if (!valid_thread)
+                continue;
             if (running) {
 #ifdef LLVMLIBUNWIND
                 /*
@@ -677,7 +698,7 @@ void *mach_profile_listener(void *arg)
                 bt_data_prof[bt_size_cur++].uintptr = 0;
             }
             // We're done! Resume the thread.
-            jl_thread_resume(i, 0);
+            jl_thread_resume(i);
         }
         jl_unlock_profile_mach(0, keymgr_locked);
         if (running) {
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 8d02aa96a8586..f38389913aa59 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -293,6 +293,18 @@ int exc_reg_is_write_fault(uintptr_t esr) {
 #include "signals-mach.c"
 #else
 
+int jl_lock_stackwalk(void)
+{
+    jl_lock_profile();
+    return 0;
+}
+
+void jl_unlock_stackwalk(int lockret)
+{
+    (void)lockret;
+    jl_unlock_profile();
+}
+
 
 #if defined(_OS_LINUX_) && (defined(_CPU_X86_64_) || defined(_CPU_X86_))
 int is_write_fault(void *context) {
@@ -386,12 +398,12 @@ JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context)
 }
 
 #if !defined(JL_DISABLE_LIBUNWIND)
-static unw_context_t *signal_context;
+static bt_context_t *signal_context;
 pthread_mutex_t in_signal_lock;
 static pthread_cond_t exit_signal_cond;
 static pthread_cond_t signal_caught_cond;
 
-static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t **ctx)
+int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
 {
     struct timespec ts;
     clock_gettime(CLOCK_REALTIME, &ts);
@@ -401,9 +413,8 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t
     jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
     if (ct2 == NULL) {
         // this thread is not alive or already dead
-        *ctx = NULL;
         pthread_mutex_unlock(&in_signal_lock);
-        return;
+        return 0;
     }
     jl_atomic_store_release(&ptls2->signal_request, 1);
     pthread_kill(ptls2->system_id, SIGUSR2);
@@ -412,9 +423,8 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t
     if (err == ETIMEDOUT) {
         sig_atomic_t request = 1;
         if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
-            *ctx = NULL;
             pthread_mutex_unlock(&in_signal_lock);
-            return;
+            return 0;
         }
         // Request is either now 0 (meaning the other thread is waiting for
         //   exit_signal_cond already),
@@ -431,15 +441,16 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t
     // checking it is 0, and add an acquire barrier for good measure)
     int request = jl_atomic_load_acquire(&ptls2->signal_request);
     assert(request == 0); (void) request;
-    *ctx = signal_context;
+    jl_atomic_store_release(&ptls2->signal_request, 1); // prepare to resume normally
+    *ctx = *signal_context;
+    return 1;
 }
 
-static void jl_thread_resume(int tid, int sig)
+void jl_thread_resume(int tid)
 {
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
-    jl_atomic_store_release(&ptls2->signal_request, sig == -1 ? 3 : 1);
     pthread_cond_broadcast(&exit_signal_cond);
-    pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge
+    pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge (so that signal_request doesn't get mixed up)
     // The other thread is waiting to leave exit_signal_cond (verify that here by
     // checking it is 0, and add an acquire barrier for good measure)
     int request = jl_atomic_load_acquire(&ptls2->signal_request);
@@ -474,14 +485,14 @@ CFI_NORETURN
 static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
 {
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
-    unw_context_t *signal_context;
+    bt_context_t signal_context;
     // This also makes sure `sleep` is aborted.
-    jl_thread_suspend_and_get_state(0, 30, &signal_context);
-    if (signal_context != NULL) {
+    if (jl_thread_suspend_and_get_state(0, 30, &signal_context)) {
         thread0_exit_signo = signo;
         ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE
         memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0]));
-        jl_thread_resume(0, -1); // resume with message 3 (call jl_exit_thread0_cb)
+        jl_atomic_store_release(&ptls2->signal_request, 3);
+        jl_thread_resume(0); // resume with message 3 (call jl_exit_thread0_cb)
     }
     else {
         // thread 0 is gone? just do the exit ourself
@@ -877,11 +888,11 @@ static void *signal_listener(void *arg)
         int nthreads = jl_atomic_load_acquire(&jl_n_threads);
         bt_size = 0;
 #if !defined(JL_DISABLE_LIBUNWIND)
-        unw_context_t *signal_context;
+        bt_context_t signal_context;
         // sample each thread, round-robin style in reverse order
         // (so that thread zero gets notified last)
         if (critical || profile) {
-            jl_lock_profile();
+            int lockret = jl_lock_stackwalk();
             int *randperm;
             if (profile)
                  randperm = profile_get_randperm(nthreads);
@@ -889,8 +900,7 @@ static void *signal_listener(void *arg)
                 // Stop the threads in the random or reverse round-robin order.
                 int i = profile ? randperm[idx] : idx;
                 // notify thread to stop
-                jl_thread_suspend_and_get_state(i, 1, &signal_context);
-                if (signal_context == NULL)
+                if (!jl_thread_suspend_and_get_state(i, 1, &signal_context))
                     continue;
 
                 // do backtrace on thread contexts for critical signals
@@ -898,7 +908,7 @@ static void *signal_listener(void *arg)
                 if (critical) {
                     bt_size += rec_backtrace_ctx(bt_data + bt_size,
                             JL_MAX_BT_SIZE / nthreads - 1,
-                            signal_context, NULL);
+                            &signal_context, NULL);
                     bt_data[bt_size++].uintptr = 0;
                 }
 
@@ -920,7 +930,7 @@ static void *signal_listener(void *arg)
                         } else {
                             // Get backtrace data
                             bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
-                                    bt_size_max - bt_size_cur - 1, signal_context, NULL);
+                                    bt_size_max - bt_size_cur - 1, &signal_context, NULL);
                         }
                         jl_set_safe_restore(old_buf);
 
@@ -945,9 +955,9 @@ static void *signal_listener(void *arg)
                 }
 
                 // notify thread to resume
-                jl_thread_resume(i, sig);
+                jl_thread_resume(i);
             }
-            jl_unlock_profile();
+            jl_unlock_stackwalk(lockret);
         }
 #ifndef HAVE_MACH
         if (profile && running) {
@@ -985,7 +995,7 @@ static void *signal_listener(void *arg)
             jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig));
             size_t i;
             for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) {
-                jl_print_bt_entry_codeloc(bt_data + i);
+                jl_print_bt_entry_codeloc(-1, bt_data + i);
             }
         }
     }
diff --git a/src/signals-win.c b/src/signals-win.c
index 5dd6b34558ca6..d1f83d6bfdcc4 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -327,7 +327,7 @@ LONG WINAPI jl_exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo)
         jl_safe_printf("UNKNOWN"); break;
     }
     jl_safe_printf(" at 0x%Ix -- ", (size_t)ExceptionInfo->ExceptionRecord->ExceptionAddress);
-    jl_print_native_codeloc((uintptr_t)ExceptionInfo->ExceptionRecord->ExceptionAddress);
+    jl_print_native_codeloc("", (uintptr_t)ExceptionInfo->ExceptionRecord->ExceptionAddress);
 
     jl_critical_error(0, 0, ExceptionInfo->ContextRecord, ct);
     static int recursion = 0;
@@ -344,6 +344,54 @@ JL_DLLEXPORT void jl_install_sigint_handler(void)
 
 static volatile HANDLE hBtThread = 0;
 
+int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
+{
+    (void)timeout;
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+    if (ptls2 == NULL) // this thread is not alive
+        return 0;
+    jl_task_t *ct2 = jl_atomic_load_relaxed(&ptls2->current_task);
+    if (ct2 == NULL) // this thread is already dead
+        return 0;
+    HANDLE hThread = ptls2->system_id;
+    if ((DWORD)-1 == SuspendThread(hThread))
+        return 0;
+    assert(sizeof(*ctx) == sizeof(CONTEXT));
+    memset(ctx, 0, sizeof(CONTEXT));
+    ctx->ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER;
+    if (!GetThreadContext(hThread, ctx)) {
+        if ((DWORD)-1 == ResumeThread(hThread))
+            abort();
+        return 0;
+    }
+    return 1;
+}
+
+void jl_thread_resume(int tid)
+{
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+    HANDLE hThread = ptls2->system_id;
+    if ((DWORD)-1 == ResumeThread(hThread)) {
+        fputs("failed to resume main thread! aborting.", stderr);
+        abort();
+    }
+}
+
+int jl_lock_stackwalk(void)
+{
+    uv_mutex_lock(&jl_in_stackwalk);
+    jl_lock_profile();
+    return 0;
+}
+
+void jl_unlock_stackwalk(int lockret)
+{
+    (void)lockret;
+    jl_unlock_profile();
+    uv_mutex_unlock(&jl_in_stackwalk);
+}
+
+
 static DWORD WINAPI profile_bt( LPVOID lparam )
 {
     // Note: illegal to use jl_* functions from this thread except for profiling-specific functions
@@ -357,58 +405,45 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
                 continue;
             }
             else {
-                uv_mutex_lock(&jl_in_stackwalk);
-                jl_lock_profile();
-                if ((DWORD)-1 == SuspendThread(hMainThread)) {
-                    fputs("failed to suspend main thread. aborting profiling.", stderr);
-                    break;
-                }
+                // TODO: bring this up to parity with other OS by adding loop over tid here
+                int lockret = jl_lock_stackwalk();
                 CONTEXT ctxThread;
-                memset(&ctxThread, 0, sizeof(CONTEXT));
-                ctxThread.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER;
-                if (!GetThreadContext(hMainThread, &ctxThread)) {
-                    fputs("failed to get context from main thread. aborting profiling.", stderr);
+                if (!jl_thread_suspend_and_get_state(0, 0, &ctxThread)) {
+                    jl_unlock_stackwalk(lockret);
+                    fputs("failed to suspend main thread. aborting profiling.", stderr);
                     jl_profile_stop_timer();
+                    break;
                 }
-                else {
-                    // Get backtrace data
-                    bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
-                            bt_size_max - bt_size_cur - 1, &ctxThread, NULL);
+                // Get backtrace data
+                bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
+                        bt_size_max - bt_size_cur - 1, &ctxThread, NULL);
 
-                    jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread
+                jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread
 
-                    // store threadid but add 1 as 0 is preserved to indicate end of block
-                    bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
+                // store threadid but add 1 as 0 is preserved to indicate end of block
+                bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
 
-                    // store task id (never null)
-                    bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
+                // store task id (never null)
+                bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
 
-                    // store cpu cycle clock
-                    bt_data_prof[bt_size_cur++].uintptr = cycleclock();
+                // store cpu cycle clock
+                bt_data_prof[bt_size_cur++].uintptr = cycleclock();
 
-                    // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
-                    bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
+                // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
+                bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
 
-                    // Mark the end of this block with two 0's
-                    bt_data_prof[bt_size_cur++].uintptr = 0;
-                    bt_data_prof[bt_size_cur++].uintptr = 0;
-                }
-                jl_unlock_profile();
-                uv_mutex_unlock(&jl_in_stackwalk);
-                if ((DWORD)-1 == ResumeThread(hMainThread)) {
-                    jl_profile_stop_timer();
-                    fputs("failed to resume main thread! aborting.", stderr);
-                    jl_gc_debug_critical_error();
-                    abort();
-                }
+                // Mark the end of this block with two 0's
+                bt_data_prof[bt_size_cur++].uintptr = 0;
+                bt_data_prof[bt_size_cur++].uintptr = 0;
+                jl_unlock_stackwalk(lockret);
+                jl_thread_resume(0);
                 jl_check_profile_autostop();
             }
         }
     }
-    jl_unlock_profile();
     uv_mutex_unlock(&jl_in_stackwalk);
     jl_profile_stop_timer();
-    hBtThread = 0;
+    hBtThread = NULL;
     return 0;
 }
 
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 6c5eb6f4537cc..d6c71a2f1b01b 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -612,22 +612,25 @@ JL_DLLEXPORT jl_value_t *jl_lookup_code_address(void *ip, int skipC)
     return rs;
 }
 
-static void jl_safe_print_codeloc(const char* func_name, const char* file_name,
+static void jl_safe_print_codeloc(const char *pre_str,
+                                  const char* func_name, const char* file_name,
                                   int line, int inlined) JL_NOTSAFEPOINT
 {
     const char *inlined_str = inlined ? " [inlined]" : "";
     if (line != -1) {
-        jl_safe_printf("%s at %s:%d%s\n", func_name, file_name, line, inlined_str);
+        jl_safe_printf("%s%s at %s:%d%s\n",
+                       pre_str, func_name, file_name, line, inlined_str);
     }
     else {
-        jl_safe_printf("%s at %s (unknown line)%s\n", func_name, file_name, inlined_str);
+        jl_safe_printf("%s%s at %s (unknown line)%s\n",
+                       pre_str, func_name, file_name, inlined_str);
     }
 }
 
 // Print function, file and line containing native instruction pointer `ip` by
 // looking up debug info. Prints multiple such frames when `ip` points to
 // inlined code.
-void jl_print_native_codeloc(uintptr_t ip) JL_NOTSAFEPOINT
+void jl_print_native_codeloc(char *pre_str, uintptr_t ip) JL_NOTSAFEPOINT
 {
     // This function is not allowed to reference any TLS variables since
     // it can be called from an unmanaged thread on OSX.
@@ -639,10 +642,11 @@ void jl_print_native_codeloc(uintptr_t ip) JL_NOTSAFEPOINT
     for (i = 0; i < n; i++) {
         jl_frame_t frame = frames[i];
         if (!frame.func_name) {
-            jl_safe_printf("unknown function (ip: %p)\n", (void*)ip);
+            jl_safe_printf("%sunknown function (ip: %p)\n", pre_str, (void*)ip);
         }
         else {
-            jl_safe_print_codeloc(frame.func_name, frame.file_name, frame.line, frame.inlined);
+            jl_safe_print_codeloc(pre_str, frame.func_name,
+                                  frame.file_name, frame.line, frame.inlined);
             free(frame.func_name);
             free(frame.file_name);
         }
@@ -651,10 +655,17 @@ void jl_print_native_codeloc(uintptr_t ip) JL_NOTSAFEPOINT
 }
 
 // Print code location for backtrace buffer entry at *bt_entry
-void jl_print_bt_entry_codeloc(jl_bt_element_t *bt_entry) JL_NOTSAFEPOINT
+void jl_print_bt_entry_codeloc(int sig, jl_bt_element_t *bt_entry) JL_NOTSAFEPOINT
 {
+    char sig_str[32], pre_str[64];
+    sig_str[0] = '\0';
+    if (sig != -1) {
+        snprintf(sig_str, 32, "signal (%d) ", sig);
+    }
+    snprintf(pre_str, 64, "%sthread (%d) ", sig_str, jl_threadid() + 1);
+
     if (jl_bt_is_native(bt_entry)) {
-        jl_print_native_codeloc(bt_entry[0].uintptr);
+        jl_print_native_codeloc(pre_str, bt_entry[0].uintptr);
     }
     else if (jl_bt_entry_tag(bt_entry) == JL_BT_INTERP_FRAME_TAG) {
         size_t ip = jl_bt_entry_header(bt_entry);
@@ -680,7 +691,7 @@ void jl_print_bt_entry_codeloc(jl_bt_element_t *bt_entry) JL_NOTSAFEPOINT
                     method = (jl_value_t*)((jl_method_t*)method)->name;
                 if (jl_is_symbol(method))
                     func_name = jl_symbol_name((jl_sym_t*)method);
-                jl_safe_print_codeloc(func_name, jl_symbol_name(locinfo->file),
+                jl_safe_print_codeloc(pre_str, func_name, jl_symbol_name(locinfo->file),
                                       locinfo->line, locinfo->inlined_at);
                 debuginfoloc = locinfo->inlined_at;
             }
@@ -854,7 +865,7 @@ _os_ptr_munge(uintptr_t ptr)
 
 extern bt_context_t *jl_to_bt_context(void *sigctx);
 
-void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
+static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
 {
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
@@ -863,222 +874,242 @@ void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
         ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
         return;
     }
-    if (t->copy_stack || !t->started || t->stkbuf == NULL)
-        return;
-    int16_t old = -1;
-    if (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid)
-        return;
     bt_context_t *context = NULL;
-#if defined(_OS_WINDOWS_)
     bt_context_t c;
-    memset(&c, 0, sizeof(c));
-    _JUMP_BUFFER *mctx = (_JUMP_BUFFER*)&t->ctx.ctx.uc_mcontext;
+    int16_t old = -1;
+    while (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid) {
+        int lockret = jl_lock_stackwalk();
+        // if this task is already running somewhere, we need to stop the thread it is running on and query its state
+        if (!jl_thread_suspend_and_get_state(old, 0, &c)) {
+            jl_unlock_stackwalk(lockret);
+            return;
+        }
+        jl_unlock_stackwalk(lockret);
+        if (jl_atomic_load_relaxed(&t->tid) == old) {
+            jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[old];
+            if (ptls2->previous_task == t || // we might print the wrong stack here, since we can't know whether we executed the swapcontext yet or not, but it at least avoids trying to access the state inside uc_mcontext which might not be set yet
+                (ptls2->previous_task == NULL && jl_atomic_load_relaxed(&ptls2->current_task) == t)) { // this case should be always accurate
+                // use the thread context for the unwind state
+                context = &c;
+            }
+            break;
+        }
+        // got the wrong thread stopped, try again
+        jl_thread_resume(old);
+    }
+    if (context == NULL && (!t->copy_stack && t->started && t->stkbuf != NULL)) {
+        // need to read the context from the task stored state
+#if defined(_OS_WINDOWS_)
+        memset(&c, 0, sizeof(c));
+        _JUMP_BUFFER *mctx = (_JUMP_BUFFER*)&t->ctx.ctx.uc_mcontext;
 #if defined(_CPU_X86_64_)
-    c.Rbx = mctx->Rbx;
-    c.Rsp = mctx->Rsp;
-    c.Rbp = mctx->Rbp;
-    c.Rsi = mctx->Rsi;
-    c.Rdi = mctx->Rdi;
-    c.R12 = mctx->R12;
-    c.R13 = mctx->R13;
-    c.R14 = mctx->R14;
-    c.R15 = mctx->R15;
-    c.Rip = mctx->Rip;
-    memcpy(&c.Xmm6, &mctx->Xmm6, 10 * sizeof(mctx->Xmm6)); // Xmm6-Xmm15
+        c.Rbx = mctx->Rbx;
+        c.Rsp = mctx->Rsp;
+        c.Rbp = mctx->Rbp;
+        c.Rsi = mctx->Rsi;
+        c.Rdi = mctx->Rdi;
+        c.R12 = mctx->R12;
+        c.R13 = mctx->R13;
+        c.R14 = mctx->R14;
+        c.R15 = mctx->R15;
+        c.Rip = mctx->Rip;
+        memcpy(&c.Xmm6, &mctx->Xmm6, 10 * sizeof(mctx->Xmm6)); // Xmm6-Xmm15
 #else
-    c.Eip = mctx->Eip;
-    c.Esp = mctx->Esp;
-    c.Ebp = mctx->Ebp;
+        c.Eip = mctx->Eip;
+        c.Esp = mctx->Esp;
+        c.Ebp = mctx->Ebp;
 #endif
-    context = &c;
+        context = &c;
 #elif defined(JL_HAVE_UNW_CONTEXT)
-    context = &t->ctx.ctx;
+        context = &t->ctx.ctx;
 #elif defined(JL_HAVE_UCONTEXT)
-    context = jl_to_bt_context(&t->ctx.ctx);
+        context = jl_to_bt_context(&t->ctx.ctx);
 #elif defined(JL_HAVE_ASM)
-    bt_context_t c;
-    memset(&c, 0, sizeof(c));
- #if defined(_OS_LINUX_) && defined(__GLIBC__)
-    __jmp_buf *mctx = &t->ctx.ctx.uc_mcontext->__jmpbuf;
-    mcontext_t *mc = &c.uc_mcontext;
-  #if defined(_CPU_X86_)
-    // https://github.com/bminor/glibc/blame/master/sysdeps/i386/__longjmp.S
-    // https://github.com/bminor/glibc/blame/master/sysdeps/i386/jmpbuf-offsets.h
-    // https://github.com/bminor/musl/blame/master/src/setjmp/i386/longjmp.s
-    mc->gregs[REG_EBX] = (*mctx)[0];
-    mc->gregs[REG_ESI] = (*mctx)[1];
-    mc->gregs[REG_EDI] = (*mctx)[2];
-    mc->gregs[REG_EBP] = (*mctx)[3];
-    mc->gregs[REG_ESP] = (*mctx)[4];
-    mc->gregs[REG_EIP] = (*mctx)[5];
-    // ifdef PTR_DEMANGLE ?
-    mc->gregs[REG_ESP] = ptr_demangle(mc->gregs[REG_ESP]);
-    mc->gregs[REG_EIP] = ptr_demangle(mc->gregs[REG_EIP]);
-    context = &c;
-  #elif defined(_CPU_X86_64_)
-    // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/__longjmp.S
-    // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/jmpbuf-offsets.h
-    // https://github.com/bminor/musl/blame/master/src/setjmp/x86_64/setjmp.s
-    mc->gregs[REG_RBX] = (*mctx)[0];
-    mc->gregs[REG_RBP] = (*mctx)[1];
-    mc->gregs[REG_R12] = (*mctx)[2];
-    mc->gregs[REG_R13] = (*mctx)[3];
-    mc->gregs[REG_R14] = (*mctx)[4];
-    mc->gregs[REG_R15] = (*mctx)[5];
-    mc->gregs[REG_RSP] = (*mctx)[6];
-    mc->gregs[REG_RIP] = (*mctx)[7];
-    // ifdef PTR_DEMANGLE ?
-    mc->gregs[REG_RBP] = ptr_demangle(mc->gregs[REG_RBP]);
-    mc->gregs[REG_RSP] = ptr_demangle(mc->gregs[REG_RSP]);
-    mc->gregs[REG_RIP] = ptr_demangle(mc->gregs[REG_RIP]);
-    context = &c;
-  #elif defined(_CPU_ARM_)
-    // https://github.com/bminor/glibc/blame/master/sysdeps/arm/__longjmp.S
-    // https://github.com/bminor/glibc/blame/master/sysdeps/arm/include/bits/setjmp.h
-    // https://github.com/bminor/musl/blame/master/src/setjmp/arm/longjmp.S
-    mc->arm_sp = (*mctx)[0];
-    mc->arm_lr = (*mctx)[1];
-    mc->arm_r4 = (*mctx)[2]; // aka v1
-    mc->arm_r5 = (*mctx)[3]; // aka v2
-    mc->arm_r6 = (*mctx)[4]; // aka v3
-    mc->arm_r7 = (*mctx)[5]; // aka v4
-    mc->arm_r8 = (*mctx)[6]; // aka v5
-    mc->arm_r9 = (*mctx)[7]; // aka v6 aka sb
-    mc->arm_r10 = (*mctx)[8]; // aka v7 aka sl
-    mc->arm_fp = (*mctx)[10]; // aka v8 aka r11
-    // ifdef PTR_DEMANGLE ?
-    mc->arm_sp = ptr_demangle(mc->arm_sp);
-    mc->arm_lr = ptr_demangle(mc->arm_lr);
-    mc->arm_pc = mc->arm_lr;
-    context = &c;
-  #elif defined(_CPU_AARCH64_)
-    // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/__longjmp.S
-    // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/jmpbuf-offsets.h
-    // https://github.com/bminor/musl/blame/master/src/setjmp/aarch64/longjmp.s
-    // https://github.com/libunwind/libunwind/blob/ec171c9ba7ea3abb2a1383cee2988a7abd483a1f/src/aarch64/unwind_i.h#L62
-    unw_fpsimd_context_t *mcfp = (unw_fpsimd_context_t*)&mc->__reserved;
-    mc->regs[19] = (*mctx)[0];
-    mc->regs[20] = (*mctx)[1];
-    mc->regs[21] = (*mctx)[2];
-    mc->regs[22] = (*mctx)[3];
-    mc->regs[23] = (*mctx)[4];
-    mc->regs[24] = (*mctx)[5];
-    mc->regs[25] = (*mctx)[6];
-    mc->regs[26] = (*mctx)[7];
-    mc->regs[27] = (*mctx)[8];
-    mc->regs[28] = (*mctx)[9];
-    mc->regs[29] = (*mctx)[10]; // aka fp
-    mc->regs[30] = (*mctx)[11]; // aka lr
-    // Yes, they did skip 12 why writing the code originally; and, no, I do not know why.
-    mc->sp = (*mctx)[13];
-    mcfp->vregs[7] = (*mctx)[14]; // aka d8
-    mcfp->vregs[8] = (*mctx)[15]; // aka d9
-    mcfp->vregs[9] = (*mctx)[16]; // aka d10
-    mcfp->vregs[10] = (*mctx)[17]; // aka d11
-    mcfp->vregs[11] = (*mctx)[18]; // aka d12
-    mcfp->vregs[12] = (*mctx)[19]; // aka d13
-    mcfp->vregs[13] = (*mctx)[20]; // aka d14
-    mcfp->vregs[14] = (*mctx)[21]; // aka d15
-    // ifdef PTR_DEMANGLE ?
-    mc->sp = ptr_demangle(mc->sp);
-    mc->regs[30] = ptr_demangle(mc->regs[30]);
-    mc->pc = mc->regs[30];
-    context = &c;
-  #else
-   #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown linux")
-   (void)mc;
-   (void)c;
-   (void)mctx;
-  #endif
- #elif defined(_OS_DARWIN_)
-    sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext;
-  #if defined(_CPU_X86_64_)
-    // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/x86_64/_setjmp.s
-    x86_thread_state64_t *mc = (x86_thread_state64_t*)&c;
-    mc->__rbx = ((uint64_t*)mctx)[0];
-    mc->__rbp = ((uint64_t*)mctx)[1];
-    mc->__rsp = ((uint64_t*)mctx)[2];
-    mc->__r12 = ((uint64_t*)mctx)[3];
-    mc->__r13 = ((uint64_t*)mctx)[4];
-    mc->__r14 = ((uint64_t*)mctx)[5];
-    mc->__r15 = ((uint64_t*)mctx)[6];
-    mc->__rip = ((uint64_t*)mctx)[7];
-    // added in libsystem_plaform 177.200.16 (macOS Mojave 10.14.3)
-    // prior to that _os_ptr_munge_token was (hopefully) typically 0,
-    // so x ^ 0 == x and this is a no-op
-    mc->__rbp = _OS_PTR_UNMUNGE(mc->__rbp);
-    mc->__rsp = _OS_PTR_UNMUNGE(mc->__rsp);
-    mc->__rip = _OS_PTR_UNMUNGE(mc->__rip);
-    context = &c;
-  #elif defined(_CPU_AARCH64_)
-    // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/arm64/setjmp.s
-    // https://github.com/apple/darwin-xnu/blob/main/osfmk/mach/arm/_structs.h
-    // https://github.com/llvm/llvm-project/blob/7714e0317520207572168388f22012dd9e152e9e/libunwind/src/Registers.hpp -> Registers_arm64
-    arm_thread_state64_t *mc = (arm_thread_state64_t*)&c;
-    mc->__x[19] = ((uint64_t*)mctx)[0];
-    mc->__x[20] = ((uint64_t*)mctx)[1];
-    mc->__x[21] = ((uint64_t*)mctx)[2];
-    mc->__x[22] = ((uint64_t*)mctx)[3];
-    mc->__x[23] = ((uint64_t*)mctx)[4];
-    mc->__x[24] = ((uint64_t*)mctx)[5];
-    mc->__x[25] = ((uint64_t*)mctx)[6];
-    mc->__x[26] = ((uint64_t*)mctx)[7];
-    mc->__x[27] = ((uint64_t*)mctx)[8];
-    mc->__x[28] = ((uint64_t*)mctx)[9];
-    mc->__x[10] = ((uint64_t*)mctx)[10];
-    mc->__x[11] = ((uint64_t*)mctx)[11];
-    mc->__x[12] = ((uint64_t*)mctx)[12];
-    // 13 is reserved/unused
-    double *mcfp = (double*)&mc[1];
-    mcfp[7] = ((uint64_t*)mctx)[14]; // aka d8
-    mcfp[8] = ((uint64_t*)mctx)[15]; // aka d9
-    mcfp[9] = ((uint64_t*)mctx)[16]; // aka d10
-    mcfp[10] = ((uint64_t*)mctx)[17]; // aka d11
-    mcfp[11] = ((uint64_t*)mctx)[18]; // aka d12
-    mcfp[12] = ((uint64_t*)mctx)[19]; // aka d13
-    mcfp[13] = ((uint64_t*)mctx)[20]; // aka d14
-    mcfp[14] = ((uint64_t*)mctx)[21]; // aka d15
-    mc->__fp = _OS_PTR_UNMUNGE(mc->__x[10]);
-    mc->__lr = _OS_PTR_UNMUNGE(mc->__x[11]);
-    mc->__x[12] = _OS_PTR_UNMUNGE(mc->__x[12]);
-    mc->__sp = mc->__x[12];
-    // libunwind is broken for signed-pointers, but perhaps best not to leave the signed pointer lying around either
-    mc->__pc = ptrauth_strip(mc->__lr, 0);
-    mc->__pad = 0; // aka __ra_sign_state = not signed
-    context = &c;
-  #else
-   #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown darwin")
-    (void)mctx;
-    (void)c;
-  #endif
- #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_)
-    sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext;
-    mcontext_t *mc = &c.uc_mcontext;
-    // https://github.com/freebsd/freebsd-src/blob/releng/13.1/lib/libc/amd64/gen/_setjmp.S
-    mc->mc_rip = ((long*)mctx)[0];
-    mc->mc_rbx = ((long*)mctx)[1];
-    mc->mc_rsp = ((long*)mctx)[2];
-    mc->mc_rbp = ((long*)mctx)[3];
-    mc->mc_r12 = ((long*)mctx)[4];
-    mc->mc_r13 = ((long*)mctx)[5];
-    mc->mc_r14 = ((long*)mctx)[6];
-    mc->mc_r15 = ((long*)mctx)[7];
-    context = &c;
- #else
-  #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown system")
-  (void)c;
- #endif
+        memset(&c, 0, sizeof(c));
+     #if defined(_OS_LINUX_) && defined(__GLIBC__)
+        __jmp_buf *mctx = &t->ctx.ctx.uc_mcontext->__jmpbuf;
+        mcontext_t *mc = &c.uc_mcontext;
+      #if defined(_CPU_X86_)
+        // https://github.com/bminor/glibc/blame/master/sysdeps/i386/__longjmp.S
+        // https://github.com/bminor/glibc/blame/master/sysdeps/i386/jmpbuf-offsets.h
+        // https://github.com/bminor/musl/blame/master/src/setjmp/i386/longjmp.s
+        mc->gregs[REG_EBX] = (*mctx)[0];
+        mc->gregs[REG_ESI] = (*mctx)[1];
+        mc->gregs[REG_EDI] = (*mctx)[2];
+        mc->gregs[REG_EBP] = (*mctx)[3];
+        mc->gregs[REG_ESP] = (*mctx)[4];
+        mc->gregs[REG_EIP] = (*mctx)[5];
+        // ifdef PTR_DEMANGLE ?
+        mc->gregs[REG_ESP] = ptr_demangle(mc->gregs[REG_ESP]);
+        mc->gregs[REG_EIP] = ptr_demangle(mc->gregs[REG_EIP]);
+        context = &c;
+      #elif defined(_CPU_X86_64_)
+        // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/__longjmp.S
+        // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/jmpbuf-offsets.h
+        // https://github.com/bminor/musl/blame/master/src/setjmp/x86_64/setjmp.s
+        mc->gregs[REG_RBX] = (*mctx)[0];
+        mc->gregs[REG_RBP] = (*mctx)[1];
+        mc->gregs[REG_R12] = (*mctx)[2];
+        mc->gregs[REG_R13] = (*mctx)[3];
+        mc->gregs[REG_R14] = (*mctx)[4];
+        mc->gregs[REG_R15] = (*mctx)[5];
+        mc->gregs[REG_RSP] = (*mctx)[6];
+        mc->gregs[REG_RIP] = (*mctx)[7];
+        // ifdef PTR_DEMANGLE ?
+        mc->gregs[REG_RBP] = ptr_demangle(mc->gregs[REG_RBP]);
+        mc->gregs[REG_RSP] = ptr_demangle(mc->gregs[REG_RSP]);
+        mc->gregs[REG_RIP] = ptr_demangle(mc->gregs[REG_RIP]);
+        context = &c;
+      #elif defined(_CPU_ARM_)
+        // https://github.com/bminor/glibc/blame/master/sysdeps/arm/__longjmp.S
+        // https://github.com/bminor/glibc/blame/master/sysdeps/arm/include/bits/setjmp.h
+        // https://github.com/bminor/musl/blame/master/src/setjmp/arm/longjmp.S
+        mc->arm_sp = (*mctx)[0];
+        mc->arm_lr = (*mctx)[1];
+        mc->arm_r4 = (*mctx)[2]; // aka v1
+        mc->arm_r5 = (*mctx)[3]; // aka v2
+        mc->arm_r6 = (*mctx)[4]; // aka v3
+        mc->arm_r7 = (*mctx)[5]; // aka v4
+        mc->arm_r8 = (*mctx)[6]; // aka v5
+        mc->arm_r9 = (*mctx)[7]; // aka v6 aka sb
+        mc->arm_r10 = (*mctx)[8]; // aka v7 aka sl
+        mc->arm_fp = (*mctx)[10]; // aka v8 aka r11
+        // ifdef PTR_DEMANGLE ?
+        mc->arm_sp = ptr_demangle(mc->arm_sp);
+        mc->arm_lr = ptr_demangle(mc->arm_lr);
+        mc->arm_pc = mc->arm_lr;
+        context = &c;
+      #elif defined(_CPU_AARCH64_)
+        // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/__longjmp.S
+        // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/jmpbuf-offsets.h
+        // https://github.com/bminor/musl/blame/master/src/setjmp/aarch64/longjmp.s
+        // https://github.com/libunwind/libunwind/blob/ec171c9ba7ea3abb2a1383cee2988a7abd483a1f/src/aarch64/unwind_i.h#L62
+        unw_fpsimd_context_t *mcfp = (unw_fpsimd_context_t*)&mc->__reserved;
+        mc->regs[19] = (*mctx)[0];
+        mc->regs[20] = (*mctx)[1];
+        mc->regs[21] = (*mctx)[2];
+        mc->regs[22] = (*mctx)[3];
+        mc->regs[23] = (*mctx)[4];
+        mc->regs[24] = (*mctx)[5];
+        mc->regs[25] = (*mctx)[6];
+        mc->regs[26] = (*mctx)[7];
+        mc->regs[27] = (*mctx)[8];
+        mc->regs[28] = (*mctx)[9];
+        mc->regs[29] = (*mctx)[10]; // aka fp
+        mc->regs[30] = (*mctx)[11]; // aka lr
+        // Yes, they did skip 12 why writing the code originally; and, no, I do not know why.
+        mc->sp = (*mctx)[13];
+        mcfp->vregs[7] = (*mctx)[14]; // aka d8
+        mcfp->vregs[8] = (*mctx)[15]; // aka d9
+        mcfp->vregs[9] = (*mctx)[16]; // aka d10
+        mcfp->vregs[10] = (*mctx)[17]; // aka d11
+        mcfp->vregs[11] = (*mctx)[18]; // aka d12
+        mcfp->vregs[12] = (*mctx)[19]; // aka d13
+        mcfp->vregs[13] = (*mctx)[20]; // aka d14
+        mcfp->vregs[14] = (*mctx)[21]; // aka d15
+        // ifdef PTR_DEMANGLE ?
+        mc->sp = ptr_demangle(mc->sp);
+        mc->regs[30] = ptr_demangle(mc->regs[30]);
+        mc->pc = mc->regs[30];
+        context = &c;
+      #else
+       #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown linux")
+       (void)mc;
+       (void)c;
+       (void)mctx;
+      #endif
+     #elif defined(_OS_DARWIN_)
+        sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext;
+      #if defined(_CPU_X86_64_)
+        // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/x86_64/_setjmp.s
+        x86_thread_state64_t *mc = (x86_thread_state64_t*)&c;
+        mc->__rbx = ((uint64_t*)mctx)[0];
+        mc->__rbp = ((uint64_t*)mctx)[1];
+        mc->__rsp = ((uint64_t*)mctx)[2];
+        mc->__r12 = ((uint64_t*)mctx)[3];
+        mc->__r13 = ((uint64_t*)mctx)[4];
+        mc->__r14 = ((uint64_t*)mctx)[5];
+        mc->__r15 = ((uint64_t*)mctx)[6];
+        mc->__rip = ((uint64_t*)mctx)[7];
+        // added in libsystem_plaform 177.200.16 (macOS Mojave 10.14.3)
+        // prior to that _os_ptr_munge_token was (hopefully) typically 0,
+        // so x ^ 0 == x and this is a no-op
+        mc->__rbp = _OS_PTR_UNMUNGE(mc->__rbp);
+        mc->__rsp = _OS_PTR_UNMUNGE(mc->__rsp);
+        mc->__rip = _OS_PTR_UNMUNGE(mc->__rip);
+        context = &c;
+      #elif defined(_CPU_AARCH64_)
+        // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/arm64/setjmp.s
+        // https://github.com/apple/darwin-xnu/blob/main/osfmk/mach/arm/_structs.h
+        // https://github.com/llvm/llvm-project/blob/7714e0317520207572168388f22012dd9e152e9e/libunwind/src/Registers.hpp -> Registers_arm64
+        arm_thread_state64_t *mc = (arm_thread_state64_t*)&c;
+        mc->__x[19] = ((uint64_t*)mctx)[0];
+        mc->__x[20] = ((uint64_t*)mctx)[1];
+        mc->__x[21] = ((uint64_t*)mctx)[2];
+        mc->__x[22] = ((uint64_t*)mctx)[3];
+        mc->__x[23] = ((uint64_t*)mctx)[4];
+        mc->__x[24] = ((uint64_t*)mctx)[5];
+        mc->__x[25] = ((uint64_t*)mctx)[6];
+        mc->__x[26] = ((uint64_t*)mctx)[7];
+        mc->__x[27] = ((uint64_t*)mctx)[8];
+        mc->__x[28] = ((uint64_t*)mctx)[9];
+        mc->__x[10] = ((uint64_t*)mctx)[10];
+        mc->__x[11] = ((uint64_t*)mctx)[11];
+        mc->__x[12] = ((uint64_t*)mctx)[12];
+        // 13 is reserved/unused
+        double *mcfp = (double*)&mc[1];
+        mcfp[7] = ((uint64_t*)mctx)[14]; // aka d8
+        mcfp[8] = ((uint64_t*)mctx)[15]; // aka d9
+        mcfp[9] = ((uint64_t*)mctx)[16]; // aka d10
+        mcfp[10] = ((uint64_t*)mctx)[17]; // aka d11
+        mcfp[11] = ((uint64_t*)mctx)[18]; // aka d12
+        mcfp[12] = ((uint64_t*)mctx)[19]; // aka d13
+        mcfp[13] = ((uint64_t*)mctx)[20]; // aka d14
+        mcfp[14] = ((uint64_t*)mctx)[21]; // aka d15
+        mc->__fp = _OS_PTR_UNMUNGE(mc->__x[10]);
+        mc->__lr = _OS_PTR_UNMUNGE(mc->__x[11]);
+        mc->__x[12] = _OS_PTR_UNMUNGE(mc->__x[12]);
+        mc->__sp = mc->__x[12];
+        // libunwind is broken for signed-pointers, but perhaps best not to leave the signed pointer lying around either
+        mc->__pc = ptrauth_strip(mc->__lr, 0);
+        mc->__pad = 0; // aka __ra_sign_state = not signed
+        context = &c;
+      #else
+       #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown darwin")
+        (void)mctx;
+        (void)c;
+      #endif
+     #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_)
+        sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext;
+        mcontext_t *mc = &c.uc_mcontext;
+        // https://github.com/freebsd/freebsd-src/blob/releng/13.1/lib/libc/amd64/gen/_setjmp.S
+        mc->mc_rip = ((long*)mctx)[0];
+        mc->mc_rbx = ((long*)mctx)[1];
+        mc->mc_rsp = ((long*)mctx)[2];
+        mc->mc_rbp = ((long*)mctx)[3];
+        mc->mc_r12 = ((long*)mctx)[4];
+        mc->mc_r13 = ((long*)mctx)[5];
+        mc->mc_r14 = ((long*)mctx)[6];
+        mc->mc_r15 = ((long*)mctx)[7];
+        context = &c;
+     #else
+      #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown system")
+      (void)c;
+     #endif
 #elif defined(JL_HAVE_ASYNCIFY)
- #pragma message("jl_rec_backtrace not defined for ASYNCIFY")
+     #pragma message("jl_rec_backtrace not defined for ASYNCIFY")
 #elif defined(JL_HAVE_SIGALTSTACK)
- #pragma message("jl_rec_backtrace not defined for SIGALTSTACK")
+     #pragma message("jl_rec_backtrace not defined for SIGALTSTACK")
 #else
- #pragma message("jl_rec_backtrace not defined for unknown task system")
+     #pragma message("jl_rec_backtrace not defined for unknown task system")
 #endif
+    }
     if (context)
-        ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context, t->gcstack);
+        ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context,  t->gcstack);
     if (old == -1)
         jl_atomic_store_relaxed(&t->tid, old);
+    else if (old != ptls->tid)
+        jl_thread_resume(old);
 }
 
 //--------------------------------------------------
@@ -1086,7 +1117,9 @@ void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
 
 JL_DLLEXPORT void jl_gdblookup(void* ip)
 {
-    jl_print_native_codeloc((uintptr_t)ip);
+    char pre_str[64];
+    snprintf(pre_str, 64, "thread (%d) ", jl_threadid() + 1);
+    jl_print_native_codeloc(pre_str, (uintptr_t)ip);
 }
 
 // Print backtrace for current exception in catch block
@@ -1101,11 +1134,11 @@ JL_DLLEXPORT void jlbacktrace(void) JL_NOTSAFEPOINT
     size_t i, bt_size = jl_excstack_bt_size(s, s->top);
     jl_bt_element_t *bt_data = jl_excstack_bt_data(s, s->top);
     for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) {
-        jl_print_bt_entry_codeloc(bt_data + i);
+        jl_print_bt_entry_codeloc(-1, bt_data + i);
     }
 }
 
-// Print backtrace for specified task
+// Print backtrace for specified task to jl_safe_printf stderr
 JL_DLLEXPORT void jlbacktracet(jl_task_t *t) JL_NOTSAFEPOINT
 {
     jl_task_t *ct = jl_current_task;
@@ -1114,7 +1147,7 @@ JL_DLLEXPORT void jlbacktracet(jl_task_t *t) JL_NOTSAFEPOINT
     size_t i, bt_size = ptls->bt_size;
     jl_bt_element_t *bt_data = ptls->bt_data;
     for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) {
-        jl_print_bt_entry_codeloc(bt_data + i);
+        jl_print_bt_entry_codeloc(-1, bt_data + i);
     }
 }
 
@@ -1123,46 +1156,69 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
     jlbacktrace();
 }
 
-// Print backtraces for all live tasks, for all threads.
-// WARNING: this is dangerous and can crash if used outside of gdb, if
-// all of Julia's threads are not stopped!
+extern int gc_first_tid;
+
+// Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
 {
     size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
     jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
+    int ctid = jl_threadid() + 1;
+    jl_safe_printf("thread (%d) ++++ Task backtraces\n", ctid);
     for (size_t i = 0; i < nthreads; i++) {
+        // skip GC threads since they don't have tasks
+        if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) {
+            continue;
+        }
         jl_ptls_t ptls2 = allstates[i];
-        arraylist_t *live_tasks = &ptls2->heap.live_tasks;
-        size_t n = live_tasks->len;
-        jl_safe_printf("==== Thread %d created %zu live tasks\n",
-                ptls2->tid + 1, n + 1);
-        jl_safe_printf("     ---- Root task (%p)\n", ptls2->root_task);
-        jl_safe_printf("          (sticky: %d, started: %d, state: %d, tid: %d)\n",
-                ptls2->root_task->sticky, ptls2->root_task->started,
-                jl_atomic_load_relaxed(&ptls2->root_task->_state),
-                jl_atomic_load_relaxed(&ptls2->root_task->tid) + 1);
-        jlbacktracet(ptls2->root_task);
-
-        void **lst = live_tasks->items;
-        for (size_t j = 0; j < live_tasks->len; j++) {
-            jl_task_t *t = (jl_task_t *)lst[j];
+        if (ptls2 == NULL) {
+            continue;
+        }
+        small_arraylist_t *live_tasks = &ptls2->heap.live_tasks;
+        size_t n = mtarraylist_length(live_tasks);
+        int t_state = JL_TASK_STATE_DONE;
+        jl_task_t *t = ptls2->root_task;
+        if (t != NULL)
+            t_state = jl_atomic_load_relaxed(&t->_state);
+        jl_safe_printf("thread (%d) ==== Thread %d created %zu live tasks\n",
+                ctid, ptls2->tid + 1, n + (t_state != JL_TASK_STATE_DONE));
+        if (show_done || t_state != JL_TASK_STATE_DONE) {
+            jl_safe_printf("thread (%d)     ---- Root task (%p)\n", ctid, ptls2->root_task);
+            if (t != NULL) {
+                jl_safe_printf("thread (%d)          (sticky: %d, started: %d, state: %d, tid: %d)\n",
+                        ctid, t->sticky, t->started, t_state,
+                        jl_atomic_load_relaxed(&t->tid) + 1);
+                if (t->stkbuf != NULL) {
+                    jlbacktracet(t);
+                }
+                else {
+                    jl_safe_printf("thread (%d)      no stack\n", ctid);
+                }
+            }
+            jl_safe_printf("thread (%d)     ---- End root task\n", ctid);
+        }
+
+        for (size_t j = 0; j < n; j++) {
+            jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, j);
+            if (t == NULL)
+                continue;
             int t_state = jl_atomic_load_relaxed(&t->_state);
-            if (!show_done && t_state == JL_TASK_STATE_DONE) {
+            if (!show_done && t_state == JL_TASK_STATE_DONE)
                 continue;
-            }
-            jl_safe_printf("     ---- Task %zu (%p)\n", j + 1, t);
-            jl_safe_printf("          (sticky: %d, started: %d, state: %d, tid: %d)\n",
-                    t->sticky, t->started, t_state,
+            jl_safe_printf("thread (%d)     ---- Task %zu (%p)\n", ctid, j + 1, t);
+            // n.b. this information might not be consistent with the stack printing after it, since it could start running or change tid, etc.
+            jl_safe_printf("thread (%d)          (sticky: %d, started: %d, state: %d, tid: %d)\n",
+                    ctid, t->sticky, t->started, t_state,
                     jl_atomic_load_relaxed(&t->tid) + 1);
             if (t->stkbuf != NULL)
                 jlbacktracet(t);
             else
-                jl_safe_printf("      no stack\n");
-            jl_safe_printf("     ---- End task %zu\n", j + 1);
+                jl_safe_printf("thread (%d)      no stack\n", ctid);
+            jl_safe_printf("thread (%d)     ---- End task %zu\n", ctid, j + 1);
         }
-        jl_safe_printf("==== End thread %d\n", ptls2->tid + 1);
+        jl_safe_printf("thread (%d) ==== End thread %d\n", ctid, ptls2->tid + 1);
     }
-    jl_safe_printf("==== Done\n");
+    jl_safe_printf("thread (%d) ++++ Done\n", ctid);
 }
 
 #ifdef __cplusplus
diff --git a/src/support/MurmurHash3.c b/src/support/MurmurHash3.c
index 43bd015ddd69f..1f7056d0e7e56 100644
--- a/src/support/MurmurHash3.c
+++ b/src/support/MurmurHash3.c
@@ -12,8 +12,6 @@
 //-----------------------------------------------------------------------------
 // Platform-specific functions and macros
 
-#define FORCE_INLINE inline __attribute__((always_inline))
-
 static inline uint32_t rotl32 ( uint32_t x, int8_t r )
 {
   return (x << r) | (x >> (32 - r));
diff --git a/src/support/dtypes.h b/src/support/dtypes.h
index d49ae0b22b5f9..0e528b5cc9b56 100644
--- a/src/support/dtypes.h
+++ b/src/support/dtypes.h
@@ -117,6 +117,7 @@ typedef intptr_t ssize_t;
 #define LLT_FREE(x) free(x)
 
 #define STATIC_INLINE static inline
+#define FORCE_INLINE static inline __attribute__((always_inline))
 
 #if defined(_OS_WINDOWS_) && !defined(_COMPILER_GCC_)
 #  define NOINLINE __declspec(noinline)
@@ -331,6 +332,23 @@ STATIC_INLINE void jl_store_unaligned_i16(void *ptr, uint16_t val) JL_NOTSAFEPOI
     memcpy(ptr, &val, 2);
 }
 
+STATIC_INLINE void *calloc_s(size_t sz) JL_NOTSAFEPOINT {
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+#endif
+    void *p = calloc(sz == 0 ? 1 : sz, 1);
+    if (p == NULL) {
+        perror("(julia) calloc");
+        abort();
+    }
+#ifdef _OS_WINDOWS_
+    SetLastError(last_error);
+#endif
+    errno = last_errno;
+    return p;
+}
+
 STATIC_INLINE void *malloc_s(size_t sz) JL_NOTSAFEPOINT {
     int last_errno = errno;
 #ifdef _OS_WINDOWS_
diff --git a/src/threading.c b/src/threading.c
index 8500279220825..6c17abf2c36be 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -305,6 +305,8 @@ static uv_mutex_t tls_lock; // controls write-access to these variables:
 _Atomic(jl_ptls_t*) jl_all_tls_states JL_GLOBALLY_ROOTED;
 int jl_all_tls_states_size;
 static uv_cond_t cond;
+// concurrent reads are permitted, using the same pattern as mtsmall_arraylist
+// it is implemented separately because the API of direct jl_all_tls_states use is already widely prevalent
 
 // return calling thread's ID
 JL_DLLEXPORT int16_t jl_threadid(void)
@@ -338,7 +340,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
 #ifndef _OS_WINDOWS_
     pthread_setspecific(jl_task_exit_key, (void*)ptls);
 #endif
-    ptls->system_id = (jl_thread_t)(uintptr_t)uv_thread_self();
+    ptls->system_id = uv_thread_self();
     ptls->rngseed = jl_rand();
     if (tid == 0)
         ptls->disable_gc = 1;
@@ -373,10 +375,10 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
     uv_cond_init(&ptls->wake_signal);
 
     uv_mutex_lock(&tls_lock);
-    jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
     if (tid == -1)
         tid = jl_atomic_load_relaxed(&jl_n_threads);
     ptls->tid = tid;
+    jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
     if (jl_all_tls_states_size <= tid) {
         int i, newsize = jl_all_tls_states_size + tid + 2;
         jl_ptls_t *newpptls = (jl_ptls_t*)calloc(newsize, sizeof(jl_ptls_t));
@@ -562,6 +564,8 @@ static void jl_check_tls(void)
 JL_DLLEXPORT const int jl_tls_elf_support = 0;
 #endif
 
+extern int gc_first_tid;
+
 // interface to Julia; sets up to make the runtime thread-safe
 void jl_init_threading(void)
 {
@@ -614,13 +618,32 @@ void jl_init_threading(void)
         }
     }
 
-    jl_all_tls_states_size = nthreads + nthreadsi;
+    int16_t ngcthreads = jl_options.ngcthreads - 1;
+    if (ngcthreads == -1 &&
+        (cp = getenv(NUM_GC_THREADS_NAME))) { // ENV[NUM_GC_THREADS_NAME] specified
+
+        ngcthreads = (uint64_t)strtol(cp, NULL, 10) - 1;
+    }
+    if (ngcthreads == -1) {
+        // if `--gcthreads` was not specified, set the number of GC threads
+        // to the number of compute threads
+        if (nthreads <= 1) {
+            ngcthreads = 0;
+        }
+        else {
+            ngcthreads = nthreads - 1;
+        }
+    }
+
+    jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
     jl_n_threads_per_pool[0] = nthreadsi;
     jl_n_threads_per_pool[1] = nthreads;
 
     jl_atomic_store_release(&jl_all_tls_states, (jl_ptls_t*)calloc(jl_all_tls_states_size, sizeof(jl_ptls_t)));
     jl_atomic_store_release(&jl_n_threads, jl_all_tls_states_size);
+    jl_n_gcthreads = ngcthreads;
+    gc_first_tid = nthreads;
 }
 
 static uv_barrier_t thread_init_done;
@@ -628,6 +651,7 @@ static uv_barrier_t thread_init_done;
 void jl_start_threads(void)
 {
     int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
+    int ngcthreads = jl_n_gcthreads;
     int cpumasksize = uv_cpumask_size();
     char *cp;
     int i, exclusive;
@@ -660,15 +684,23 @@ void jl_start_threads(void)
     // create threads
     uv_barrier_init(&thread_init_done, nthreads);
 
+    // GC/System threads need to be after the worker threads.
+    int nworker_threads = nthreads - ngcthreads;
+
     for (i = 1; i < nthreads; ++i) {
         jl_threadarg_t *t = (jl_threadarg_t *)malloc_s(sizeof(jl_threadarg_t)); // ownership will be passed to the thread
         t->tid = i;
         t->barrier = &thread_init_done;
-        uv_thread_create(&uvtid, jl_threadfun, t);
-        if (exclusive) {
-            mask[i] = 1;
-            uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize);
-            mask[i] = 0;
+        if (i < nworker_threads) {
+            uv_thread_create(&uvtid, jl_threadfun, t);
+            if (exclusive) {
+                mask[i] = 1;
+                uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize);
+                mask[i] = 0;
+            }
+        }
+        else {
+            uv_thread_create(&uvtid, jl_gc_threadfun, t);
         }
         uv_thread_detach(&uvtid);
     }
@@ -816,6 +848,224 @@ JL_DLLEXPORT int jl_alignment(size_t sz)
     return jl_gc_alignment(sz);
 }
 
+// Heartbeat mechanism for Julia's task scheduler
+// ---
+// Start a thread that does not participate in running Julia's tasks. This
+// thread simply sleeps until the heartbeat mechanism is enabled. When
+// enabled, the heartbeat thread enters a loop in which it blocks waiting
+// for the specified heartbeat interval. If, within that interval,
+// `jl_heartbeat()` is *not* called at least once, then the thread calls
+// `jl_print_task_backtraces(0)`.
+
+#ifdef JL_HEARTBEAT_THREAD
+
+#include <time.h>
+
+volatile int heartbeat_enabled;
+uv_sem_t heartbeat_on_sem,              // jl_heartbeat_enable -> thread
+         heartbeat_off_sem;             // thread -> jl_heartbeat_enable
+int heartbeat_interval_s,
+    tasks_after_n,
+    reset_tasks_after_n;
+int tasks_showed, n_hbs_missed, n_hbs_recvd;
+_Atomic(int) heartbeats;
+
+JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT;
+void jl_heartbeat_threadfun(void *arg);
+
+// start the heartbeat thread with heartbeats disabled
+void jl_init_heartbeat(void)
+{
+    uv_thread_t uvtid;
+    heartbeat_enabled = 0;
+    uv_sem_init(&heartbeat_on_sem, 0);
+    uv_sem_init(&heartbeat_off_sem, 0);
+    uv_thread_create(&uvtid, jl_heartbeat_threadfun, NULL);
+    uv_thread_detach(&uvtid);
+}
+
+// enable/disable heartbeats
+// heartbeat_s: interval within which jl_heartbeat() must be called
+// show_tasks_after_n: number of heartbeats missed before printing task backtraces
+// reset_after_n: number of heartbeats after which to reset
+//
+// When disabling heartbeats, the heartbeat thread must wake up,
+// find out that heartbeats are now diabled, and reset. For now, we
+// handle this by preventing re-enabling of heartbeats until this
+// completes.
+JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n,
+                                     int reset_after_n)
+{
+    if (heartbeat_s <= 0) {
+        heartbeat_enabled = 0;
+        heartbeat_interval_s = tasks_after_n = reset_tasks_after_n = 0;
+    }
+    else {
+        // must disable before enabling
+        if (heartbeat_enabled) {
+            return -1;
+        }
+        // heartbeat thread must be ready
+        if (uv_sem_trywait(&heartbeat_off_sem) != 0) {
+            return -1;
+        }
+
+        jl_atomic_store_relaxed(&heartbeats, 0);
+        heartbeat_interval_s = heartbeat_s;
+        tasks_after_n = show_tasks_after_n;
+        reset_tasks_after_n = reset_after_n;
+        tasks_showed = 0;
+        n_hbs_missed = 0;
+        n_hbs_recvd = 0;
+        heartbeat_enabled = 1;
+        uv_sem_post(&heartbeat_on_sem); // wake the heartbeat thread
+    }
+    return 0;
+}
+
+// heartbeat
+JL_DLLEXPORT void jl_heartbeat(void)
+{
+    jl_atomic_fetch_add(&heartbeats, 1);
+}
+
+// sleep the thread for the specified interval
+void sleep_for(int secs, int nsecs)
+{
+    struct timespec rqtp, rmtp;
+    rqtp.tv_sec = secs;
+    rqtp.tv_nsec = nsecs;
+    rmtp.tv_sec = 0;
+    rmtp.tv_nsec = 0;
+    for (; ;) {
+        // this suspends the thread so we aren't using CPU
+        if (nanosleep(&rqtp, &rmtp) == 0) {
+            return;
+        }
+        // TODO: else if (errno == EINTR)
+        // this could be SIGTERM and we should shutdown but how to find out?
+        rqtp = rmtp;
+    }
+}
+
+// check for heartbeats and maybe report loss
+uint8_t check_heartbeats(uint8_t gc_state)
+{
+    int hb = jl_atomic_exchange(&heartbeats, 0);
+
+    if (hb <= 0) {
+        // we didn't get a heartbeat
+        n_hbs_recvd = 0;
+        n_hbs_missed++;
+
+        // if we've printed task backtraces already, do nothing
+        if (!tasks_showed) {
+            // otherwise, at least show this message
+            jl_safe_printf("==== heartbeat loss (%ds) ====\n",
+                           n_hbs_missed * heartbeat_interval_s);
+            // if we've missed enough heartbeats, print task backtraces
+            if (n_hbs_missed >= tasks_after_n) {
+                jl_task_t *ct = jl_current_task;
+                jl_ptls_t ptls = ct->ptls;
+
+                // exit GC-safe region to report then re-enter
+                jl_gc_safe_leave(ptls, gc_state);
+                jl_print_task_backtraces(0);
+                gc_state = jl_gc_safe_enter(ptls);
+
+                // we printed task backtraces
+                tasks_showed = 1;
+            }
+        }
+    }
+    else {
+        // got a heartbeat
+        n_hbs_recvd++;
+        // if we'd printed task backtraces, check for reset
+        if (tasks_showed && n_hbs_recvd >= reset_tasks_after_n) {
+            tasks_showed = 0;
+            jl_safe_printf("==== heartbeats recovered (lost for %ds) ====\n",
+                           n_hbs_missed * heartbeat_interval_s);
+        }
+        n_hbs_missed = 0;
+    }
+
+    return gc_state;
+}
+
+// heartbeat thread function
+void jl_heartbeat_threadfun(void *arg)
+{
+    int s = 59, ns = 1e9 - 1, rs;
+    uint64_t t0, tchb;
+
+    // We need a TLS because backtraces are accumulated into ptls->bt_size
+    // and ptls->bt_data, so we need to call jl_adopt_thread().
+    jl_adopt_thread();
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+
+    // Don't hold up GC, this thread doesn't participate.
+    uint8_t gc_state = jl_gc_safe_enter(ptls);
+
+    for (;;) {
+        if (!heartbeat_enabled) {
+            // post the off semaphore to indicate we're ready to enable
+            uv_sem_post(&heartbeat_off_sem);
+
+            // sleep the thread here; this semaphore is posted in
+            // jl_heartbeat_enable()
+            uv_sem_wait(&heartbeat_on_sem);
+
+            // Set the sleep duration.
+            s = heartbeat_interval_s - 1;
+            ns = 1e9 - 1;
+            continue;
+        }
+
+        // heartbeat is enabled; sleep, waiting for the desired interval
+        sleep_for(s, ns);
+
+        // if heartbeats were turned off while we were sleeping, reset
+        if (!heartbeat_enabled) {
+            continue;
+        }
+
+        // check if any heartbeats have happened, report as appropriate
+        t0 = jl_hrtime();
+        gc_state = check_heartbeats(gc_state);
+        tchb = jl_hrtime() - t0;
+
+        // adjust the next sleep duration based on how long the heartbeat
+        // check took
+        rs = 1;
+        while (tchb > 1e9) {
+            rs++;
+            tchb -= 1e9;
+        }
+        s = heartbeat_interval_s - rs;
+        ns = 1e9 - tchb;
+    }
+}
+
+#else // !JL_HEARTBEAT_THREAD
+
+void jl_init_heartbeat(void)
+{
+}
+
+JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n,
+                                     int reset_after_n)
+{
+    return -1;
+}
+
+JL_DLLEXPORT void jl_heartbeat(void)
+{
+}
+
+#endif // JL_HEARTBEAT_THREAD
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/threading.h b/src/threading.h
index 9fd63f0fd188d..1cf2d4a9d3711 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -25,6 +25,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid);
 
 // provided by a threading infrastructure
 void jl_init_threadinginfra(void);
+void jl_gc_threadfun(void *arg);
 void jl_threadfun(void *arg);
 
 #ifdef __cplusplus
diff --git a/src/work-stealing-queue.h b/src/work-stealing-queue.h
new file mode 100644
index 0000000000000..38429e02886e9
--- /dev/null
+++ b/src/work-stealing-queue.h
@@ -0,0 +1,102 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifndef WORK_STEALING_QUEUE_H
+#define WORK_STEALING_QUEUE_H
+
+#include "julia_atomics.h"
+#include "assert.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// =======
+// Chase and Lev's work-stealing queue, optimized for
+// weak memory models by Le et al.
+//
+// * Chase D., Lev Y. Dynamic Circular Work-Stealing queue
+// * Le N. M. et al. Correct and Efficient Work-Stealing for
+//   Weak Memory Models
+// =======
+
+typedef struct {
+    char *buffer;
+    int32_t capacity;
+    int32_t mask;
+} ws_array_t;
+
+static inline ws_array_t *create_ws_array(size_t capacity, int32_t eltsz) JL_NOTSAFEPOINT
+{
+    ws_array_t *a = (ws_array_t *)malloc_s(sizeof(ws_array_t));
+    a->buffer = (char *)malloc_s(capacity * eltsz);
+    a->capacity = capacity;
+    a->mask = capacity - 1;
+    return a;
+}
+
+typedef struct {
+    _Atomic(int64_t) top;
+    _Atomic(int64_t) bottom;
+    _Atomic(ws_array_t *) array;
+} ws_queue_t;
+
+static inline ws_array_t *ws_queue_push(ws_queue_t *q, void *elt, int32_t eltsz) JL_NOTSAFEPOINT
+{
+    int64_t b = jl_atomic_load_relaxed(&q->bottom);
+    int64_t t = jl_atomic_load_acquire(&q->top);
+    ws_array_t *ary = jl_atomic_load_relaxed(&q->array);
+    ws_array_t *old_ary = NULL;
+    if (__unlikely(b - t > ary->capacity - 1)) {
+        ws_array_t *new_ary = create_ws_array(2 * ary->capacity, eltsz);
+        for (int i = 0; i < ary->capacity; i++) {
+            memcpy(new_ary->buffer + ((t + i) & new_ary->mask) * eltsz, ary->buffer + ((t + i) & ary->mask) * eltsz, eltsz);
+        }
+        jl_atomic_store_release(&q->array, new_ary);
+        old_ary = ary;
+        ary = new_ary;
+    }
+    memcpy(ary->buffer + (b & ary->mask) * eltsz, elt, eltsz);
+    jl_fence_release();
+    jl_atomic_store_relaxed(&q->bottom, b + 1);
+    return old_ary;
+}
+
+static inline void ws_queue_pop(ws_queue_t *q, void *dest, int32_t eltsz) JL_NOTSAFEPOINT
+{
+    int64_t b = jl_atomic_load_relaxed(&q->bottom) - 1;
+    ws_array_t *ary = jl_atomic_load_relaxed(&q->array);
+    jl_atomic_store_relaxed(&q->bottom, b);
+    jl_fence();
+    int64_t t = jl_atomic_load_relaxed(&q->top);
+    if (__likely(t <= b)) {
+        memcpy(dest, ary->buffer + (b & ary->mask) * eltsz, eltsz);
+        if (t == b) {
+            if (!jl_atomic_cmpswap(&q->top, &t, t + 1))
+                memset(dest, 0, eltsz);
+            jl_atomic_store_relaxed(&q->bottom, b + 1);
+        }
+    }
+    else {
+        memset(dest, 0, eltsz);
+        jl_atomic_store_relaxed(&q->bottom, b + 1);
+    }
+}
+
+static inline void ws_queue_steal_from(ws_queue_t *q, void *dest, int32_t eltsz) JL_NOTSAFEPOINT
+{
+    int64_t t = jl_atomic_load_acquire(&q->top);
+    jl_fence();
+    int64_t b = jl_atomic_load_acquire(&q->bottom);
+    if (t < b) {
+        ws_array_t *ary = jl_atomic_load_relaxed(&q->array);
+        memcpy(dest, ary->buffer + (t & ary->mask) * eltsz, eltsz);
+        if (!jl_atomic_cmpswap(&q->top, &t, t + 1))
+            memset(dest, 0, eltsz);
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/stdlib/Distributed/src/cluster.jl b/stdlib/Distributed/src/cluster.jl
index 554a3d9185080..d8039ba160d66 100644
--- a/stdlib/Distributed/src/cluster.jl
+++ b/stdlib/Distributed/src/cluster.jl
@@ -1345,9 +1345,10 @@ function process_opts(opts)
     end
 
     # Propagate --threads to workers
-    threads = get_threads_spec(opts)
+    threads = opts.nthreads > 0 ? `--threads=$(opts.nthreads)` : ``
+    gcthreads = opts.ngcthreads > 0 ? `--gcthreads=$(opts.ngcthreads)` : ``
 
-    exeflags = `$threads`
+    exeflags = `$threads $gcthreads`
 
     # add processors
     if opts.nprocs > 0
diff --git a/stdlib/Pkg.version b/stdlib/Pkg.version
index 3279234710531..c7cc917268a90 100644
--- a/stdlib/Pkg.version
+++ b/stdlib/Pkg.version
@@ -1,4 +1,4 @@
 PKG_BRANCH = release-1.9
-PKG_SHA1 = ffe4615b1e4e39b818a49bb1a06467932d5eaf51
+PKG_SHA1 = 397c8bf20ef70a78247d4cbd3d59f28a3116c884
 PKG_GIT_URL := https://github.com/JuliaLang/Pkg.jl.git
 PKG_TAR_URL = https://api.github.com/repos/JuliaLang/Pkg.jl/tarball/$1
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 6dd2a12205b66..482c2c0a97ad3 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -1275,6 +1275,21 @@ function take_heap_snapshot(all_one::Bool=false)
     return take_heap_snapshot(f, all_one)
 end
 
+"""
+    Profile.take_page_profile(io::IOStream)
+    Profile.take_page_profile(filepath::String)
+
+Write a JSON snapshot of the pages from Julia's pool allocator, printing for every pool allocated object, whether it's garbage, or its type.
+"""
+function take_page_profile(io::IOStream)
+    Base.@_lock_ios(io, ccall(:jl_gc_take_page_profile, Cvoid, (Ptr{Cvoid},), io.handle))
+end
+function take_page_profile(filepath::String)
+    open(filepath, "w") do io
+        take_page_profile(io)
+    end
+    return filepath
+end
 
 include("Allocs.jl")
 
diff --git a/stdlib/Profile/test/allocs.jl b/stdlib/Profile/test/allocs.jl
index c2ec7d2f6cb54..ae0cbab945f01 100644
--- a/stdlib/Profile/test/allocs.jl
+++ b/stdlib/Profile/test/allocs.jl
@@ -121,3 +121,34 @@ end
     @test length(prof.allocs) >= 1
     @test length([a for a in prof.allocs if a.type == String]) >= 1
 end
+
+@testset "alloc profiler catches allocs from codegen" begin
+    @eval begin
+        struct MyType x::Int; y::Int end
+        Base.:(+)(n::Number, x::MyType) = n + x.x + x.y
+        foo(a, x) = a[1] + x
+        wrapper(a) = foo(a, MyType(0,1))
+    end
+    a = Any[1,2,3]
+    # warmup
+    wrapper(a)
+
+    @eval Allocs.@profile sample_rate=1 wrapper($a)
+
+    prof = Allocs.fetch()
+    Allocs.clear()
+
+    @test length(prof.allocs) >= 1
+    @test length([a for a in prof.allocs if a.type == MyType]) >= 1
+end
+
+@testset "alloc profiler catches allocs from buffer resize" begin
+    a = Int[]
+    Allocs.@profile sample_rate=1 for _ in 1:100; push!(a, 1); end
+
+    prof = Allocs.fetch()
+    Allocs.clear()
+
+    @test length(prof.allocs) >= 1
+    @test length([a for a in prof.allocs if a.type == Profile.Allocs.BufferType]) >= 1
+end
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index 6ff4890004f62..01a596278d74f 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -287,4 +287,14 @@ end
     rm(tmpdir, force = true, recursive = true)
 end
 
+@testset "PageProfile" begin
+    fname = "$(getpid())_$(time_ns())"
+    fpath = joinpath(tempdir(), fname)
+    Profile.take_page_profile(fpath)
+    open(fpath) do fs
+        @test readline(fs) != ""
+    end
+    rm(fpath)
+end
+
 include("allocs.jl")
diff --git a/test/choosetests.jl b/test/choosetests.jl
index 334ef051a0fe6..7d426221180f5 100644
--- a/test/choosetests.jl
+++ b/test/choosetests.jl
@@ -21,7 +21,7 @@ const TESTNAMES = [
         "combinatorics", "sysinfo", "env", "rounding", "ranges", "mod2pi",
         "euler", "show", "client",
         "errorshow", "sets", "goto", "llvmcall", "llvmcall2", "ryu",
-        "some", "meta", "stacktraces", "docs",
+        "some", "meta", "stacktraces", "docs", "gc",
         "misc", "threads", "stress", "binaryplatforms", "atexit",
         "enums", "cmdlineargs", "int", "interpreter",
         "checked", "bitset", "floatfuncs", "precompile",
diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl
index 7f86534f923b3..f73a7854fd2f1 100644
--- a/test/cmdlineargs.jl
+++ b/test/cmdlineargs.jl
@@ -269,6 +269,24 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no`
         @test p.exitcode == 1 && p.termsignal == 0
     end
 
+    # --gcthreads
+    code = "print(Threads.ngcthreads())"
+    cpu_threads = ccall(:jl_effective_threads, Int32, ())
+    @test (cpu_threads == 1 ? "1" : string(div(cpu_threads, 2))) ==
+          read(`$exename --threads auto -e $code`, String) ==
+          read(`$exename --threads=auto -e $code`, String) ==
+          read(`$exename -tauto -e $code`, String) ==
+          read(`$exename -t auto -e $code`, String)
+    for nt in (nothing, "1")
+        withenv("JULIA_NUM_GC_THREADS" => nt) do
+            @test read(`$exename --gcthreads=2 -e $code`, String) == "2"
+        end
+    end
+
+    withenv("JULIA_NUM_GC_THREADS" => 2) do
+        @test read(`$exename -e $code`, String) == "2"
+    end
+
     # --machine-file
     # this does not check that machine file works,
     # only that the filename gets correctly passed to the option struct
diff --git a/test/compiler/codegen.jl b/test/compiler/codegen.jl
index e4e107351c57f..4c9c7e97a710b 100644
--- a/test/compiler/codegen.jl
+++ b/test/compiler/codegen.jl
@@ -15,9 +15,12 @@ function libjulia_codegen_name()
     is_debug_build ? "libjulia-codegen-debug" : "libjulia-codegen"
 end
 
-# `_dump_function` might be more efficient but it doesn't really matter here...
-get_llvm(@nospecialize(f), @nospecialize(t), raw=true, dump_module=false, optimize=true) =
-    sprint(code_llvm, f, t, raw, dump_module, optimize)
+# The tests below assume a certain format and safepoint_on_entry=true breaks that.
+function get_llvm(@nospecialize(f), @nospecialize(t), raw=true, dump_module=false, optimize=true)
+    params = Base.CodegenParams(safepoint_on_entry=false)
+    d = InteractiveUtils._dump_function(f, t, false, false, !raw, dump_module, :att, optimize, :none, false, params)
+    sprint(print, d)
+end
 
 if !is_debug_build && opt_level > 0
     # Make sure getptls call is removed at IR level with optimization on
diff --git a/test/gc.jl b/test/gc.jl
new file mode 100644
index 0000000000000..9cc9d753dfc09
--- /dev/null
+++ b/test/gc.jl
@@ -0,0 +1,18 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+using Test
+
+function run_gctest(file)
+    let cmd = `$(Base.julia_cmd()) --depwarn=error --rr-detach --startup-file=no $file`
+        for test_nthreads in (1, 2, 4)
+            new_env = copy(ENV)
+            new_env["JULIA_NUM_THREADS"] = string(test_nthreads)
+            new_env["JULIA_NUM_GC_THREADS"] = string(test_nthreads)
+            @time run(pipeline(setenv(cmd, new_env), stdout = stdout, stderr = stderr))
+        end
+    end
+end
+
+@time run_gctest("gc/binarytree.jl")
+@time run_gctest("gc/linkedlist.jl")
+@time run_gctest("gc/objarray.jl")
diff --git a/test/gc/binarytree.jl b/test/gc/binarytree.jl
new file mode 100644
index 0000000000000..3089e2d2ce869
--- /dev/null
+++ b/test/gc/binarytree.jl
@@ -0,0 +1,53 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+module BinaryTreeMutable
+
+# Adopted from
+# https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/binarytrees.html#binarytrees
+
+using Base.Threads
+using Printf
+
+mutable struct Node
+    l::Union{Nothing, Node}
+    r::Union{Nothing, Node}
+end
+
+function make(n::Int)
+    return n === 0 ? Node(nothing, nothing) : Node(make(n-1), make(n-1))
+end
+
+function check(node::Node)
+    return  1 + (node.l === nothing ? 0 : check(node.l) + check(node.r))
+end
+
+function binary_trees(io, n::Int)
+    @printf io "stretch tree of depth %jd\t check: %jd\n" n+1 check(make(n+1))
+
+    long_tree = make(n)
+    minDepth = 4
+    resultSize = div((n - minDepth), 2) + 1
+    results = Vector{String}(undef, resultSize)
+    Threads.@threads for depth in minDepth:2:n
+        c = 0
+        niter = 1 << (n - depth + minDepth)
+        for _ in 1:niter
+            c += check(make(depth))
+        end
+        index = div((depth - minDepth),2) + 1
+        results[index] = @sprintf "%jd\t trees of depth %jd\t check: %jd\n" niter depth c
+    end
+
+    for i in results
+        write(io, i)
+    end
+
+    @printf io "long lived tree of depth %jd\t check: %jd\n" n check(long_tree)
+end
+
+end #module
+
+using .BinaryTreeMutable
+
+BinaryTreeMutable.binary_trees(devnull, 20)
+GC.gc()
diff --git a/test/gc/linkedlist.jl b/test/gc/linkedlist.jl
new file mode 100644
index 0000000000000..c447a9680326d
--- /dev/null
+++ b/test/gc/linkedlist.jl
@@ -0,0 +1,21 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+mutable struct ListNode
+  key::Int64
+  next::ListNode
+  ListNode() = new()
+  ListNode(x)= new(x)
+  ListNode(x,y) = new(x,y);
+end
+
+function list(n=128)
+    start::ListNode = ListNode(1)
+    current::ListNode = start
+    for i = 2:(n*1024^2)
+        current = ListNode(i,current)
+    end
+    return current.key
+end
+
+_ = list()
+GC.gc()
diff --git a/test/gc/objarray.jl b/test/gc/objarray.jl
new file mode 100644
index 0000000000000..4b4cb67c42eac
--- /dev/null
+++ b/test/gc/objarray.jl
@@ -0,0 +1,35 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+using Random: seed!
+seed!(1)
+
+abstract type Cell end
+
+struct CellA<:Cell
+    a::Ref{Int}
+end
+
+struct CellB<:Cell
+    b::String
+end
+
+function fillcells!(mc::Array{Cell})
+    for ind in eachindex(mc)
+        mc[ind] = ifelse(rand() > 0.5, CellA(ind), CellB(string(ind)))
+    end
+    return mc
+end
+
+function work(size)
+    mcells = Array{Cell}(undef, size, size)
+    fillcells!(mcells)
+end
+
+function run(maxsize)
+    Threads.@threads for i in 1:maxsize
+        work(i*500)
+    end
+end
+
+run(4)
+GC.gc()
diff --git a/test/llvmpasses/alloc-opt-gcframe.jl b/test/llvmpasses/alloc-opt-gcframe.jl
index 3b5fc3a51a606..ad4be12be0840 100644
--- a/test/llvmpasses/alloc-opt-gcframe.jl
+++ b/test/llvmpasses/alloc-opt-gcframe.jl
@@ -14,11 +14,11 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 # CHECK-LABEL: @return_obj
 # CHECK-NOT: @julia.gc_alloc_obj
 # CHECK: %current_task = getelementptr inbounds {}*, {}** %gcstack, i64 -12
-# CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 15
+# CHECK: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 15
 # CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0
 # CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}**
 # CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8*
-# CHECK-NEXT: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8* [[ptls_i8]], i32 [[SIZE_T:[0-9]+]], i32 16)
+# CHECK-NEXT: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc_instrumented(i8* [[ptls_i8]], i32 [[SIZE_T:[0-9]+]], i32 16, i64 {{.*}} @tag {{.*}})
 # CHECK: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* {{.*}} unordered, align 8, !tbaa !4
 println("""
 define {} addrspace(10)* @return_obj() {
@@ -260,8 +260,8 @@ L3:
 """)
 # CHECK-LABEL: }{{$}}
 
-# CHECK: declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*,
-# CHECK: declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*,
+# CHECK: declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc_instrumented(i8*,
+# CHECK: declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc_instrumented(i8*,
 println("""
 declare void @external_function()
 declare {}*** @julia.get_pgcstack()
diff --git a/test/llvmpasses/final-lower-gc.ll b/test/llvmpasses/final-lower-gc.ll
index 4af43f748020b..840c911d0874f 100644
--- a/test/llvmpasses/final-lower-gc.ll
+++ b/test/llvmpasses/final-lower-gc.ll
@@ -13,7 +13,7 @@ declare noalias nonnull {} addrspace(10)** @julia.new_gc_frame(i32)
 declare void @julia.push_gc_frame({} addrspace(10)**, i32)
 declare {} addrspace(10)** @julia.get_gc_frame_slot({} addrspace(10)**, i32)
 declare void @julia.pop_gc_frame({} addrspace(10)**)
-declare noalias nonnull {} addrspace(10)* @julia.gc_alloc_bytes(i8*, i64) #0
+declare noalias nonnull {} addrspace(10)* @julia.gc_alloc_bytes(i8*, i64, i64) #0
 
 attributes #0 = { allocsize(1) }
 
@@ -59,8 +59,8 @@ top:
   %pgcstack = call {}*** @julia.get_pgcstack()
   %ptls = call {}*** @julia.ptls_states()
   %ptls_i8 = bitcast {}*** %ptls to i8*
-; CHECK: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc
-  %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 8)
+; CHECK: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc_instrumented
+  %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 8, i64 12341234)
   %0 = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)*
   %1 = getelementptr {} addrspace(10)*, {} addrspace(10)* addrspace(10)* %0, i64 -1
   store {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* %1, align 8, !tbaa !0
@@ -74,8 +74,8 @@ top:
   %ptls = call {}*** @julia.ptls_states()
   %ptls_i8 = bitcast {}*** %ptls to i8*
 ; CHECK: %0 = add i64 %size, 8
-; CHECK: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_alloc_typed(i8* %ptls_i8, i64 %0, i8* null)
-  %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 %size)
+; CHECK: %v = call noalias nonnull {} addrspace(10)* @ijl_gc_alloc_typed(i8* %ptls_i8, i64 %0, i64 12341234)
+  %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 %size, i64 12341234)
   %0 = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)*
   %1 = getelementptr {} addrspace(10)*, {} addrspace(10)* addrspace(10)* %0, i64 -1
   store {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* %1, align 8, !tbaa !0
diff --git a/test/llvmpasses/late-lower-gc-addrspaces.ll b/test/llvmpasses/late-lower-gc-addrspaces.ll
index 7497febf1e846..7bb8c76b07e63 100644
--- a/test/llvmpasses/late-lower-gc-addrspaces.ll
+++ b/test/llvmpasses/late-lower-gc-addrspaces.ll
@@ -49,7 +49,7 @@ top:
 ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0
 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}**
 ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8*
-; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8)
+; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
 ; CHECK-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)*
 ; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1
 ; CHECK-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4
@@ -74,7 +74,7 @@ top:
 ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0
 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}**
 ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8*
-; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8)
+; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
 ; CHECK-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)*
 ; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1
 ; CHECK-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4
diff --git a/test/llvmpasses/late-lower-gc.ll b/test/llvmpasses/late-lower-gc.ll
index 65a67c78d7810..77599290f8ef7 100644
--- a/test/llvmpasses/late-lower-gc.ll
+++ b/test/llvmpasses/late-lower-gc.ll
@@ -46,7 +46,7 @@ top:
 ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0
 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}**
 ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8*
-; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8)
+; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
 ; CHECK-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)*
 ; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1
 ; CHECK-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4
@@ -71,7 +71,7 @@ top:
 ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0
 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}**
 ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8*
-; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8)
+; CHECK-NEXT: %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* [[ptls_i8]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
 ; CHECK-NEXT: [[V2:%.*]] = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)*
 ; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[V2]], i64 -1
 ; CHECK-NEXT: store atomic {} addrspace(10)* @tag, {} addrspace(10)* addrspace(10)* [[V_HEADROOM]] unordered, align 8, !tbaa !4
@@ -154,7 +154,7 @@ define void @decayar([2 x {} addrspace(10)* addrspace(11)*] %ar) {
   %l0 = load {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %e0
   %e1 = extractvalue [2 x {} addrspace(10)* addrspace(11)*] %ar, 1
   %l1 = load {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %e1
-  %r = call i32 @callee_root({} addrspace(10)* %l0, {} addrspace(10)* %l1) 
+  %r = call i32 @callee_root({} addrspace(10)* %l0, {} addrspace(10)* %l1)
   ret void
 }
 
diff --git a/test/llvmpasses/pipeline-o0.jl b/test/llvmpasses/pipeline-o0.jl
index ff9cd0aace704..3cbd5a9174cc2 100644
--- a/test/llvmpasses/pipeline-o0.jl
+++ b/test/llvmpasses/pipeline-o0.jl
@@ -9,7 +9,7 @@ include(joinpath("..", "testhelpers", "llvmpasses.jl"))
 # CHECK-NOT: julia.get_pgcstack
 # CHECK: asm
 # CHECK-NOT: julia.gc_alloc_obj
-# CHECK: ijl_gc_pool_alloc
+# CHECK: ijl_gc_pool_alloc_instrumented
 # COM: we want something vaguely along the lines of asm load from the fs register -> allocate bytes
 function simple()
     Ref(0)
diff --git a/test/threads.jl b/test/threads.jl
index fb684b275e864..1a0dbeb2d4dbf 100644
--- a/test/threads.jl
+++ b/test/threads.jl
@@ -327,3 +327,9 @@ end
     @test_throws ArgumentError @macroexpand(@threads 1) # arg isn't an Expr
     @test_throws ArgumentError @macroexpand(@threads if true 1 end) # arg doesn't start with for
 end
+
+@testset "num_stack_mappings metric" begin
+    @test @ccall(jl_get_num_stack_mappings()::Cint) >= 1
+    # There must be at least two: one for the root test task and one for the async task:
+    @test fetch(@async(@ccall(jl_get_num_stack_mappings()::Cint))) >= 2
+end