From d386c6fc02bbfaeeb307057d182176ea3aa394c4 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Thu, 29 Jul 2021 19:35:46 -0400
Subject: [PATCH 01/13] Thread and task specific profiling

- Adds thread and task ids to profile samples
- Implements thread and task selection for Profile.print()
- Implements thread and task groupby options for Profile.print()
- Add include_meta to Profile.fetch() which defaults to false to ensure backwards compat with external profiling tooling
---
 contrib/generate_precompile.jl  |   6 +-
 src/signals-mach.c              |   7 +
 src/signals-unix.c              |   8 ++
 src/signals-win.c               |   9 ++
 stdlib/Profile/src/Profile.jl   | 230 ++++++++++++++++++++++++++------
 stdlib/Profile/test/runtests.jl |  20 +++
 6 files changed, 238 insertions(+), 42 deletions(-)

diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl
index a9bf83a9f9674..cb9bc8fb019f3 100644
--- a/contrib/generate_precompile.jl
+++ b/contrib/generate_precompile.jl
@@ -222,7 +222,11 @@ Profile = get(Base.loaded_modules,
           nothing)
 if Profile !== nothing
     hardcoded_precompile_statements *= """
-    precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol})
+    precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, Int, UInt})
+    precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, Int, UnitRange{UInt}})
+    precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, UnitRange{Int}, UInt})
+    precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, UnitRange{Int}, UnitRange{UInt}})
+    precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, Vector{Int}, Vector{UInt}})
     """
 end
 
diff --git a/src/signals-mach.c b/src/signals-mach.c
index 7a678af4ec65f..8ec6e8d7bb1df 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -588,6 +588,13 @@ void *mach_profile_listener(void *arg)
 #else
                 bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
 #endif
+                jl_ptls_t ptls = jl_all_tls_states[i];
+
+                // store threadid but add 1 as 0 is preserved to indicate end of block
+                bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
+
+                // store task id
+                bt_data_prof[bt_size_cur++].uintptr = ptls->current_task;
 
                 // Mark the end of this block with 0
                 bt_data_prof[bt_size_cur++].uintptr = 0;
diff --git a/src/signals-unix.c b/src/signals-unix.c
index daec048912325..639dba0b990d8 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -786,6 +786,14 @@ static void *signal_listener(void *arg)
                     }
                     jl_set_safe_restore(old_buf);
 
+                    jl_ptls_t ptls = jl_all_tls_states[i];
+
+                    // store threadid but add 1 as 0 is preserved to indicate end of block
+                    bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
+
+                    // store task id
+                    bt_data_prof[bt_size_cur++].uintptr = ptls->current_task;
+
                     // Mark the end of this block with 0
                     bt_data_prof[bt_size_cur++].uintptr = 0;
                 }
diff --git a/src/signals-win.c b/src/signals-win.c
index 5d1a963078df4..400e5a469a4fd 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -360,6 +360,15 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
                     // Get backtrace data
                     bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
                             bt_size_max - bt_size_cur - 1, &ctxThread, NULL);
+
+                    jl_ptls_t ptls = jl_all_tls_states[0]; // given only profiling hMainThread
+
+                    // store threadid but add 1 as 0 is preserved to indicate end of block
+                    bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
+
+                    // store task id
+                    bt_data_prof[bt_size_cur++].uintptr = ptls->current_task;
+
                     // Mark the end of this block with 0
                     bt_data_prof[bt_size_cur++].uintptr = 0;
                 }
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 49f5020d277c4..1266b79eec90d 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -39,11 +39,11 @@ end
 """
     init(; n::Integer, delay::Real))
 
-Configure the `delay` between backtraces (measured in seconds), and the number `n` of
-instruction pointers that may be stored per thread. Each instruction pointer corresponds to a single
-line of code; backtraces generally consist of a long list of instruction pointers. Current
-settings can be obtained by calling this function with no arguments, and each can be set
-independently using keywords or in the order `(n, delay)`.
+Configure the `delay` between backtraces (measured in seconds), and the number `n` of instruction pointers that may be
+stored per thread. Each instruction pointer corresponds to a single line of code; backtraces generally consist of a long
+list of instruction pointers. Note that 3 instruction pointers per backtrace are used to store metadata. Current settings can be
+obtained by calling this function with no arguments, and each can be set independently using keywords or in the order
+`(n, delay)`.
 
 !!! compat "Julia 1.8"
     As of Julia 1.8, this function allocates space for `n` instruction pointers per thread being profiled.
@@ -141,6 +141,9 @@ The keyword arguments can be any combination of:
     line, `:count` sorts in order of number of collected samples, and `:overhead` sorts by the number of samples
     incurred by each function by itself.
 
+ - `groupby` -- Controls grouping over tasks and threads, or no grouping. Options are `:none` (default), `:threads`, `:tasks`,
+    `[:threads, :tasks]`, or `[:tasks, :threads]` where the last two provide nested grouping.
+
  - `noisefloor` -- Limits frames that exceed the heuristic noise floor of the sample (only applies to format `:tree`).
     A suggested value to try for this is 2.0 (the default is 0). This parameter hides samples for which `n <= noisefloor * √N`,
     where `n` is the number of samples on this line, and `N` is the number of samples for the callee.
@@ -150,9 +153,15 @@ The keyword arguments can be any combination of:
  - `recur` -- Controls the recursion handling in `:tree` format. `:off` (default) prints the tree as normal. `:flat` instead
     compresses any recursion (by ip), showing the approximate effect of converting any self-recursion into an iterator.
     `:flatc` does the same but also includes collapsing of C frames (may do odd things around `jl_apply`).
+
+ - `threads::Union{Int,AbstractVector{Int}}` -- Specify which threads to include snapshots from in the report. Note that
+    this does not control which threads samples are collected on.
+
+ - `tasks::Union{Int,AbstractVector{Int}}` -- Specify which tasks to include snapshots from in the report. Note that this
+    does not control which tasks samples are collected within.
 """
 function print(io::IO,
-        data::Vector{<:Unsigned} = fetch(),
+        data::Vector{<:Unsigned} = fetch(include_meta = true),
         lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data)
         ;
         format = :tree,
@@ -162,24 +171,86 @@ function print(io::IO,
         mincount::Int = 0,
         noisefloor = 0,
         sortedby::Symbol = :filefuncline,
-        recur::Symbol = :off)
-    print(io, data, lidict, ProfileFormat(
-            C = C,
-            combine = combine,
-            maxdepth = maxdepth,
-            mincount = mincount,
-            noisefloor = noisefloor,
-            sortedby = sortedby,
-            recur = recur),
-        format)
+        groupby::Union{Symbol,AbstractVector{Symbol}} = :none,
+        recur::Symbol = :off,
+        threads::Union{Int,AbstractVector{Int}} = 1:Threads.nthreads(),
+        tasks::Union{UInt,AbstractVector{UInt}} = typemin(UInt):typemax(UInt))
+
+    pf = ProfileFormat(;C, combine, maxdepth, mincount, noisefloor, sortedby, recur)
+    if groupby == :none
+        print(io, data, lidict, pf, format, threads, tasks, false)
+    else
+        if !in(groupby, [:thread, :task, [:task, :thread], [:thread, :task]])
+            error(ArgumentError("Unrecognized groupby option: $groupby. Options are :none (default), :task, :thread, [:task, :thread], or [:thread, :task]"))
+        elseif Sys.iswindows() && in(groupby, [:thread, [:task, :thread], [:thread, :task]])
+            @warn "Profiling on windows is limited to the main thread. Other threads have not been sampled and will not show in the report"
+        end
+        any_nosamples = false
+        println(io, "Overhead ╎ [+additional indent] Count File:Line; Function")
+        println(io, "=========================================================")
+        if groupby == [:task, :thread]
+            for taskid in _intersect(get_task_ids(data), tasks)
+                threadids = _intersect(get_thread_ids(data, taskid), threads)
+                if length(threadids) == 0
+                    any_nosamples = true
+                else
+                    nl = length(threadids) > 1 ? "\n" : ""
+                    printstyled(io, "Task $(Base.repr(taskid))$nl"; bold=true, color=Base.debug_color())
+                    for threadid in threadids
+                        printstyled(io, " Thread $threadid\n"; bold=true, color=Base.info_color())
+                        nosamples = print(io, data, lidict, pf, format, threadid, taskid, true)
+                        nosamples && (any_nosamples = true)
+                        println(io)
+                    end
+                end
+            end
+        elseif groupby == [:thread, :task]
+            for threadid in _intersect(get_thread_ids(data), threads)
+                taskids = _intersect(get_task_ids(data, threadid), tasks)
+                if length(taskids) == 0
+                    any_nosamples = true
+                else
+                    nl = length(taskids) > 1 ? "\n" : ""
+                    printstyled(io, "Thread $threadid$nl"; bold=true, color=Base.info_color())
+                    for taskid in taskids
+                        printstyled(io, " Task $(Base.repr(taskid))\n"; bold=true, color=Base.debug_color())
+                        nosamples = print(io, data, lidict, pf, format, threadid, taskid, true)
+                        nosamples && (any_nosamples = true)
+                        println(io)
+                    end
+                end
+            end
+        elseif groupby == :task
+            threads = 1:typemax(Int)
+            for taskid in _intersect(get_task_ids(data), tasks)
+                printstyled(io, "Task $(Base.repr(taskid))\n"; bold=true, color=Base.debug_color())
+                nosamples = print(io, data, lidict, pf, format, threads, taskid, true)
+                nosamples && (any_nosamples = true)
+                println(io)
+            end
+        elseif groupby == :thread
+            tasks = 1:typemax(UInt)
+            for threadid in _intersect(get_thread_ids(data), threads)
+                printstyled(io, "Thread $threadid\n"; bold=true, color=Base.info_color())
+                nosamples = print(io, data, lidict, pf, format, threadid, tasks, true)
+                nosamples && (any_nosamples = true)
+                println(io)
+            end
+        end
+        any_nosamples && warning_empty(summary = true)
+    end
+    return
 end
 
-function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, LineInfoFlatDict}, fmt::ProfileFormat, format::Symbol)
+function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, LineInfoFlatDict}, fmt::ProfileFormat,
+                format::Symbol, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}},
+                is_subsection::Bool = false)
     cols::Int = Base.displaysize(io)[2]
     data = convert(Vector{UInt64}, data)
     fmt.recur ∈ (:off, :flat, :flatc) || throw(ArgumentError("recur value not recognized"))
     if format === :tree
-        tree(io, data, lidict, cols, fmt)
+        nosamples = tree(io, data, lidict, cols, fmt, threads, tasks, is_subsection)
+        return nosamples
     elseif format === :flat
         fmt.recur === :off || throw(ArgumentError("format flat only implements recur=:off"))
         flat(io, data, lidict, cols, fmt)
@@ -188,6 +259,32 @@ function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, Lin
     end
 end
 
+function get_task_ids(data::Vector{<:Unsigned}, threadid = nothing)
+    taskids = UInt[]
+    for i in length(data):-1:1
+        if data[i] == 0 # find start of block
+            if isnothing(threadid) || data[i - 3] == threadid
+                taskid = data[i - 2]
+                !in(taskid, taskids) && push!(taskids, taskid)
+            end
+        end
+    end
+    return taskids
+end
+
+function get_thread_ids(data::Vector{<:Unsigned}, taskid = nothing)
+    threadids = Int[]
+    for i in length(data):-1:1
+        if data[i] == 0 # find start of block
+            if isnothing(taskid) || data[i - 2] == taskid
+                threadid = data[i - 3]
+                !in(threadid, threadids) && push!(threadids, threadid)
+            end
+        end
+    end
+    return sort(threadids)
+end
+
 """
     print([io::IO = stdout,] data::Vector, lidict::LineInfoDict; kwargs...)
 
@@ -197,19 +294,19 @@ a dictionary `lidict` of line information.
 
 See `Profile.print([io], data)` for an explanation of the valid keyword arguments.
 """
-print(data::Vector{<:Unsigned} = fetch(), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data); kwargs...) =
+print(data::Vector{<:Unsigned} = fetch(include_meta = true), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data); kwargs...) =
     print(stdout, data, lidict; kwargs...)
 
 """
-    retrieve() -> data, lidict
+    retrieve(; kwargs...) -> data, lidict
 
 "Exports" profiling results in a portable format, returning the set of all backtraces
 (`data`) and a dictionary that maps the (session-specific) instruction pointers in `data` to
 `LineInfo` values that store the file name, function name, and line number. This function
 allows you to save profiling results for future analysis.
 """
-function retrieve()
-    data = fetch()
+function retrieve(; kwargs...)
+    data = fetch(; kwargs...)
     return (data, getdict(data))
 end
 
@@ -384,14 +481,15 @@ error_codes = Dict(
 
 
 """
-    fetch() -> data
+    fetch(;include_meta = false) -> data
 
 Returns a copy of the buffer of profile backtraces. Note that the
 values in `data` have meaning only on this machine in the current session, because it
 depends on the exact memory addresses used in JIT-compiling. This function is primarily for
 internal use; [`retrieve`](@ref) may be a better choice for most users.
+By default metadata such as threadid and taskid will be stripped. Set `include_meta` to `true` to include metadata.
 """
-function fetch()
+function fetch(;include_meta = false)
     maxlen = maxlen_data()
     len = len_data()
     if is_buffer_full()
@@ -401,7 +499,24 @@ function fetch()
     end
     data = Vector{UInt}(undef, len)
     GC.@preserve data unsafe_copyto!(pointer(data), get_data_pointer(), len)
-    return data
+    if include_meta
+        return data
+    else
+        nblocks = count(iszero, data)
+        nmeta = 2 # number of metadata fields (threadid, taskid)
+        data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta))
+        j = length(data_stripped)
+        i = length(data)
+        while i > 0
+            data_stripped[j] = data[i]
+            if data[i] == 0
+                i -= nmeta
+            end
+            i -= 1
+            j -= 1
+        end
+        return data_stripped
+    end
 end
 
 
@@ -627,14 +742,25 @@ function tree_format(frames::Vector{<:StackFrameTree}, level::Int, cols::Int, ma
 end
 
 # turn a list of backtraces into a tree (implicitly separated by NULL markers)
-function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, C::Bool, recur::Symbol) where {T}
+function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, C::Bool, recur::Symbol,
+                threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}) where {T}
     parent = root
     tops = Vector{StackFrameTree{T}}()
     build = Vector{StackFrameTree{T}}()
     startframe = length(all)
+    skip = false
     for i in startframe:-1:1
+        startframe - 1 <= i <= startframe - 2 && continue # skip metadata (its read ahead below)
         ip = all[i]
         if ip == 0
+            # read metadata
+            taskid = all[i - 1]
+            threadid = all[i - 2]
+            if !in(threadid, threads) || !in(taskid, tasks)
+                skip = true
+                continue
+            end
+            skip = false
             # sentinel value indicates the start of a new backtrace
             empty!(build)
             root.recur = 0
@@ -661,7 +787,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
             parent = root
             root.count += 1
             startframe = i
-        else
+        elseif !skip
             pushfirst!(build, parent)
             if recur === :flat || recur === :flatc
                 # Rewind the `parent` tree back, if this exact ip was already present *higher* in the current tree
@@ -702,6 +828,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
                 parent = this
                 continue
             end
+
             frames = lidict[ip]
             nframes = (frames isa Vector ? length(frames) : 1)
             this = parent
@@ -758,12 +885,14 @@ end
 
 # Print the stack frame tree starting at a particular root. Uses a worklist to
 # avoid stack overflows.
-function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat) where T
+function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat, is_subsection::Bool) where T
     maxes = maxstats(bt)
     filenamemap = Dict{Symbol,String}()
     worklist = [(bt, 0, 0, "")]
-    println(io, "Overhead ╎ [+additional indent] Count File:Line; Function")
-    println(io, "=========================================================")
+    if !is_subsection
+        println(io, "Overhead ╎ [+additional indent] Count File:Line; Function")
+        println(io, "=========================================================")
+    end
     while !isempty(worklist)
         (bt, level, noisefloor, str) = popfirst!(worklist)
         isempty(str) || println(io, str)
@@ -797,21 +926,28 @@ function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat
             pushfirst!(worklist, (down, level + 1, noisefloor_down, str))
         end
     end
+    return
 end
 
-function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat)
+function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat,
+                threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool)
     if fmt.combine
-        root = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur)
+        root = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks)
     else
-        root = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur)
+        root = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks)
     end
     if isempty(root.down)
-        warning_empty()
-        return
+        if is_subsection
+            Base.print(io, "Total snapshots: ")
+            printstyled(io, "$(root.count)\n", color=Base.warn_color())
+        else
+            warning_empty()
+        end
+        return true
     end
-    print_tree(io, root, cols, fmt)
+    print_tree(io, root, cols, fmt, is_subsection)
     Base.println(io, "Total snapshots: ", root.count)
-    nothing
+    return false
 end
 
 function callersf(matchfunc::Function, bt::Vector, lidict::LineInfoFlatDict)
@@ -875,9 +1011,21 @@ function liperm(lilist::Vector{StackFrame})
     return sortperm(lilist, lt = lt)
 end
 
-warning_empty() = @warn """
-            There were no samples collected. Run your program longer (perhaps by
-            running it multiple times), or adjust the delay between samples with
-            `Profile.init()`."""
+warning_empty(;summary = false) = @warn """
+        There were no samples collected$(summary ? " in one or more groups" : "").
+        Run your program longer (perhaps by running it multiple times),
+        or adjust the delay between samples with `Profile.init()`."""
+
+
+# Given Base.intersect isn't efficient for mixtures of UnitRange and Vectors
+# TODO: Replace once https://github.com/JuliaLang/julia/pull/41769 has merged
+function _intersect(v::AbstractVector, r::AbstractRange)
+    common = Iterators.filter(x -> x ∈ r, v)
+    seen = Set{eltype(v)}(common)
+    return Base.vectorfilter(Base._shrink_filter!(seen), common)
+end
+_intersect(r::AbstractRange, v::AbstractVector) = _intersect(v, r)
+_intersect(a, b) = Base.intersect(a, b)
+
 
 end # module
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index 76f8a3a1b8ca8..cf0790e606615 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -59,6 +59,26 @@ let iobuf = IOBuffer()
     truncate(iobuf, 0)
 end
 
+@testset "Profile.print() groupby options" begin
+    for tasks in [typemin(UInt):typemax(UInt), UInt(0), UInt(0):UInt(1), [UInt(0), UInt(3)]]
+        for threads in [1:Threads.nthreads(), 1, 1:1, 1:2, [1,3]]
+            for groupby in [:none, :thread, :task, [:thread, :task], [:task, :thread]]
+                @testset "tasks = $tasks threads = $threads groupby = $groupby" begin
+                    Profile.print(devnull; groupby, threads, tasks)
+                end
+            end
+        end
+    end
+end
+
+@testset "Profile.fetch() with and without meta" begin
+    data_without = Profile.fetch()
+    data_with = Profile.fetch(include_meta = true)
+    @test data_without[1] == data_with[1]
+    @test data_without[end] == data_with[end]
+    @test length(data_without) < length(data_with)
+end
+
 Profile.clear()
 @test isempty(Profile.fetch())
 

From 9a16479485a536bfbd828b83123a52d952d5abee Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sun, 8 Aug 2021 03:31:49 -0400
Subject: [PATCH 02/13] store time of each profile sample

---
 src/signals-mach.c            | 3 +++
 src/signals-unix.c            | 3 +++
 src/signals-win.c             | 3 +++
 stdlib/Profile/src/Profile.jl | 9 +++++----
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/signals-mach.c b/src/signals-mach.c
index 8ec6e8d7bb1df..848e602ade188 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -596,6 +596,9 @@ void *mach_profile_listener(void *arg)
                 // store task id
                 bt_data_prof[bt_size_cur++].uintptr = ptls->current_task;
 
+                // store time
+                bt_data_prof[bt_size_cur++].uintptr = jl_hrtime();
+
                 // Mark the end of this block with 0
                 bt_data_prof[bt_size_cur++].uintptr = 0;
             }
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 639dba0b990d8..d6a26859547f5 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -794,6 +794,9 @@ static void *signal_listener(void *arg)
                     // store task id
                     bt_data_prof[bt_size_cur++].uintptr = ptls->current_task;
 
+                    // store time
+                    bt_data_prof[bt_size_cur++].uintptr = jl_hrtime();
+
                     // Mark the end of this block with 0
                     bt_data_prof[bt_size_cur++].uintptr = 0;
                 }
diff --git a/src/signals-win.c b/src/signals-win.c
index 400e5a469a4fd..ffe7b0a38fa42 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -369,6 +369,9 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
                     // store task id
                     bt_data_prof[bt_size_cur++].uintptr = ptls->current_task;
 
+                    // store time
+                    bt_data_prof[bt_size_cur++].uintptr = jl_hrtime();
+
                     // Mark the end of this block with 0
                     bt_data_prof[bt_size_cur++].uintptr = 0;
                 }
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 1266b79eec90d..acdb20161fc32 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -503,7 +503,7 @@ function fetch(;include_meta = false)
         return data
     else
         nblocks = count(iszero, data)
-        nmeta = 2 # number of metadata fields (threadid, taskid)
+        nmeta = 3 # number of metadata fields (threadid, taskid, time)
         data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta))
         j = length(data_stripped)
         i = length(data)
@@ -750,12 +750,13 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
     startframe = length(all)
     skip = false
     for i in startframe:-1:1
-        startframe - 1 <= i <= startframe - 2 && continue # skip metadata (its read ahead below)
+        startframe - 1 <= i <= startframe - 3 && continue # skip metadata (its read ahead below)
         ip = all[i]
         if ip == 0
             # read metadata
-            taskid = all[i - 1]
-            threadid = all[i - 2]
+            # time = all[i - 1]
+            taskid = all[i - 2]
+            threadid = all[i - 3]
             if !in(threadid, threads) || !in(taskid, tasks)
                 skip = true
                 continue

From c339d5f9c49b17f384859f3422b5ad8898e474bc Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sun, 8 Aug 2021 04:04:03 -0400
Subject: [PATCH 03/13] fix flat printing

---
 stdlib/Profile/src/Profile.jl | 80 ++++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 29 deletions(-)

diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index acdb20161fc32..0b14d54e56528 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -253,7 +253,8 @@ function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, Lin
         return nosamples
     elseif format === :flat
         fmt.recur === :off || throw(ArgumentError("format flat only implements recur=:off"))
-        flat(io, data, lidict, cols, fmt)
+        nosamples = flat(io, data, lidict, cols, fmt, threads, tasks, is_subsection)
+        return nosamples
     else
         throw(ArgumentError("output format $(repr(format)) not recognized"))
     end
@@ -523,7 +524,8 @@ end
 ## Print as a flat list
 # Counts the number of times each line appears, at any nesting level and at the topmost level
 # Merging multiple equivalent entries and recursive calls
-function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, C::Bool) where {T}
+function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, C::Bool,
+                    threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}) where {T}
     lilist = StackFrame[]
     n = Int[]
     m = Int[]
@@ -531,32 +533,46 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict,
     recursive = Set{T}()
     first = true
     totalshots = 0
-    for ip in data
+    startframe = length(data)
+    skip = false
+    for i in startframe:-1:1
+        startframe - 1 <= i <= startframe - 3 && continue # skip metadata (it's read ahead below)
+        ip = data[i]
         if ip == 0
+            # read metadata
+            # time = data[i - 1]
+            taskid = data[i - 2]
+            threadid = data[i - 3]
+            if !in(threadid, threads) || !in(taskid, tasks)
+                skip = true
+                continue
+            end
+            skip = false
             totalshots += 1
             empty!(recursive)
             first = true
-            continue
-        end
-        frames = lidict[ip]
-        nframes = (frames isa Vector ? length(frames) : 1)
-        for i = 1:nframes
-            frame = (frames isa Vector ? frames[i] : frames)
-            !C && frame.from_c && continue
-            key = (T === UInt64 ? ip : frame)
-            idx = get!(lilist_idx, key, length(lilist) + 1)
-            if idx > length(lilist)
-                push!(recursive, key)
-                push!(lilist, frame)
-                push!(n, 1)
-                push!(m, 0)
-            elseif !(key in recursive)
-                push!(recursive, key)
-                n[idx] += 1
-            end
-            if first
-                m[idx] += 1
-                first = false
+            startframe = i
+        elseif !skip
+            frames = lidict[ip]
+            nframes = (frames isa Vector ? length(frames) : 1)
+            for j = 1:nframes
+                frame = (frames isa Vector ? frames[j] : frames)
+                !C && frame.from_c && continue
+                key = (T === UInt64 ? ip : frame)
+                idx = get!(lilist_idx, key, length(lilist) + 1)
+                if idx > length(lilist)
+                    push!(recursive, key)
+                    push!(lilist, frame)
+                    push!(n, 1)
+                    push!(m, 0)
+                elseif !(key in recursive)
+                    push!(recursive, key)
+                    n[idx] += 1
+                end
+                if first
+                    m[idx] += 1
+                    first = false
+                end
             end
         end
     end
@@ -564,11 +580,17 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict,
     return (lilist, n, m, totalshots)
 end
 
-function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat)
-    lilist, n, m, totalshots = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C)
+function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat,
+                threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool)
+    lilist, n, m, totalshots = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks)
     if isempty(lilist)
-        warning_empty()
-        return
+        if is_subsection
+            Base.print(io, "Total snapshots: ")
+            printstyled(io, "$(totalshots)\n", color=Base.warn_color())
+        else
+            warning_empty()
+        end
+        return true
     end
     if false # optional: drop the "non-interpretable" ones
         keep = map(frame -> frame != UNKNOWN && frame.line != 0, lilist)
@@ -579,7 +601,7 @@ function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfo
     filenamemap = Dict{Symbol,String}()
     print_flat(io, lilist, n, m, cols, filenamemap, fmt)
     Base.println(io, "Total snapshots: ", totalshots)
-    nothing
+    return false
 end
 
 function print_flat(io::IO, lilist::Vector{StackFrame},

From 6c4649bbc97b66f622ea297e6cd4909dc2b47ab4 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sun, 8 Aug 2021 04:22:36 -0400
Subject: [PATCH 04/13] fix tests

---
 stdlib/Profile/Project.toml     |  5 +++--
 stdlib/Profile/test/runtests.jl | 14 ++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/stdlib/Profile/Project.toml b/stdlib/Profile/Project.toml
index 6aca0601439e3..1d13dad22233a 100644
--- a/stdlib/Profile/Project.toml
+++ b/stdlib/Profile/Project.toml
@@ -5,8 +5,9 @@ uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 
 [extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Serialization"]
+test = ["Logging", "Serialization", "Test"]
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index cf0790e606615..777122e571aed 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -1,6 +1,6 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-using Test, Profile, Serialization
+using Test, Profile, Serialization, Logging
 
 Profile.clear()
 Profile.init()
@@ -60,11 +60,13 @@ let iobuf = IOBuffer()
 end
 
 @testset "Profile.print() groupby options" begin
-    for tasks in [typemin(UInt):typemax(UInt), UInt(0), UInt(0):UInt(1), [UInt(0), UInt(3)]]
-        for threads in [1:Threads.nthreads(), 1, 1:1, 1:2, [1,3]]
-            for groupby in [:none, :thread, :task, [:thread, :task], [:task, :thread]]
-                @testset "tasks = $tasks threads = $threads groupby = $groupby" begin
-                    Profile.print(devnull; groupby, threads, tasks)
+    iobuf = IOBuffer()
+    with_logger(NullLogger()) do
+        @testset for format in [:flat, :tree]
+            @testset for threads in [1:Threads.nthreads(), 1, 1:1, 1:2, [1,2]]
+                @testset for groupby in [:none, :thread, :task, [:thread, :task], [:task, :thread]]
+                    Profile.print(iobuf; groupby, threads, format)
+                    @test !isempty(String(take!(iobuf)))
                 end
             end
         end

From 0201ef20d8a62934912dcdc39ba4b8f914040686 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Sun, 8 Aug 2021 21:17:32 -0400
Subject: [PATCH 05/13] store cycleclock rather than time_ns

---
 src/signals-mach.c            |  4 ++--
 src/signals-unix.c            |  4 ++--
 src/signals-win.c             |  4 ++--
 stdlib/Profile/src/Profile.jl | 12 ++++++------
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/signals-mach.c b/src/signals-mach.c
index 848e602ade188..ab21cf4824fb7 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -596,8 +596,8 @@ void *mach_profile_listener(void *arg)
                 // store task id
                 bt_data_prof[bt_size_cur++].uintptr = ptls->current_task;
 
-                // store time
-                bt_data_prof[bt_size_cur++].uintptr = jl_hrtime();
+                // store cpu cycle clock
+                bt_data_prof[bt_size_cur++].uintptr = cycleclock();
 
                 // Mark the end of this block with 0
                 bt_data_prof[bt_size_cur++].uintptr = 0;
diff --git a/src/signals-unix.c b/src/signals-unix.c
index d6a26859547f5..ed2dd8239ed10 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -794,8 +794,8 @@ static void *signal_listener(void *arg)
                     // store task id
                     bt_data_prof[bt_size_cur++].uintptr = ptls->current_task;
 
-                    // store time
-                    bt_data_prof[bt_size_cur++].uintptr = jl_hrtime();
+                    // store cpu cycle clock
+                    bt_data_prof[bt_size_cur++].uintptr = cycleclock();
 
                     // Mark the end of this block with 0
                     bt_data_prof[bt_size_cur++].uintptr = 0;
diff --git a/src/signals-win.c b/src/signals-win.c
index ffe7b0a38fa42..bf190abad5f1a 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -369,8 +369,8 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
                     // store task id
                     bt_data_prof[bt_size_cur++].uintptr = ptls->current_task;
 
-                    // store time
-                    bt_data_prof[bt_size_cur++].uintptr = jl_hrtime();
+                    // store cpu cycle clock
+                    bt_data_prof[bt_size_cur++].uintptr = cycleclock();
 
                     // Mark the end of this block with 0
                     bt_data_prof[bt_size_cur++].uintptr = 0;
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 0b14d54e56528..2a3401885b8c9 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -41,9 +41,9 @@ end
 
 Configure the `delay` between backtraces (measured in seconds), and the number `n` of instruction pointers that may be
 stored per thread. Each instruction pointer corresponds to a single line of code; backtraces generally consist of a long
-list of instruction pointers. Note that 3 instruction pointers per backtrace are used to store metadata. Current settings can be
-obtained by calling this function with no arguments, and each can be set independently using keywords or in the order
-`(n, delay)`.
+list of instruction pointers. Note that 4 instruction pointers per backtrace are used to store metadata and markers. Current
+settings can be obtained by calling this function with no arguments, and each can be set independently using keywords or
+in the order `(n, delay)`.
 
 !!! compat "Julia 1.8"
     As of Julia 1.8, this function allocates space for `n` instruction pointers per thread being profiled.
@@ -504,7 +504,7 @@ function fetch(;include_meta = false)
         return data
     else
         nblocks = count(iszero, data)
-        nmeta = 3 # number of metadata fields (threadid, taskid, time)
+        nmeta = 3 # number of metadata fields (threadid, taskid, cpu_cycle_clock)
         data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta))
         j = length(data_stripped)
         i = length(data)
@@ -540,7 +540,7 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict,
         ip = data[i]
         if ip == 0
             # read metadata
-            # time = data[i - 1]
+            # cpu_cycle_clock = data[i - 1]
             taskid = data[i - 2]
             threadid = data[i - 3]
             if !in(threadid, threads) || !in(taskid, tasks)
@@ -776,7 +776,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
         ip = all[i]
         if ip == 0
             # read metadata
-            # time = all[i - 1]
+            # cpu_cycle_clock = all[i - 1]
             taskid = all[i - 2]
             threadid = all[i - 3]
             if !in(threadid, threads) || !in(taskid, tasks)

From e97b3696e7e8e6de17a477530acd65466f6790f2 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Fri, 13 Aug 2021 00:35:20 -0400
Subject: [PATCH 06/13] add sleep_check_state to metadata and show %
 utilization

---
 src/signal-handling.c         |  4 +-
 src/signals-mach.c            |  3 ++
 src/signals-unix.c            |  3 ++
 src/signals-win.c             |  3 ++
 stdlib/Profile/src/Profile.jl | 89 ++++++++++++++++++++++++-----------
 5 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/src/signal-handling.c b/src/signal-handling.c
index 23fa69f6e4475..2939d73b25e81 100644
--- a/src/signal-handling.c
+++ b/src/signal-handling.c
@@ -37,8 +37,8 @@ void jl_shuffle_int_array_inplace(volatile uint64_t *carray, size_t size, uint64
 
 JL_DLLEXPORT int jl_profile_is_buffer_full(void)
 {
-    // the latter `+ 1` is for the block terminator `0`.
-    return bt_size_cur + (JL_BT_MAX_ENTRY_SIZE + 1) + 1 > bt_size_max;
+    // the `+ 5` is for the block terminator `0` plus 4 metadata entries
+    return bt_size_cur + (JL_BT_MAX_ENTRY_SIZE + 1) + 5 > bt_size_max;
 }
 
 static uint64_t jl_last_sigint_trigger = 0;
diff --git a/src/signals-mach.c b/src/signals-mach.c
index ab21cf4824fb7..cfcbf64e6a55e 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -599,6 +599,9 @@ void *mach_profile_listener(void *arg)
                 // store cpu cycle clock
                 bt_data_prof[bt_size_cur++].uintptr = cycleclock();
 
+                // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
+                bt_data_prof[bt_size_cur++].uintptr = ptls->sleep_check_state + 1;
+
                 // Mark the end of this block with 0
                 bt_data_prof[bt_size_cur++].uintptr = 0;
             }
diff --git a/src/signals-unix.c b/src/signals-unix.c
index ed2dd8239ed10..414504d8f742f 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -797,6 +797,9 @@ static void *signal_listener(void *arg)
                     // store cpu cycle clock
                     bt_data_prof[bt_size_cur++].uintptr = cycleclock();
 
+                    // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
+                    bt_data_prof[bt_size_cur++].uintptr = ptls->sleep_check_state + 1;
+
                     // Mark the end of this block with 0
                     bt_data_prof[bt_size_cur++].uintptr = 0;
                 }
diff --git a/src/signals-win.c b/src/signals-win.c
index bf190abad5f1a..ba12f683cc713 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -372,6 +372,9 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
                     // store cpu cycle clock
                     bt_data_prof[bt_size_cur++].uintptr = cycleclock();
 
+                    // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
+                    bt_data_prof[bt_size_cur++].uintptr = ptls->sleep_check_state + 1;
+
                     // Mark the end of this block with 0
                     bt_data_prof[bt_size_cur++].uintptr = 0;
                 }
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 2a3401885b8c9..cc6ac288904c8 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -41,9 +41,9 @@ end
 
 Configure the `delay` between backtraces (measured in seconds), and the number `n` of instruction pointers that may be
 stored per thread. Each instruction pointer corresponds to a single line of code; backtraces generally consist of a long
-list of instruction pointers. Note that 4 instruction pointers per backtrace are used to store metadata and markers. Current
-settings can be obtained by calling this function with no arguments, and each can be set independently using keywords or
-in the order `(n, delay)`.
+list of instruction pointers. Note that 5 spaces for instruction pointers per backtrace are used to store metadata and a marker.
+Current settings can be obtained by calling this function with no arguments, and each can be set independently using keywords
+or in the order `(n, delay)`.
 
 !!! compat "Julia 1.8"
     As of Julia 1.8, this function allocates space for `n` instruction pointers per thread being profiled.
@@ -264,8 +264,8 @@ function get_task_ids(data::Vector{<:Unsigned}, threadid = nothing)
     taskids = UInt[]
     for i in length(data):-1:1
         if data[i] == 0 # find start of block
-            if isnothing(threadid) || data[i - 3] == threadid
-                taskid = data[i - 2]
+            if isnothing(threadid) || data[i - 4] == threadid
+                taskid = data[i - 3]
                 !in(taskid, taskids) && push!(taskids, taskid)
             end
         end
@@ -277,8 +277,8 @@ function get_thread_ids(data::Vector{<:Unsigned}, taskid = nothing)
     threadids = Int[]
     for i in length(data):-1:1
         if data[i] == 0 # find start of block
-            if isnothing(taskid) || data[i - 2] == taskid
-                threadid = data[i - 3]
+            if isnothing(taskid) || data[i - 3] == taskid
+                threadid = data[i - 4]
                 !in(threadid, threadids) && push!(threadids, threadid)
             end
         end
@@ -504,7 +504,7 @@ function fetch(;include_meta = false)
         return data
     else
         nblocks = count(iszero, data)
-        nmeta = 3 # number of metadata fields (threadid, taskid, cpu_cycle_clock)
+        nmeta = 4 # number of metadata fields (threadid, taskid, cpu_cycle_clock, thread_sleeping)
         data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta))
         j = length(data_stripped)
         i = length(data)
@@ -535,18 +535,23 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict,
     totalshots = 0
     startframe = length(data)
     skip = false
+    nsleeping = 0
     for i in startframe:-1:1
-        startframe - 1 <= i <= startframe - 3 && continue # skip metadata (it's read ahead below)
+        startframe - 1 <= i <= startframe - 4 && continue # skip metadata (it's read ahead below)
         ip = data[i]
         if ip == 0
             # read metadata
-            # cpu_cycle_clock = data[i - 1]
-            taskid = data[i - 2]
-            threadid = data[i - 3]
+            thread_sleeping = data[i - 1] - 1 # subtract 1 as state is incremented to avoid being equal to 0
+            # cpu_cycle_clock = data[i - 2]
+            taskid = data[i - 3]
+            threadid = data[i - 4]
             if !in(threadid, threads) || !in(taskid, tasks)
                 skip = true
                 continue
             end
+            if thread_sleeping == 1
+                nsleeping += 1
+            end
             skip = false
             totalshots += 1
             empty!(recursive)
@@ -577,16 +582,18 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict,
         end
     end
     @assert length(lilist) == length(n) == length(m) == length(lilist_idx)
-    return (lilist, n, m, totalshots)
+    return (lilist, n, m, totalshots, nsleeping)
 end
 
 function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat,
                 threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool)
-    lilist, n, m, totalshots = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks)
+    lilist, n, m, totalshots, nsleeping = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks)
+    util_perc = (1 - (nsleeping / totalshots)) * 100
     if isempty(lilist)
         if is_subsection
             Base.print(io, "Total snapshots: ")
-            printstyled(io, "$(totalshots)\n", color=Base.warn_color())
+            printstyled(io, "$(totalshots)", color=Base.warn_color())
+            Base.println(io, " (", round(Int, util_perc), "% utilization)")
         else
             warning_empty()
         end
@@ -600,7 +607,12 @@ function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfo
     end
     filenamemap = Dict{Symbol,String}()
     print_flat(io, lilist, n, m, cols, filenamemap, fmt)
-    Base.println(io, "Total snapshots: ", totalshots)
+    Base.print(io, "Total snapshots: ", totalshots, " (", round(Int, util_perc), "% utilization")
+    if is_subsection
+        println(io, ")")
+    else
+        println(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task)")
+    end
     return false
 end
 
@@ -771,18 +783,23 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
     build = Vector{StackFrameTree{T}}()
     startframe = length(all)
     skip = false
+    nsleeping = 0
     for i in startframe:-1:1
-        startframe - 1 <= i <= startframe - 3 && continue # skip metadata (its read ahead below)
+        startframe - 1 <= i <= startframe - 4 && continue # skip metadata (its read ahead below)
         ip = all[i]
         if ip == 0
             # read metadata
-            # cpu_cycle_clock = all[i - 1]
-            taskid = all[i - 2]
-            threadid = all[i - 3]
+            thread_sleeping = all[i - 1] - 1 # subtract 1 as state is incremented to avoid being equal to 0
+            # cpu_cycle_clock = all[i - 2]
+            taskid = all[i - 3]
+            threadid = all[i - 4]
             if !in(threadid, threads) || !in(taskid, tasks)
                 skip = true
                 continue
             end
+            if thread_sleeping == 1
+                nsleeping += 1
+            end
             skip = false
             # sentinel value indicates the start of a new backtrace
             empty!(build)
@@ -886,7 +903,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
         nothing
     end
     cleanup!(root)
-    return root
+    return root, nsleeping
 end
 
 function maxstats(root::StackFrameTree)
@@ -955,21 +972,28 @@ end
 function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat,
                 threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool)
     if fmt.combine
-        root = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks)
+        root, nsleeping = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks)
     else
-        root = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks)
+        root, nsleeping = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks)
     end
+    util_perc = (1 - (nsleeping / root.count)) * 100
     if isempty(root.down)
         if is_subsection
             Base.print(io, "Total snapshots: ")
-            printstyled(io, "$(root.count)\n", color=Base.warn_color())
+            printstyled(io, "$(root.count)", color=Base.warn_color())
+            Base.println(io, " (", round(Int, util_perc), "% utilization)")
         else
             warning_empty()
         end
         return true
     end
     print_tree(io, root, cols, fmt, is_subsection)
-    Base.println(io, "Total snapshots: ", root.count)
+    Base.print(io, "Total snapshots: ", root.count, " (", round(Int, util_perc), "% utilization")
+    if is_subsection
+        println(io, ")")
+    else
+        println(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task)")
+    end
     return false
 end
 
@@ -1034,10 +1058,21 @@ function liperm(lilist::Vector{StackFrame})
     return sortperm(lilist, lt = lt)
 end
 
-warning_empty(;summary = false) = @warn """
-        There were no samples collected$(summary ? " in one or more groups" : "").
+function warning_empty(;summary = false)
+    if summary
+        @warn """
+        There were no samples collected in one or more groups.
+        This may be due to idle threads, or you may need to run your
+        program longer (perhaps by running it multiple times),
+        or adjust the delay between samples with `Profile.init()`."""
+    else
+        @warn """
+        There were no samples collected in one or more groups.
         Run your program longer (perhaps by running it multiple times),
         or adjust the delay between samples with `Profile.init()`."""
+    end
+end
+
 
 
 # Given Base.intersect isn't efficient for mixtures of UnitRange and Vectors

From 3cb3630750d3edffef68b170a4b141f9fef668eb Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Sun, 15 Aug 2021 22:25:57 -0400
Subject: [PATCH 07/13] fix ambiguity test failure

---
 stdlib/Profile/src/Profile.jl | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index cc6ac288904c8..d569817465408 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -1083,6 +1083,15 @@ function _intersect(v::AbstractVector, r::AbstractRange)
     return Base.vectorfilter(Base._shrink_filter!(seen), common)
 end
 _intersect(r::AbstractRange, v::AbstractVector) = _intersect(v, r)
+function _intersect(r1::AbstractRange, r2::AbstractRange)
+    # To iterate over the shorter range
+    length(r1) > length(r2) && return _intersect(r2, r1)
+
+    r1 = Base.unique(r1)
+    T = Base.promote_eltype(r1, r2)
+
+    return T[x for x in r1 if x ∈ r2]
+end
 _intersect(a, b) = Base.intersect(a, b)
 
 

From 1a1507c4966e3b1c8a8e17301b82be74cab82d11 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Sun, 15 Aug 2021 22:26:18 -0400
Subject: [PATCH 08/13] fix Profile test failure

---
 stdlib/Profile/src/Profile.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index d569817465408..da8a21f2b7e95 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -1067,7 +1067,7 @@ function warning_empty(;summary = false)
         or adjust the delay between samples with `Profile.init()`."""
     else
         @warn """
-        There were no samples collected in one or more groups.
+        There were no samples collected.
         Run your program longer (perhaps by running it multiple times),
         or adjust the delay between samples with `Profile.init()`."""
     end

From 1a7ce703a9bb71facbde908f55588c3b2a011393 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Sun, 15 Aug 2021 22:35:25 -0400
Subject: [PATCH 09/13] add to NEWS

---
 NEWS.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index f9fb0835afffc..64d249e460001 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -64,6 +64,13 @@ Standard library changes
 #### Printf
 * Now uses `textwidth` for formatting `%s` and `%c` widths ([#41085]).
 
+#### Profile
+* Profiling now records sample metadata including thread and task. `Profile.print()` has a new `groupby` kwarg that allows
+  grouping by thread, task, or nested thread/task, task/thread, and `threads` and `tasks` kwargs to allow filtering.
+  Further, percent utilization is now reported as a total or per-thread, based on whether the thread is idle or not at
+  each sample. `Profile.fetch()` by default strips out the new metadata to ensure backwards compatibility with external
+  profiling data consumers, but can be included with the `include_meta` kwarg. ([#41742])
+
 #### Random
 
 #### REPL

From 1ab855ab46fedc2bba2bfa61b1c7cfe7cef659e4 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Mon, 30 Aug 2021 20:00:12 -0400
Subject: [PATCH 10/13] use new Base.intersect methods

---
 stdlib/Profile/src/Profile.jl | 34 ++++++----------------------------
 1 file changed, 6 insertions(+), 28 deletions(-)

diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index da8a21f2b7e95..34d8a2052753d 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -189,8 +189,8 @@ function print(io::IO,
         println(io, "Overhead ╎ [+additional indent] Count File:Line; Function")
         println(io, "=========================================================")
         if groupby == [:task, :thread]
-            for taskid in _intersect(get_task_ids(data), tasks)
-                threadids = _intersect(get_thread_ids(data, taskid), threads)
+            for taskid in intersect(get_task_ids(data), tasks)
+                threadids = intersect(get_thread_ids(data, taskid), threads)
                 if length(threadids) == 0
                     any_nosamples = true
                 else
@@ -205,8 +205,8 @@ function print(io::IO,
                 end
             end
         elseif groupby == [:thread, :task]
-            for threadid in _intersect(get_thread_ids(data), threads)
-                taskids = _intersect(get_task_ids(data, threadid), tasks)
+            for threadid in intersect(get_thread_ids(data), threads)
+                taskids = intersect(get_task_ids(data, threadid), tasks)
                 if length(taskids) == 0
                     any_nosamples = true
                 else
@@ -222,7 +222,7 @@ function print(io::IO,
             end
         elseif groupby == :task
             threads = 1:typemax(Int)
-            for taskid in _intersect(get_task_ids(data), tasks)
+            for taskid in intersect(get_task_ids(data), tasks)
                 printstyled(io, "Task $(Base.repr(taskid))\n"; bold=true, color=Base.debug_color())
                 nosamples = print(io, data, lidict, pf, format, threads, taskid, true)
                 nosamples && (any_nosamples = true)
@@ -230,7 +230,7 @@ function print(io::IO,
             end
         elseif groupby == :thread
             tasks = 1:typemax(UInt)
-            for threadid in _intersect(get_thread_ids(data), threads)
+            for threadid in intersect(get_thread_ids(data), threads)
                 printstyled(io, "Thread $threadid\n"; bold=true, color=Base.info_color())
                 nosamples = print(io, data, lidict, pf, format, threadid, tasks, true)
                 nosamples && (any_nosamples = true)
@@ -1073,26 +1073,4 @@ function warning_empty(;summary = false)
     end
 end
 
-
-
-# Given Base.intersect isn't efficient for mixtures of UnitRange and Vectors
-# TODO: Replace once https://github.com/JuliaLang/julia/pull/41769 has merged
-function _intersect(v::AbstractVector, r::AbstractRange)
-    common = Iterators.filter(x -> x ∈ r, v)
-    seen = Set{eltype(v)}(common)
-    return Base.vectorfilter(Base._shrink_filter!(seen), common)
-end
-_intersect(r::AbstractRange, v::AbstractVector) = _intersect(v, r)
-function _intersect(r1::AbstractRange, r2::AbstractRange)
-    # To iterate over the shorter range
-    length(r1) > length(r2) && return _intersect(r2, r1)
-
-    r1 = Base.unique(r1)
-    T = Base.promote_eltype(r1, r2)
-
-    return T[x for x in r1 if x ∈ r2]
-end
-_intersect(a, b) = Base.intersect(a, b)
-
-
 end # module

From 47c94f1a9aa47feb6d8e4faa2bbfea903f48fbe8 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Mon, 30 Aug 2021 22:27:53 -0400
Subject: [PATCH 11/13] add assert for checking stripping metadata

---
 stdlib/Profile/src/Profile.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 34d8a2052753d..c201d2061e720 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -508,7 +508,7 @@ function fetch(;include_meta = false)
         data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta))
         j = length(data_stripped)
         i = length(data)
-        while i > 0
+        while i > 0 && j > 0
             data_stripped[j] = data[i]
             if data[i] == 0
                 i -= nmeta
@@ -516,6 +516,7 @@ function fetch(;include_meta = false)
             i -= 1
             j -= 1
         end
+        @assert i == j == 0 "metadata stripping failed i=$i j=$j"
         return data_stripped
     end
 end

From 973a3f68bfccc91ad167d11c069be42f51b94330 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 31 Aug 2021 20:25:02 -0400
Subject: [PATCH 12/13] more debug

---
 stdlib/Profile/src/Profile.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index c201d2061e720..4c491b17b4d43 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -516,7 +516,7 @@ function fetch(;include_meta = false)
             i -= 1
             j -= 1
         end
-        @assert i == j == 0 "metadata stripping failed i=$i j=$j"
+        @assert i == j == 0 "metadata stripping failed i=$i j=$j data[1:i]=$(data[1:i])"
         return data_stripped
     end
 end

From 5c7da45ca63165a731b18ef3d32c3bf29d885796 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Wed, 1 Sep 2021 23:33:30 -0400
Subject: [PATCH 13/13] check that zero entry actually looks like block end
 (fix linux32)

---
 stdlib/Profile/src/Profile.jl | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 4c491b17b4d43..f2a39da2b7aa5 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -500,17 +500,26 @@ function fetch(;include_meta = false)
     end
     data = Vector{UInt}(undef, len)
     GC.@preserve data unsafe_copyto!(pointer(data), get_data_pointer(), len)
-    if include_meta
+    if include_meta || isempty(data)
         return data
     else
-        nblocks = count(iszero, data)
+        nblocks = 0
+        for i = 2:length(data)
+            if data[i] == 0 && in(data[i - 1], [1,2])
+                # detect block ends and count them
+                # linux 32 has been seen to have rogue ips equal to 0 so also check for the previous entry looking like an idle
+                # state metadata entry which can only be 1 or 2
+                nblocks += 1
+            end
+        end
         nmeta = 4 # number of metadata fields (threadid, taskid, cpu_cycle_clock, thread_sleeping)
         data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta))
         j = length(data_stripped)
         i = length(data)
         while i > 0 && j > 0
             data_stripped[j] = data[i]
-            if data[i] == 0
+            if i > 1 && data[i] == 0 && in(data[i - 1], [1,2])
+                # detect block end (same approach as above)
                 i -= nmeta
             end
             i -= 1
@@ -540,7 +549,7 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict,
     for i in startframe:-1:1
         startframe - 1 <= i <= startframe - 4 && continue # skip metadata (it's read ahead below)
         ip = data[i]
-        if ip == 0
+        if i > 1 && ip == 0 && in(data[i - 1], [1,2]) # check that the field next to the zero is the idle metadata entry
             # read metadata
             thread_sleeping = data[i - 1] - 1 # subtract 1 as state is incremented to avoid being equal to 0
             # cpu_cycle_clock = data[i - 2]
@@ -788,7 +797,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
     for i in startframe:-1:1
         startframe - 1 <= i <= startframe - 4 && continue # skip metadata (its read ahead below)
         ip = all[i]
-        if ip == 0
+        if i > 1 && ip == 0 && in(all[i - 1], [1,2]) # check that the field next to the zero is the idle metadata entry
             # read metadata
             thread_sleeping = all[i - 1] - 1 # subtract 1 as state is incremented to avoid being equal to 0
             # cpu_cycle_clock = all[i - 2]