From d386c6fc02bbfaeeb307057d182176ea3aa394c4 Mon Sep 17 00:00:00 2001 From: Ian Date: Thu, 29 Jul 2021 19:35:46 -0400 Subject: [PATCH 01/13] Thread and task specific profiling - Adds thread and task ids to profile samples - Implements thread and task selection for Profile.print() - Implements thread and task groupby options for Profile.print() - Add include_meta to Profile.fetch() which defaults to false to ensure backwards compat with external profiling tooling --- contrib/generate_precompile.jl | 6 +- src/signals-mach.c | 7 + src/signals-unix.c | 8 ++ src/signals-win.c | 9 ++ stdlib/Profile/src/Profile.jl | 230 ++++++++++++++++++++++++++------ stdlib/Profile/test/runtests.jl | 20 +++ 6 files changed, 238 insertions(+), 42 deletions(-) diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl index a9bf83a9f9674..cb9bc8fb019f3 100644 --- a/contrib/generate_precompile.jl +++ b/contrib/generate_precompile.jl @@ -222,7 +222,11 @@ Profile = get(Base.loaded_modules, nothing) if Profile !== nothing hardcoded_precompile_statements *= """ - precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol}) + precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, Int, UInt}) + precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, Int, UnitRange{UInt}}) + precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, UnitRange{Int}, UInt}) + precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, UnitRange{Int}, UnitRange{UInt}}) + precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, Vector{Int}, Vector{UInt}}) """ end diff --git a/src/signals-mach.c b/src/signals-mach.c index 7a678af4ec65f..8ec6e8d7bb1df 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -588,6 +588,13 @@ void *mach_profile_listener(void *arg) #else bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); #endif + jl_ptls_t ptls = jl_all_tls_states[i]; + + // store threadid but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; + + // store task id + bt_data_prof[bt_size_cur++].uintptr = ptls->current_task; // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; diff --git a/src/signals-unix.c b/src/signals-unix.c index daec048912325..639dba0b990d8 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -786,6 +786,14 @@ static void *signal_listener(void *arg) } jl_set_safe_restore(old_buf); + jl_ptls_t ptls = jl_all_tls_states[i]; + + // store threadid but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; + + // store task id + bt_data_prof[bt_size_cur++].uintptr = ptls->current_task; + // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; } diff --git a/src/signals-win.c b/src/signals-win.c index 5d1a963078df4..400e5a469a4fd 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -360,6 +360,15 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) // Get backtrace data bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, &ctxThread, NULL); + + jl_ptls_t ptls = jl_all_tls_states[0]; // given only profiling hMainThread + + // store threadid but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; + + // store task id + bt_data_prof[bt_size_cur++].uintptr = ptls->current_task; + // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; } diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 49f5020d277c4..1266b79eec90d 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -39,11 +39,11 @@ end """ init(; n::Integer, delay::Real)) -Configure the `delay` between backtraces (measured in seconds), and the number `n` of -instruction pointers that may be stored per thread. Each instruction pointer corresponds to a single -line of code; backtraces generally consist of a long list of instruction pointers. Current -settings can be obtained by calling this function with no arguments, and each can be set -independently using keywords or in the order `(n, delay)`. +Configure the `delay` between backtraces (measured in seconds), and the number `n` of instruction pointers that may be +stored per thread. Each instruction pointer corresponds to a single line of code; backtraces generally consist of a long +list of instruction pointers. Note that 3 instruction pointers per backtrace are used to store metadata. Current settings can be +obtained by calling this function with no arguments, and each can be set independently using keywords or in the order +`(n, delay)`. !!! compat "Julia 1.8" As of Julia 1.8, this function allocates space for `n` instruction pointers per thread being profiled. @@ -141,6 +141,9 @@ The keyword arguments can be any combination of: line, `:count` sorts in order of number of collected samples, and `:overhead` sorts by the number of samples incurred by each function by itself. + - `groupby` -- Controls grouping over tasks and threads, or no grouping. Options are `:none` (default), `:threads`, `:tasks`, + `[:threads, :tasks]`, or `[:tasks, :threads]` where the last two provide nested grouping. + - `noisefloor` -- Limits frames that exceed the heuristic noise floor of the sample (only applies to format `:tree`). A suggested value to try for this is 2.0 (the default is 0). This parameter hides samples for which `n <= noisefloor * √N`, where `n` is the number of samples on this line, and `N` is the number of samples for the callee. @@ -150,9 +153,15 @@ The keyword arguments can be any combination of: - `recur` -- Controls the recursion handling in `:tree` format. `:off` (default) prints the tree as normal. `:flat` instead compresses any recursion (by ip), showing the approximate effect of converting any self-recursion into an iterator. `:flatc` does the same but also includes collapsing of C frames (may do odd things around `jl_apply`). + + - `threads::Union{Int,AbstractVector{Int}}` -- Specify which threads to include snapshots from in the report. Note that + this does not control which threads samples are collected on. + + - `tasks::Union{Int,AbstractVector{Int}}` -- Specify which tasks to include snapshots from in the report. Note that this + does not control which tasks samples are collected within. """ function print(io::IO, - data::Vector{<:Unsigned} = fetch(), + data::Vector{<:Unsigned} = fetch(include_meta = true), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data) ; format = :tree, @@ -162,24 +171,86 @@ function print(io::IO, mincount::Int = 0, noisefloor = 0, sortedby::Symbol = :filefuncline, - recur::Symbol = :off) - print(io, data, lidict, ProfileFormat( - C = C, - combine = combine, - maxdepth = maxdepth, - mincount = mincount, - noisefloor = noisefloor, - sortedby = sortedby, - recur = recur), - format) + groupby::Union{Symbol,AbstractVector{Symbol}} = :none, + recur::Symbol = :off, + threads::Union{Int,AbstractVector{Int}} = 1:Threads.nthreads(), + tasks::Union{UInt,AbstractVector{UInt}} = typemin(UInt):typemax(UInt)) + + pf = ProfileFormat(;C, combine, maxdepth, mincount, noisefloor, sortedby, recur) + if groupby == :none + print(io, data, lidict, pf, format, threads, tasks, false) + else + if !in(groupby, [:thread, :task, [:task, :thread], [:thread, :task]]) + error(ArgumentError("Unrecognized groupby option: $groupby. Options are :none (default), :task, :thread, [:task, :thread], or [:thread, :task]")) + elseif Sys.iswindows() && in(groupby, [:thread, [:task, :thread], [:thread, :task]]) + @warn "Profiling on windows is limited to the main thread. Other threads have not been sampled and will not show in the report" + end + any_nosamples = false + println(io, "Overhead ╎ [+additional indent] Count File:Line; Function") + println(io, "=========================================================") + if groupby == [:task, :thread] + for taskid in _intersect(get_task_ids(data), tasks) + threadids = _intersect(get_thread_ids(data, taskid), threads) + if length(threadids) == 0 + any_nosamples = true + else + nl = length(threadids) > 1 ? "\n" : "" + printstyled(io, "Task $(Base.repr(taskid))$nl"; bold=true, color=Base.debug_color()) + for threadid in threadids + printstyled(io, " Thread $threadid\n"; bold=true, color=Base.info_color()) + nosamples = print(io, data, lidict, pf, format, threadid, taskid, true) + nosamples && (any_nosamples = true) + println(io) + end + end + end + elseif groupby == [:thread, :task] + for threadid in _intersect(get_thread_ids(data), threads) + taskids = _intersect(get_task_ids(data, threadid), tasks) + if length(taskids) == 0 + any_nosamples = true + else + nl = length(taskids) > 1 ? "\n" : "" + printstyled(io, "Thread $threadid$nl"; bold=true, color=Base.info_color()) + for taskid in taskids + printstyled(io, " Task $(Base.repr(taskid))\n"; bold=true, color=Base.debug_color()) + nosamples = print(io, data, lidict, pf, format, threadid, taskid, true) + nosamples && (any_nosamples = true) + println(io) + end + end + end + elseif groupby == :task + threads = 1:typemax(Int) + for taskid in _intersect(get_task_ids(data), tasks) + printstyled(io, "Task $(Base.repr(taskid))\n"; bold=true, color=Base.debug_color()) + nosamples = print(io, data, lidict, pf, format, threads, taskid, true) + nosamples && (any_nosamples = true) + println(io) + end + elseif groupby == :thread + tasks = 1:typemax(UInt) + for threadid in _intersect(get_thread_ids(data), threads) + printstyled(io, "Thread $threadid\n"; bold=true, color=Base.info_color()) + nosamples = print(io, data, lidict, pf, format, threadid, tasks, true) + nosamples && (any_nosamples = true) + println(io) + end + end + any_nosamples && warning_empty(summary = true) + end + return end -function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, LineInfoFlatDict}, fmt::ProfileFormat, format::Symbol) +function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, LineInfoFlatDict}, fmt::ProfileFormat, + format::Symbol, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, + is_subsection::Bool = false) cols::Int = Base.displaysize(io)[2] data = convert(Vector{UInt64}, data) fmt.recur ∈ (:off, :flat, :flatc) || throw(ArgumentError("recur value not recognized")) if format === :tree - tree(io, data, lidict, cols, fmt) + nosamples = tree(io, data, lidict, cols, fmt, threads, tasks, is_subsection) + return nosamples elseif format === :flat fmt.recur === :off || throw(ArgumentError("format flat only implements recur=:off")) flat(io, data, lidict, cols, fmt) @@ -188,6 +259,32 @@ function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, Lin end end +function get_task_ids(data::Vector{<:Unsigned}, threadid = nothing) + taskids = UInt[] + for i in length(data):-1:1 + if data[i] == 0 # find start of block + if isnothing(threadid) || data[i - 3] == threadid + taskid = data[i - 2] + !in(taskid, taskids) && push!(taskids, taskid) + end + end + end + return taskids +end + +function get_thread_ids(data::Vector{<:Unsigned}, taskid = nothing) + threadids = Int[] + for i in length(data):-1:1 + if data[i] == 0 # find start of block + if isnothing(taskid) || data[i - 2] == taskid + threadid = data[i - 3] + !in(threadid, threadids) && push!(threadids, threadid) + end + end + end + return sort(threadids) +end + """ print([io::IO = stdout,] data::Vector, lidict::LineInfoDict; kwargs...) @@ -197,19 +294,19 @@ a dictionary `lidict` of line information. See `Profile.print([io], data)` for an explanation of the valid keyword arguments. """ -print(data::Vector{<:Unsigned} = fetch(), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data); kwargs...) = +print(data::Vector{<:Unsigned} = fetch(include_meta = true), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data); kwargs...) = print(stdout, data, lidict; kwargs...) """ - retrieve() -> data, lidict + retrieve(; kwargs...) -> data, lidict "Exports" profiling results in a portable format, returning the set of all backtraces (`data`) and a dictionary that maps the (session-specific) instruction pointers in `data` to `LineInfo` values that store the file name, function name, and line number. This function allows you to save profiling results for future analysis. """ -function retrieve() - data = fetch() +function retrieve(; kwargs...) + data = fetch(; kwargs...) return (data, getdict(data)) end @@ -384,14 +481,15 @@ error_codes = Dict( """ - fetch() -> data + fetch(;include_meta = false) -> data Returns a copy of the buffer of profile backtraces. Note that the values in `data` have meaning only on this machine in the current session, because it depends on the exact memory addresses used in JIT-compiling. This function is primarily for internal use; [`retrieve`](@ref) may be a better choice for most users. +By default metadata such as threadid and taskid will be stripped. Set `include_meta` to `true` to include metadata. """ -function fetch() +function fetch(;include_meta = false) maxlen = maxlen_data() len = len_data() if is_buffer_full() @@ -401,7 +499,24 @@ function fetch() end data = Vector{UInt}(undef, len) GC.@preserve data unsafe_copyto!(pointer(data), get_data_pointer(), len) - return data + if include_meta + return data + else + nblocks = count(iszero, data) + nmeta = 2 # number of metadata fields (threadid, taskid) + data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta)) + j = length(data_stripped) + i = length(data) + while i > 0 + data_stripped[j] = data[i] + if data[i] == 0 + i -= nmeta + end + i -= 1 + j -= 1 + end + return data_stripped + end end @@ -627,14 +742,25 @@ function tree_format(frames::Vector{<:StackFrameTree}, level::Int, cols::Int, ma end # turn a list of backtraces into a tree (implicitly separated by NULL markers) -function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, C::Bool, recur::Symbol) where {T} +function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, C::Bool, recur::Symbol, + threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}) where {T} parent = root tops = Vector{StackFrameTree{T}}() build = Vector{StackFrameTree{T}}() startframe = length(all) + skip = false for i in startframe:-1:1 + startframe - 1 <= i <= startframe - 2 && continue # skip metadata (its read ahead below) ip = all[i] if ip == 0 + # read metadata + taskid = all[i - 1] + threadid = all[i - 2] + if !in(threadid, threads) || !in(taskid, tasks) + skip = true + continue + end + skip = false # sentinel value indicates the start of a new backtrace empty!(build) root.recur = 0 @@ -661,7 +787,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI parent = root root.count += 1 startframe = i - else + elseif !skip pushfirst!(build, parent) if recur === :flat || recur === :flatc # Rewind the `parent` tree back, if this exact ip was already present *higher* in the current tree @@ -702,6 +828,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI parent = this continue end + frames = lidict[ip] nframes = (frames isa Vector ? length(frames) : 1) this = parent @@ -758,12 +885,14 @@ end # Print the stack frame tree starting at a particular root. Uses a worklist to # avoid stack overflows. -function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat) where T +function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat, is_subsection::Bool) where T maxes = maxstats(bt) filenamemap = Dict{Symbol,String}() worklist = [(bt, 0, 0, "")] - println(io, "Overhead ╎ [+additional indent] Count File:Line; Function") - println(io, "=========================================================") + if !is_subsection + println(io, "Overhead ╎ [+additional indent] Count File:Line; Function") + println(io, "=========================================================") + end while !isempty(worklist) (bt, level, noisefloor, str) = popfirst!(worklist) isempty(str) || println(io, str) @@ -797,21 +926,28 @@ function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat pushfirst!(worklist, (down, level + 1, noisefloor_down, str)) end end + return end -function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat) +function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat, + threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool) if fmt.combine - root = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur) + root = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks) else - root = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur) + root = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks) end if isempty(root.down) - warning_empty() - return + if is_subsection + Base.print(io, "Total snapshots: ") + printstyled(io, "$(root.count)\n", color=Base.warn_color()) + else + warning_empty() + end + return true end - print_tree(io, root, cols, fmt) + print_tree(io, root, cols, fmt, is_subsection) Base.println(io, "Total snapshots: ", root.count) - nothing + return false end function callersf(matchfunc::Function, bt::Vector, lidict::LineInfoFlatDict) @@ -875,9 +1011,21 @@ function liperm(lilist::Vector{StackFrame}) return sortperm(lilist, lt = lt) end -warning_empty() = @warn """ - There were no samples collected. Run your program longer (perhaps by - running it multiple times), or adjust the delay between samples with - `Profile.init()`.""" +warning_empty(;summary = false) = @warn """ + There were no samples collected$(summary ? " in one or more groups" : ""). + Run your program longer (perhaps by running it multiple times), + or adjust the delay between samples with `Profile.init()`.""" + + +# Given Base.intersect isn't efficient for mixtures of UnitRange and Vectors +# TODO: Replace once https://github.com/JuliaLang/julia/pull/41769 has merged +function _intersect(v::AbstractVector, r::AbstractRange) + common = Iterators.filter(x -> x ∈ r, v) + seen = Set{eltype(v)}(common) + return Base.vectorfilter(Base._shrink_filter!(seen), common) +end +_intersect(r::AbstractRange, v::AbstractVector) = _intersect(v, r) +_intersect(a, b) = Base.intersect(a, b) + end # module diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl index 76f8a3a1b8ca8..cf0790e606615 100644 --- a/stdlib/Profile/test/runtests.jl +++ b/stdlib/Profile/test/runtests.jl @@ -59,6 +59,26 @@ let iobuf = IOBuffer() truncate(iobuf, 0) end +@testset "Profile.print() groupby options" begin + for tasks in [typemin(UInt):typemax(UInt), UInt(0), UInt(0):UInt(1), [UInt(0), UInt(3)]] + for threads in [1:Threads.nthreads(), 1, 1:1, 1:2, [1,3]] + for groupby in [:none, :thread, :task, [:thread, :task], [:task, :thread]] + @testset "tasks = $tasks threads = $threads groupby = $groupby" begin + Profile.print(devnull; groupby, threads, tasks) + end + end + end + end +end + +@testset "Profile.fetch() with and without meta" begin + data_without = Profile.fetch() + data_with = Profile.fetch(include_meta = true) + @test data_without[1] == data_with[1] + @test data_without[end] == data_with[end] + @test length(data_without) < length(data_with) +end + Profile.clear() @test isempty(Profile.fetch()) From 9a16479485a536bfbd828b83123a52d952d5abee Mon Sep 17 00:00:00 2001 From: Ian Butterworth Date: Sun, 8 Aug 2021 03:31:49 -0400 Subject: [PATCH 02/13] store time of each profile sample --- src/signals-mach.c | 3 +++ src/signals-unix.c | 3 +++ src/signals-win.c | 3 +++ stdlib/Profile/src/Profile.jl | 9 +++++---- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/signals-mach.c b/src/signals-mach.c index 8ec6e8d7bb1df..848e602ade188 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -596,6 +596,9 @@ void *mach_profile_listener(void *arg) // store task id bt_data_prof[bt_size_cur++].uintptr = ptls->current_task; + // store time + bt_data_prof[bt_size_cur++].uintptr = jl_hrtime(); + // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; } diff --git a/src/signals-unix.c b/src/signals-unix.c index 639dba0b990d8..d6a26859547f5 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -794,6 +794,9 @@ static void *signal_listener(void *arg) // store task id bt_data_prof[bt_size_cur++].uintptr = ptls->current_task; + // store time + bt_data_prof[bt_size_cur++].uintptr = jl_hrtime(); + // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; } diff --git a/src/signals-win.c b/src/signals-win.c index 400e5a469a4fd..ffe7b0a38fa42 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -369,6 +369,9 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) // store task id bt_data_prof[bt_size_cur++].uintptr = ptls->current_task; + // store time + bt_data_prof[bt_size_cur++].uintptr = jl_hrtime(); + // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; } diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 1266b79eec90d..acdb20161fc32 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -503,7 +503,7 @@ function fetch(;include_meta = false) return data else nblocks = count(iszero, data) - nmeta = 2 # number of metadata fields (threadid, taskid) + nmeta = 3 # number of metadata fields (threadid, taskid, time) data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta)) j = length(data_stripped) i = length(data) @@ -750,12 +750,13 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI startframe = length(all) skip = false for i in startframe:-1:1 - startframe - 1 <= i <= startframe - 2 && continue # skip metadata (its read ahead below) + startframe - 1 <= i <= startframe - 3 && continue # skip metadata (its read ahead below) ip = all[i] if ip == 0 # read metadata - taskid = all[i - 1] - threadid = all[i - 2] + # time = all[i - 1] + taskid = all[i - 2] + threadid = all[i - 3] if !in(threadid, threads) || !in(taskid, tasks) skip = true continue From c339d5f9c49b17f384859f3422b5ad8898e474bc Mon Sep 17 00:00:00 2001 From: Ian Butterworth Date: Sun, 8 Aug 2021 04:04:03 -0400 Subject: [PATCH 03/13] fix flat printing --- stdlib/Profile/src/Profile.jl | 80 ++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 29 deletions(-) diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index acdb20161fc32..0b14d54e56528 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -253,7 +253,8 @@ function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, Lin return nosamples elseif format === :flat fmt.recur === :off || throw(ArgumentError("format flat only implements recur=:off")) - flat(io, data, lidict, cols, fmt) + nosamples = flat(io, data, lidict, cols, fmt, threads, tasks, is_subsection) + return nosamples else throw(ArgumentError("output format $(repr(format)) not recognized")) end @@ -523,7 +524,8 @@ end ## Print as a flat list # Counts the number of times each line appears, at any nesting level and at the topmost level # Merging multiple equivalent entries and recursive calls -function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, C::Bool) where {T} +function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, C::Bool, + threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}) where {T} lilist = StackFrame[] n = Int[] m = Int[] @@ -531,32 +533,46 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, recursive = Set{T}() first = true totalshots = 0 - for ip in data + startframe = length(data) + skip = false + for i in startframe:-1:1 + startframe - 1 <= i <= startframe - 3 && continue # skip metadata (it's read ahead below) + ip = data[i] if ip == 0 + # read metadata + # time = data[i - 1] + taskid = data[i - 2] + threadid = data[i - 3] + if !in(threadid, threads) || !in(taskid, tasks) + skip = true + continue + end + skip = false totalshots += 1 empty!(recursive) first = true - continue - end - frames = lidict[ip] - nframes = (frames isa Vector ? length(frames) : 1) - for i = 1:nframes - frame = (frames isa Vector ? frames[i] : frames) - !C && frame.from_c && continue - key = (T === UInt64 ? ip : frame) - idx = get!(lilist_idx, key, length(lilist) + 1) - if idx > length(lilist) - push!(recursive, key) - push!(lilist, frame) - push!(n, 1) - push!(m, 0) - elseif !(key in recursive) - push!(recursive, key) - n[idx] += 1 - end - if first - m[idx] += 1 - first = false + startframe = i + elseif !skip + frames = lidict[ip] + nframes = (frames isa Vector ? length(frames) : 1) + for j = 1:nframes + frame = (frames isa Vector ? frames[j] : frames) + !C && frame.from_c && continue + key = (T === UInt64 ? ip : frame) + idx = get!(lilist_idx, key, length(lilist) + 1) + if idx > length(lilist) + push!(recursive, key) + push!(lilist, frame) + push!(n, 1) + push!(m, 0) + elseif !(key in recursive) + push!(recursive, key) + n[idx] += 1 + end + if first + m[idx] += 1 + first = false + end end end end @@ -564,11 +580,17 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, return (lilist, n, m, totalshots) end -function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat) - lilist, n, m, totalshots = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C) +function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat, + threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool) + lilist, n, m, totalshots = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks) if isempty(lilist) - warning_empty() - return + if is_subsection + Base.print(io, "Total snapshots: ") + printstyled(io, "$(totalshots)\n", color=Base.warn_color()) + else + warning_empty() + end + return true end if false # optional: drop the "non-interpretable" ones keep = map(frame -> frame != UNKNOWN && frame.line != 0, lilist) @@ -579,7 +601,7 @@ function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfo filenamemap = Dict{Symbol,String}() print_flat(io, lilist, n, m, cols, filenamemap, fmt) Base.println(io, "Total snapshots: ", totalshots) - nothing + return false end function print_flat(io::IO, lilist::Vector{StackFrame}, From 6c4649bbc97b66f622ea297e6cd4909dc2b47ab4 Mon Sep 17 00:00:00 2001 From: Ian Butterworth Date: Sun, 8 Aug 2021 04:22:36 -0400 Subject: [PATCH 04/13] fix tests --- stdlib/Profile/Project.toml | 5 +++-- stdlib/Profile/test/runtests.jl | 14 ++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/stdlib/Profile/Project.toml b/stdlib/Profile/Project.toml index 6aca0601439e3..1d13dad22233a 100644 --- a/stdlib/Profile/Project.toml +++ b/stdlib/Profile/Project.toml @@ -5,8 +5,9 @@ uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" [extras] -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Serialization"] +test = ["Logging", "Serialization", "Test"] diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl index cf0790e606615..777122e571aed 100644 --- a/stdlib/Profile/test/runtests.jl +++ b/stdlib/Profile/test/runtests.jl @@ -1,6 +1,6 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -using Test, Profile, Serialization +using Test, Profile, Serialization, Logging Profile.clear() Profile.init() @@ -60,11 +60,13 @@ let iobuf = IOBuffer() end @testset "Profile.print() groupby options" begin - for tasks in [typemin(UInt):typemax(UInt), UInt(0), UInt(0):UInt(1), [UInt(0), UInt(3)]] - for threads in [1:Threads.nthreads(), 1, 1:1, 1:2, [1,3]] - for groupby in [:none, :thread, :task, [:thread, :task], [:task, :thread]] - @testset "tasks = $tasks threads = $threads groupby = $groupby" begin - Profile.print(devnull; groupby, threads, tasks) + iobuf = IOBuffer() + with_logger(NullLogger()) do + @testset for format in [:flat, :tree] + @testset for threads in [1:Threads.nthreads(), 1, 1:1, 1:2, [1,2]] + @testset for groupby in [:none, :thread, :task, [:thread, :task], [:task, :thread]] + Profile.print(iobuf; groupby, threads, format) + @test !isempty(String(take!(iobuf))) end end end From 0201ef20d8a62934912dcdc39ba4b8f914040686 Mon Sep 17 00:00:00 2001 From: Ian Date: Sun, 8 Aug 2021 21:17:32 -0400 Subject: [PATCH 05/13] store cycleclock rather than time_ns --- src/signals-mach.c | 4 ++-- src/signals-unix.c | 4 ++-- src/signals-win.c | 4 ++-- stdlib/Profile/src/Profile.jl | 12 ++++++------ 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/signals-mach.c b/src/signals-mach.c index 848e602ade188..ab21cf4824fb7 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -596,8 +596,8 @@ void *mach_profile_listener(void *arg) // store task id bt_data_prof[bt_size_cur++].uintptr = ptls->current_task; - // store time - bt_data_prof[bt_size_cur++].uintptr = jl_hrtime(); + // store cpu cycle clock + bt_data_prof[bt_size_cur++].uintptr = cycleclock(); // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; diff --git a/src/signals-unix.c b/src/signals-unix.c index d6a26859547f5..ed2dd8239ed10 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -794,8 +794,8 @@ static void *signal_listener(void *arg) // store task id bt_data_prof[bt_size_cur++].uintptr = ptls->current_task; - // store time - bt_data_prof[bt_size_cur++].uintptr = jl_hrtime(); + // store cpu cycle clock + bt_data_prof[bt_size_cur++].uintptr = cycleclock(); // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; diff --git a/src/signals-win.c b/src/signals-win.c index ffe7b0a38fa42..bf190abad5f1a 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -369,8 +369,8 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) // store task id bt_data_prof[bt_size_cur++].uintptr = ptls->current_task; - // store time - bt_data_prof[bt_size_cur++].uintptr = jl_hrtime(); + // store cpu cycle clock + bt_data_prof[bt_size_cur++].uintptr = cycleclock(); // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 0b14d54e56528..2a3401885b8c9 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -41,9 +41,9 @@ end Configure the `delay` between backtraces (measured in seconds), and the number `n` of instruction pointers that may be stored per thread. Each instruction pointer corresponds to a single line of code; backtraces generally consist of a long -list of instruction pointers. Note that 3 instruction pointers per backtrace are used to store metadata. Current settings can be -obtained by calling this function with no arguments, and each can be set independently using keywords or in the order -`(n, delay)`. +list of instruction pointers. Note that 4 instruction pointers per backtrace are used to store metadata and markers. Current +settings can be obtained by calling this function with no arguments, and each can be set independently using keywords or +in the order `(n, delay)`. !!! compat "Julia 1.8" As of Julia 1.8, this function allocates space for `n` instruction pointers per thread being profiled. @@ -504,7 +504,7 @@ function fetch(;include_meta = false) return data else nblocks = count(iszero, data) - nmeta = 3 # number of metadata fields (threadid, taskid, time) + nmeta = 3 # number of metadata fields (threadid, taskid, cpu_cycle_clock) data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta)) j = length(data_stripped) i = length(data) @@ -540,7 +540,7 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, ip = data[i] if ip == 0 # read metadata - # time = data[i - 1] + # cpu_cycle_clock = data[i - 1] taskid = data[i - 2] threadid = data[i - 3] if !in(threadid, threads) || !in(taskid, tasks) @@ -776,7 +776,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI ip = all[i] if ip == 0 # read metadata - # time = all[i - 1] + # cpu_cycle_clock = all[i - 1] taskid = all[i - 2] threadid = all[i - 3] if !in(threadid, threads) || !in(taskid, tasks) From e97b3696e7e8e6de17a477530acd65466f6790f2 Mon Sep 17 00:00:00 2001 From: Ian Date: Fri, 13 Aug 2021 00:35:20 -0400 Subject: [PATCH 06/13] add sleep_check_state to metadata and show % utilization --- src/signal-handling.c | 4 +- src/signals-mach.c | 3 ++ src/signals-unix.c | 3 ++ src/signals-win.c | 3 ++ stdlib/Profile/src/Profile.jl | 89 ++++++++++++++++++++++++----------- 5 files changed, 73 insertions(+), 29 deletions(-) diff --git a/src/signal-handling.c b/src/signal-handling.c index 23fa69f6e4475..2939d73b25e81 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -37,8 +37,8 @@ void jl_shuffle_int_array_inplace(volatile uint64_t *carray, size_t size, uint64 JL_DLLEXPORT int jl_profile_is_buffer_full(void) { - // the latter `+ 1` is for the block terminator `0`. - return bt_size_cur + (JL_BT_MAX_ENTRY_SIZE + 1) + 1 > bt_size_max; + // the `+ 5` is for the block terminator `0` plus 4 metadata entries + return bt_size_cur + (JL_BT_MAX_ENTRY_SIZE + 1) + 5 > bt_size_max; } static uint64_t jl_last_sigint_trigger = 0; diff --git a/src/signals-mach.c b/src/signals-mach.c index ab21cf4824fb7..cfcbf64e6a55e 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -599,6 +599,9 @@ void *mach_profile_listener(void *arg) // store cpu cycle clock bt_data_prof[bt_size_cur++].uintptr = cycleclock(); + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = ptls->sleep_check_state + 1; + // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; } diff --git a/src/signals-unix.c b/src/signals-unix.c index ed2dd8239ed10..414504d8f742f 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -797,6 +797,9 @@ static void *signal_listener(void *arg) // store cpu cycle clock bt_data_prof[bt_size_cur++].uintptr = cycleclock(); + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = ptls->sleep_check_state + 1; + // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; } diff --git a/src/signals-win.c b/src/signals-win.c index bf190abad5f1a..ba12f683cc713 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -372,6 +372,9 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) // store cpu cycle clock bt_data_prof[bt_size_cur++].uintptr = cycleclock(); + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = ptls->sleep_check_state + 1; + // Mark the end of this block with 0 bt_data_prof[bt_size_cur++].uintptr = 0; } diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 2a3401885b8c9..cc6ac288904c8 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -41,9 +41,9 @@ end Configure the `delay` between backtraces (measured in seconds), and the number `n` of instruction pointers that may be stored per thread. Each instruction pointer corresponds to a single line of code; backtraces generally consist of a long -list of instruction pointers. Note that 4 instruction pointers per backtrace are used to store metadata and markers. Current -settings can be obtained by calling this function with no arguments, and each can be set independently using keywords or -in the order `(n, delay)`. +list of instruction pointers. Note that 5 spaces for instruction pointers per backtrace are used to store metadata and a marker. +Current settings can be obtained by calling this function with no arguments, and each can be set independently using keywords +or in the order `(n, delay)`. !!! compat "Julia 1.8" As of Julia 1.8, this function allocates space for `n` instruction pointers per thread being profiled. @@ -264,8 +264,8 @@ function get_task_ids(data::Vector{<:Unsigned}, threadid = nothing) taskids = UInt[] for i in length(data):-1:1 if data[i] == 0 # find start of block - if isnothing(threadid) || data[i - 3] == threadid - taskid = data[i - 2] + if isnothing(threadid) || data[i - 4] == threadid + taskid = data[i - 3] !in(taskid, taskids) && push!(taskids, taskid) end end @@ -277,8 +277,8 @@ function get_thread_ids(data::Vector{<:Unsigned}, taskid = nothing) threadids = Int[] for i in length(data):-1:1 if data[i] == 0 # find start of block - if isnothing(taskid) || data[i - 2] == taskid - threadid = data[i - 3] + if isnothing(taskid) || data[i - 3] == taskid + threadid = data[i - 4] !in(threadid, threadids) && push!(threadids, threadid) end end @@ -504,7 +504,7 @@ function fetch(;include_meta = false) return data else nblocks = count(iszero, data) - nmeta = 3 # number of metadata fields (threadid, taskid, cpu_cycle_clock) + nmeta = 4 # number of metadata fields (threadid, taskid, cpu_cycle_clock, thread_sleeping) data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta)) j = length(data_stripped) i = length(data) @@ -535,18 +535,23 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, totalshots = 0 startframe = length(data) skip = false + nsleeping = 0 for i in startframe:-1:1 - startframe - 1 <= i <= startframe - 3 && continue # skip metadata (it's read ahead below) + startframe - 1 <= i <= startframe - 4 && continue # skip metadata (it's read ahead below) ip = data[i] if ip == 0 # read metadata - # cpu_cycle_clock = data[i - 1] - taskid = data[i - 2] - threadid = data[i - 3] + thread_sleeping = data[i - 1] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + # cpu_cycle_clock = data[i - 2] + taskid = data[i - 3] + threadid = data[i - 4] if !in(threadid, threads) || !in(taskid, tasks) skip = true continue end + if thread_sleeping == 1 + nsleeping += 1 + end skip = false totalshots += 1 empty!(recursive) @@ -577,16 +582,18 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, end end @assert length(lilist) == length(n) == length(m) == length(lilist_idx) - return (lilist, n, m, totalshots) + return (lilist, n, m, totalshots, nsleeping) end function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool) - lilist, n, m, totalshots = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks) + lilist, n, m, totalshots, nsleeping = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks) + util_perc = (1 - (nsleeping / totalshots)) * 100 if isempty(lilist) if is_subsection Base.print(io, "Total snapshots: ") - printstyled(io, "$(totalshots)\n", color=Base.warn_color()) + printstyled(io, "$(totalshots)", color=Base.warn_color()) + Base.println(io, " (", round(Int, util_perc), "% utilization)") else warning_empty() end @@ -600,7 +607,12 @@ function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfo end filenamemap = Dict{Symbol,String}() print_flat(io, lilist, n, m, cols, filenamemap, fmt) - Base.println(io, "Total snapshots: ", totalshots) + Base.print(io, "Total snapshots: ", totalshots, " (", round(Int, util_perc), "% utilization") + if is_subsection + println(io, ")") + else + println(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task)") + end return false end @@ -771,18 +783,23 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI build = Vector{StackFrameTree{T}}() startframe = length(all) skip = false + nsleeping = 0 for i in startframe:-1:1 - startframe - 1 <= i <= startframe - 3 && continue # skip metadata (its read ahead below) + startframe - 1 <= i <= startframe - 4 && continue # skip metadata (its read ahead below) ip = all[i] if ip == 0 # read metadata - # cpu_cycle_clock = all[i - 1] - taskid = all[i - 2] - threadid = all[i - 3] + thread_sleeping = all[i - 1] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + # cpu_cycle_clock = all[i - 2] + taskid = all[i - 3] + threadid = all[i - 4] if !in(threadid, threads) || !in(taskid, tasks) skip = true continue end + if thread_sleeping == 1 + nsleeping += 1 + end skip = false # sentinel value indicates the start of a new backtrace empty!(build) @@ -886,7 +903,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI nothing end cleanup!(root) - return root + return root, nsleeping end function maxstats(root::StackFrameTree) @@ -955,21 +972,28 @@ end function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool) if fmt.combine - root = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks) + root, nsleeping = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks) else - root = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks) + root, nsleeping = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks) end + util_perc = (1 - (nsleeping / root.count)) * 100 if isempty(root.down) if is_subsection Base.print(io, "Total snapshots: ") - printstyled(io, "$(root.count)\n", color=Base.warn_color()) + printstyled(io, "$(root.count)", color=Base.warn_color()) + Base.println(io, " (", round(Int, util_perc), "% utilization)") else warning_empty() end return true end print_tree(io, root, cols, fmt, is_subsection) - Base.println(io, "Total snapshots: ", root.count) + Base.print(io, "Total snapshots: ", root.count, " (", round(Int, util_perc), "% utilization") + if is_subsection + println(io, ")") + else + println(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task)") + end return false end @@ -1034,10 +1058,21 @@ function liperm(lilist::Vector{StackFrame}) return sortperm(lilist, lt = lt) end -warning_empty(;summary = false) = @warn """ - There were no samples collected$(summary ? " in one or more groups" : ""). +function warning_empty(;summary = false) + if summary + @warn """ + There were no samples collected in one or more groups. + This may be due to idle threads, or you may need to run your + program longer (perhaps by running it multiple times), + or adjust the delay between samples with `Profile.init()`.""" + else + @warn """ + There were no samples collected in one or more groups. Run your program longer (perhaps by running it multiple times), or adjust the delay between samples with `Profile.init()`.""" + end +end + # Given Base.intersect isn't efficient for mixtures of UnitRange and Vectors From 3cb3630750d3edffef68b170a4b141f9fef668eb Mon Sep 17 00:00:00 2001 From: Ian Date: Sun, 15 Aug 2021 22:25:57 -0400 Subject: [PATCH 07/13] fix ambiguity test failure --- stdlib/Profile/src/Profile.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index cc6ac288904c8..d569817465408 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -1083,6 +1083,15 @@ function _intersect(v::AbstractVector, r::AbstractRange) return Base.vectorfilter(Base._shrink_filter!(seen), common) end _intersect(r::AbstractRange, v::AbstractVector) = _intersect(v, r) +function _intersect(r1::AbstractRange, r2::AbstractRange) + # To iterate over the shorter range + length(r1) > length(r2) && return _intersect(r2, r1) + + r1 = Base.unique(r1) + T = Base.promote_eltype(r1, r2) + + return T[x for x in r1 if x ∈ r2] +end _intersect(a, b) = Base.intersect(a, b) From 1a1507c4966e3b1c8a8e17301b82be74cab82d11 Mon Sep 17 00:00:00 2001 From: Ian Date: Sun, 15 Aug 2021 22:26:18 -0400 Subject: [PATCH 08/13] fix Profile test failure --- stdlib/Profile/src/Profile.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index d569817465408..da8a21f2b7e95 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -1067,7 +1067,7 @@ function warning_empty(;summary = false) or adjust the delay between samples with `Profile.init()`.""" else @warn """ - There were no samples collected in one or more groups. + There were no samples collected. Run your program longer (perhaps by running it multiple times), or adjust the delay between samples with `Profile.init()`.""" end From 1a7ce703a9bb71facbde908f55588c3b2a011393 Mon Sep 17 00:00:00 2001 From: Ian Date: Sun, 15 Aug 2021 22:35:25 -0400 Subject: [PATCH 09/13] add to NEWS --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index f9fb0835afffc..64d249e460001 100644 --- a/NEWS.md +++ b/NEWS.md @@ -64,6 +64,13 @@ Standard library changes #### Printf * Now uses `textwidth` for formatting `%s` and `%c` widths ([#41085]). +#### Profile +* Profiling now records sample metadata including thread and task. `Profile.print()` has a new `groupby` kwarg that allows + grouping by thread, task, or nested thread/task, task/thread, and `threads` and `tasks` kwargs to allow filtering. + Further, percent utilization is now reported as a total or per-thread, based on whether the thread is idle or not at + each sample. `Profile.fetch()` by default strips out the new metadata to ensure backwards compatibility with external + profiling data consumers, but can be included with the `include_meta` kwarg. ([#41742]) + #### Random #### REPL From 1ab855ab46fedc2bba2bfa61b1c7cfe7cef659e4 Mon Sep 17 00:00:00 2001 From: Ian Date: Mon, 30 Aug 2021 20:00:12 -0400 Subject: [PATCH 10/13] use new Base.intersect methods --- stdlib/Profile/src/Profile.jl | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index da8a21f2b7e95..34d8a2052753d 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -189,8 +189,8 @@ function print(io::IO, println(io, "Overhead ╎ [+additional indent] Count File:Line; Function") println(io, "=========================================================") if groupby == [:task, :thread] - for taskid in _intersect(get_task_ids(data), tasks) - threadids = _intersect(get_thread_ids(data, taskid), threads) + for taskid in intersect(get_task_ids(data), tasks) + threadids = intersect(get_thread_ids(data, taskid), threads) if length(threadids) == 0 any_nosamples = true else @@ -205,8 +205,8 @@ function print(io::IO, end end elseif groupby == [:thread, :task] - for threadid in _intersect(get_thread_ids(data), threads) - taskids = _intersect(get_task_ids(data, threadid), tasks) + for threadid in intersect(get_thread_ids(data), threads) + taskids = intersect(get_task_ids(data, threadid), tasks) if length(taskids) == 0 any_nosamples = true else @@ -222,7 +222,7 @@ function print(io::IO, end elseif groupby == :task threads = 1:typemax(Int) - for taskid in _intersect(get_task_ids(data), tasks) + for taskid in intersect(get_task_ids(data), tasks) printstyled(io, "Task $(Base.repr(taskid))\n"; bold=true, color=Base.debug_color()) nosamples = print(io, data, lidict, pf, format, threads, taskid, true) nosamples && (any_nosamples = true) @@ -230,7 +230,7 @@ function print(io::IO, end elseif groupby == :thread tasks = 1:typemax(UInt) - for threadid in _intersect(get_thread_ids(data), threads) + for threadid in intersect(get_thread_ids(data), threads) printstyled(io, "Thread $threadid\n"; bold=true, color=Base.info_color()) nosamples = print(io, data, lidict, pf, format, threadid, tasks, true) nosamples && (any_nosamples = true) @@ -1073,26 +1073,4 @@ function warning_empty(;summary = false) end end - - -# Given Base.intersect isn't efficient for mixtures of UnitRange and Vectors -# TODO: Replace once https://github.com/JuliaLang/julia/pull/41769 has merged -function _intersect(v::AbstractVector, r::AbstractRange) - common = Iterators.filter(x -> x ∈ r, v) - seen = Set{eltype(v)}(common) - return Base.vectorfilter(Base._shrink_filter!(seen), common) -end -_intersect(r::AbstractRange, v::AbstractVector) = _intersect(v, r) -function _intersect(r1::AbstractRange, r2::AbstractRange) - # To iterate over the shorter range - length(r1) > length(r2) && return _intersect(r2, r1) - - r1 = Base.unique(r1) - T = Base.promote_eltype(r1, r2) - - return T[x for x in r1 if x ∈ r2] -end -_intersect(a, b) = Base.intersect(a, b) - - end # module From 47c94f1a9aa47feb6d8e4faa2bbfea903f48fbe8 Mon Sep 17 00:00:00 2001 From: Ian Date: Mon, 30 Aug 2021 22:27:53 -0400 Subject: [PATCH 11/13] add assert for checking stripping metadata --- stdlib/Profile/src/Profile.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 34d8a2052753d..c201d2061e720 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -508,7 +508,7 @@ function fetch(;include_meta = false) data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta)) j = length(data_stripped) i = length(data) - while i > 0 + while i > 0 && j > 0 data_stripped[j] = data[i] if data[i] == 0 i -= nmeta @@ -516,6 +516,7 @@ function fetch(;include_meta = false) i -= 1 j -= 1 end + @assert i == j == 0 "metadata stripping failed i=$i j=$j" return data_stripped end end From 973a3f68bfccc91ad167d11c069be42f51b94330 Mon Sep 17 00:00:00 2001 From: Ian Date: Tue, 31 Aug 2021 20:25:02 -0400 Subject: [PATCH 12/13] more debug --- stdlib/Profile/src/Profile.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index c201d2061e720..4c491b17b4d43 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -516,7 +516,7 @@ function fetch(;include_meta = false) i -= 1 j -= 1 end - @assert i == j == 0 "metadata stripping failed i=$i j=$j" + @assert i == j == 0 "metadata stripping failed i=$i j=$j data[1:i]=$(data[1:i])" return data_stripped end end From 5c7da45ca63165a731b18ef3d32c3bf29d885796 Mon Sep 17 00:00:00 2001 From: Ian Butterworth Date: Wed, 1 Sep 2021 23:33:30 -0400 Subject: [PATCH 13/13] check that zero entry actually looks like block end (fix linux32) --- stdlib/Profile/src/Profile.jl | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 4c491b17b4d43..f2a39da2b7aa5 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -500,17 +500,26 @@ function fetch(;include_meta = false) end data = Vector{UInt}(undef, len) GC.@preserve data unsafe_copyto!(pointer(data), get_data_pointer(), len) - if include_meta + if include_meta || isempty(data) return data else - nblocks = count(iszero, data) + nblocks = 0 + for i = 2:length(data) + if data[i] == 0 && in(data[i - 1], [1,2]) + # detect block ends and count them + # linux 32 has been seen to have rogue ips equal to 0 so also check for the previous entry looking like an idle + # state metadata entry which can only be 1 or 2 + nblocks += 1 + end + end nmeta = 4 # number of metadata fields (threadid, taskid, cpu_cycle_clock, thread_sleeping) data_stripped = Vector{UInt}(undef, length(data) - (nblocks * nmeta)) j = length(data_stripped) i = length(data) while i > 0 && j > 0 data_stripped[j] = data[i] - if data[i] == 0 + if i > 1 && data[i] == 0 && in(data[i - 1], [1,2]) + # detect block end (same approach as above) i -= nmeta end i -= 1 @@ -540,7 +549,7 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, for i in startframe:-1:1 startframe - 1 <= i <= startframe - 4 && continue # skip metadata (it's read ahead below) ip = data[i] - if ip == 0 + if i > 1 && ip == 0 && in(data[i - 1], [1,2]) # check that the field next to the zero is the idle metadata entry # read metadata thread_sleeping = data[i - 1] - 1 # subtract 1 as state is incremented to avoid being equal to 0 # cpu_cycle_clock = data[i - 2] @@ -788,7 +797,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI for i in startframe:-1:1 startframe - 1 <= i <= startframe - 4 && continue # skip metadata (its read ahead below) ip = all[i] - if ip == 0 + if i > 1 && ip == 0 && in(all[i - 1], [1,2]) # check that the field next to the zero is the idle metadata entry # read metadata thread_sleeping = all[i - 1] - 1 # subtract 1 as state is incremented to avoid being equal to 0 # cpu_cycle_clock = all[i - 2]