-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Description
The frustrating state of tails calls in .NET
I listened to Microsoft Language & Runtime Community standup on twitch
were they said they like "aggressive" well-informed feedback from the community
so I figured I bring up something that has been rubbing me the wrong way a long
time in .NET; tail call performance.
I am sure that has been discussed before but I am going to add more fuel to the debate.
Tails calls in .NET
In .NET it's possible that annotate calls with a .tail attribute allowing the
jitter to eliminate stack frames and thus avoiding to run out of stack space.
If you are a C# developer you probably never encountered tail calls as the C#
compiler as of now (AFAIK?) doesn't emit .tail attributes (might change with tail return?).
It's different for F# developers though where the compiler does emit .tail
attributes.
The problem
Avoiding running out stack space is a good thing so what's the problem?
The problem is the abysmal performance of tail calls in the general case.
Let me demonstrate by implementing a simple data pipeline based on push rather than pull (IEnumerable<_> is pull based).
// Minimalistic PushStream
// A PushStream accepts a receiver function that will be called
// with each value in the PushStream
type 'T PushStream = ('T -> unit) -> unit
module PushStream =
let inline zero () = LanguagePrimitives.GenericZero
let inline push r v = r v
// Creates a PushStream with all integers from b to e (inclusive)
let inline fromRange b e r = for i = b to e do push r i
// Maps all values in ps using mapping function f
let inline map f ps r = ps (fun v -> push r (f v))
// Filters all values in ps using filter function f
let inline filter f ps r = ps (fun v -> if f v then push r v)
// Sums all values in ps
let inline sum ps = let mutable s = zero () in ps (fun v -> s <- s + v); sFor a simple data pipeline designed to detect the overhead of the pipeline the push stream shows some promising performance when comparing it to LINQ.
// Uses BenchmarkDotNet
type Benchmarks () =
[<Params (10000, 1000, 100)>]
member val public Count = 100 with get, set
[<Benchmark>]
member x.SimpleImperativeTest () =
let mutable i = x.Count
let mutable sum = 0L
while i >= 0 do
let v = int64 i
i <- i - 1
if (v &&& 1L) = 0L then
sum <- sum + (v + 1L)
sum
[<Benchmark>]
member x.SimpleLinqTest () =
Enumerable.Range(0, x.Count)
.Select(int64)
.Where(fun v -> (v &&& 1L) = 0L)
.Select((+) 1L)
.Sum()
[<Benchmark>]
member x.SimplePushStreamTest () =
PushStream.fromRange 0 x.Count
|> PushStream.map int64
|> PushStream.filter (fun v -> (v &&& 1L) = 0L)
|> PushStream.map ((+) 1L)
|> PushStream.sumBenchmarkDotNet=v0.12.0, OS=Windows 10.0.18363
Intel Core i5-3570K CPU 3.40GHz (Ivy Bridge), 1 CPU, 4 logical and 4 physical cores
.NET Core SDK=3.1.100
[Host] : .NET Core 3.1.0 (CoreCLR 4.700.19.56402, CoreFX 4.700.19.56404), X64 RyuJIT DEBUG
DefaultJob : .NET Core 3.1.0 (CoreCLR 4.700.19.56402, CoreFX 4.700.19.56404), X64 RyuJIT
| Method | Count | Mean | Error | StdDev |
|--------------------- |------ |--------------:|-----------:|-----------:|
| SimpleImperativeTest | 100 | 73.36 ns | 0.259 ns | 0.243 ns |
| SimpleLinqTest | 100 | 1,550.18 ns | 7.837 ns | 7.331 ns |
| SimplePushStreamTest | 100 | 419.71 ns | 1.768 ns | 1.654 ns |
| SimpleImperativeTest | 1000 | 611.51 ns | 2.081 ns | 1.946 ns |
| SimpleLinqTest | 1000 | 13,677.24 ns | 47.074 ns | 44.033 ns |
| SimplePushStreamTest | 1000 | 3,576.24 ns | 11.202 ns | 10.478 ns |
| SimpleImperativeTest | 10000 | 5,996.41 ns | 27.344 ns | 25.578 ns |
| SimpleLinqTest | 10000 | 134,129.03 ns | 443.042 ns | 414.422 ns |
| SimplePushStreamTest | 10000 | 34,988.54 ns | 126.441 ns | 118.273 ns |
The imperative "pipeline" does the best as expected as there's no pipeline
overhead but the push stream is doing a lot better than LINQ. Great right?
Not so fast! If we add a second test which does the same thing in a slightly
different way then what happens?
[<Benchmark>]
member x.StructLinqStreamTest () =
Enumerable.Range(0, x.Count)
.Select(int64)
.Select(fun v -> struct ((v &&& 1L) = 0L, v))
.Select(fun struct (b, v) -> struct (b, v + 1L))
.Select(fun struct (b, v) -> if b then v else 0L)
.Sum()
[<Benchmark>]
member x.StructPushStreamTest () =
PushStream.fromRange 0 x.Count
|> PushStream.map int64
|> PushStream.map (fun v -> struct ((v &&& 1L) = 0L, v))
|> PushStream.map (fun struct (b, v) -> struct (b, v + 1L))
|> PushStream.map (fun struct (b, v) -> if b then v else 0L)
|> PushStream.sumSuddenly the push stream performance is abysmal.
| Method | Count | Mean | Error | StdDev |
|--------------------- |------ |----------------:|-------------:|-------------:|
| StructLinqStreamTest | 100 | 3,060.52 ns | 15.855 ns | 14.831 ns |
| StructPushStreamTest | 100 | 60,241.31 ns | 256.031 ns | 239.492 ns |
| StructLinqStreamTest | 1000 | 27,813.86 ns | 99.601 ns | 93.167 ns |
| StructPushStreamTest | 1000 | 591,909.97 ns | 1,710.549 ns | 1,600.048 ns |
| StructLinqStreamTest | 10000 | 274,592.50 ns | 591.566 ns | 553.351 ns |
| StructPushStreamTest | 10000 | 5,908,867.08 ns | 9,791.398 ns | 9,158.880 ns |
We see the LINQ now performs 20x times better? What's going on?
Let's fix the problem using magic! Rewrite the push function from this:
// Original push function, just invokes the function r with v
let inline push r v = r vReplace it with this nonsense:
// New invoke. Seems just to add redundant code that does nothing
let inline push r v = match r v with () -> ()Now push stream compares favorably in all cases
| Method | Count | Mean | Error | StdDev |
|--------------------- |------ |--------------:|-------------:|-------------:|
| SimpleImperativeTest | 100 | 73.52 ns | 0.290 ns | 0.271 ns |
| SimpleLinqTest | 100 | 1,552.31 ns | 4.984 ns | 4.418 ns |
| SimplePushStreamTest | 100 | 503.42 ns | 3.451 ns | 3.228 ns |
| StructLinqStreamTest | 100 | 3,079.51 ns | 18.761 ns | 17.549 ns |
| StructPushStreamTest | 100 | 2,531.61 ns | 9.913 ns | 9.273 ns |
| SimpleImperativeTest | 1000 | 612.66 ns | 1.832 ns | 1.713 ns |
| SimpleLinqTest | 1000 | 13,667.12 ns | 41.531 ns | 38.848 ns |
| SimplePushStreamTest | 1000 | 4,346.26 ns | 16.795 ns | 15.710 ns |
| StructLinqStreamTest | 1000 | 28,029.09 ns | 70.510 ns | 58.879 ns |
| StructPushStreamTest | 1000 | 22,131.33 ns | 63.323 ns | 59.232 ns |
| SimpleImperativeTest | 10000 | 6,007.44 ns | 19.996 ns | 18.704 ns |
| SimpleLinqTest | 10000 | 133,865.57 ns | 355.534 ns | 332.567 ns |
| SimplePushStreamTest | 10000 | 42,625.87 ns | 75.884 ns | 67.270 ns |
| StructLinqStreamTest | 10000 | 321,961.25 ns | 1,074.170 ns | 1,004.780 ns |
| StructPushStreamTest | 10000 | 242,181.19 ns | 419.269 ns | 392.185 ns |
What's going on?
The difference are the tail calls.
With the original push function the IL code to invoke the receiver function looks like this:
// let inline push r v = r v
// Tell the jitter that the call is a tail call
IL_000D: tail.
// Call invoke virtually
IL_000F: callvirt instance !1 class <>::Invoke
// For a tail call the call function has to be followed by ret
IL_0014: retThe modified push function which doesn't change the meaning of the program at all the IL code looks like this:
// let inline push r v = match r v with () -> ()
// Call Invoke (no tail call)
callvirt instance !1 class [FSharp.Core]Microsoft.FSharp.Core.FSharpFunc`2<int64, class [FSharp.Core]Microsoft.FSharp.Core.Unit>::Invoke(!0)
// Throw away the result (which is the unit value anyway)
IL_0012: pop
// Push the result (the unit value is null)
IL_0013: ldnull
// Done
IL_0014: retAs the modified push "looks" at the value (match) and then loads a new result this doesn't fit the pattern of a tail call. Thus the compiler doens't inject the .tail attribute.
Why do .tail calls sometimes go fast and sometimes go really really slow?
Let's look at the assembly code when .tail call executes quickly.
Fast .tail calls (Yay!)
// This is the implementation of: PushStream.map (fun v -> struct ((v &&& 1L) = 0L, v))
// What's this? Tiered compilation left behinds?
00007fff`90362010 0f1f440000 nop dword ptr [rax+rax]
00007fff`90362015 8bc2 mov eax,edx
// Check is number odd
00007fff`90362017 a801 test al,1
00007fff`90362019 7512 jne 00007fff`9036202d
// Number is even, pass value down the pipeline
// This is a virtual call so we find the jump address by looking
// up the value in the vtables
00007fff`9036201b 488b4908 mov rcx,qword ptr [rcx+8]
00007fff`9036201f 488b01 mov rax,qword ptr [rcx]
00007fff`90362022 488b4040 mov rax,qword ptr [rax+40h]
00007fff`90362026 488b4020 mov rax,qword ptr [rax+20h]
// rax now contains the address of the next step in the pipeline
// thanks to .tail call we do a jmp here not call
// this means that when the pipeline finally returns it will return
// directly to the top loop
00007fff`9036202a 48ffe0 jmp rax
// Number was odd, clear the result and return
00007fff`9036202d 33c0 xor eax,eax
00007fff`9036202f c3 retSo apart from the odd nop in the beginning and the usual vtable dance over
virtual functions (what are the conditions to make the jitter succeed with
devirtualizations?) it's quite ok. Actually suprisingly just 5x slower than the
fast imperative solution considering how much more junk happens in each step.
Let's look at the slow .tail call
Slow .tail calls (Boo!)
// This is the implementation of: PushStream.map (fun v -> struct ((v &&& 1L) = 0L, v))
// Function prelude
00007ff7`d77725e0 56 push rsi
00007ff7`d77725e1 4883ec40 sub rsp,40h
00007ff7`d77725e5 c5f877 vzeroupper
00007ff7`d77725e8 4c8d442430 lea r8,[rsp+30h]
00007ff7`d77725ed c5f857c0 vxorps xmm0,xmm0,xmm0
00007ff7`d77725f1 c4c17a7f00 vmovdqu xmmword ptr [r8],xmm0
00007ff7`d77725f6 488b7108 mov rsi,qword ptr [rcx+8]
00007ff7`d77725fa 448bc2 mov r8d,edx
// Is number odd?
00007ff7`d77725fd 41f6c001 test r8b,1
00007ff7`d7772601 410f94c0 sete r8b
00007ff7`d7772605 450fb6c0 movzx r8d,r8b
// Save results
00007ff7`d7772609 4488442438 mov byte ptr [rsp+38h],r8b
00007ff7`d777260e 4889542430 mov qword ptr [rsp+30h],rdx
00007ff7`d7772613 49b8784b6637f87f0000 mov r8,offset
// Checks: Volatile<LONG> g_TrapReturningThreads;
coreclr!g_TrapReturningThreads (00007ff8`37664b78)
// If true we need to suspend thread (by calling coreclr!JIT_PollGC ())
00007ff7`d777261d 41833800 cmp dword ptr [r8],0
00007ff7`d7772621 752e jne 00007ff7`d7772651
00007ff7`d7772623 4c8bc6 mov r8,rsi
// Juggling with struct tuple values
00007ff7`d7772626 c5fa6f442430 vmovdqu xmm0,xmmword ptr [rsp+30h]
00007ff7`d777262c c5fa7f442420 vmovdqu xmmword ptr [rsp+20h],xmm0
00007ff7`d7772632 4c8d4c2420 lea r9,[rsp+20h]
00007ff7`d7772637 48b990d162d7f77f0000 mov rcx,7FF7D762D190h
// Loading vtable to find the address to call to
00007ff7`d7772641 498b10 mov rdx,qword ptr [r8]
00007ff7`d7772644 488b5240 mov rdx,qword ptr [rdx+40h]
00007ff7`d7772648 488b5220 mov rdx,qword ptr [rdx+20h]
// Do the call to next step through: coreclr!JIT_TailCall
00007ff7`d777264c e89ff6c35f call coreclr!JIT_TailCall (00007ff8`373b1cf0)
00007ff7`d7772651 e86ad6c35f call coreclr!JIT_PollGC (00007ff8`373afcc0)
00007ff7`d7772656 ebcb jmp 00007ff7`d7772623So, there's lot more setup here but this is because this function actually needs
a stackframe to store intermediate results. Also I suspect because the stackframe
is needed it has to call coreclr!JIT_TailCall at the end which is the CPU hog.
I don't know exactly what coreclr!JIT_TailCall does but it does a lot when
stepping through the assembly code. However, my suspicion is that its purpose is
eliminate the stackframe and call the next function. While it eliminates the stackframe it adds about 60x overhead to the PushStream pipeline.
Finally let's look at the code for the modified push function to get back
predictable performance
Predictable calls
// Function prelude
00007ff7`d77725e1 4883ec40 sub rsp,58h
00007ff7`d78625f4 c5f877 vzeroupper
00007ff7`d78625f7 33c0 xor eax,eax
00007ff7`d78625f9 4889442448 mov qword ptr [rsp+48h],rax
00007ff7`d78625fe 4889442450 mov qword ptr [rsp+50h],rax
00007ff7`d7862603 488d442438 lea rax,[rsp+38h]
00007ff7`d7862608 c5f857c0 vxorps xmm0,xmm0,xmm0
00007ff7`d786260c c5fa7f00 vmovdqu xmmword ptr [rax],xmm0
00007ff7`d7862610 8bc2 mov eax,edx
// Is number odd?
00007ff7`d7862612 a801 test al,1
00007ff7`d7862614 0f94c0 sete al
00007ff7`d7862617 0fb6c0 movzx eax,al
// Juggling with struct tuple values
00007ff7`d786261a 88442440 mov byte ptr [rsp+40h],al
00007ff7`d786261e 4889542438 mov qword ptr [rsp+38h],rdx
00007ff7`d7862623 c5fa6f442438 vmovdqu xmm0,xmmword ptr [rsp+38h]
00007ff7`d7862629 c5fa7f442448 vmovdqu xmmword ptr [rsp+48h],xmm0
00007ff7`d786262f 488b4908 mov rcx,qword ptr [rcx+8]
00007ff7`d7862633 c5fa6f442448 vmovdqu xmm0,xmmword ptr [rsp+48h]
00007ff7`d7862639 c5fa7f442428 vmovdqu xmmword ptr [rsp+28h],xmm0
00007ff7`d786263f 488d542428 lea rdx,[rsp+28h]
// Loading vtable to find the address to call to
00007ff7`d7862644 488b01 mov rax,qword ptr [rcx]
00007ff7`d7862647 488b4040 mov rax,qword ptr [rax+40h]
// Call the next step in the pipeline
00007ff7`d786264b ff5020 call qword ptr [rax+20h]
// The function returns, clear eax
00007ff7`d786264e 33c0 xor eax,eax
// Deallocate stackframe
00007ff7`d7862650 4883c458 add rsp,58h
// Return to previous chain
00007ff7`d7862654 c3 retBecause juggling with struct tuple is more complex than the first example it is
more complex code but at least the next step in the pipeline is invoked without
the need of coreclr!JIT_TailCall which means the performance is reasonable and
predictable.
It turns out that the only overhead of match r v with () -> () ends up being a
xor eax,eax which is essentially free compared to everything else.
Frustration sets in...
I believe for most of us we only have to make that we just have to avoid writing
code that has TRUELY TERRIBLE PERFORMANCE. We can get away with bad performance
in 99% of all code we write.
However, if I need to write performant code in F# (and maybe future versions of
C#) because tail calls are really really slow I have to be very careful to ensure
with the code I write so that the F# compiler don't emit the .tail attribute.
Sure, I have a pattern that allows me to do it in this case but what if F# compiler improves in future releases and eliminate the nonsense code I wrote to avoid tail calls? The F# compiler has compiler options that allows me to suppress tail calls but F# also supports true inlining which means even if my library is compiled without tail calls when the functions are inlined into the calling assembly it might well inject tail calls.
Further, sometimes the tail calls does go faster when no stack frame is needed.
This puts me in a frustrating spot; I want tail calls but I don't want to pay the
price of the worst case tail call performance.
Obviously the best solution would be that tail calls are always faster than normal
calls but I am sure that is tricky to implement (otherwise it would have been done already).
The second best solution for my particular scenario would be, if the tail call
can be faster than normal calls let's do it otherwise fallback to normal calls.
That is probably confusing as then it doesn't always have the correct semantics
of a tail call but for this particular scenario that is what I want.
I realize F# is a small language so I am hoping that the C# compiler will start
emitting .tail attributes so that the big C# community will notice the awkward
performance of tail calls.
Be sure to complain to Microsoft Language & Runtime Community standup on twitch
if you were bored by this rant and want to see less of it.
Regards.
Full sample code
// Turn off tiered compilation
// $env:COMPlus_TieredCompilation="0"
// dotnet core : 3.1.100
// FSharp.Core : 4.7.0
// BenchMarkDotNet: 0.12.0
module TailCall =
open System
open System.Linq
open System.Diagnostics
// let inline push r v = match r v with () -> ()
// let inline push r v = r v
// Minimalistic PushStream
// A PushStream accepts a receiver function that will be called
// with each value in the PushStream
type 'T PushStream = ('T -> unit) -> unit
module PushStream =
let inline zero () = LanguagePrimitives.GenericZero
let inline push r v = r v
// Creates a PushStream with all integers from b to e (inclusive)
let inline fromRange b e r = for i = b to e do push r i
// Maps all values in ps using mapping function f
let inline map f ps r = ps (fun v -> push r (f v))
// Filters all values in ps using filter function f
let inline filter f ps r = ps (fun v -> if f v then push r v)
// Sums all values in ps
let inline sum ps = let mutable s = zero () in ps (fun v -> s <- s + v); s
module Tests =
open BenchmarkDotNet.Attributes
open BenchmarkDotNet.Running
type Benchmarks () =
[<Params (10000, 1000, 100)>]
member val public Count = 100 with get, set
[<Benchmark>]
member x.SimpleImperativeTest () =
let mutable i = x.Count
let mutable sum = 0L
while i >= 0 do
let v = int64 i
i <- i - 1
if (v &&& 1L) = 0L then
sum <- sum + (v + 1L)
sum
[<Benchmark>]
member x.SimpleLinqTest () =
Enumerable.Range(0, x.Count)
.Select(int64)
.Where(fun v -> (v &&& 1L) = 0L)
.Select((+) 1L)
.Sum()
[<Benchmark>]
member x.SimplePushStreamTest () =
PushStream.fromRange 0 x.Count
|> PushStream.map int64
|> PushStream.filter (fun v -> (v &&& 1L) = 0L)
|> PushStream.map ((+) 1L)
|> PushStream.sum
[<Benchmark>]
member x.StructLinqStreamTest () =
Enumerable.Range(0, x.Count)
.Select(int64)
.Select(fun v -> struct ((v &&& 1L) = 0L, v))
.Select(fun struct (b, v) -> struct (b, v + 1L))
.Select(fun struct (b, v) -> if b then v else 0L)
.Sum()
[<Benchmark>]
member x.StructPushStreamTest () =
PushStream.fromRange 0 x.Count
|> PushStream.map int64
|> PushStream.map (fun v -> struct ((v &&& 1L) = 0L, v))
|> PushStream.map (fun struct (b, v) -> struct (b, v + 1L))
|> PushStream.map (fun struct (b, v) -> if b then v else 0L)
|> PushStream.sum
let now =
let sw = Stopwatch ()
sw.Start ()
fun () -> sw.ElapsedMilliseconds
let time o a =
let inline cc n = GC.CollectionCount n
let v = a ()
GC.Collect (2, GCCollectionMode.Forced)
GC.WaitForFullGCComplete () |> ignore
let bcc0, bcc1, bcc2 = cc 0, cc 1, cc 2
let before = now ()
for _ = 1 to o do
a () |> ignore
let after = now ()
let acc0, acc1, acc2 = cc 0, cc 1, cc 2
v, (after - before), (acc0 - bcc0, acc1 - bcc1, acc2 - bcc2)
let run argv =
let b = BenchmarkSwitcher [|typeof<Benchmarks>|]
let summary = b.Run argv
printfn "%A" summary
// BenchMarkDotNet is good but the runs takes too long for me for experimentation
// Then I rely on quickRun
let quickRun () =
let inline testCase n a = n, fun c -> string (a c)
let benchmarks = Benchmarks ()
let testCases =
[|
testCase "simple, imperative" benchmarks.SimpleImperativeTest
testCase "simple, linq" benchmarks.SimpleLinqTest
testCase "simple, pushstream" benchmarks.SimplePushStreamTest
// testCase "struct, linq" benchmarks.StructLinqStreamTest
// testCase "struct, pushstream" benchmarks.StructPushStreamTest
|]
let total = 100000000
let inners = [|1000000; 10000; 100|]
for inner in inners do
let outer = total / inner
benchmarks.Count <- inner
printfn "Performance test, total: %d, outer: %d, inner: %d" total outer inner
for n, a in testCases do
printfn " Running '%s'..." n
let v, r, cc = time outer a
printfn " result is %A, it took %d ms to produce with (%A) CC" v r cc
[<EntryPoint>]
let main argv =
// TailCall.Tests.quickRun ()
TailCall.Tests.run argv
0category:cq
theme:tail-call
skill-level:expert
cost:medium