-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Description
Description
Hi, I've noticed that the JIT currently doesn't seem to track what assertions are done on the state of registers/locals between different jump labels in the generated code, resulting in completely redundant conditional branches being repeated even though by looking at the assembly one can actually see the variables being tested couldn't possibly have changed. I've discovered this issue while working on the Span2D<T> type in the Microsoft.Toolkit.HighPerformance package (here), which in some cases it's actually hurt quite a bit by the lack of this optimization (unless users just manually switch to using unsafe code to work around it).
As suggested by @tannergooding - first here's a minimal repro that illustrates the issue:
C# repro code (click to expand)
public readonly struct Wrapper
{
private readonly int length;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int Test(int i)
{
static void Throw() => throw new ArgumentException();
if ((uint)i >= (uint)this.length)
{
Throw();
}
return 1;
}
}
public static int Repro(Wrapper wrapper)
{
int result = 0;
for (int i = 0; i < 100; i++)
{
// We're calling Test twice here, but the same issue happens for any number
// of times we repeat the call: each generates one extra redundant check
result += wrapper.Test(i);
result += wrapper.Test(i);
}
return result;
}This produces the following assembly code:
asm x86-64 codegen (click to expand)
; Assembly listing for method Span2DBenchmark.Program:Foo(Span2DBenchmark.Wrapper):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; Tier-1 compilation
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
;* V00 arg0 [V00 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op
; V01 loc0 [V01,T01] ( 6, 18 ) int -> rax
; V02 loc1 [V02,T00] ( 6, 21 ) int -> rdx
; V03 OutArgs [V03 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V04 tmp1 [V04,T02] ( 3, 9 ) int -> rcx V00.length(offs=0x00) P-INDEP "field V00.length (fldOffset=0x0)"
;
; Lcl frame size = 40
G_M50921_IG01:
sub rsp, 40
;; bbWeight=1 PerfScore 0.25
G_M50921_IG02:
xor eax, eax
xor edx, edx
;; bbWeight=1 PerfScore 0.50
G_M50921_IG03:
cmp edx, ecx
jae SHORT G_M50921_IG07
;; bbWeight=4 PerfScore 5.00
G_M50921_IG04:
inc eax
cmp edx, ecx
jae SHORT G_M50921_IG07
;; bbWeight=4 PerfScore 6.00
G_M50921_IG05:
inc eax
inc edx
cmp edx, 100
jl SHORT G_M50921_IG03
;; bbWeight=4 PerfScore 7.00
G_M50921_IG06:
add rsp, 40
ret
;; bbWeight=1 PerfScore 1.25
G_M50921_IG07:
call Span2DBenchmark.Wrapper:<Test>g__Throw|1_0()
int3
;; bbWeight=0 PerfScore 0.00
; Total bytes of code 38, prolog size 4, PerfScore 23.80, (MethodHash=665a3916) for method Span2DBenchmark.Program:Foo(Span2DBenchmark.Wrapper):int
; ============================================================You can see that each call to Wrapper.Test is inline and produces that inc eax as expected. But, each call also adds one extra conditional jump that checks the current index in edx. After the first one (G_M50921_IG03), the others are just not needed, as the test is only working on edx and ecx, which are not modified by any of the following instructions until the end of the loop. The JIT doesn't seem to currently be aware of the state of these registers, so it just doesn't see that no instructions are writing to those registers to be able to know that those repeated conditional branches are just not necessary.
My original example
C# repro code (click to expand)
private static int Sum(Span2D<int> span)
{
int height = span.Height;
int width = span.Width;
int sum = 0;
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
sum += span[i, j];
}
}
return sum;
}This produces the following assembly code:
asm x86-64 codegen (click to expand)
; Assembly listing for method Span2DBenchmark.Benchmark:Anujfrewf(Microsoft.Toolkit.HighPerformance.Memory.Span2D`1[Int32]):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 6, 72 ) byref -> rcx ld-addr-op
; V01 loc0 [V01,T09] ( 3, 6 ) int -> r8
; V02 loc1 [V02,T06] ( 3, 21 ) int -> r10
; V03 loc2 [V03,T03] ( 4, 34 ) int -> rax
; V04 loc3 [V04,T02] ( 6, 45 ) int -> r11
; V05 loc4 [V05,T00] ( 6, 76 ) int -> rsi
; V06 OutArgs [V06 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V07 tmp1 [V07,T05] ( 2, 32 ) long -> rbx "Inline stloc first use temp"
;* V08 tmp2 [V08 ] ( 0, 0 ) byref -> zero-ref "impAppendStmt"
;* V09 tmp3 [V09 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg"
; V10 tmp4 [V10,T04] ( 2, 32 ) byref -> rdi V09._pointer(offs=0x00) P-INDEP "field V09._pointer (fldOffset=0x0)"
;* V11 tmp5 [V11 ] ( 0, 0 ) int -> zero-ref V09._length(offs=0x08) P-INDEP "field V09._length (fldOffset=0x8)"
; V12 cse0 [V12,T07] ( 3, 18 ) int -> rdx "CSE - aggressive"
; V13 cse1 [V13,T08] ( 3, 10 ) int -> r9 "CSE - aggressive"
;
; Lcl frame size = 40
G_M41662_IG01:
push rdi
push rsi
push rbp
push rbx
sub rsp, 40
;; bbWeight=1 PerfScore 4.25
G_M41662_IG02:
mov edx, dword ptr [rcx+8]
mov r8d, edx
mov r9d, dword ptr [rcx+16]
mov r10d, r9d
xor eax, eax
xor r11d, r11d
test r8d, r8d
jle SHORT G_M41662_IG08
;; bbWeight=1 PerfScore 6.25
G_M41662_IG03:
xor esi, esi
test r10d, r10d
jle SHORT G_M41662_IG07
;; bbWeight=4 PerfScore 6.00
G_M41662_IG04:
cmp r11d, edx
jae SHORT G_M41662_IG09
;; bbWeight=16 PerfScore 20.00
G_M41662_IG05:
cmp esi, r9d
jae SHORT G_M41662_IG09
;; bbWeight=8 PerfScore 10.00
G_M41662_IG06:
mov rdi, bword ptr [rcx]
mov ebx, r11d
mov ebp, dword ptr [rcx+20]
imul rbx, rbp
mov ebp, esi
add rbx, rbp
add eax, dword ptr [rdi+4*rbx]
inc esi
cmp esi, r10d
jl SHORT G_M41662_IG04
;; bbWeight=16 PerfScore 164.00
G_M41662_IG07:
inc r11d
cmp r11d, r8d
jl SHORT G_M41662_IG03
;; bbWeight=4 PerfScore 6.00
G_M41662_IG08:
add rsp, 40
pop rbx
pop rbp
pop rsi
pop rdi
ret
;; bbWeight=1 PerfScore 3.25
G_M41662_IG09:
call Microsoft.Toolkit.HighPerformance.Memory.Internals.ThrowHelper:ThrowIndexOutOfRangeException()
int3
;; bbWeight=0 PerfScore 0.00
; Total bytes of code 99, prolog size 8, PerfScore 229.65, (MethodHash=8b1c5d41) for method Span2DBenchmark.Benchmark:Anujfrewf(Microsoft.Toolkit.HighPerformance.Memory.Span2D`1[Int32]):int
; ============================================================Here the codegen is repeating the bounds check for i every single time the indexer is used, even though none of the registers being checked is every updated by the following code. Ideally the check should only be done once, and then that jl SHORT G_M41662_IG04 should instead jump to G_M41662_IG05 and only repeat the check for j, which is updated in that block of code. Like, that jump label to directly go back to the check for j is in fact generated, it's just not being used.
Benchmarks
I run a quick benchmark for this method against a version where I manually skipped the bounds check for i after the first iteration of the loop, and got about a 12% speed improvement due to this change alone. I think it might be worth it for the JIT to be able to track situations such as this one automatically, though admittedly I'm not sure how difficult it would be to enable this.
Configuration
- .NET Core 5.0.0 (CoreCLR 5.0.20.47505, CoreFX 5.0.20.47505), X64 RyuJIT
- Codegen done with disasmo on a local checked build of .NET 5 from
master, cloned a couple weeks ago
Metadata
Metadata
Assignees
Labels
Type
Projects
Status
