- 
                Notifications
    You must be signed in to change notification settings 
- Fork 5.2k
Closed
Labels
Priority:2Work that is important, but not critical for the releaseWork that is important, but not critical for the releasearea-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Milestone
Description
Now that we have an SSA based IV analysis (added in #97865) we should implement strength reduction based on it. Example loop:
[MethodImpl(MethodImplOptions.NoInlining)]
private static int StrengthReduce(Span<int> s)
{
    int sum = 0;
    foreach (int val in s)
        sum += val;
    return sum;
}Codegen x64:
       xor      r8d, r8d
       test     ecx, ecx
       jle      SHORT G_M11380_IG04
       align    [0 bytes for IG03]
						;; size=15 bbWeight=1 PerfScore 5.75
G_M11380_IG03:  ;; offset=0x0013
       add      eax, dword ptr [rdx+4*r8]
       inc      r8d
       cmp      r8d, ecx
       jl       SHORT G_M11380_IG03
						;; size=12 bbWeight=4 PerfScore 18.00Codegen arm64:
            mov     w3, wzr
            cmp     w2, #0
            ble     G_M1017_IG04
            align   [0 bytes for IG03]
						;; size=24 bbWeight=1 PerfScore 6.50
G_M1017_IG03:  ;; offset=0x0024
            ldr     w4, [x1, w3, UXTW #2]
            add     w0, w4, w0
            add     w3, w3, #1
            cmp     w3, w2
            blt     G_M1017_IG03
						;; size=20 bbWeight=4 PerfScore 22.00The point of strength reduction is to optimize the loop codegen as if it had been written as follows:
[MethodImpl(MethodImplOptions.NoInlining)]
private static int StrengthReduce(Span<int> s)
{
    int sum = 0;
    ref int p = ref MemoryMarshal.GetReference(s);
    ref int end = ref Unsafe.Add(ref p, s.Length);
    while (Unsafe.IsAddressLessThan(ref p, ref end))
    {
        sum += p;
        p = ref Unsafe.Add(ref p, 1);
    }
    return sum;
}The codegen would look like:
x64:
       xor      eax, eax
       mov      rdx, bword ptr [rcx]
       mov      ecx, dword ptr [rcx+0x08]
       lea      rcx, bword ptr [rdx+4*rcx]
       cmp      rdx, rcx
       jae      SHORT G_M11380_IG04
       align    [0 bytes for IG03]
						;; size=17 bbWeight=1 PerfScore 6.00
G_M11380_IG03:  ;; offset=0x0011
       add      eax, dword ptr [rdx]
       add      rdx, 4
       cmp      rdx, rcx
       jb       SHORT G_M11380_IG03
						;; size=11 bbWeight=4 PerfScore 18.00arm64:
            mov     w0, wzr
            ldr     x1, [fp, #0x10]	// [V00 arg0]
            ldr     w2, [fp, #0x18]	// [V00 arg0+0x08]
            ubfiz   x2, x2, #2, #32
            add     x2, x1, x2
            cmp     x1, x2
            bhs     G_M11380_IG04
            align   [0 bytes for IG03]
						;; size=28 bbWeight=1 PerfScore 7.50
G_M11380_IG03:  ;; offset=0x0028
            ldr     w3, [x1]
            add     w0, w0, w3
            add     x1, x1, #4
            cmp     x1, x2
            blo     G_M11380_IG03
						;; size=20 bbWeight=4 PerfScore 22.00For arm64 there is the additional possibility of using post-increment addressing mode by optimizing the placement of the IV increment once the strength reduction has happened. The loop body is then reducible to:
G_M11380_IG03:  ;; offset=0x0028
            ldr     w3, [x1], #4
            add     w0, w0, w3
            cmp     x1, x2
            blo     G_M11380_IG03EgorBo, PaulusParssinen and neon-sunsetSergio0694 and omariom
Metadata
Metadata
Assignees
Labels
Priority:2Work that is important, but not critical for the releaseWork that is important, but not critical for the releasearea-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI