aws · nebeid · Aug 23, 2023 · Aug 11, 2023 · Aug 18, 2023 · Aug 18, 2023
@@ -243,6 +243,12 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) OR
                 generic/bignum_mul.S
                 generic/bignum_optsub.S
                 generic/bignum_sqr.S
+
+                fastmul/bignum_kmul_16_32_neon.S
+                fastmul/bignum_kmul_32_64_neon.S
+                fastmul/bignum_ksqr_16_32_neon.S
+                fastmul/bignum_ksqr_32_64_neon.S
+                fastmul/bignum_emontredc_8n_neon.S
                 )
   endif()
 endif()

@@ -477,15 +477,29 @@ static void montgomery_s2n_bignum_mul_mont(BN_ULONG *rp, const BN_ULONG *ap,
   uint64_t w = n0[0];
 
   if (num == 32) {
-    if (ap == bp)
-      bignum_ksqr_32_64(mulres, ap, t);
-    else
-      bignum_kmul_32_64(mulres, ap, bp, t);
+    if (CRYPTO_is_NEON_capable()) {
+      if (ap == bp)
+        bignum_ksqr_32_64_neon(mulres, ap, t);
+      else
+        bignum_kmul_32_64_neon(mulres, ap, bp, t);
+    } else {
+      if (ap == bp)
+        bignum_ksqr_32_64(mulres, ap, t);
+      else
+        bignum_kmul_32_64(mulres, ap, bp, t);
+    }
   } else if (num == 16) {
-    if (ap == bp)
-      bignum_ksqr_16_32(mulres, ap, t);
-    else
-      bignum_kmul_16_32(mulres, ap, bp, t);
+    if (CRYPTO_is_NEON_capable()) {
+      if (ap == bp)
+        bignum_ksqr_16_32_neon(mulres, ap, t);
+      else
+        bignum_kmul_16_32_neon(mulres, ap, bp, t);
+    } else {
+      if (ap == bp)
+        bignum_ksqr_16_32(mulres, ap, t);
+      else
+        bignum_kmul_16_32(mulres, ap, bp, t);
+    }
   } else {
     if (ap == bp)
       bignum_sqr(num * 2, mulres, num, ap);
@@ -504,8 +518,9 @@ static void montgomery_s2n_bignum_mul_mont(BN_ULONG *rp, const BN_ULONG *ap,
   //    A. The result of step 1 >= 2^(64*num), meaning that bignum_emontredc_8n
   //       returned 1. Since m is less than 2^(64*num), (result of step 1) >= m holds.
   //    B. The result of step 1 fits in 2^(64*num), and the result >= m.
-  uint64_t c;
-  c = bignum_emontredc_8n(num, mulres, np, w); // c: case A
+  uint64_t c = CRYPTO_is_NEON_capable() ? 
+               bignum_emontredc_8n_neon(num, mulres, np, w) :
+               bignum_emontredc_8n(num, mulres, np, w); // c: case A
   c |= bignum_ge(num, mulres + num, num, np);  // c: case B
   // Optionally subtract and store the result at rp
   bignum_optsub(num, rp, mulres + num, c, np);

@@ -137,13 +137,19 @@ extern void curve25519_x25519base_byte_alt(uint8_t res[static 32], const uint8_t
 extern void
 bignum_ksqr_32_64(uint64_t z[static 64], const uint64_t x[static 32],
                   uint64_t t[static S2NBIGNUM_KSQR_32_64_TEMP_NWORDS]);
+extern void
+bignum_ksqr_32_64_neon(uint64_t z[static 64], const uint64_t x[static 32],
+                       uint64_t t[static S2NBIGNUM_KSQR_32_64_TEMP_NWORDS]);
 
 // Evaluate z := x^2 where x is a 1024-bit integer.
 // Input: x[16]; output: z[32]; temporary buffer: t[>=24]
 #define S2NBIGNUM_KSQR_16_32_TEMP_NWORDS 24
 extern void
 bignum_ksqr_16_32(uint64_t z[static 32], const uint64_t x[static 16],
                   uint64_t t[static S2NBIGNUM_KSQR_16_32_TEMP_NWORDS]);
+extern void
+bignum_ksqr_16_32_neon(uint64_t z[static 32], const uint64_t x[static 16],
+                       uint64_t t[static S2NBIGNUM_KSQR_16_32_TEMP_NWORDS]);
 
 // Evaluate z := x * y where x and y are 2048-bit integers.
 // Inputs: x[32], y[32]; output: z[64]; temporary buffer t[>=96]
@@ -152,6 +158,10 @@ extern void
 bignum_kmul_32_64(uint64_t z[static 64], const uint64_t x[static 32],
                   const uint64_t y[static 32],
                   uint64_t t[static S2NBIGNUM_KMUL_32_64_TEMP_NWORDS]);
+extern void
+bignum_kmul_32_64_neon(uint64_t z[static 64], const uint64_t x[static 32],
+                       const uint64_t y[static 32],
+                       uint64_t t[static S2NBIGNUM_KMUL_32_64_TEMP_NWORDS]);
 
 // Evaluate z := x * y where x and y are 1024-bit integers.
 // Inputs: x[16], y[16]; output: z[32]; temporary buffer t[>=32]
@@ -160,6 +170,10 @@ extern void
 bignum_kmul_16_32(uint64_t z[static 32], const uint64_t x[static 16],
                   const uint64_t y[static 16],
                   uint64_t t[static S2NBIGNUM_KMUL_16_32_TEMP_NWORDS]);
+extern void
+bignum_kmul_16_32_neon(uint64_t z[static 32], const uint64_t x[static 16],
+                       const uint64_t y[static 16],
+                       uint64_t t[static S2NBIGNUM_KMUL_16_32_TEMP_NWORDS]);
 
 // Extended Montgomery reduce in 8-digit blocks.
 // Assumes that z initially holds a 2k-digit bignum z_0, m is a k-digit odd
@@ -178,6 +192,8 @@ bignum_kmul_16_32(uint64_t z[static 32], const uint64_t x[static 16],
 // Inputs: z[2*k], m[k], w; outputs: function return (extra result bit) and z[2*k]
 extern uint64_t bignum_emontredc_8n(uint64_t k, uint64_t *z, const uint64_t *m,
                                     uint64_t w);
+extern uint64_t bignum_emontredc_8n_neon(uint64_t k, uint64_t *z, const uint64_t *m,
+                                         uint64_t w);
 
 // Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero)
 // Inputs: x[k], p, y[k]; outputs: function return (carry-out) and z[k]