crypto: arm64/aes-modes - get rid of literal load of addend vector
authorArd Biesheuvel <ard.biesheuvel@linaro.org>
Thu, 23 Aug 2018 16:48:45 +0000 (17:48 +0100)
committerHerbert Xu <herbert@gondor.apana.org.au>
Tue, 4 Sep 2018 03:37:04 +0000 (11:37 +0800)
Replace the literal load of the addend vector with a sequence that
performs each add individually. This sequence is only 2 instructions
longer than the original, and 2% faster on Cortex-A53.

This is an improvement by itself, but also works around a Clang issue,
whose integrated assembler does not implement the GNU ARM asm syntax
completely, and does not support the =literal notation for FP registers
(more info at https://bugs.llvm.org/show_bug.cgi?id=38642)

Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm64/crypto/aes-modes.S

index 483a7130cf0e118de591837a067c4a489ca12a5e..496c243de4ac3ccabc089c64e0e959721520d769 100644 (file)
@@ -232,17 +232,19 @@ AES_ENTRY(aes_ctr_encrypt)
        bmi             .Lctr1x
        cmn             w6, #4                  /* 32 bit overflow? */
        bcs             .Lctr1x
-       ldr             q8, =0x30000000200000001        /* addends 1,2,3[,0] */
-       dup             v7.4s, w6
+       add             w7, w6, #1
        mov             v0.16b, v4.16b
-       add             v7.4s, v7.4s, v8.4s
+       add             w8, w6, #2
        mov             v1.16b, v4.16b
-       rev32           v8.16b, v7.16b
+       add             w9, w6, #3
        mov             v2.16b, v4.16b
+       rev             w7, w7
        mov             v3.16b, v4.16b
-       mov             v1.s[3], v8.s[0]
-       mov             v2.s[3], v8.s[1]
-       mov             v3.s[3], v8.s[2]
+       rev             w8, w8
+       mov             v1.s[3], w7
+       rev             w9, w9
+       mov             v2.s[3], w8
+       mov             v3.s[3], w9
        ld1             {v5.16b-v7.16b}, [x20], #48     /* get 3 input blocks */
        bl              aes_encrypt_block4x
        eor             v0.16b, v5.16b, v0.16b