crypto: arm64/aes-blk - add 4 way interleave to CBC encrypt path
authorArd Biesheuvel <ard.biesheuvel@linaro.org>
Sat, 10 Mar 2018 15:21:52 +0000 (15:21 +0000)
committerHerbert Xu <herbert@gondor.apana.org.au>
Fri, 16 Mar 2018 15:35:57 +0000 (23:35 +0800)
CBC encryption is strictly sequential, and so the current AES code
simply processes the input one block at a time. However, we are
about to add yield support, which adds a bit of overhead, and which
we prefer to align with other modes in terms of granularity (i.e.,
it is better to have all routines yield every 64 bytes and not have
an exception for CBC encrypt which yields every 16 bytes)

So unroll the loop by 4. We still cannot perform the AES algorithm in
parallel, but we can at least merge the loads and stores.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm64/crypto/aes-modes.S

index 27a235b2ddee83341d33c030b7a64a873e66c400..e86535a1329d24e6ee48c87b5d58bca8b6f5814a 100644 (file)
@@ -94,17 +94,36 @@ AES_ENDPROC(aes_ecb_decrypt)
         */
 
 AES_ENTRY(aes_cbc_encrypt)
-       ld1             {v0.16b}, [x5]                  /* get iv */
+       ld1             {v4.16b}, [x5]                  /* get iv */
        enc_prepare     w3, x2, x6
 
-.Lcbcencloop:
-       ld1             {v1.16b}, [x1], #16             /* get next pt block */
-       eor             v0.16b, v0.16b, v1.16b          /* ..and xor with iv */
+.Lcbcencloop4x:
+       subs            w4, w4, #4
+       bmi             .Lcbcenc1x
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
+       eor             v0.16b, v0.16b, v4.16b          /* ..and xor with iv */
        encrypt_block   v0, w3, x2, x6, w7
-       st1             {v0.16b}, [x0], #16
+       eor             v1.16b, v1.16b, v0.16b
+       encrypt_block   v1, w3, x2, x6, w7
+       eor             v2.16b, v2.16b, v1.16b
+       encrypt_block   v2, w3, x2, x6, w7
+       eor             v3.16b, v3.16b, v2.16b
+       encrypt_block   v3, w3, x2, x6, w7
+       st1             {v0.16b-v3.16b}, [x0], #64
+       mov             v4.16b, v3.16b
+       b               .Lcbcencloop4x
+.Lcbcenc1x:
+       adds            w4, w4, #4
+       beq             .Lcbcencout
+.Lcbcencloop:
+       ld1             {v0.16b}, [x1], #16             /* get next pt block */
+       eor             v4.16b, v4.16b, v0.16b          /* ..and xor with iv */
+       encrypt_block   v4, w3, x2, x6, w7
+       st1             {v4.16b}, [x0], #16
        subs            w4, w4, #1
        bne             .Lcbcencloop
-       st1             {v0.16b}, [x5]                  /* return iv */
+.Lcbcencout:
+       st1             {v4.16b}, [x5]                  /* return iv */
        ret
 AES_ENDPROC(aes_cbc_encrypt)