crypto: arm64/aes-blk - improve XTS mask handling
authorArd Biesheuvel <ard.biesheuvel@linaro.org>
Mon, 10 Sep 2018 14:41:15 +0000 (16:41 +0200)
committerHerbert Xu <herbert@gondor.apana.org.au>
Fri, 21 Sep 2018 05:24:50 +0000 (13:24 +0800)
The Crypto Extension instantiation of the aes-modes.S collection of
skciphers uses only 15 NEON registers for the round key array, whereas
the pure NEON flavor uses 16 NEON registers for the AES S-box.

This means we have a spare register available that we can use to hold
the XTS mask vector, removing the need to reload it at every iteration
of the inner loop.

Since the pure NEON version does not permit this optimization, tweak
the macros so we can factor out this functionality. Also, replace the
literal load with a short sequence to compose the mask vector.

On Cortex-A53, this results in a ~4% speedup.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm64/crypto/aes-ce.S
arch/arm64/crypto/aes-modes.S
arch/arm64/crypto/aes-neon.S

index 623e74ed1c67f1d634e5762d35ddeaf39b73a498..143070510809ac1df415f785c315c110b8df7ada 100644 (file)
 
        .arch           armv8-a+crypto
 
+       xtsmask         .req    v16
+
+       .macro          xts_reload_mask, tmp
+       .endm
+
        /* preload all round keys */
        .macro          load_round_keys, rounds, rk
        cmp             \rounds, #12
index 9697eda3b4d1bad51df070df6c877d2766671eb5..039738ae23f662f7ad656815e0a922485bc54428 100644 (file)
@@ -340,17 +340,19 @@ AES_ENDPROC(aes_ctr_encrypt)
         *                 int blocks, u8 const rk2[], u8 iv[], int first)
         */
 
-       .macro          next_tweak, out, in, const, tmp
+       .macro          next_tweak, out, in, tmp
        sshr            \tmp\().2d,  \in\().2d,   #63
-       and             \tmp\().16b, \tmp\().16b, \const\().16b
+       and             \tmp\().16b, \tmp\().16b, xtsmask.16b
        add             \out\().2d,  \in\().2d,   \in\().2d
        ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
        eor             \out\().16b, \out\().16b, \tmp\().16b
        .endm
 
-.Lxts_mul_x:
-CPU_LE(        .quad           1, 0x87         )
-CPU_BE(        .quad           0x87, 1         )
+       .macro          xts_load_mask, tmp
+       movi            xtsmask.2s, #0x1
+       movi            \tmp\().2s, #0x87
+       uzp1            xtsmask.4s, xtsmask.4s, \tmp\().4s
+       .endm
 
 AES_ENTRY(aes_xts_encrypt)
        stp             x29, x30, [sp, #-16]!
@@ -362,24 +364,24 @@ AES_ENTRY(aes_xts_encrypt)
        enc_prepare     w3, x5, x8
        encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
        enc_switch_key  w3, x2, x8
-       ldr             q7, .Lxts_mul_x
+       xts_load_mask   v8
        b               .LxtsencNx
 
 .Lxtsencnotfirst:
        enc_prepare     w3, x2, x8
 .LxtsencloopNx:
-       ldr             q7, .Lxts_mul_x
-       next_tweak      v4, v4, v7, v8
+       xts_reload_mask v8
+       next_tweak      v4, v4, v8
 .LxtsencNx:
        subs            w4, w4, #4
        bmi             .Lxtsenc1x
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
-       next_tweak      v5, v4, v7, v8
+       next_tweak      v5, v4, v8
        eor             v0.16b, v0.16b, v4.16b
-       next_tweak      v6, v5, v7, v8
+       next_tweak      v6, v5, v8
        eor             v1.16b, v1.16b, v5.16b
        eor             v2.16b, v2.16b, v6.16b
-       next_tweak      v7, v6, v7, v8
+       next_tweak      v7, v6, v8
        eor             v3.16b, v3.16b, v7.16b
        bl              aes_encrypt_block4x
        eor             v3.16b, v3.16b, v7.16b
@@ -401,7 +403,7 @@ AES_ENTRY(aes_xts_encrypt)
        st1             {v0.16b}, [x0], #16
        subs            w4, w4, #1
        beq             .Lxtsencout
-       next_tweak      v4, v4, v7, v8
+       next_tweak      v4, v4, v8
        b               .Lxtsencloop
 .Lxtsencout:
        st1             {v4.16b}, [x6]
@@ -420,24 +422,24 @@ AES_ENTRY(aes_xts_decrypt)
        enc_prepare     w3, x5, x8
        encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
        dec_prepare     w3, x2, x8
-       ldr             q7, .Lxts_mul_x
+       xts_load_mask   v8
        b               .LxtsdecNx
 
 .Lxtsdecnotfirst:
        dec_prepare     w3, x2, x8
 .LxtsdecloopNx:
-       ldr             q7, .Lxts_mul_x
-       next_tweak      v4, v4, v7, v8
+       xts_reload_mask v8
+       next_tweak      v4, v4, v8
 .LxtsdecNx:
        subs            w4, w4, #4
        bmi             .Lxtsdec1x
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
-       next_tweak      v5, v4, v7, v8
+       next_tweak      v5, v4, v8
        eor             v0.16b, v0.16b, v4.16b
-       next_tweak      v6, v5, v7, v8
+       next_tweak      v6, v5, v8
        eor             v1.16b, v1.16b, v5.16b
        eor             v2.16b, v2.16b, v6.16b
-       next_tweak      v7, v6, v7, v8
+       next_tweak      v7, v6, v8
        eor             v3.16b, v3.16b, v7.16b
        bl              aes_decrypt_block4x
        eor             v3.16b, v3.16b, v7.16b
@@ -459,7 +461,7 @@ AES_ENTRY(aes_xts_decrypt)
        st1             {v0.16b}, [x0], #16
        subs            w4, w4, #1
        beq             .Lxtsdecout
-       next_tweak      v4, v4, v7, v8
+       next_tweak      v4, v4, v8
        b               .Lxtsdecloop
 .Lxtsdecout:
        st1             {v4.16b}, [x6]
index 1c7b45b7268e4677fe589830830b08732d6cf138..29100f692e8a03034f100eae9fc6b2bba64370c8 100644 (file)
 #define AES_ENTRY(func)                ENTRY(neon_ ## func)
 #define AES_ENDPROC(func)      ENDPROC(neon_ ## func)
 
+       xtsmask         .req    v7
+
+       .macro          xts_reload_mask, tmp
+       xts_load_mask   \tmp
+       .endm
+
        /* multiply by polynomial 'x' in GF(2^8) */
        .macro          mul_by_x, out, in, temp, const
        sshr            \temp, \in, #7