1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Fri, 6 Nov 2020 17:39:38 +0100
4 Subject: [PATCH] crypto: arm64/chacha - simplify tail block handling
6 commit c4fc6328d6c67690a7e6e03f43a5a976a13120ef upstream.
8 Based on lessons learnt from optimizing the 32-bit version of this driver,
9 we can simplify the arm64 version considerably, by reordering the final
10 two stores when the last block is not a multiple of 64 bytes. This removes
11 the need to use permutation instructions to calculate the elements that are
12 clobbered by the final overlapping store, given that the store of the
13 penultimate block now follows it, and that one carries the correct values
14 for those elements already.
16 While at it, simplify the overlapping loads as well, by calculating the
17 address of the final overlapping load upfront, and switching to this
18 address for every load that would otherwise extend past the end of the
21 There is no impact on performance, but the resulting code is substantially
22 smaller and easier to follow.
24 Cc: Eric Biggers <ebiggers@google.com>
25 Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
26 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
27 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
28 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
30 arch/arm64/crypto/chacha-neon-core.S | 193 ++++++++++-----------------
31 1 file changed, 69 insertions(+), 124 deletions(-)
33 --- a/arch/arm64/crypto/chacha-neon-core.S
34 +++ b/arch/arm64/crypto/chacha-neon-core.S
35 @@ -195,7 +195,6 @@ ENTRY(chacha_4block_xor_neon)
42 // This function encrypts four consecutive ChaCha blocks by loading
43 @@ -645,11 +644,11 @@ CPU_BE( rev a15, a15 )
44 zip2 v31.4s, v14.4s, v15.4s
49 + sub x3, x3, #128 // start of last block
53 - csel x3, x3, xzr, ge
57 // interleave 64-bit words in state n, n+2
58 zip1 v0.2d, v16.2d, v18.2d
59 @@ -658,13 +657,10 @@ CPU_BE( rev a15, a15 )
60 zip1 v8.2d, v17.2d, v19.2d
61 zip2 v12.2d, v17.2d, v19.2d
62 stp a2, a3, [x1, #-56]
63 - ld1 {v16.16b-v19.16b}, [x2], x3
66 - ccmp x3, xzr, #4, lt
68 - csel x3, x3, xzr, eq
70 + ld1 {v16.16b-v19.16b}, [x2], #64
73 zip1 v1.2d, v20.2d, v22.2d
74 zip2 v5.2d, v20.2d, v22.2d
75 @@ -672,13 +668,10 @@ CPU_BE( rev a15, a15 )
76 zip1 v9.2d, v21.2d, v23.2d
77 zip2 v13.2d, v21.2d, v23.2d
78 stp a6, a7, [x1, #-40]
79 - ld1 {v20.16b-v23.16b}, [x2], x3
82 - ccmp x3, xzr, #4, lt
84 - csel x3, x3, xzr, eq
86 + ld1 {v20.16b-v23.16b}, [x2], #64
89 zip1 v2.2d, v24.2d, v26.2d
90 zip2 v6.2d, v24.2d, v26.2d
91 @@ -686,12 +679,10 @@ CPU_BE( rev a15, a15 )
92 zip1 v10.2d, v25.2d, v27.2d
93 zip2 v14.2d, v25.2d, v27.2d
94 stp a10, a11, [x1, #-24]
95 - ld1 {v24.16b-v27.16b}, [x2], x3
98 - ccmp x3, xzr, #4, lt
100 - csel x2, x2, x9, eq
101 + ld1 {v24.16b-v27.16b}, [x2], #64
102 + csel x2, x2, x3, ge
104 zip1 v3.2d, v28.2d, v30.2d
105 zip2 v7.2d, v28.2d, v30.2d
106 @@ -699,151 +690,105 @@ CPU_BE( rev a15, a15 )
107 zip1 v11.2d, v29.2d, v31.2d
108 zip2 v15.2d, v29.2d, v31.2d
109 stp a14, a15, [x1, #-8]
111 + tbnz x5, #63, .Lt128
112 ld1 {v28.16b-v31.16b}, [x2]
114 // xor with corresponding input, write to output
116 eor v16.16b, v16.16b, v0.16b
117 eor v17.16b, v17.16b, v1.16b
118 eor v18.16b, v18.16b, v2.16b
119 eor v19.16b, v19.16b, v3.16b
120 - st1 {v16.16b-v19.16b}, [x1], #64
124 + tbnz x6, #63, .Lt192
126 eor v20.16b, v20.16b, v4.16b
127 eor v21.16b, v21.16b, v5.16b
128 eor v22.16b, v22.16b, v6.16b
129 eor v23.16b, v23.16b, v7.16b
130 - st1 {v20.16b-v23.16b}, [x1], #64
134 + st1 {v16.16b-v19.16b}, [x1], #64
135 + tbnz x7, #63, .Lt256
137 eor v24.16b, v24.16b, v8.16b
138 eor v25.16b, v25.16b, v9.16b
139 eor v26.16b, v26.16b, v10.16b
140 eor v27.16b, v27.16b, v11.16b
141 - st1 {v24.16b-v27.16b}, [x1], #64
145 + st1 {v20.16b-v23.16b}, [x1], #64
146 + tbnz x8, #63, .Lt320
148 eor v28.16b, v28.16b, v12.16b
149 eor v29.16b, v29.16b, v13.16b
150 eor v30.16b, v30.16b, v14.16b
151 eor v31.16b, v31.16b, v15.16b
153 + st1 {v24.16b-v27.16b}, [x1], #64
154 st1 {v28.16b-v31.16b}, [x1]
159 - // fewer than 128 bytes of in/output
160 -0: ld1 {v8.16b}, [x10]
161 - ld1 {v9.16b}, [x11]
165 - ld1 {v16.16b-v19.16b}, [x2]
166 - tbl v4.16b, {v0.16b-v3.16b}, v8.16b
167 - tbx v20.16b, {v16.16b-v19.16b}, v9.16b
168 - add v8.16b, v8.16b, v10.16b
169 - add v9.16b, v9.16b, v10.16b
170 - tbl v5.16b, {v0.16b-v3.16b}, v8.16b
171 - tbx v21.16b, {v16.16b-v19.16b}, v9.16b
172 - add v8.16b, v8.16b, v10.16b
173 - add v9.16b, v9.16b, v10.16b
174 - tbl v6.16b, {v0.16b-v3.16b}, v8.16b
175 - tbx v22.16b, {v16.16b-v19.16b}, v9.16b
176 - add v8.16b, v8.16b, v10.16b
177 - add v9.16b, v9.16b, v10.16b
178 - tbl v7.16b, {v0.16b-v3.16b}, v8.16b
179 - tbx v23.16b, {v16.16b-v19.16b}, v9.16b
181 - eor v20.16b, v20.16b, v4.16b
182 - eor v21.16b, v21.16b, v5.16b
183 - eor v22.16b, v22.16b, v6.16b
184 - eor v23.16b, v23.16b, v7.16b
185 - st1 {v20.16b-v23.16b}, [x1]
188 // fewer than 192 bytes of in/output
189 -1: ld1 {v8.16b}, [x10]
190 - ld1 {v9.16b}, [x11]
193 - tbl v0.16b, {v4.16b-v7.16b}, v8.16b
194 - tbx v20.16b, {v16.16b-v19.16b}, v9.16b
195 - add v8.16b, v8.16b, v10.16b
196 - add v9.16b, v9.16b, v10.16b
197 - tbl v1.16b, {v4.16b-v7.16b}, v8.16b
198 - tbx v21.16b, {v16.16b-v19.16b}, v9.16b
199 - add v8.16b, v8.16b, v10.16b
200 - add v9.16b, v9.16b, v10.16b
201 - tbl v2.16b, {v4.16b-v7.16b}, v8.16b
202 - tbx v22.16b, {v16.16b-v19.16b}, v9.16b
203 - add v8.16b, v8.16b, v10.16b
204 - add v9.16b, v9.16b, v10.16b
205 - tbl v3.16b, {v4.16b-v7.16b}, v8.16b
206 - tbx v23.16b, {v16.16b-v19.16b}, v9.16b
208 - eor v20.16b, v20.16b, v0.16b
209 - eor v21.16b, v21.16b, v1.16b
210 - eor v22.16b, v22.16b, v2.16b
211 - eor v23.16b, v23.16b, v3.16b
212 - st1 {v20.16b-v23.16b}, [x1]
213 +.Lt192: cbz x5, 1f // exactly 128 bytes?
214 + ld1 {v28.16b-v31.16b}, [x10]
216 + tbl v28.16b, {v4.16b-v7.16b}, v28.16b
217 + tbl v29.16b, {v4.16b-v7.16b}, v29.16b
218 + tbl v30.16b, {v4.16b-v7.16b}, v30.16b
219 + tbl v31.16b, {v4.16b-v7.16b}, v31.16b
221 +0: eor v20.16b, v20.16b, v28.16b
222 + eor v21.16b, v21.16b, v29.16b
223 + eor v22.16b, v22.16b, v30.16b
224 + eor v23.16b, v23.16b, v31.16b
225 + st1 {v20.16b-v23.16b}, [x5] // overlapping stores
226 +1: st1 {v16.16b-v19.16b}, [x1]
229 + // fewer than 128 bytes of in/output
230 +.Lt128: ld1 {v28.16b-v31.16b}, [x10]
233 + tbl v28.16b, {v0.16b-v3.16b}, v28.16b
234 + tbl v29.16b, {v0.16b-v3.16b}, v29.16b
235 + tbl v30.16b, {v0.16b-v3.16b}, v30.16b
236 + tbl v31.16b, {v0.16b-v3.16b}, v31.16b
237 + ld1 {v16.16b-v19.16b}, [x1] // reload first output block
240 // fewer than 256 bytes of in/output
241 -2: ld1 {v4.16b}, [x10]
242 - ld1 {v5.16b}, [x11]
245 +.Lt256: cbz x6, 2f // exactly 192 bytes?
246 + ld1 {v4.16b-v7.16b}, [x10]
248 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
249 - tbx v24.16b, {v20.16b-v23.16b}, v5.16b
250 - add v4.16b, v4.16b, v6.16b
251 - add v5.16b, v5.16b, v6.16b
252 - tbl v1.16b, {v8.16b-v11.16b}, v4.16b
253 - tbx v25.16b, {v20.16b-v23.16b}, v5.16b
254 - add v4.16b, v4.16b, v6.16b
255 - add v5.16b, v5.16b, v6.16b
256 - tbl v2.16b, {v8.16b-v11.16b}, v4.16b
257 - tbx v26.16b, {v20.16b-v23.16b}, v5.16b
258 - add v4.16b, v4.16b, v6.16b
259 - add v5.16b, v5.16b, v6.16b
260 - tbl v3.16b, {v8.16b-v11.16b}, v4.16b
261 - tbx v27.16b, {v20.16b-v23.16b}, v5.16b
263 - eor v24.16b, v24.16b, v0.16b
264 - eor v25.16b, v25.16b, v1.16b
265 - eor v26.16b, v26.16b, v2.16b
266 - eor v27.16b, v27.16b, v3.16b
267 - st1 {v24.16b-v27.16b}, [x1]
268 + tbl v1.16b, {v8.16b-v11.16b}, v5.16b
269 + tbl v2.16b, {v8.16b-v11.16b}, v6.16b
270 + tbl v3.16b, {v8.16b-v11.16b}, v7.16b
272 + eor v28.16b, v28.16b, v0.16b
273 + eor v29.16b, v29.16b, v1.16b
274 + eor v30.16b, v30.16b, v2.16b
275 + eor v31.16b, v31.16b, v3.16b
276 + st1 {v28.16b-v31.16b}, [x6] // overlapping stores
277 +2: st1 {v20.16b-v23.16b}, [x1]
280 // fewer than 320 bytes of in/output
281 -3: ld1 {v4.16b}, [x10]
282 - ld1 {v5.16b}, [x11]
285 +.Lt320: cbz x7, 3f // exactly 256 bytes?
286 + ld1 {v4.16b-v7.16b}, [x10]
288 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
289 - tbx v28.16b, {v24.16b-v27.16b}, v5.16b
290 - add v4.16b, v4.16b, v6.16b
291 - add v5.16b, v5.16b, v6.16b
292 - tbl v1.16b, {v12.16b-v15.16b}, v4.16b
293 - tbx v29.16b, {v24.16b-v27.16b}, v5.16b
294 - add v4.16b, v4.16b, v6.16b
295 - add v5.16b, v5.16b, v6.16b
296 - tbl v2.16b, {v12.16b-v15.16b}, v4.16b
297 - tbx v30.16b, {v24.16b-v27.16b}, v5.16b
298 - add v4.16b, v4.16b, v6.16b
299 - add v5.16b, v5.16b, v6.16b
300 - tbl v3.16b, {v12.16b-v15.16b}, v4.16b
301 - tbx v31.16b, {v24.16b-v27.16b}, v5.16b
302 + tbl v1.16b, {v12.16b-v15.16b}, v5.16b
303 + tbl v2.16b, {v12.16b-v15.16b}, v6.16b
304 + tbl v3.16b, {v12.16b-v15.16b}, v7.16b
306 eor v28.16b, v28.16b, v0.16b
307 eor v29.16b, v29.16b, v1.16b
308 eor v30.16b, v30.16b, v2.16b
309 eor v31.16b, v31.16b, v3.16b
310 - st1 {v28.16b-v31.16b}, [x1]
311 + st1 {v28.16b-v31.16b}, [x7] // overlapping stores
312 +3: st1 {v24.16b-v27.16b}, [x1]
314 ENDPROC(chacha_4block_xor_neon)
316 @@ -851,7 +796,7 @@ ENDPROC(chacha_4block_xor_neon)
317 .align L1_CACHE_SHIFT