14a75e10eb21ad3bee2658feacafc8184ed7ae25
[openwrt/staging/neocturne.git] /
1 From ec96c25c1ce09c78e44bd4627bc0a3e610b7f5d8 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Fri, 8 Nov 2019 13:22:38 +0100
4 Subject: [PATCH 031/124] crypto: arm/curve25519 - wire up NEON implementation
5
6 commit d8f1308a025fc7e00414194ed742d5f05a21e13c upstream.
7
8 This ports the SUPERCOP implementation for usage in kernel space. In
9 addition to the usual header, macro, and style changes required for
10 kernel space, it makes a few small changes to the code:
11
12 - The stack alignment is relaxed to 16 bytes.
13 - Superfluous mov statements have been removed.
14 - ldr for constants has been replaced with movw.
15 - ldreq has been replaced with moveq.
16 - The str epilogue has been made more idiomatic.
17 - SIMD registers are not pushed and popped at the beginning and end.
18 - The prologue and epilogue have been made idiomatic.
19 - A hole has been removed from the stack, saving 32 bytes.
20 - We write-back the base register whenever possible for vld1.8.
21 - Some multiplications have been reordered for better A7 performance.
22
23 There are more opportunities for cleanup, since this code is from qhasm,
24 which doesn't always do the most opportune thing. But even prior to
25 extensive hand optimizations, this code delivers significant performance
26 improvements (given in get_cycles() per call):
27
28 ----------- -------------
29 | generic C | this commit |
30 ------------ ----------- -------------
31 | Cortex-A7 | 49136 | 22395 |
32 ------------ ----------- -------------
33 | Cortex-A17 | 17326 | 4983 |
34 ------------ ----------- -------------
35
36 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
37 [ardb: - move to arch/arm/crypto
38 - wire into lib/crypto framework
39 - implement crypto API KPP hooks ]
40 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
41 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
42 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
43 ---
44 arch/arm/crypto/Kconfig | 6 +
45 arch/arm/crypto/Makefile | 2 +
46 arch/arm/crypto/curve25519-core.S | 347 +++++++++++++-----------------
47 arch/arm/crypto/curve25519-glue.c | 127 +++++++++++
48 4 files changed, 287 insertions(+), 195 deletions(-)
49 create mode 100644 arch/arm/crypto/curve25519-glue.c
50
51 --- a/arch/arm/crypto/Kconfig
52 +++ b/arch/arm/crypto/Kconfig
53 @@ -141,4 +141,10 @@ config CRYPTO_NHPOLY1305_NEON
54 depends on KERNEL_MODE_NEON
55 select CRYPTO_NHPOLY1305
56
57 +config CRYPTO_CURVE25519_NEON
58 + tristate "NEON accelerated Curve25519 scalar multiplication library"
59 + depends on KERNEL_MODE_NEON
60 + select CRYPTO_LIB_CURVE25519_GENERIC
61 + select CRYPTO_ARCH_HAVE_LIB_CURVE25519
62 +
63 endif
64 --- a/arch/arm/crypto/Makefile
65 +++ b/arch/arm/crypto/Makefile
66 @@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha51
67 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
68 obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
69 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
70 +obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
71
72 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
73 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
74 @@ -58,6 +59,7 @@ chacha-neon-y := chacha-scalar-core.o ch
75 chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
76 poly1305-arm-y := poly1305-core.o poly1305-glue.o
77 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
78 +curve25519-neon-y := curve25519-core.o curve25519-glue.o
79
80 ifdef REGENERATE_ARM_CRYPTO
81 quiet_cmd_perl = PERL $@
82 --- a/arch/arm/crypto/curve25519-core.S
83 +++ b/arch/arm/crypto/curve25519-core.S
84 @@ -1,43 +1,35 @@
85 +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
86 /*
87 - * Public domain code from Daniel J. Bernstein and Peter Schwabe, from
88 - * SUPERCOP's curve25519/neon2/scalarmult.s.
89 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
90 + *
91 + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
92 + * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
93 + * manually reworked for use in kernel space.
94 */
95
96 -.fpu neon
97 +#include <linux/linkage.h>
98 +
99 .text
100 +.fpu neon
101 +.arch armv7-a
102 .align 4
103 -.global _crypto_scalarmult_curve25519_neon2
104 -.global crypto_scalarmult_curve25519_neon2
105 -.type _crypto_scalarmult_curve25519_neon2 STT_FUNC
106 -.type crypto_scalarmult_curve25519_neon2 STT_FUNC
107 - _crypto_scalarmult_curve25519_neon2:
108 - crypto_scalarmult_curve25519_neon2:
109 - vpush {q4, q5, q6, q7}
110 - mov r12, sp
111 - sub sp, sp, #736
112 - and sp, sp, #0xffffffe0
113 - strd r4, [sp, #0]
114 - strd r6, [sp, #8]
115 - strd r8, [sp, #16]
116 - strd r10, [sp, #24]
117 - str r12, [sp, #480]
118 - str r14, [sp, #484]
119 - mov r0, r0
120 - mov r1, r1
121 - mov r2, r2
122 - add r3, sp, #32
123 - ldr r4, =0
124 - ldr r5, =254
125 +
126 +ENTRY(curve25519_neon)
127 + push {r4-r11, lr}
128 + mov ip, sp
129 + sub r3, sp, #704
130 + and r3, r3, #0xfffffff0
131 + mov sp, r3
132 + movw r4, #0
133 + movw r5, #254
134 vmov.i32 q0, #1
135 vshr.u64 q1, q0, #7
136 vshr.u64 q0, q0, #8
137 vmov.i32 d4, #19
138 vmov.i32 d5, #38
139 - add r6, sp, #512
140 - vst1.8 {d2-d3}, [r6, : 128]
141 - add r6, sp, #528
142 - vst1.8 {d0-d1}, [r6, : 128]
143 - add r6, sp, #544
144 + add r6, sp, #480
145 + vst1.8 {d2-d3}, [r6, : 128]!
146 + vst1.8 {d0-d1}, [r6, : 128]!
147 vst1.8 {d4-d5}, [r6, : 128]
148 add r6, r3, #0
149 vmov.i32 q2, #0
150 @@ -45,12 +37,12 @@
151 vst1.8 {d4-d5}, [r6, : 128]!
152 vst1.8 d4, [r6, : 64]
153 add r6, r3, #0
154 - ldr r7, =960
155 + movw r7, #960
156 sub r7, r7, #2
157 neg r7, r7
158 sub r7, r7, r7, LSL #7
159 str r7, [r6]
160 - add r6, sp, #704
161 + add r6, sp, #672
162 vld1.8 {d4-d5}, [r1]!
163 vld1.8 {d6-d7}, [r1]
164 vst1.8 {d4-d5}, [r6, : 128]!
165 @@ -212,15 +204,15 @@
166 vst1.8 {d0-d1}, [r6, : 128]!
167 vst1.8 {d2-d3}, [r6, : 128]!
168 vst1.8 d4, [r6, : 64]
169 -._mainloop:
170 +.Lmainloop:
171 mov r2, r5, LSR #3
172 and r6, r5, #7
173 ldrb r2, [r1, r2]
174 mov r2, r2, LSR r6
175 and r2, r2, #1
176 - str r5, [sp, #488]
177 + str r5, [sp, #456]
178 eor r4, r4, r2
179 - str r2, [sp, #492]
180 + str r2, [sp, #460]
181 neg r2, r4
182 add r4, r3, #96
183 add r5, r3, #192
184 @@ -291,7 +283,7 @@
185 vsub.i32 q0, q1, q3
186 vst1.8 d4, [r4, : 64]
187 vst1.8 d0, [r6, : 64]
188 - add r2, sp, #544
189 + add r2, sp, #512
190 add r4, r3, #96
191 add r5, r3, #144
192 vld1.8 {d0-d1}, [r2, : 128]
193 @@ -361,14 +353,13 @@
194 vmlal.s32 q0, d12, d8
195 vmlal.s32 q0, d13, d17
196 vmlal.s32 q0, d6, d6
197 - add r2, sp, #512
198 - vld1.8 {d18-d19}, [r2, : 128]
199 + add r2, sp, #480
200 + vld1.8 {d18-d19}, [r2, : 128]!
201 vmull.s32 q3, d16, d7
202 vmlal.s32 q3, d10, d15
203 vmlal.s32 q3, d11, d14
204 vmlal.s32 q3, d12, d9
205 vmlal.s32 q3, d13, d8
206 - add r2, sp, #528
207 vld1.8 {d8-d9}, [r2, : 128]
208 vadd.i64 q5, q12, q9
209 vadd.i64 q6, q15, q9
210 @@ -502,22 +493,19 @@
211 vadd.i32 q5, q5, q0
212 vtrn.32 q11, q14
213 vadd.i32 q6, q6, q3
214 - add r2, sp, #560
215 + add r2, sp, #528
216 vadd.i32 q10, q10, q2
217 vtrn.32 d24, d25
218 - vst1.8 {d12-d13}, [r2, : 128]
219 + vst1.8 {d12-d13}, [r2, : 128]!
220 vshl.i32 q6, q13, #1
221 - add r2, sp, #576
222 - vst1.8 {d20-d21}, [r2, : 128]
223 + vst1.8 {d20-d21}, [r2, : 128]!
224 vshl.i32 q10, q14, #1
225 - add r2, sp, #592
226 - vst1.8 {d12-d13}, [r2, : 128]
227 + vst1.8 {d12-d13}, [r2, : 128]!
228 vshl.i32 q15, q12, #1
229 vadd.i32 q8, q8, q4
230 vext.32 d10, d31, d30, #0
231 vadd.i32 q7, q7, q1
232 - add r2, sp, #608
233 - vst1.8 {d16-d17}, [r2, : 128]
234 + vst1.8 {d16-d17}, [r2, : 128]!
235 vmull.s32 q8, d18, d5
236 vmlal.s32 q8, d26, d4
237 vmlal.s32 q8, d19, d9
238 @@ -528,8 +516,7 @@
239 vmlal.s32 q8, d29, d1
240 vmlal.s32 q8, d24, d6
241 vmlal.s32 q8, d25, d0
242 - add r2, sp, #624
243 - vst1.8 {d14-d15}, [r2, : 128]
244 + vst1.8 {d14-d15}, [r2, : 128]!
245 vmull.s32 q2, d18, d4
246 vmlal.s32 q2, d12, d9
247 vmlal.s32 q2, d13, d8
248 @@ -537,8 +524,7 @@
249 vmlal.s32 q2, d22, d2
250 vmlal.s32 q2, d23, d1
251 vmlal.s32 q2, d24, d0
252 - add r2, sp, #640
253 - vst1.8 {d20-d21}, [r2, : 128]
254 + vst1.8 {d20-d21}, [r2, : 128]!
255 vmull.s32 q7, d18, d9
256 vmlal.s32 q7, d26, d3
257 vmlal.s32 q7, d19, d8
258 @@ -547,14 +533,12 @@
259 vmlal.s32 q7, d28, d1
260 vmlal.s32 q7, d23, d6
261 vmlal.s32 q7, d29, d0
262 - add r2, sp, #656
263 - vst1.8 {d10-d11}, [r2, : 128]
264 + vst1.8 {d10-d11}, [r2, : 128]!
265 vmull.s32 q5, d18, d3
266 vmlal.s32 q5, d19, d2
267 vmlal.s32 q5, d22, d1
268 vmlal.s32 q5, d23, d0
269 vmlal.s32 q5, d12, d8
270 - add r2, sp, #672
271 vst1.8 {d16-d17}, [r2, : 128]
272 vmull.s32 q4, d18, d8
273 vmlal.s32 q4, d26, d2
274 @@ -566,7 +550,7 @@
275 vmlal.s32 q8, d26, d1
276 vmlal.s32 q8, d19, d6
277 vmlal.s32 q8, d27, d0
278 - add r2, sp, #576
279 + add r2, sp, #544
280 vld1.8 {d20-d21}, [r2, : 128]
281 vmlal.s32 q7, d24, d21
282 vmlal.s32 q7, d25, d20
283 @@ -575,32 +559,30 @@
284 vmlal.s32 q8, d22, d21
285 vmlal.s32 q8, d28, d20
286 vmlal.s32 q5, d24, d20
287 - add r2, sp, #576
288 vst1.8 {d14-d15}, [r2, : 128]
289 vmull.s32 q7, d18, d6
290 vmlal.s32 q7, d26, d0
291 - add r2, sp, #656
292 + add r2, sp, #624
293 vld1.8 {d30-d31}, [r2, : 128]
294 vmlal.s32 q2, d30, d21
295 vmlal.s32 q7, d19, d21
296 vmlal.s32 q7, d27, d20
297 - add r2, sp, #624
298 + add r2, sp, #592
299 vld1.8 {d26-d27}, [r2, : 128]
300 vmlal.s32 q4, d25, d27
301 vmlal.s32 q8, d29, d27
302 vmlal.s32 q8, d25, d26
303 vmlal.s32 q7, d28, d27
304 vmlal.s32 q7, d29, d26
305 - add r2, sp, #608
306 + add r2, sp, #576
307 vld1.8 {d28-d29}, [r2, : 128]
308 vmlal.s32 q4, d24, d29
309 vmlal.s32 q8, d23, d29
310 vmlal.s32 q8, d24, d28
311 vmlal.s32 q7, d22, d29
312 vmlal.s32 q7, d23, d28
313 - add r2, sp, #608
314 vst1.8 {d8-d9}, [r2, : 128]
315 - add r2, sp, #560
316 + add r2, sp, #528
317 vld1.8 {d8-d9}, [r2, : 128]
318 vmlal.s32 q7, d24, d9
319 vmlal.s32 q7, d25, d31
320 @@ -621,36 +603,36 @@
321 vmlal.s32 q0, d23, d26
322 vmlal.s32 q0, d24, d31
323 vmlal.s32 q0, d19, d20
324 - add r2, sp, #640
325 + add r2, sp, #608
326 vld1.8 {d18-d19}, [r2, : 128]
327 vmlal.s32 q2, d18, d7
328 - vmlal.s32 q2, d19, d6
329 vmlal.s32 q5, d18, d6
330 - vmlal.s32 q5, d19, d21
331 vmlal.s32 q1, d18, d21
332 - vmlal.s32 q1, d19, d29
333 vmlal.s32 q0, d18, d28
334 - vmlal.s32 q0, d19, d9
335 vmlal.s32 q6, d18, d29
336 + vmlal.s32 q2, d19, d6
337 + vmlal.s32 q5, d19, d21
338 + vmlal.s32 q1, d19, d29
339 + vmlal.s32 q0, d19, d9
340 vmlal.s32 q6, d19, d28
341 - add r2, sp, #592
342 + add r2, sp, #560
343 vld1.8 {d18-d19}, [r2, : 128]
344 - add r2, sp, #512
345 + add r2, sp, #480
346 vld1.8 {d22-d23}, [r2, : 128]
347 vmlal.s32 q5, d19, d7
348 vmlal.s32 q0, d18, d21
349 vmlal.s32 q0, d19, d29
350 vmlal.s32 q6, d18, d6
351 - add r2, sp, #528
352 + add r2, sp, #496
353 vld1.8 {d6-d7}, [r2, : 128]
354 vmlal.s32 q6, d19, d21
355 - add r2, sp, #576
356 + add r2, sp, #544
357 vld1.8 {d18-d19}, [r2, : 128]
358 vmlal.s32 q0, d30, d8
359 - add r2, sp, #672
360 + add r2, sp, #640
361 vld1.8 {d20-d21}, [r2, : 128]
362 vmlal.s32 q5, d30, d29
363 - add r2, sp, #608
364 + add r2, sp, #576
365 vld1.8 {d24-d25}, [r2, : 128]
366 vmlal.s32 q1, d30, d28
367 vadd.i64 q13, q0, q11
368 @@ -823,22 +805,19 @@
369 vadd.i32 q5, q5, q0
370 vtrn.32 q11, q14
371 vadd.i32 q6, q6, q3
372 - add r2, sp, #560
373 + add r2, sp, #528
374 vadd.i32 q10, q10, q2
375 vtrn.32 d24, d25
376 - vst1.8 {d12-d13}, [r2, : 128]
377 + vst1.8 {d12-d13}, [r2, : 128]!
378 vshl.i32 q6, q13, #1
379 - add r2, sp, #576
380 - vst1.8 {d20-d21}, [r2, : 128]
381 + vst1.8 {d20-d21}, [r2, : 128]!
382 vshl.i32 q10, q14, #1
383 - add r2, sp, #592
384 - vst1.8 {d12-d13}, [r2, : 128]
385 + vst1.8 {d12-d13}, [r2, : 128]!
386 vshl.i32 q15, q12, #1
387 vadd.i32 q8, q8, q4
388 vext.32 d10, d31, d30, #0
389 vadd.i32 q7, q7, q1
390 - add r2, sp, #608
391 - vst1.8 {d16-d17}, [r2, : 128]
392 + vst1.8 {d16-d17}, [r2, : 128]!
393 vmull.s32 q8, d18, d5
394 vmlal.s32 q8, d26, d4
395 vmlal.s32 q8, d19, d9
396 @@ -849,8 +828,7 @@
397 vmlal.s32 q8, d29, d1
398 vmlal.s32 q8, d24, d6
399 vmlal.s32 q8, d25, d0
400 - add r2, sp, #624
401 - vst1.8 {d14-d15}, [r2, : 128]
402 + vst1.8 {d14-d15}, [r2, : 128]!
403 vmull.s32 q2, d18, d4
404 vmlal.s32 q2, d12, d9
405 vmlal.s32 q2, d13, d8
406 @@ -858,8 +836,7 @@
407 vmlal.s32 q2, d22, d2
408 vmlal.s32 q2, d23, d1
409 vmlal.s32 q2, d24, d0
410 - add r2, sp, #640
411 - vst1.8 {d20-d21}, [r2, : 128]
412 + vst1.8 {d20-d21}, [r2, : 128]!
413 vmull.s32 q7, d18, d9
414 vmlal.s32 q7, d26, d3
415 vmlal.s32 q7, d19, d8
416 @@ -868,15 +845,13 @@
417 vmlal.s32 q7, d28, d1
418 vmlal.s32 q7, d23, d6
419 vmlal.s32 q7, d29, d0
420 - add r2, sp, #656
421 - vst1.8 {d10-d11}, [r2, : 128]
422 + vst1.8 {d10-d11}, [r2, : 128]!
423 vmull.s32 q5, d18, d3
424 vmlal.s32 q5, d19, d2
425 vmlal.s32 q5, d22, d1
426 vmlal.s32 q5, d23, d0
427 vmlal.s32 q5, d12, d8
428 - add r2, sp, #672
429 - vst1.8 {d16-d17}, [r2, : 128]
430 + vst1.8 {d16-d17}, [r2, : 128]!
431 vmull.s32 q4, d18, d8
432 vmlal.s32 q4, d26, d2
433 vmlal.s32 q4, d19, d7
434 @@ -887,7 +862,7 @@
435 vmlal.s32 q8, d26, d1
436 vmlal.s32 q8, d19, d6
437 vmlal.s32 q8, d27, d0
438 - add r2, sp, #576
439 + add r2, sp, #544
440 vld1.8 {d20-d21}, [r2, : 128]
441 vmlal.s32 q7, d24, d21
442 vmlal.s32 q7, d25, d20
443 @@ -896,32 +871,30 @@
444 vmlal.s32 q8, d22, d21
445 vmlal.s32 q8, d28, d20
446 vmlal.s32 q5, d24, d20
447 - add r2, sp, #576
448 vst1.8 {d14-d15}, [r2, : 128]
449 vmull.s32 q7, d18, d6
450 vmlal.s32 q7, d26, d0
451 - add r2, sp, #656
452 + add r2, sp, #624
453 vld1.8 {d30-d31}, [r2, : 128]
454 vmlal.s32 q2, d30, d21
455 vmlal.s32 q7, d19, d21
456 vmlal.s32 q7, d27, d20
457 - add r2, sp, #624
458 + add r2, sp, #592
459 vld1.8 {d26-d27}, [r2, : 128]
460 vmlal.s32 q4, d25, d27
461 vmlal.s32 q8, d29, d27
462 vmlal.s32 q8, d25, d26
463 vmlal.s32 q7, d28, d27
464 vmlal.s32 q7, d29, d26
465 - add r2, sp, #608
466 + add r2, sp, #576
467 vld1.8 {d28-d29}, [r2, : 128]
468 vmlal.s32 q4, d24, d29
469 vmlal.s32 q8, d23, d29
470 vmlal.s32 q8, d24, d28
471 vmlal.s32 q7, d22, d29
472 vmlal.s32 q7, d23, d28
473 - add r2, sp, #608
474 vst1.8 {d8-d9}, [r2, : 128]
475 - add r2, sp, #560
476 + add r2, sp, #528
477 vld1.8 {d8-d9}, [r2, : 128]
478 vmlal.s32 q7, d24, d9
479 vmlal.s32 q7, d25, d31
480 @@ -942,36 +915,36 @@
481 vmlal.s32 q0, d23, d26
482 vmlal.s32 q0, d24, d31
483 vmlal.s32 q0, d19, d20
484 - add r2, sp, #640
485 + add r2, sp, #608
486 vld1.8 {d18-d19}, [r2, : 128]
487 vmlal.s32 q2, d18, d7
488 - vmlal.s32 q2, d19, d6
489 vmlal.s32 q5, d18, d6
490 - vmlal.s32 q5, d19, d21
491 vmlal.s32 q1, d18, d21
492 - vmlal.s32 q1, d19, d29
493 vmlal.s32 q0, d18, d28
494 - vmlal.s32 q0, d19, d9
495 vmlal.s32 q6, d18, d29
496 + vmlal.s32 q2, d19, d6
497 + vmlal.s32 q5, d19, d21
498 + vmlal.s32 q1, d19, d29
499 + vmlal.s32 q0, d19, d9
500 vmlal.s32 q6, d19, d28
501 - add r2, sp, #592
502 + add r2, sp, #560
503 vld1.8 {d18-d19}, [r2, : 128]
504 - add r2, sp, #512
505 + add r2, sp, #480
506 vld1.8 {d22-d23}, [r2, : 128]
507 vmlal.s32 q5, d19, d7
508 vmlal.s32 q0, d18, d21
509 vmlal.s32 q0, d19, d29
510 vmlal.s32 q6, d18, d6
511 - add r2, sp, #528
512 + add r2, sp, #496
513 vld1.8 {d6-d7}, [r2, : 128]
514 vmlal.s32 q6, d19, d21
515 - add r2, sp, #576
516 + add r2, sp, #544
517 vld1.8 {d18-d19}, [r2, : 128]
518 vmlal.s32 q0, d30, d8
519 - add r2, sp, #672
520 + add r2, sp, #640
521 vld1.8 {d20-d21}, [r2, : 128]
522 vmlal.s32 q5, d30, d29
523 - add r2, sp, #608
524 + add r2, sp, #576
525 vld1.8 {d24-d25}, [r2, : 128]
526 vmlal.s32 q1, d30, d28
527 vadd.i64 q13, q0, q11
528 @@ -1069,7 +1042,7 @@
529 sub r4, r4, #24
530 vst1.8 d0, [r2, : 64]
531 vst1.8 d1, [r4, : 64]
532 - add r2, sp, #544
533 + add r2, sp, #512
534 add r4, r3, #144
535 add r5, r3, #192
536 vld1.8 {d0-d1}, [r2, : 128]
537 @@ -1139,14 +1112,13 @@
538 vmlal.s32 q0, d12, d8
539 vmlal.s32 q0, d13, d17
540 vmlal.s32 q0, d6, d6
541 - add r2, sp, #512
542 - vld1.8 {d18-d19}, [r2, : 128]
543 + add r2, sp, #480
544 + vld1.8 {d18-d19}, [r2, : 128]!
545 vmull.s32 q3, d16, d7
546 vmlal.s32 q3, d10, d15
547 vmlal.s32 q3, d11, d14
548 vmlal.s32 q3, d12, d9
549 vmlal.s32 q3, d13, d8
550 - add r2, sp, #528
551 vld1.8 {d8-d9}, [r2, : 128]
552 vadd.i64 q5, q12, q9
553 vadd.i64 q6, q15, q9
554 @@ -1295,22 +1267,19 @@
555 vadd.i32 q5, q5, q0
556 vtrn.32 q11, q14
557 vadd.i32 q6, q6, q3
558 - add r2, sp, #560
559 + add r2, sp, #528
560 vadd.i32 q10, q10, q2
561 vtrn.32 d24, d25
562 - vst1.8 {d12-d13}, [r2, : 128]
563 + vst1.8 {d12-d13}, [r2, : 128]!
564 vshl.i32 q6, q13, #1
565 - add r2, sp, #576
566 - vst1.8 {d20-d21}, [r2, : 128]
567 + vst1.8 {d20-d21}, [r2, : 128]!
568 vshl.i32 q10, q14, #1
569 - add r2, sp, #592
570 - vst1.8 {d12-d13}, [r2, : 128]
571 + vst1.8 {d12-d13}, [r2, : 128]!
572 vshl.i32 q15, q12, #1
573 vadd.i32 q8, q8, q4
574 vext.32 d10, d31, d30, #0
575 vadd.i32 q7, q7, q1
576 - add r2, sp, #608
577 - vst1.8 {d16-d17}, [r2, : 128]
578 + vst1.8 {d16-d17}, [r2, : 128]!
579 vmull.s32 q8, d18, d5
580 vmlal.s32 q8, d26, d4
581 vmlal.s32 q8, d19, d9
582 @@ -1321,8 +1290,7 @@
583 vmlal.s32 q8, d29, d1
584 vmlal.s32 q8, d24, d6
585 vmlal.s32 q8, d25, d0
586 - add r2, sp, #624
587 - vst1.8 {d14-d15}, [r2, : 128]
588 + vst1.8 {d14-d15}, [r2, : 128]!
589 vmull.s32 q2, d18, d4
590 vmlal.s32 q2, d12, d9
591 vmlal.s32 q2, d13, d8
592 @@ -1330,8 +1298,7 @@
593 vmlal.s32 q2, d22, d2
594 vmlal.s32 q2, d23, d1
595 vmlal.s32 q2, d24, d0
596 - add r2, sp, #640
597 - vst1.8 {d20-d21}, [r2, : 128]
598 + vst1.8 {d20-d21}, [r2, : 128]!
599 vmull.s32 q7, d18, d9
600 vmlal.s32 q7, d26, d3
601 vmlal.s32 q7, d19, d8
602 @@ -1340,15 +1307,13 @@
603 vmlal.s32 q7, d28, d1
604 vmlal.s32 q7, d23, d6
605 vmlal.s32 q7, d29, d0
606 - add r2, sp, #656
607 - vst1.8 {d10-d11}, [r2, : 128]
608 + vst1.8 {d10-d11}, [r2, : 128]!
609 vmull.s32 q5, d18, d3
610 vmlal.s32 q5, d19, d2
611 vmlal.s32 q5, d22, d1
612 vmlal.s32 q5, d23, d0
613 vmlal.s32 q5, d12, d8
614 - add r2, sp, #672
615 - vst1.8 {d16-d17}, [r2, : 128]
616 + vst1.8 {d16-d17}, [r2, : 128]!
617 vmull.s32 q4, d18, d8
618 vmlal.s32 q4, d26, d2
619 vmlal.s32 q4, d19, d7
620 @@ -1359,7 +1324,7 @@
621 vmlal.s32 q8, d26, d1
622 vmlal.s32 q8, d19, d6
623 vmlal.s32 q8, d27, d0
624 - add r2, sp, #576
625 + add r2, sp, #544
626 vld1.8 {d20-d21}, [r2, : 128]
627 vmlal.s32 q7, d24, d21
628 vmlal.s32 q7, d25, d20
629 @@ -1368,32 +1333,30 @@
630 vmlal.s32 q8, d22, d21
631 vmlal.s32 q8, d28, d20
632 vmlal.s32 q5, d24, d20
633 - add r2, sp, #576
634 vst1.8 {d14-d15}, [r2, : 128]
635 vmull.s32 q7, d18, d6
636 vmlal.s32 q7, d26, d0
637 - add r2, sp, #656
638 + add r2, sp, #624
639 vld1.8 {d30-d31}, [r2, : 128]
640 vmlal.s32 q2, d30, d21
641 vmlal.s32 q7, d19, d21
642 vmlal.s32 q7, d27, d20
643 - add r2, sp, #624
644 + add r2, sp, #592
645 vld1.8 {d26-d27}, [r2, : 128]
646 vmlal.s32 q4, d25, d27
647 vmlal.s32 q8, d29, d27
648 vmlal.s32 q8, d25, d26
649 vmlal.s32 q7, d28, d27
650 vmlal.s32 q7, d29, d26
651 - add r2, sp, #608
652 + add r2, sp, #576
653 vld1.8 {d28-d29}, [r2, : 128]
654 vmlal.s32 q4, d24, d29
655 vmlal.s32 q8, d23, d29
656 vmlal.s32 q8, d24, d28
657 vmlal.s32 q7, d22, d29
658 vmlal.s32 q7, d23, d28
659 - add r2, sp, #608
660 vst1.8 {d8-d9}, [r2, : 128]
661 - add r2, sp, #560
662 + add r2, sp, #528
663 vld1.8 {d8-d9}, [r2, : 128]
664 vmlal.s32 q7, d24, d9
665 vmlal.s32 q7, d25, d31
666 @@ -1414,36 +1377,36 @@
667 vmlal.s32 q0, d23, d26
668 vmlal.s32 q0, d24, d31
669 vmlal.s32 q0, d19, d20
670 - add r2, sp, #640
671 + add r2, sp, #608
672 vld1.8 {d18-d19}, [r2, : 128]
673 vmlal.s32 q2, d18, d7
674 - vmlal.s32 q2, d19, d6
675 vmlal.s32 q5, d18, d6
676 - vmlal.s32 q5, d19, d21
677 vmlal.s32 q1, d18, d21
678 - vmlal.s32 q1, d19, d29
679 vmlal.s32 q0, d18, d28
680 - vmlal.s32 q0, d19, d9
681 vmlal.s32 q6, d18, d29
682 + vmlal.s32 q2, d19, d6
683 + vmlal.s32 q5, d19, d21
684 + vmlal.s32 q1, d19, d29
685 + vmlal.s32 q0, d19, d9
686 vmlal.s32 q6, d19, d28
687 - add r2, sp, #592
688 + add r2, sp, #560
689 vld1.8 {d18-d19}, [r2, : 128]
690 - add r2, sp, #512
691 + add r2, sp, #480
692 vld1.8 {d22-d23}, [r2, : 128]
693 vmlal.s32 q5, d19, d7
694 vmlal.s32 q0, d18, d21
695 vmlal.s32 q0, d19, d29
696 vmlal.s32 q6, d18, d6
697 - add r2, sp, #528
698 + add r2, sp, #496
699 vld1.8 {d6-d7}, [r2, : 128]
700 vmlal.s32 q6, d19, d21
701 - add r2, sp, #576
702 + add r2, sp, #544
703 vld1.8 {d18-d19}, [r2, : 128]
704 vmlal.s32 q0, d30, d8
705 - add r2, sp, #672
706 + add r2, sp, #640
707 vld1.8 {d20-d21}, [r2, : 128]
708 vmlal.s32 q5, d30, d29
709 - add r2, sp, #608
710 + add r2, sp, #576
711 vld1.8 {d24-d25}, [r2, : 128]
712 vmlal.s32 q1, d30, d28
713 vadd.i64 q13, q0, q11
714 @@ -1541,10 +1504,10 @@
715 sub r4, r4, #24
716 vst1.8 d0, [r2, : 64]
717 vst1.8 d1, [r4, : 64]
718 - ldr r2, [sp, #488]
719 - ldr r4, [sp, #492]
720 + ldr r2, [sp, #456]
721 + ldr r4, [sp, #460]
722 subs r5, r2, #1
723 - bge ._mainloop
724 + bge .Lmainloop
725 add r1, r3, #144
726 add r2, r3, #336
727 vld1.8 {d0-d1}, [r1, : 128]!
728 @@ -1553,41 +1516,41 @@
729 vst1.8 {d0-d1}, [r2, : 128]!
730 vst1.8 {d2-d3}, [r2, : 128]!
731 vst1.8 d4, [r2, : 64]
732 - ldr r1, =0
733 -._invertloop:
734 + movw r1, #0
735 +.Linvertloop:
736 add r2, r3, #144
737 - ldr r4, =0
738 - ldr r5, =2
739 + movw r4, #0
740 + movw r5, #2
741 cmp r1, #1
742 - ldreq r5, =1
743 + moveq r5, #1
744 addeq r2, r3, #336
745 addeq r4, r3, #48
746 cmp r1, #2
747 - ldreq r5, =1
748 + moveq r5, #1
749 addeq r2, r3, #48
750 cmp r1, #3
751 - ldreq r5, =5
752 + moveq r5, #5
753 addeq r4, r3, #336
754 cmp r1, #4
755 - ldreq r5, =10
756 + moveq r5, #10
757 cmp r1, #5
758 - ldreq r5, =20
759 + moveq r5, #20
760 cmp r1, #6
761 - ldreq r5, =10
762 + moveq r5, #10
763 addeq r2, r3, #336
764 addeq r4, r3, #336
765 cmp r1, #7
766 - ldreq r5, =50
767 + moveq r5, #50
768 cmp r1, #8
769 - ldreq r5, =100
770 + moveq r5, #100
771 cmp r1, #9
772 - ldreq r5, =50
773 + moveq r5, #50
774 addeq r2, r3, #336
775 cmp r1, #10
776 - ldreq r5, =5
777 + moveq r5, #5
778 addeq r2, r3, #48
779 cmp r1, #11
780 - ldreq r5, =0
781 + moveq r5, #0
782 addeq r2, r3, #96
783 add r6, r3, #144
784 add r7, r3, #288
785 @@ -1598,8 +1561,8 @@
786 vst1.8 {d2-d3}, [r7, : 128]!
787 vst1.8 d4, [r7, : 64]
788 cmp r5, #0
789 - beq ._skipsquaringloop
790 -._squaringloop:
791 + beq .Lskipsquaringloop
792 +.Lsquaringloop:
793 add r6, r3, #288
794 add r7, r3, #288
795 add r8, r3, #288
796 @@ -1611,7 +1574,7 @@
797 vld1.8 {d6-d7}, [r7, : 128]!
798 vld1.8 {d9}, [r7, : 64]
799 vld1.8 {d10-d11}, [r6, : 128]!
800 - add r7, sp, #416
801 + add r7, sp, #384
802 vld1.8 {d12-d13}, [r6, : 128]!
803 vmul.i32 q7, q2, q0
804 vld1.8 {d8}, [r6, : 64]
805 @@ -1726,7 +1689,7 @@
806 vext.32 d10, d6, d6, #0
807 vmov.i32 q1, #0xffffffff
808 vshl.i64 q4, q1, #25
809 - add r7, sp, #512
810 + add r7, sp, #480
811 vld1.8 {d14-d15}, [r7, : 128]
812 vadd.i64 q9, q2, q7
813 vshl.i64 q1, q1, #26
814 @@ -1735,7 +1698,7 @@
815 vadd.i64 q5, q5, q10
816 vand q9, q9, q1
817 vld1.8 {d16}, [r6, : 64]!
818 - add r6, sp, #528
819 + add r6, sp, #496
820 vld1.8 {d20-d21}, [r6, : 128]
821 vadd.i64 q11, q5, q10
822 vsub.i64 q2, q2, q9
823 @@ -1789,8 +1752,8 @@
824 sub r6, r6, #32
825 vst1.8 d4, [r6, : 64]
826 subs r5, r5, #1
827 - bhi ._squaringloop
828 -._skipsquaringloop:
829 + bhi .Lsquaringloop
830 +.Lskipsquaringloop:
831 mov r2, r2
832 add r5, r3, #288
833 add r6, r3, #144
834 @@ -1802,7 +1765,7 @@
835 vld1.8 {d6-d7}, [r5, : 128]!
836 vld1.8 {d9}, [r5, : 64]
837 vld1.8 {d10-d11}, [r2, : 128]!
838 - add r5, sp, #416
839 + add r5, sp, #384
840 vld1.8 {d12-d13}, [r2, : 128]!
841 vmul.i32 q7, q2, q0
842 vld1.8 {d8}, [r2, : 64]
843 @@ -1917,7 +1880,7 @@
844 vext.32 d10, d6, d6, #0
845 vmov.i32 q1, #0xffffffff
846 vshl.i64 q4, q1, #25
847 - add r5, sp, #512
848 + add r5, sp, #480
849 vld1.8 {d14-d15}, [r5, : 128]
850 vadd.i64 q9, q2, q7
851 vshl.i64 q1, q1, #26
852 @@ -1926,7 +1889,7 @@
853 vadd.i64 q5, q5, q10
854 vand q9, q9, q1
855 vld1.8 {d16}, [r2, : 64]!
856 - add r2, sp, #528
857 + add r2, sp, #496
858 vld1.8 {d20-d21}, [r2, : 128]
859 vadd.i64 q11, q5, q10
860 vsub.i64 q2, q2, q9
861 @@ -1980,7 +1943,7 @@
862 sub r2, r2, #32
863 vst1.8 d4, [r2, : 64]
864 cmp r4, #0
865 - beq ._skippostcopy
866 + beq .Lskippostcopy
867 add r2, r3, #144
868 mov r4, r4
869 vld1.8 {d0-d1}, [r2, : 128]!
870 @@ -1989,9 +1952,9 @@
871 vst1.8 {d0-d1}, [r4, : 128]!
872 vst1.8 {d2-d3}, [r4, : 128]!
873 vst1.8 d4, [r4, : 64]
874 -._skippostcopy:
875 +.Lskippostcopy:
876 cmp r1, #1
877 - bne ._skipfinalcopy
878 + bne .Lskipfinalcopy
879 add r2, r3, #288
880 add r4, r3, #144
881 vld1.8 {d0-d1}, [r2, : 128]!
882 @@ -2000,10 +1963,10 @@
883 vst1.8 {d0-d1}, [r4, : 128]!
884 vst1.8 {d2-d3}, [r4, : 128]!
885 vst1.8 d4, [r4, : 64]
886 -._skipfinalcopy:
887 +.Lskipfinalcopy:
888 add r1, r1, #1
889 cmp r1, #12
890 - blo ._invertloop
891 + blo .Linvertloop
892 add r1, r3, #144
893 ldr r2, [r1], #4
894 ldr r3, [r1], #4
895 @@ -2085,21 +2048,15 @@
896 add r8, r8, r10, LSL #12
897 mov r9, r10, LSR #20
898 add r1, r9, r1, LSL #6
899 - str r2, [r0], #4
900 - str r3, [r0], #4
901 - str r4, [r0], #4
902 - str r5, [r0], #4
903 - str r6, [r0], #4
904 - str r7, [r0], #4
905 - str r8, [r0], #4
906 - str r1, [r0]
907 - ldrd r4, [sp, #0]
908 - ldrd r6, [sp, #8]
909 - ldrd r8, [sp, #16]
910 - ldrd r10, [sp, #24]
911 - ldr r12, [sp, #480]
912 - ldr r14, [sp, #484]
913 - ldr r0, =0
914 - mov sp, r12
915 - vpop {q4, q5, q6, q7}
916 - bx lr
917 + str r2, [r0]
918 + str r3, [r0, #4]
919 + str r4, [r0, #8]
920 + str r5, [r0, #12]
921 + str r6, [r0, #16]
922 + str r7, [r0, #20]
923 + str r8, [r0, #24]
924 + str r1, [r0, #28]
925 + movw r0, #0
926 + mov sp, ip
927 + pop {r4-r11, pc}
928 +ENDPROC(curve25519_neon)
929 --- /dev/null
930 +++ b/arch/arm/crypto/curve25519-glue.c
931 @@ -0,0 +1,127 @@
932 +// SPDX-License-Identifier: GPL-2.0 OR MIT
933 +/*
934 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
935 + *
936 + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
937 + * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
938 + * manually reworked for use in kernel space.
939 + */
940 +
941 +#include <asm/hwcap.h>
942 +#include <asm/neon.h>
943 +#include <asm/simd.h>
944 +#include <crypto/internal/kpp.h>
945 +#include <crypto/internal/simd.h>
946 +#include <linux/types.h>
947 +#include <linux/module.h>
948 +#include <linux/init.h>
949 +#include <linux/jump_label.h>
950 +#include <crypto/curve25519.h>
951 +
952 +asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
953 + const u8 secret[CURVE25519_KEY_SIZE],
954 + const u8 basepoint[CURVE25519_KEY_SIZE]);
955 +
956 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
957 +
958 +void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
959 + const u8 scalar[CURVE25519_KEY_SIZE],
960 + const u8 point[CURVE25519_KEY_SIZE])
961 +{
962 + if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
963 + kernel_neon_begin();
964 + curve25519_neon(out, scalar, point);
965 + kernel_neon_end();
966 + } else {
967 + curve25519_generic(out, scalar, point);
968 + }
969 +}
970 +EXPORT_SYMBOL(curve25519_arch);
971 +
972 +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
973 + unsigned int len)
974 +{
975 + u8 *secret = kpp_tfm_ctx(tfm);
976 +
977 + if (!len)
978 + curve25519_generate_secret(secret);
979 + else if (len == CURVE25519_KEY_SIZE &&
980 + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
981 + memcpy(secret, buf, CURVE25519_KEY_SIZE);
982 + else
983 + return -EINVAL;
984 + return 0;
985 +}
986 +
987 +static int curve25519_compute_value(struct kpp_request *req)
988 +{
989 + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
990 + const u8 *secret = kpp_tfm_ctx(tfm);
991 + u8 public_key[CURVE25519_KEY_SIZE];
992 + u8 buf[CURVE25519_KEY_SIZE];
993 + int copied, nbytes;
994 + u8 const *bp;
995 +
996 + if (req->src) {
997 + copied = sg_copy_to_buffer(req->src,
998 + sg_nents_for_len(req->src,
999 + CURVE25519_KEY_SIZE),
1000 + public_key, CURVE25519_KEY_SIZE);
1001 + if (copied != CURVE25519_KEY_SIZE)
1002 + return -EINVAL;
1003 + bp = public_key;
1004 + } else {
1005 + bp = curve25519_base_point;
1006 + }
1007 +
1008 + curve25519_arch(buf, secret, bp);
1009 +
1010 + /* might want less than we've got */
1011 + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1012 + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1013 + nbytes),
1014 + buf, nbytes);
1015 + if (copied != nbytes)
1016 + return -EINVAL;
1017 + return 0;
1018 +}
1019 +
1020 +static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
1021 +{
1022 + return CURVE25519_KEY_SIZE;
1023 +}
1024 +
1025 +static struct kpp_alg curve25519_alg = {
1026 + .base.cra_name = "curve25519",
1027 + .base.cra_driver_name = "curve25519-neon",
1028 + .base.cra_priority = 200,
1029 + .base.cra_module = THIS_MODULE,
1030 + .base.cra_ctxsize = CURVE25519_KEY_SIZE,
1031 +
1032 + .set_secret = curve25519_set_secret,
1033 + .generate_public_key = curve25519_compute_value,
1034 + .compute_shared_secret = curve25519_compute_value,
1035 + .max_size = curve25519_max_size,
1036 +};
1037 +
1038 +static int __init mod_init(void)
1039 +{
1040 + if (elf_hwcap & HWCAP_NEON) {
1041 + static_branch_enable(&have_neon);
1042 + return crypto_register_kpp(&curve25519_alg);
1043 + }
1044 + return 0;
1045 +}
1046 +
1047 +static void __exit mod_exit(void)
1048 +{
1049 + if (elf_hwcap & HWCAP_NEON)
1050 + crypto_unregister_kpp(&curve25519_alg);
1051 +}
1052 +
1053 +module_init(mod_init);
1054 +module_exit(mod_exit);
1055 +
1056 +MODULE_ALIAS_CRYPTO("curve25519");
1057 +MODULE_ALIAS_CRYPTO("curve25519-neon");
1058 +MODULE_LICENSE("GPL v2");