1 From f9b4c68865fdb7f3327f7d82fbc82c76c8773d53 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Fri, 8 Nov 2019 13:22:16 +0100
4 Subject: [PATCH 010/124] crypto: mips/chacha - import 32r2 ChaCha code from
7 Content-Type: text/plain; charset=UTF-8
8 Content-Transfer-Encoding: 8bit
10 commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream.
12 This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
15 Co-developed-by: René van Dorst <opensource@vdorst.com>
16 Signed-off-by: René van Dorst <opensource@vdorst.com>
17 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
18 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
19 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
20 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
22 arch/mips/crypto/chacha-core.S | 424 +++++++++++++++++++++++++++++++++
23 1 file changed, 424 insertions(+)
24 create mode 100644 arch/mips/crypto/chacha-core.S
27 +++ b/arch/mips/crypto/chacha-core.S
29 +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
31 + * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
32 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
35 +#define MASK_U32 0x3c
36 +#define CHACHA20_BLOCK_SIZE 64
37 +#define STACK_SIZE 32
55 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
61 +/* Input arguments */
67 +/* Output argument */
68 +/* NONCE[0] is kept in a register and not in memory.
69 + * We don't want to touch original value in memory.
70 + * Must be incremented every loop iteration.
74 +/* SAVED_X and SAVED_CA are set in the jump table.
75 + * Use regs which are overwritten on exit else we don't leak clear data.
76 + * They are used to handling the last bytes which are not multiple of 4.
81 +#define IS_UNALIGNED $s7
83 +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
87 +#define ROTR(n) rotr n, 24
88 +#define CPU_TO_LE32(n) \
95 +#define CPU_TO_LE32(n)
99 +#define FOR_EACH_WORD(x) \
117 +#define FOR_EACH_WORD_REV(x) \
135 +#define PLUS_ONE_0 1
136 +#define PLUS_ONE_1 2
137 +#define PLUS_ONE_2 3
138 +#define PLUS_ONE_3 4
139 +#define PLUS_ONE_4 5
140 +#define PLUS_ONE_5 6
141 +#define PLUS_ONE_6 7
142 +#define PLUS_ONE_7 8
143 +#define PLUS_ONE_8 9
144 +#define PLUS_ONE_9 10
145 +#define PLUS_ONE_10 11
146 +#define PLUS_ONE_11 12
147 +#define PLUS_ONE_12 13
148 +#define PLUS_ONE_13 14
149 +#define PLUS_ONE_14 15
150 +#define PLUS_ONE_15 16
151 +#define PLUS_ONE(x) PLUS_ONE_ ## x
152 +#define _CONCAT3(a,b,c) a ## b ## c
153 +#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
155 +#define STORE_UNALIGNED(x) \
156 +CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
158 + lw T0, (x*4)(STATE); \
160 + lwl T1, (x*4)+MSB ## (IN); \
161 + lwr T1, (x*4)+LSB ## (IN); \
163 + addu X ## x, NONCE_0; \
167 + CPU_TO_LE32(X ## x); \
169 + swl X ## x, (x*4)+MSB ## (OUT); \
170 + swr X ## x, (x*4)+LSB ## (OUT);
172 +#define STORE_ALIGNED(x) \
173 +CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
175 + lw T0, (x*4)(STATE); \
177 + lw T1, (x*4) ## (IN); \
179 + addu X ## x, NONCE_0; \
183 + CPU_TO_LE32(X ## x); \
185 + sw X ## x, (x*4) ## (OUT);
187 +/* Jump table macro.
188 + * Used for setup and handling the last bytes, which are not multiple of 4.
189 + * X15 is free to store Xn
190 + * Every jumptable entry must be equal in size.
192 +#define JMPTBL_ALIGNED(x) \
193 +.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
195 + b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
197 + addu SAVED_X, X ## x, NONCE_0; \
199 + addu SAVED_X, X ## x, SAVED_CA; \
203 +#define JMPTBL_UNALIGNED(x) \
204 +.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
206 + b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
208 + addu SAVED_X, X ## x, NONCE_0; \
210 + addu SAVED_X, X ## x, SAVED_CA; \
214 +#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
231 +.globl chacha20_mips
234 + .frame $sp, STACK_SIZE, $ra
236 + addiu $sp, -STACK_SIZE
238 + /* Return bytes = 0. */
239 + beqz BYTES, .Lchacha20_mips_end
241 + lw NONCE_0, 48(STATE)
253 + /* Test IN or OUT is unaligned.
254 + * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
256 + or IS_UNALIGNED, IN, OUT
257 + andi IS_UNALIGNED, 0x3
259 + /* Set number of rounds */
262 + b .Lchacha20_rounds_start
265 +.Loop_chacha20_rounds:
266 + addiu IN, CHACHA20_BLOCK_SIZE
267 + addiu OUT, CHACHA20_BLOCK_SIZE
270 +.Lchacha20_rounds_start:
290 +.Loop_chacha20_xor_rounds:
292 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
293 + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
294 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
295 + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
296 + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
297 + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
298 + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
299 + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
300 + bnez $at, .Loop_chacha20_xor_rounds
302 + addiu BYTES, -(CHACHA20_BLOCK_SIZE)
304 + /* Is data src/dst unaligned? Jump */
305 + bnez IS_UNALIGNED, .Loop_chacha20_unaligned
307 + /* Set number rounds here to fill delayslot. */
310 + /* BYTES < 0, it has no full block. */
311 + bltz BYTES, .Lchacha20_mips_no_full_block_aligned
313 + FOR_EACH_WORD_REV(STORE_ALIGNED)
315 + /* BYTES > 0? Loop again. */
316 + bgtz BYTES, .Loop_chacha20_rounds
318 + /* Place this here to fill delay slot */
321 + /* BYTES < 0? Handle last bytes */
322 + bltz BYTES, .Lchacha20_mips_xor_bytes
324 +.Lchacha20_mips_xor_done:
325 + /* Restore used registers */
335 + /* Write NONCE_0 back to right location in state */
336 + sw NONCE_0, 48(STATE)
338 +.Lchacha20_mips_end:
339 + addiu $sp, STACK_SIZE
342 +.Lchacha20_mips_no_full_block_aligned:
343 + /* Restore the offset on BYTES */
344 + addiu BYTES, CHACHA20_BLOCK_SIZE
346 + /* Get number of full WORDS */
347 + andi $at, BYTES, MASK_U32
349 + /* Load upper half of jump table addr */
350 + lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
352 + /* Calculate lower half jump table offset */
355 + /* Add offset to STATE */
356 + addu T1, STATE, $at
358 + /* Add lower half jump table addr */
359 + addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
361 + /* Read value from STATE */
364 + /* Store remaining bytecounter as negative value */
365 + subu BYTES, $at, BYTES
370 + FOR_EACH_WORD(JMPTBL_ALIGNED)
373 +.Loop_chacha20_unaligned:
374 + /* Set number rounds here to fill delayslot. */
377 + /* BYTES > 0, it has no full block. */
378 + bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
380 + FOR_EACH_WORD_REV(STORE_UNALIGNED)
382 + /* BYTES > 0? Loop again. */
383 + bgtz BYTES, .Loop_chacha20_rounds
385 + /* Write NONCE_0 back to right location in state */
386 + sw NONCE_0, 48(STATE)
389 + /* Fall through to byte handling */
390 + bgez BYTES, .Lchacha20_mips_xor_done
391 +.Lchacha20_mips_xor_unaligned_0_b:
392 +.Lchacha20_mips_xor_aligned_0_b:
393 + /* Place this here to fill delay slot */
397 +.Lchacha20_mips_xor_bytes:
402 + addiu $at, BYTES, 1
403 + CPU_TO_LE32(SAVED_X)
407 + beqz $at, .Lchacha20_mips_xor_done
410 + addiu $at, BYTES, 2
414 + beqz $at, .Lchacha20_mips_xor_done
420 + b .Lchacha20_mips_xor_done
422 +.Lchacha20_mips_no_full_block_unaligned:
423 + /* Restore the offset on BYTES */
424 + addiu BYTES, CHACHA20_BLOCK_SIZE
426 + /* Get number of full WORDS */
427 + andi $at, BYTES, MASK_U32
429 + /* Load upper half of jump table addr */
430 + lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
432 + /* Calculate lower half jump table offset */
435 + /* Add offset to STATE */
436 + addu T1, STATE, $at
438 + /* Add lower half jump table addr */
439 + addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
441 + /* Read value from STATE */
444 + /* Store remaining bytecounter as negative value */
445 + subu BYTES, $at, BYTES
450 + FOR_EACH_WORD(JMPTBL_UNALIGNED)