1 From a338793df36990e97ab0b824fad6fbf6ef171f94 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Fri, 8 Nov 2019 13:22:26 +0100
4 Subject: [PATCH 020/124] crypto: mips/poly1305 - incorporate
5 OpenSSL/CRYPTOGAMS optimized implementation
7 Content-Type: text/plain; charset=UTF-8
8 Content-Transfer-Encoding: 8bit
10 commit a11d055e7a64ac34a5e99b6fe731299449cbcd58 upstream.
12 This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation for
13 MIPS authored by Andy Polyakov, a prior 64-bit only version of which has been
14 contributed by him to the OpenSSL project. The file 'poly1305-mips.pl' is taken
15 straight from this upstream GitHub repository [0] at commit
16 d22ade312a7af958ec955620b0d241cf42c37feb, and already contains all the changes
17 required to build it as part of a Linux kernel module.
19 [0] https://github.com/dot-asm/cryptogams
21 Co-developed-by: Andy Polyakov <appro@cryptogams.org>
22 Signed-off-by: Andy Polyakov <appro@cryptogams.org>
23 Co-developed-by: René van Dorst <opensource@vdorst.com>
24 Signed-off-by: René van Dorst <opensource@vdorst.com>
25 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
26 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
27 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
29 arch/mips/crypto/Makefile | 14 +
30 arch/mips/crypto/poly1305-glue.c | 203 +++++
31 arch/mips/crypto/poly1305-mips.pl | 1273 +++++++++++++++++++++++++++++
33 lib/crypto/Kconfig | 1 +
34 5 files changed, 1496 insertions(+)
35 create mode 100644 arch/mips/crypto/poly1305-glue.c
36 create mode 100644 arch/mips/crypto/poly1305-mips.pl
38 --- a/arch/mips/crypto/Makefile
39 +++ b/arch/mips/crypto/Makefile
40 @@ -8,3 +8,17 @@ obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32
41 obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
42 chacha-mips-y := chacha-core.o chacha-glue.o
43 AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
45 +obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
46 +poly1305-mips-y := poly1305-core.o poly1305-glue.o
48 +perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
49 +perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
51 +quiet_cmd_perlasm = PERLASM $@
52 + cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
54 +$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
55 + $(call if_changed,perlasm)
57 +targets += poly1305-core.S
59 +++ b/arch/mips/crypto/poly1305-glue.c
61 +// SPDX-License-Identifier: GPL-2.0
63 + * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
65 + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
68 +#include <asm/unaligned.h>
69 +#include <crypto/algapi.h>
70 +#include <crypto/internal/hash.h>
71 +#include <crypto/internal/poly1305.h>
72 +#include <linux/cpufeature.h>
73 +#include <linux/crypto.h>
74 +#include <linux/module.h>
76 +asmlinkage void poly1305_init_mips(void *state, const u8 *key);
77 +asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
78 +asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce);
80 +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
82 + poly1305_init_mips(&dctx->h, key);
83 + dctx->s[0] = get_unaligned_le32(key + 16);
84 + dctx->s[1] = get_unaligned_le32(key + 20);
85 + dctx->s[2] = get_unaligned_le32(key + 24);
86 + dctx->s[3] = get_unaligned_le32(key + 28);
89 +EXPORT_SYMBOL(poly1305_init_arch);
91 +static int mips_poly1305_init(struct shash_desc *desc)
93 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
102 +static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
103 + u32 len, u32 hibit)
105 + if (unlikely(!dctx->sset)) {
107 + poly1305_init_mips(&dctx->h, src);
108 + src += POLY1305_BLOCK_SIZE;
109 + len -= POLY1305_BLOCK_SIZE;
112 + if (len >= POLY1305_BLOCK_SIZE) {
113 + dctx->s[0] = get_unaligned_le32(src + 0);
114 + dctx->s[1] = get_unaligned_le32(src + 4);
115 + dctx->s[2] = get_unaligned_le32(src + 8);
116 + dctx->s[3] = get_unaligned_le32(src + 12);
117 + src += POLY1305_BLOCK_SIZE;
118 + len -= POLY1305_BLOCK_SIZE;
121 + if (len < POLY1305_BLOCK_SIZE)
125 + len &= ~(POLY1305_BLOCK_SIZE - 1);
127 + poly1305_blocks_mips(&dctx->h, src, len, hibit);
130 +static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
133 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
135 + if (unlikely(dctx->buflen)) {
136 + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
138 + memcpy(dctx->buf + dctx->buflen, src, bytes);
141 + dctx->buflen += bytes;
143 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
144 + mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
149 + if (likely(len >= POLY1305_BLOCK_SIZE)) {
150 + mips_poly1305_blocks(dctx, src, len, 1);
151 + src += round_down(len, POLY1305_BLOCK_SIZE);
152 + len %= POLY1305_BLOCK_SIZE;
155 + if (unlikely(len)) {
156 + dctx->buflen = len;
157 + memcpy(dctx->buf, src, len);
162 +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
163 + unsigned int nbytes)
165 + if (unlikely(dctx->buflen)) {
166 + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
168 + memcpy(dctx->buf + dctx->buflen, src, bytes);
171 + dctx->buflen += bytes;
173 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
174 + poly1305_blocks_mips(&dctx->h, dctx->buf,
175 + POLY1305_BLOCK_SIZE, 1);
180 + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
181 + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
183 + poly1305_blocks_mips(&dctx->h, src, len, 1);
185 + nbytes %= POLY1305_BLOCK_SIZE;
188 + if (unlikely(nbytes)) {
189 + dctx->buflen = nbytes;
190 + memcpy(dctx->buf, src, nbytes);
193 +EXPORT_SYMBOL(poly1305_update_arch);
195 +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
200 + if (unlikely(dctx->buflen)) {
201 + dctx->buf[dctx->buflen++] = 1;
202 + memset(dctx->buf + dctx->buflen, 0,
203 + POLY1305_BLOCK_SIZE - dctx->buflen);
204 + poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
207 + poly1305_emit_mips(&dctx->h, digest, dctx->s);
209 + /* mac = (h + s) % (2^128) */
210 + f = (f >> 32) + le32_to_cpu(digest[0]);
211 + put_unaligned_le32(f, dst);
212 + f = (f >> 32) + le32_to_cpu(digest[1]);
213 + put_unaligned_le32(f, dst + 4);
214 + f = (f >> 32) + le32_to_cpu(digest[2]);
215 + put_unaligned_le32(f, dst + 8);
216 + f = (f >> 32) + le32_to_cpu(digest[3]);
217 + put_unaligned_le32(f, dst + 12);
219 + *dctx = (struct poly1305_desc_ctx){};
221 +EXPORT_SYMBOL(poly1305_final_arch);
223 +static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
225 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
227 + if (unlikely(!dctx->sset))
230 + poly1305_final_arch(dctx, dst);
234 +static struct shash_alg mips_poly1305_alg = {
235 + .init = mips_poly1305_init,
236 + .update = mips_poly1305_update,
237 + .final = mips_poly1305_final,
238 + .digestsize = POLY1305_DIGEST_SIZE,
239 + .descsize = sizeof(struct poly1305_desc_ctx),
241 + .base.cra_name = "poly1305",
242 + .base.cra_driver_name = "poly1305-mips",
243 + .base.cra_priority = 200,
244 + .base.cra_blocksize = POLY1305_BLOCK_SIZE,
245 + .base.cra_module = THIS_MODULE,
248 +static int __init mips_poly1305_mod_init(void)
250 + return crypto_register_shash(&mips_poly1305_alg);
253 +static void __exit mips_poly1305_mod_exit(void)
255 + crypto_unregister_shash(&mips_poly1305_alg);
258 +module_init(mips_poly1305_mod_init);
259 +module_exit(mips_poly1305_mod_exit);
261 +MODULE_LICENSE("GPL v2");
262 +MODULE_ALIAS_CRYPTO("poly1305");
263 +MODULE_ALIAS_CRYPTO("poly1305-mips");
265 +++ b/arch/mips/crypto/poly1305-mips.pl
268 +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
270 +# ====================================================================
271 +# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
273 +# ====================================================================
275 +# Poly1305 hash for MIPS.
279 +# Numbers are cycles per processed byte with poly1305_blocks alone.
282 +# R1x000 ~5.5/+130% (big-endian)
283 +# Octeon II 2.50/+70% (little-endian)
287 +# Add 32-bit code path.
291 +# Modulo-scheduling reduction allows to omit dependency chain at the
292 +# end of inner loop and improve performance. Also optimize MIPS32R2
293 +# code path for MIPS 1004K core. Per René von Dorst's suggestions.
296 +# R1x000 ~9.8/? (big-endian)
297 +# Octeon II 3.65/+140% (little-endian)
298 +# MT7621/1004K 4.75/? (little-endian)
300 +######################################################################
301 +# There is a number of MIPS ABI in use, O32 and N32/64 are most
302 +# widely used. Then there is a new contender: NUBI. It appears that if
303 +# one picks the latter, it's possible to arrange code in ABI neutral
304 +# manner. Therefore let's stick to NUBI register layout:
306 +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
307 +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
308 +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
309 +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
311 +# The return value is placed in $a0. Following coding rules facilitate
314 +# - never ever touch $tp, "thread pointer", former $gp [o32 can be
315 +# excluded from the rule, because it's specified volatile];
316 +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
318 +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
320 +# For reference here is register layout for N32/64 MIPS ABIs:
322 +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
323 +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
324 +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
325 +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
326 +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
328 +# <appro@openssl.org>
330 +######################################################################
332 +$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
334 +$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
336 +if ($flavour =~ /64|n32/i) {{{
337 +######################################################################
341 +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
342 +my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
345 +#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
346 + defined(_MIPS_ARCH_MIPS64R6)) \\
347 + && !defined(_MIPS_ARCH_MIPS64R2)
348 +# define _MIPS_ARCH_MIPS64R2
351 +#if defined(_MIPS_ARCH_MIPS64R6)
352 +# define dmultu(rs,rt)
353 +# define mflo(rd,rs,rt) dmulu rd,rs,rt
354 +# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
356 +# define dmultu(rs,rt) dmultu rs,rt
357 +# define mflo(rd,rs,rt) mflo rd
358 +# define mfhi(rd,rs,rt) mfhi rd
362 +# define poly1305_init poly1305_init_mips
363 +# define poly1305_blocks poly1305_blocks_mips
364 +# define poly1305_emit poly1305_emit_mips
367 +#if defined(__MIPSEB__) && !defined(MIPSEB)
384 +.globl poly1305_init
396 +#if defined(_MIPS_ARCH_MIPS64R6)
397 + andi $tmp0,$inp,7 # $inp % 8
398 + dsubu $inp,$inp,$tmp0 # align $inp
399 + sll $tmp0,$tmp0,3 # byte to bit offset
402 + beqz $tmp0,.Laligned_key
405 + subu $tmp1,$zero,$tmp0
407 + dsllv $in0,$in0,$tmp0
408 + dsrlv $tmp3,$in1,$tmp1
409 + dsllv $in1,$in1,$tmp0
410 + dsrlv $tmp2,$tmp2,$tmp1
412 + dsrlv $in0,$in0,$tmp0
413 + dsllv $tmp3,$in1,$tmp1
414 + dsrlv $in1,$in1,$tmp0
415 + dsllv $tmp2,$tmp2,$tmp1
421 + ldl $in0,0+MSB($inp)
422 + ldl $in1,8+MSB($inp)
423 + ldr $in0,0+LSB($inp)
424 + ldr $in1,8+LSB($inp)
427 +# if defined(_MIPS_ARCH_MIPS64R2)
428 + dsbh $in0,$in0 # byte swap
433 + ori $tmp0,$zero,0xFF
434 + dsll $tmp2,$tmp0,32
435 + or $tmp0,$tmp2 # 0x000000FF000000FF
437 + and $tmp1,$in0,$tmp0 # byte swap
438 + and $tmp3,$in1,$tmp0
445 + dsll $tmp0,8 # 0x0000FF000000FF00
448 + and $tmp2,$in0,$tmp0
449 + and $tmp4,$in1,$tmp0
469 + dsll $tmp0,32 # 0x0000000100000000
470 + daddiu $tmp0,-63 # 0x00000000ffffffc1
471 + dsll $tmp0,28 # 0x0ffffffc10000000
472 + daddiu $tmp0,-1 # 0x0ffffffc0fffffff
475 + daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
481 + daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
485 + li $v0,0 # return 0
490 +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
492 +my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
493 + ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
494 +my ($shr,$shl) = ($s6,$s7); # used on R6
498 +.globl poly1305_blocks
499 +.ent poly1305_blocks
502 + dsrl $len,4 # number of complete blocks
503 + bnez $len,poly1305_blocks_internal
507 +.end poly1305_blocks
510 +.ent poly1305_blocks_internal
511 +poly1305_blocks_internal:
513 +#if defined(_MIPS_ARCH_MIPS64R6)
515 + .mask $SAVED_REGS_MASK|0x000c0000,-8
521 + .mask $SAVED_REGS_MASK,-8
527 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
536 +#if defined(_MIPS_ARCH_MIPS64R6)
538 + dsubu $inp,$inp,$shr # align $inp
539 + sll $shr,$shr,3 # byte to bit offset
540 + subu $shl,$zero,$shr
543 + ld $h0,0($ctx) # load hash value
547 + ld $r0,24($ctx) # load key
552 + daddu $len,$inp # end of buffer
557 +#if defined(_MIPS_ARCH_MIPS64R6)
558 + ld $in0,0($inp) # load input
560 + beqz $shr,.Laligned_inp
564 + dsllv $in0,$in0,$shr
565 + dsrlv $tmp3,$in1,$shl
566 + dsllv $in1,$in1,$shr
567 + dsrlv $tmp2,$tmp2,$shl
569 + dsrlv $in0,$in0,$shr
570 + dsllv $tmp3,$in1,$shl
571 + dsrlv $in1,$in1,$shr
572 + dsllv $tmp2,$tmp2,$shl
578 + ldl $in0,0+MSB($inp) # load input
579 + ldl $in1,8+MSB($inp)
580 + ldr $in0,0+LSB($inp)
581 + ldr $in1,8+LSB($inp)
585 +# if defined(_MIPS_ARCH_MIPS64R2)
586 + dsbh $in0,$in0 # byte swap
591 + ori $tmp0,$zero,0xFF
592 + dsll $tmp2,$tmp0,32
593 + or $tmp0,$tmp2 # 0x000000FF000000FF
595 + and $tmp1,$in0,$tmp0 # byte swap
596 + and $tmp3,$in1,$tmp0
603 + dsll $tmp0,8 # 0x0000FF000000FF00
606 + and $tmp2,$in0,$tmp0
607 + and $tmp4,$in1,$tmp0
626 + dsrl $tmp1,$h2,2 # modulo-scheduled reduction
630 + daddu $d0,$h0,$in0 # accumulate input
633 + daddu $d0,$d0,$tmp1 # ... and residue
634 + sltu $tmp1,$d0,$tmp1
640 + dmultu ($r0,$d0) # h0*r0
641 + daddu $d2,$h2,$padbit
642 + sltu $tmp0,$d1,$tmp0
646 + dmultu ($rs1,$d1) # h1*5*r1
649 + mflo ($tmp0,$rs1,$d1)
650 + mfhi ($tmp1,$rs1,$d1)
652 + dmultu ($r1,$d0) # h0*r1
653 + mflo ($tmp2,$r1,$d0)
657 + sltu $tmp0,$h0,$tmp0
659 + dmultu ($r0,$d1) # h1*r0
662 + mflo ($tmp0,$r0,$d1)
663 + mfhi ($tmp1,$r0,$d1)
665 + dmultu ($rs1,$d2) # h2*5*r1
666 + sltu $tmp2,$h1,$tmp2
668 + mflo ($tmp2,$rs1,$d2)
670 + dmultu ($r0,$d2) # h2*r0
673 + mflo ($tmp3,$r0,$d2)
674 + sltu $tmp0,$h1,$tmp0
678 + sltu $tmp2,$h1,$tmp2
682 + bne $inp,$len,.Loop
684 + sd $h0,0($ctx) # store hash value
689 +#if defined(_MIPS_ARCH_MIPS64R6)
693 + ld $s5,40($sp) # epilogue
696 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
704 +#if defined(_MIPS_ARCH_MIPS64R6)
709 +.end poly1305_blocks_internal
713 +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
717 +.globl poly1305_emit
727 + li $in0,-4 # final reduction
733 + daddu $tmp0,$tmp0,$in0
734 + sltu $in1,$tmp0,$in0
735 + daddiu $in0,$tmp0,5 # compare to modulus
736 + daddu $tmp1,$tmp1,$in1
738 + sltu $tmp4,$tmp1,$in1
739 + daddu $in1,$tmp1,$tmp3
740 + daddu $tmp2,$tmp2,$tmp4
741 + sltu $tmp3,$in1,$tmp3
742 + daddu $tmp2,$tmp2,$tmp3
744 + dsrl $tmp2,2 # see if it carried/borrowed
745 + dsubu $tmp2,$zero,$tmp2
754 + lwu $tmp0,0($nonce) # load nonce
755 + lwu $tmp1,4($nonce)
756 + lwu $tmp2,8($nonce)
757 + lwu $tmp3,12($nonce)
763 + daddu $in0,$tmp0 # accumulate nonce
765 + sltu $tmp0,$in0,$tmp0
768 + dsrl $tmp0,$in0,8 # write mac value
803 +.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
808 +######################################################################
812 +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
813 +my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
814 + ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
817 +#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
818 + defined(_MIPS_ARCH_MIPS32R6)) \\
819 + && !defined(_MIPS_ARCH_MIPS32R2)
820 +# define _MIPS_ARCH_MIPS32R2
823 +#if defined(_MIPS_ARCH_MIPS32R6)
824 +# define multu(rs,rt)
825 +# define mflo(rd,rs,rt) mulu rd,rs,rt
826 +# define mfhi(rd,rs,rt) muhu rd,rs,rt
828 +# define multu(rs,rt) multu rs,rt
829 +# define mflo(rd,rs,rt) mflo rd
830 +# define mfhi(rd,rs,rt) mfhi rd
834 +# define poly1305_init poly1305_init_mips
835 +# define poly1305_blocks poly1305_blocks_mips
836 +# define poly1305_emit poly1305_emit_mips
839 +#if defined(__MIPSEB__) && !defined(MIPSEB)
856 +.globl poly1305_init
870 +#if defined(_MIPS_ARCH_MIPS32R6)
871 + andi $tmp0,$inp,3 # $inp % 4
872 + subu $inp,$inp,$tmp0 # align $inp
873 + sll $tmp0,$tmp0,3 # byte to bit offset
878 + beqz $tmp0,.Laligned_key
881 + subu $tmp1,$zero,$tmp0
883 + sllv $in0,$in0,$tmp0
884 + srlv $tmp3,$in1,$tmp1
885 + sllv $in1,$in1,$tmp0
887 + srlv $tmp3,$in2,$tmp1
888 + sllv $in2,$in2,$tmp0
890 + srlv $tmp3,$in3,$tmp1
891 + sllv $in3,$in3,$tmp0
893 + srlv $tmp2,$tmp2,$tmp1
896 + srlv $in0,$in0,$tmp0
897 + sllv $tmp3,$in1,$tmp1
898 + srlv $in1,$in1,$tmp0
900 + sllv $tmp3,$in2,$tmp1
901 + srlv $in2,$in2,$tmp0
903 + sllv $tmp3,$in3,$tmp1
904 + srlv $in3,$in3,$tmp0
906 + sllv $tmp2,$tmp2,$tmp1
911 + lwl $in0,0+MSB($inp)
912 + lwl $in1,4+MSB($inp)
913 + lwl $in2,8+MSB($inp)
914 + lwl $in3,12+MSB($inp)
915 + lwr $in0,0+LSB($inp)
916 + lwr $in1,4+LSB($inp)
917 + lwr $in2,8+LSB($inp)
918 + lwr $in3,12+LSB($inp)
921 +# if defined(_MIPS_ARCH_MIPS32R2)
922 + wsbh $in0,$in0 # byte swap
931 + srl $tmp0,$in0,24 # byte swap
933 + andi $tmp2,$in0,0xFF00
942 + andi $tmp1,$in1,0xFF00
951 + andi $tmp2,$in2,0xFF00
960 + andi $tmp1,$in3,0xFF00
970 + ori $tmp0,0xffff # 0x0fffffff
971 + and $in0,$in0,$tmp0
972 + subu $tmp0,3 # 0x0ffffffc
973 + and $in1,$in1,$tmp0
974 + and $in2,$in2,$tmp0
975 + and $in3,$in3,$tmp0
985 + addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
986 + addu $in2,$in2,$tmp2
987 + addu $in3,$in3,$tmp3
997 +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
999 +my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
1000 + ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
1001 +my ($d0,$d1,$d2,$d3) =
1002 + ($a4,$a5,$a6,$a7);
1003 +my $shr = $t2; # used on R6
1004 +my $one = $t2; # used on R2
1007 +.globl poly1305_blocks
1009 +.ent poly1305_blocks
1011 + .frame $sp,16*4,$ra
1012 + .mask $SAVED_REGS_MASK,-4
1014 + subu $sp, $sp,4*12
1024 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1033 + srl $len,4 # number of complete blocks
1037 +#if defined(_MIPS_ARCH_MIPS32R6)
1039 + subu $inp,$inp,$shr # align $inp
1040 + sll $shr,$shr,3 # byte to bit offset
1043 + lw $h0,0($ctx) # load hash value
1049 + lw $r0,20($ctx) # load key
1058 + addu $len,$len,$inp # end of buffer
1063 +#if defined(_MIPS_ARCH_MIPS32R6)
1064 + lw $d0,0($inp) # load input
1068 + beqz $shr,.Laligned_inp
1071 + subu $t1,$zero,$shr
1101 + lwl $d0,0+MSB($inp) # load input
1102 + lwl $d1,4+MSB($inp)
1103 + lwl $d2,8+MSB($inp)
1104 + lwl $d3,12+MSB($inp)
1105 + lwr $d0,0+LSB($inp)
1106 + lwr $d1,4+LSB($inp)
1107 + lwr $d2,8+LSB($inp)
1108 + lwr $d3,12+LSB($inp)
1111 +# if defined(_MIPS_ARCH_MIPS32R2)
1112 + wsbh $d0,$d0 # byte swap
1121 + srl $at,$d0,24 # byte swap
1123 + andi $t1,$d0,0xFF00
1132 + andi $t0,$d1,0xFF00
1141 + andi $t1,$d2,0xFF00
1150 + andi $t0,$d3,0xFF00
1159 + srl $t0,$h4,2 # modulo-scheduled reduction
1163 + addu $d0,$d0,$h0 # accumulate input
1166 + addu $d0,$d0,$t0 # ... and residue
1170 + addu $h0,$h0,$at # carry
1176 + addu $h1,$h1,$h0 # carry
1182 + addu $h2,$h2,$h1 # carry
1186 +#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
1187 + multu $r0,$d0 # d0*r0
1189 + maddu $rs3,$d1 # d1*s3
1190 + addu $h3,$h3,$h2 # carry
1191 + maddu $rs2,$d2 # d2*s2
1192 + addu $h4,$h4,$padbit
1193 + maddu $rs1,$d3 # d3*s1
1198 + multu $r1,$d0 # d0*r1
1199 + maddu $r0,$d1 # d1*r0
1200 + maddu $rs3,$d2 # d2*s3
1201 + maddu $rs2,$d3 # d3*s2
1202 + maddu $rs1,$h4 # h4*s1
1203 + maddu $at,$one # hi*1
1207 + multu $r2,$d0 # d0*r2
1208 + maddu $r1,$d1 # d1*r1
1209 + maddu $r0,$d2 # d2*r0
1210 + maddu $rs3,$d3 # d3*s3
1211 + maddu $rs2,$h4 # h4*s2
1212 + maddu $at,$one # hi*1
1216 + mul $t0,$r0,$h4 # h4*r0
1218 + multu $r3,$d0 # d0*r3
1219 + maddu $r2,$d1 # d1*r2
1220 + maddu $r1,$d2 # d2*r1
1221 + maddu $r0,$d3 # d3*r0
1222 + maddu $rs3,$h4 # h4*s3
1223 + maddu $at,$one # hi*1
1227 + addiu $inp,$inp,16
1231 + multu ($r0,$d0) # d0*r0
1232 + mflo ($h0,$r0,$d0)
1233 + mfhi ($h1,$r0,$d0)
1236 + addu $h3,$h3,$h2 # carry
1238 + multu ($rs3,$d1) # d1*s3
1239 + mflo ($at,$rs3,$d1)
1240 + mfhi ($t0,$rs3,$d1)
1242 + addu $h4,$h4,$padbit
1243 + addiu $inp,$inp,16
1246 + multu ($rs2,$d2) # d2*s2
1247 + mflo ($a3,$rs2,$d2)
1248 + mfhi ($t1,$rs2,$d2)
1251 + multu ($rs1,$d3) # d3*s1
1255 + mflo ($at,$rs1,$d3)
1256 + mfhi ($t0,$rs1,$d3)
1259 + multu ($r1,$d0) # d0*r1
1264 + mflo ($a3,$r1,$d0)
1265 + mfhi ($h2,$r1,$d0)
1268 + multu ($r0,$d1) # d1*r0
1272 + mflo ($at,$r0,$d1)
1273 + mfhi ($t0,$r0,$d1)
1276 + multu ($rs3,$d2) # d2*s3
1279 + mflo ($a3,$rs3,$d2)
1280 + mfhi ($t1,$rs3,$d2)
1283 + multu ($rs2,$d3) # d3*s2
1287 + mflo ($at,$rs2,$d3)
1288 + mfhi ($t0,$rs2,$d3)
1291 + multu ($rs1,$h4) # h4*s1
1295 + mflo ($a3,$rs1,$h4)
1298 + multu ($r2,$d0) # d0*r2
1303 + mflo ($at,$r2,$d0)
1304 + mfhi ($h3,$r2,$d0)
1307 + multu ($r1,$d1) # d1*r1
1310 + mflo ($a3,$r1,$d1)
1311 + mfhi ($t1,$r1,$d1)
1314 + multu ($r0,$d2) # d2*r0
1317 + mflo ($at,$r0,$d2)
1318 + mfhi ($t0,$r0,$d2)
1321 + multu ($rs3,$d3) # d3*s3
1325 + mflo ($a3,$rs3,$d3)
1326 + mfhi ($t1,$rs3,$d3)
1329 + multu ($rs2,$h4) # h4*s2
1333 + mflo ($at,$rs2,$h4)
1336 + multu ($r3,$d0) # d0*r3
1341 + mflo ($a3,$r3,$d0)
1342 + mfhi ($t1,$r3,$d0)
1345 + multu ($r2,$d1) # d1*r2
1348 + mflo ($at,$r2,$d1)
1349 + mfhi ($t0,$r2,$d1)
1352 + multu ($r0,$d3) # d3*r0
1355 + mflo ($a3,$r0,$d3)
1356 + mfhi ($d3,$r0,$d3)
1359 + multu ($r1,$d2) # d2*r1
1363 + mflo ($at,$r1,$d2)
1364 + mfhi ($t0,$r1,$d2)
1367 + multu ($rs3,$h4) # h4*s3
1371 + mflo ($a3,$rs3,$h4)
1374 + multu ($r0,$h4) # h4*r0
1379 + mflo ($h4,$r0,$h4)
1385 + li $padbit,1 # if we loop, padbit is 1
1387 + bne $inp,$len,.Loop
1389 + sw $h0,0($ctx) # store hash value
1406 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1415 +.end poly1305_blocks
1419 +my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1423 +.globl poly1305_emit
1435 + li $in0,-4 # final reduction
1437 + and $in0,$in0,$tmp4
1438 + andi $tmp4,$tmp4,3
1439 + addu $ctx,$ctx,$in0
1441 + addu $tmp0,$tmp0,$ctx
1442 + sltu $ctx,$tmp0,$ctx
1443 + addiu $in0,$tmp0,5 # compare to modulus
1444 + addu $tmp1,$tmp1,$ctx
1446 + sltu $ctx,$tmp1,$ctx
1447 + addu $in1,$in1,$tmp1
1448 + addu $tmp2,$tmp2,$ctx
1449 + sltu $in2,$in1,$tmp1
1450 + sltu $ctx,$tmp2,$ctx
1451 + addu $in2,$in2,$tmp2
1452 + addu $tmp3,$tmp3,$ctx
1453 + sltu $in3,$in2,$tmp2
1454 + sltu $ctx,$tmp3,$ctx
1455 + addu $in3,$in3,$tmp3
1456 + addu $tmp4,$tmp4,$ctx
1457 + sltu $ctx,$in3,$tmp3
1460 + srl $ctx,2 # see if it carried/borrowed
1461 + subu $ctx,$zero,$ctx
1476 + lw $tmp0,0($nonce) # load nonce
1477 + lw $tmp1,4($nonce)
1478 + lw $tmp2,8($nonce)
1479 + lw $tmp3,12($nonce)
1481 + addu $in0,$tmp0 # accumulate nonce
1482 + sltu $ctx,$in0,$tmp0
1485 + sltu $tmp1,$in1,$tmp1
1487 + sltu $ctx,$in1,$ctx
1491 + sltu $tmp2,$in2,$tmp2
1493 + sltu $ctx,$in2,$ctx
1499 + srl $tmp0,$in0,8 # write mac value
1531 +.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1537 +$output=pop and open STDOUT,">$output";
1540 --- a/crypto/Kconfig
1541 +++ b/crypto/Kconfig
1542 @@ -707,6 +707,11 @@ config CRYPTO_POLY1305_X86_64
1543 in IETF protocols. This is the x86_64 assembler implementation using SIMD
1546 +config CRYPTO_POLY1305_MIPS
1547 + tristate "Poly1305 authenticator algorithm (MIPS optimized)"
1548 + depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
1549 + select CRYPTO_ARCH_HAVE_LIB_POLY1305
1552 tristate "MD4 digest algorithm"
1554 --- a/lib/crypto/Kconfig
1555 +++ b/lib/crypto/Kconfig
1556 @@ -39,6 +39,7 @@ config CRYPTO_LIB_DES
1558 config CRYPTO_LIB_POLY1305_RSIZE
1562 default 9 if ARM || ARM64