From: Felix Fietkau Date: Thu, 10 Jul 2025 21:46:26 +0000 (+0200) Subject: lib: add ML-DSA 44 code X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=a9e90114b38c6712ac571b10a9357a003f3fc3d5;p=project%2Funetd.git lib: add ML-DSA 44 code Used for post-quantum signatures Signed-off-by: Felix Fietkau --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d9dd37..6b2e915 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,7 @@ ELSE() ENDIF() ADD_DEFINITIONS(-DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH) -ADD_LIBRARY(unet SHARED curve25519.c siphash.c sha512.c fprime.c f25519.c ed25519.c edsign.c auth-data.c chacha20.c pex-msg.c utils.c stun.c random.c sntrup761.c shake.c) +ADD_LIBRARY(unet SHARED curve25519.c siphash.c sha512.c fprime.c f25519.c ed25519.c edsign.c auth-data.c chacha20.c pex-msg.c utils.c stun.c random.c sntrup761.c shake.c mldsa.c) TARGET_LINK_LIBRARIES(unet ubox) ADD_EXECUTABLE(unetd ${SOURCES}) diff --git a/mldsa.c b/mldsa.c new file mode 100644 index 0000000..b302712 --- /dev/null +++ b/mldsa.c @@ -0,0 +1,2211 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include +#include + +#include "shake.h" +#include "random.h" +#include "utils.h" +#include "mldsa.h" + +#define MLD_DEFAULT_ALIGN 32 +#define MLD_ALIGN_UP(N) \ + ((((N) + (MLD_DEFAULT_ALIGN - 1)) / MLD_DEFAULT_ALIGN) * MLD_DEFAULT_ALIGN) +#if defined(__GNUC__) +#define MLD_ALIGN __attribute__((aligned(MLD_DEFAULT_ALIGN))) +#elif defined(_MSC_VER) +#define MLD_ALIGN __declspec(align(MLD_DEFAULT_ALIGN)) +#else +#define MLD_ALIGN /* No known support for alignment constraints */ +#endif + +#define MLDSA_SEEDBYTES 32 +#define MLDSA_CRHBYTES 64 +#define MLDSA_TRBYTES 64 +#define MLDSA_RNDBYTES 32 +#define MLDSA_N 256 +#define MLDSA_Q 8380417 +#define MLDSA_Q_HALF ((MLDSA_Q + 1) / 2) /* 4190209 */ +#define MLDSA_D 13 + +#define MLDSA_K 4 +#define MLDSA_L 4 +#define MLDSA_ETA 2 +#define MLDSA_TAU 39 +#define MLDSA_BETA 78 +#define MLDSA_GAMMA1 (1 << 17) +#define MLDSA_GAMMA2 ((MLDSA_Q - 1) / 88) +#define MLDSA_OMEGA 80 +#define MLDSA_CTILDEBYTES 32 +#define MLDSA_POLYZ_PACKEDBYTES 576 +#define MLDSA_POLYW1_PACKEDBYTES 192 +#define MLDSA_POLYETA_PACKEDBYTES 96 + +#define MLDSA_POLYT1_PACKEDBYTES 320 +#define MLDSA_POLYT0_PACKEDBYTES 416 +#define MLDSA_POLYVECH_PACKEDBYTES (MLDSA_OMEGA + MLDSA_K) + +#define CRYPTO_PUBLICKEYBYTES \ + (MLDSA_SEEDBYTES + MLDSA_K * MLDSA_POLYT1_PACKEDBYTES) +#define CRYPTO_SECRETKEYBYTES \ + (2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + MLDSA_L * MLDSA_POLYETA_PACKEDBYTES + \ + MLDSA_K * MLDSA_POLYETA_PACKEDBYTES + MLDSA_K * MLDSA_POLYT0_PACKEDBYTES) +#define CRYPTO_BYTES \ + (MLDSA_CTILDEBYTES + MLDSA_L * MLDSA_POLYZ_PACKEDBYTES + \ + MLDSA_POLYVECH_PACKEDBYTES) + +#define STREAM128_BLOCKBYTES SHAKE128_RATE +#define STREAM256_BLOCKBYTES SHAKE256_RATE + +#define mld_xof256_ctx keccak_state +#define mld_xof256_init(CTX) shake256_init(CTX) +#define mld_xof256_absorb(CTX, IN, INBYTES) \ + do \ + { \ + shake256_absorb(CTX, IN, INBYTES); \ + shake256_finalize(CTX); \ + } while (0) +#define mld_xof256_release(CTX) shake256_release(CTX) + +#define mld_xof256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) + +#define mld_xof128_ctx keccak_state +#define mld_xof128_init(CTX) shake128_init(CTX) +#define mld_xof128_absorb(CTX, IN, INBYTES) \ + do \ + { \ + shake128_absorb(CTX, IN, INBYTES); \ + shake128_finalize(CTX); \ + } while (0) +#define mld_xof128_release(CTX) shake128_release(CTX) + + +#define mld_xof128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) + +#define mld_xof256_x4_ctx mld_shake256x4ctx +#define mld_xof256_x4_init(CTX) mld_shake256x4_init((CTX)) +#define mld_xof256_x4_absorb(CTX, IN, INBYTES) \ + mld_shake256x4_absorb_once((CTX), (IN)[0], (IN)[1], (IN)[2], (IN)[3], \ + (INBYTES)) +#define mld_xof256_x4_squeezeblocks(BUF, NBLOCKS, CTX) \ + mld_shake256x4_squeezeblocks((BUF)[0], (BUF)[1], (BUF)[2], (BUF)[3], \ + (NBLOCKS), (CTX)) +#define mld_xof256_x4_release(CTX) mld_shake256x4_release((CTX)) + +#define mld_xof128_x4_ctx mld_shake128x4ctx +#define mld_xof128_x4_init(CTX) mld_shake128x4_init((CTX)) +#define mld_xof128_x4_absorb(CTX, IN, INBYTES) \ + mld_shake128x4_absorb_once((CTX), (IN)[0], (IN)[1], (IN)[2], (IN)[3], \ + (INBYTES)) +#define mld_xof128_x4_squeezeblocks(BUF, NBLOCKS, CTX) \ + mld_shake128x4_squeezeblocks((BUF)[0], (BUF)[1], (BUF)[2], (BUF)[3], \ + (NBLOCKS), (CTX)) +#define mld_xof128_x4_release(CTX) mld_shake128x4_release((CTX)) + +static int32_t mld_cast_uint32_to_int32(uint32_t x) +{ + /* + * PORTABILITY: This relies on uint32_t -> int32_t + * being implemented as the inverse of int32_t -> uint32_t, + * which is implementation-defined (C99 6.3.1.3 (3)) + * CBMC (correctly) fails to prove this conversion is OK, + * so we have to suppress that check here + */ + return (int32_t)x; +} + +static int32_t montgomery_reduce(int64_t a) +{ + /* check-magic: 58728449 == unsigned_mod(pow(MLDSA_Q, -1, 2^32), 2^32) */ + const uint64_t QINV = 58728449; + + /* Compute a*q^{-1} mod 2^32 in unsigned representatives */ + const uint32_t a_reduced = a & UINT32_MAX; + const uint32_t a_inverted = (a_reduced * QINV) & UINT32_MAX; + + /* Lift to signed canonical representative mod 2^16. */ + const int32_t t = mld_cast_uint32_to_int32(a_inverted); + + int64_t r; + + r = a - ((int64_t)t * MLDSA_Q); + + /* + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + r = r >> 32; + return (int32_t)r; +} + +static int32_t reduce32(int32_t a) +{ + int32_t t; + + t = (a + (1 << 22)) >> 23; + t = a - t * MLDSA_Q; + return t; +} + +static int32_t caddq(int32_t a) +{ + a += (a >> 31) & MLDSA_Q; + return a; +} + +static void power2round(int32_t *a0, int32_t *a1, const int32_t a) +{ + *a1 = (a + (1 << (MLDSA_D - 1)) - 1) >> MLDSA_D; + *a0 = a - (*a1 << MLDSA_D); +} + +static void decompose(int32_t *a0, int32_t *a1, int32_t a) +{ + *a1 = (a + 127) >> 7; + /* We know a >= 0 and a < MLDSA_Q, so... */ + *a1 = (*a1 * 11275 + (1 << 23)) >> 24; + *a1 ^= ((43 - *a1) >> 31) & *a1; + *a0 = a - *a1 * 2 * MLDSA_GAMMA2; + *a0 -= (((MLDSA_Q - 1) / 2 - *a0) >> 31) & MLDSA_Q; +} + +static unsigned int make_hint(int32_t a0, int32_t a1) +{ + if (a0 > MLDSA_GAMMA2 || a0 < -MLDSA_GAMMA2 || + (a0 == -MLDSA_GAMMA2 && a1 != 0)) + { + return 1; + } + + return 0; +} + +static int32_t use_hint(int32_t a, unsigned int hint) +{ + int32_t a0, a1; + + decompose(&a0, &a1, a); + if (hint == 0) + { + return a1; + } + + if (a0 > 0) + { + return (a1 == 43) ? 0 : a1 + 1; + } + else + { + return (a1 == 0) ? 43 : a1 - 1; + } +} + +/* + * Table of zeta values used in the reference NTT and inverse NTT. + * See autogen for details. + */ +static const int32_t zetas[MLDSA_N] = { + 0, 25847, -2608894, -518909, 237124, -777960, -876248, + 466468, 1826347, 2353451, -359251, -2091905, 3119733, -2884855, + 3111497, 2680103, 2725464, 1024112, -1079900, 3585928, -549488, + -1119584, 2619752, -2108549, -2118186, -3859737, -1399561, -3277672, + 1757237, -19422, 4010497, 280005, 2706023, 95776, 3077325, + 3530437, -1661693, -3592148, -2537516, 3915439, -3861115, -3043716, + 3574422, -2867647, 3539968, -300467, 2348700, -539299, -1699267, + -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, + -2797779, -3930395, -1528703, -3677745, -3041255, -1452451, 3475950, + 2176455, -1585221, -1257611, 1939314, -4083598, -1000202, -3190144, + -3157330, -3632928, 126922, 3412210, -983419, 2147896, 2715295, + -2967645, -3693493, -411027, -2477047, -671102, -1228525, -22981, + -1308169, -381987, 1349076, 1852771, -1430430, -3343383, 264944, + 508951, 3097992, 44288, -1100098, 904516, 3958618, -3724342, + -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856, + 189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, + 1341330, 1285669, -1584928, -812732, -1439742, -3019102, -3881060, + -3628969, 3839961, 2091667, 3407706, 2316500, 3817976, -3342478, + 2244091, -2446433, -3562462, 266997, 2434439, -1235728, 3513181, + -3520352, -3759364, -1197226, -3193378, 900702, 1859098, 909542, + 819034, 495491, -1613174, -43260, -522500, -655327, -3122442, + 2031748, 3207046, -3556995, -525098, -768622, -3595838, 342297, + 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044, + 2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, + 1595974, -3767016, 1250494, 2635921, -3548272, -2994039, 1869119, + 1903435, -1050970, -1333058, 1237275, -3318210, -1430225, -451100, + 1312455, 3306115, -1962642, -1279661, 1917081, -2546312, -1374803, + 1500165, 777191, 2235880, 3406031, -542412, -2831860, -1671176, + -1846953, -2584293, -3724270, 594136, -3776993, -2013608, 2432395, + 2454455, -164721, 1957272, 3369112, 185531, -1207385, -3183426, + 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107, + -3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, + 472078, -426683, 1723600, -1803090, 1910376, -1667432, -1104333, + -260646, -3833893, -2939036, -2235985, -420899, -2286327, 183443, + -976891, 1612842, -3545687, -554416, 3919660, -48306, -1362209, + 3937738, 1400424, -846154, 1976782, +}; + +static int32_t mld_fqmul(int32_t a, int32_t b) +{ + return montgomery_reduce((int64_t)a * (int64_t)b); + /* TODO: reason about bounds */ +} + +/************************************************* + * Name: mld_fqsacle + * + * Description: Scales a field element by mont/256 , i.e., performs Montgomery + * multiplication by mont^2/256. + * Input is expected to have absolute value smaller than + * 256 * MLDSA_Q. + * Output has absolute value smaller than MLD_INTT_BOUND (4211139). + * + * Arguments: - int32_t a: Field element to be scaled. + **************************************************/ +static int32_t mld_fqscale(int32_t a) +{ + const int32_t f = 41978; /* mont^2/256 */ + return montgomery_reduce((int64_t)a * f); + /* TODO: reason about bounds */ +} + + +/* mld_ntt_butterfly_block() + * + * Computes a block CT butterflies with a fixed twiddle factor, + * using Montgomery multiplication. + * + * Parameters: + * - r: Pointer to base of polynomial (_not_ the base of butterfly block) + * - zeta: Twiddle factor to use for the butterfly. This must be in + * Montgomery form and signed canonical. + * - start: Offset to the beginning of the butterfly block + * - len: Index difference between coefficients subject to a butterfly + * - bound: Ghost variable describing coefficient bound: Prior to `start`, + * coefficients must be bound by `bound + MLDSA_Q`. Post `start`, + * they must be bound by `bound`. + * When this function returns, output coefficients in the index range + * [start, start+2*len) have bound bumped to `bound + MLDSA_Q`. + * Example: + * - start=8, len=4 + * This would compute the following four butterflies + * 8 -- 12 + * 9 -- 13 + * 10 -- 14 + * 11 -- 15 + * - start=4, len=2 + * This would compute the following two butterflies + * 4 -- 6 + * 5 -- 7 + */ + +/* Reference: Embedded in `ntt()` in the reference implementation. */ +static void mld_ntt_butterfly_block(int32_t r[MLDSA_N], const int32_t zeta, + const unsigned start, const unsigned len, + const int32_t bound) +{ + /* `bound` is a ghost variable only needed in the CBMC specification */ + unsigned j; + ((void)bound); + for (j = start; j < start + len; j++) + { + int32_t t; + t = mld_fqmul(r[j + len], zeta); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } +} + +/* mld_ntt_layer() + * + * Compute one layer of forward NTT + * + * Parameters: + * - r: Pointer to base of polynomial + * - layer: Indicates which layer is being applied. + */ + +/* Reference: Embedded in `ntt()` in the reference implementation. */ +static void mld_ntt_layer(int32_t r[MLDSA_N], const unsigned layer) +{ + unsigned start, k, len; + /* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */ + k = 1u << (layer - 1); + len = MLDSA_N >> layer; + for (start = 0; start < MLDSA_N; start += 2 * len) + { + int32_t zeta = zetas[k++]; + mld_ntt_butterfly_block(r, zeta, start, len, layer * MLDSA_Q); + } +} + + +static void ntt(int32_t a[MLDSA_N]) +{ + unsigned int layer; + + for (layer = 1; layer < 9; layer++) + { + mld_ntt_layer(a, layer); + } + + /* When the loop exits, layer == 9, so the loop invariant */ + /* directly implies the postcondition in that coefficients */ + /* are bounded in magnitude by 9 * MLDSA_Q */ +} + +/* Reference: Embedded into `invntt_tomont()` in the reference implementation + * [@REF] */ +static void mld_invntt_layer(int32_t r[MLDSA_N], unsigned layer) +{ + unsigned start, k, len; + len = (MLDSA_N >> layer); + k = (1u << layer) - 1; + for (start = 0; start < MLDSA_N; start += 2 * len) + { + unsigned j; + int32_t zeta = -zetas[k--]; + + for (j = start; j < start + len; j++) + { + int32_t t = r[j]; + r[j] = t + r[j + len]; + r[j + len] = t - r[j + len]; + r[j + len] = mld_fqmul(r[j + len], zeta); + } + } +} + +/************************************************* + * Name: invntt_tomont + * + * Description: Inverse NTT and multiplication by Montgomery factor mont^2 /256. + * In-place. No modular reductions after additions or subtractions; + * Input coefficients need to be smaller than MLDSA_Q + * in absolute value. + * Output coefficient are smaller than MLD_INTT_BOUND + * in absolute value. + * + * Arguments: - int32_t a[MLDSA_N]: input/output coefficient array + **************************************************/ +static void invntt_tomont(int32_t a[MLDSA_N]) + +{ + unsigned int layer, j; + + for (layer = 8; layer >= 1; layer--) + { + mld_invntt_layer(a, layer); + } + + /* Coefficient bounds are now at 256Q. We now scale by mont / 256, + * i.e., compute the Montgomery multiplication by mont^2 / 256. + * mont corrects the mont^-1 factor introduced in the basemul. + * 1/256 performs that scaling of the inverse NTT. + * The reduced value is bounded by MLD_INTT_BOUND (4211139) in absolute + * value.*/ + for (j = 0; j < MLDSA_N; ++j) + { + a[j] = mld_fqscale(a[j]); + } +} + +typedef struct +{ + int32_t coeffs[MLDSA_N]; +} MLD_ALIGN poly; + +typedef struct +{ + poly vec[MLDSA_L]; +} polyvecl; + +typedef struct +{ + poly vec[MLDSA_K]; +} polyveck; + +static void poly_reduce(poly *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N; ++i) + { + a->coeffs[i] = reduce32(a->coeffs[i]); + } +} + +static void poly_caddq(poly *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N; ++i) + { + a->coeffs[i] = caddq(a->coeffs[i]); + } +} +/* Reference: We use destructive version (output=first input) to avoid + * reasoning about aliasing in the CBMC specification */ +static void poly_add(poly *r, const poly *b) +{ + unsigned int i; + for (i = 0; i < MLDSA_N; ++i) + { + r->coeffs[i] = r->coeffs[i] + b->coeffs[i]; + } +} + +/* Reference: We use destructive version (output=first input) to avoid + * reasoning about aliasing in the CBMC specification */ +static void poly_sub(poly *r, const poly *b) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N; ++i) + { + r->coeffs[i] = r->coeffs[i] - b->coeffs[i]; + } +} + +static void poly_shiftl(poly *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N; i++) + { + /* Reference: uses a left shift by MLDSA_D which is undefined behaviour in + * C90/C99 + */ + a->coeffs[i] *= (1 << MLDSA_D); + } +} + +static void poly_ntt(poly *a) +{ + ntt(a->coeffs); +} + +static void poly_invntt_tomont(poly *a) +{ + invntt_tomont(a->coeffs); +} + +static void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N; ++i) + { + c->coeffs[i] = montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); + } +} + +static void poly_power2round(poly *a1, poly *a0, const poly *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N; ++i) + { + power2round(&a0->coeffs[i], &a1->coeffs[i], a->coeffs[i]); + } +} + +static void poly_decompose(poly *a1, poly *a0, const poly *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N; ++i) + { + decompose(&a0->coeffs[i], &a1->coeffs[i], a->coeffs[i]); + } +} + +static unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1) +{ + unsigned int i, s = 0; + + for (i = 0; i < MLDSA_N; ++i) + { + const unsigned int hint_bit = make_hint(a0->coeffs[i], a1->coeffs[i]); + h->coeffs[i] = hint_bit; + s += hint_bit; + } + + return s; +} + +static void poly_use_hint(poly *b, const poly *a, const poly *h) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N; ++i) + { + b->coeffs[i] = use_hint(a->coeffs[i], h->coeffs[i]); + } +} + +/* Reference: explicitly checks the bound B to be <= (MLDSA_Q - 1) / 8). + * This is unnecessary as it's always a compile-time constant. + * We instead model it as a precondition. + */ +static int poly_chknorm(const poly *a, int32_t B) +{ + unsigned int i; + int rc = 0; + int32_t t; + + /* It is ok to leak which coefficient violates the bound since + the probability for each coefficient is independent of secret + data but we must not leak the sign of the centralized representative. */ + + for (i = 0; i < MLDSA_N; ++i) + { + /* Absolute value */ + t = a->coeffs[i] >> 31; + t = a->coeffs[i] - (t & 2 * a->coeffs[i]); + + if (t >= B) + { + rc = 1; + } + } + + return rc; +} + +/************************************************* + * Name: rej_uniform + * + * Description: Sample uniformly random coefficients in [0, MLDSA_Q-1] by + * performing rejection sampling on array of random bytes. + * + * Arguments: - int32_t *a: pointer to output array (allocated) + * - unsigned int target: requested number of coefficients to + *sample + * - unsigned int offset: number of coefficients already sampled + * - const uint8_t *buf: array of random bytes to sample from + * - unsigned int buflen: length of array of random bytes (must be + * multiple of 3) + * + * Returns number of sampled coefficients. Can be smaller than len if not enough + * random bytes were given. + **************************************************/ + +/* Reference: `rej_uniform()` in the reference implementation [@REF]. + * - Our signature differs from the reference implementation + * in that it adds the offset and always expects the base of the + * target buffer. This avoids shifting the buffer base in the + * caller, which appears tricky to reason about. */ +#define POLY_UNIFORM_NBLOCKS \ + ((768 + STREAM128_BLOCKBYTES - 1) / STREAM128_BLOCKBYTES) +static unsigned int rej_uniform(int32_t *a, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +{ + unsigned int ctr, pos; + uint32_t t; + + ctr = offset; + pos = 0; + /* pos + 3 cannot overflow due to the assumption + buflen <= (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) */ + while (ctr < target && pos + 3 <= buflen) + { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < MLDSA_Q) + { + a[ctr++] = t; + } + } + + return ctr; +} + +/* Reference: poly_uniform() in the reference implementation [@REF]. + * - Simplified from reference by removing buffer tail handling + * since buflen % 3 = 0 always holds true (STREAM128_BLOCKBYTES = + * 168). + * - Modified rej_uniform interface to track offset directly. + * - Pass nonce packed in the extended seed array instead of a third + * argument. + * */ +static void poly_uniform(poly *a, const uint8_t seed[MLDSA_SEEDBYTES + 2]) +{ + unsigned int ctr; + unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES; + MLD_ALIGN uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES]; + mld_xof128_ctx state; + + mld_xof128_init(&state); + mld_xof128_absorb(&state, seed, MLDSA_SEEDBYTES + 2); + mld_xof128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); + + ctr = rej_uniform(a->coeffs, MLDSA_N, 0, buf, buflen); + buflen = STREAM128_BLOCKBYTES; + while (ctr < MLDSA_N) + { + mld_xof128_squeezeblocks(buf, 1, &state); + ctr = rej_uniform(a->coeffs, MLDSA_N, ctr, buf, buflen); + } + mld_xof128_release(&state); +} + +static void +poly_uniform_4x(poly *vec0, poly *vec1, poly *vec2, poly *vec3, + uint8_t seed[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)]) +{ + /* Temporary buffers for XOF output before rejection sampling */ + MLD_ALIGN uint8_t + buf[4][MLD_ALIGN_UP(POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES)]; + + /* Tracks the number of coefficients we have already sampled */ + unsigned ctr[4]; + mld_xof128_x4_ctx state; + unsigned buflen; + + mld_xof128_x4_init(&state); + mld_xof128_x4_absorb(&state, seed, MLDSA_SEEDBYTES + 2); + + /* + * Initially, squeeze heuristic number of POLY_UNIFORM_NBLOCKS. + * This should generate the matrix entries with high probability. + */ + + mld_xof128_x4_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); + buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES; + ctr[0] = rej_uniform(vec0->coeffs, MLDSA_N, 0, buf[0], buflen); + ctr[1] = rej_uniform(vec1->coeffs, MLDSA_N, 0, buf[1], buflen); + ctr[2] = rej_uniform(vec2->coeffs, MLDSA_N, 0, buf[2], buflen); + ctr[3] = rej_uniform(vec3->coeffs, MLDSA_N, 0, buf[3], buflen); + + /* + * So long as not all matrix entries have been generated, squeeze + * one more block a time until we're done. + */ + buflen = STREAM128_BLOCKBYTES; + while (ctr[0] < MLDSA_N || ctr[1] < MLDSA_N || ctr[2] < MLDSA_N || + ctr[3] < MLDSA_N) + { + mld_xof128_x4_squeezeblocks(buf, 1, &state); + ctr[0] = rej_uniform(vec0->coeffs, MLDSA_N, ctr[0], buf[0], buflen); + ctr[1] = rej_uniform(vec1->coeffs, MLDSA_N, ctr[1], buf[1], buflen); + ctr[2] = rej_uniform(vec2->coeffs, MLDSA_N, ctr[2], buf[2], buflen); + ctr[3] = rej_uniform(vec3->coeffs, MLDSA_N, ctr[3], buf[3], buflen); + } + mld_xof128_x4_release(&state); +} + +/************************************************* + * Name: rej_eta + * + * Description: Sample uniformly random coefficients in [-MLDSA_ETA, MLDSA_ETA] + *by performing rejection sampling on array of random bytes. + * + * Arguments: - int32_t *a: pointer to output array (allocated) + * - unsigned int target: requested number of coefficients to + *sample + * - unsigned int offset: number of coefficients already sampled + * - const uint8_t *buf: array of random bytes to sample from + * - unsigned int buflen: length of array of random bytes + * + * Returns number of sampled coefficients. Can be smaller than target if not + *enough random bytes were given. + **************************************************/ + +/* Reference: `rej_eta()` in the reference implementation [@REF]. + * - Our signature differs from the reference implementation + * in that it adds the offset and always expects the base of the + * target buffer. This avoids shifting the buffer base in the + * caller, which appears tricky to reason about. */ +#define POLY_UNIFORM_ETA_NBLOCKS \ + ((136 + STREAM256_BLOCKBYTES - 1) / STREAM256_BLOCKBYTES) +static unsigned int rej_eta(int32_t *a, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +{ + unsigned int ctr, pos; + uint32_t t0, t1; + + ctr = offset; + pos = 0; + while (ctr < target && pos < buflen) + { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 15) + { + t0 = t0 - (205 * t0 >> 10) * 5; + a[ctr++] = 2 - (int32_t)t0; + } + if (t1 < 15 && ctr < target) + { + t1 = t1 - (205 * t1 >> 10) * 5; + a[ctr++] = 2 - (int32_t)t1; + } + } + + return ctr; +} + +static void +poly_uniform_eta_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLDSA_CRHBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +{ + /* Temporary buffers for XOF output before rejection sampling */ + MLD_ALIGN uint8_t + buf[4][MLD_ALIGN_UP(POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES)]; + + MLD_ALIGN uint8_t extseed[4][MLD_ALIGN_UP(MLDSA_CRHBYTES + 2)]; + + /* Tracks the number of coefficients we have already sampled */ + unsigned ctr[4]; + mld_xof256_x4_ctx state; + unsigned buflen; + + memcpy(extseed[0], seed, MLDSA_CRHBYTES); + memcpy(extseed[1], seed, MLDSA_CRHBYTES); + memcpy(extseed[2], seed, MLDSA_CRHBYTES); + memcpy(extseed[3], seed, MLDSA_CRHBYTES); + extseed[0][MLDSA_CRHBYTES] = nonce0; + extseed[1][MLDSA_CRHBYTES] = nonce1; + extseed[2][MLDSA_CRHBYTES] = nonce2; + extseed[3][MLDSA_CRHBYTES] = nonce3; + extseed[0][MLDSA_CRHBYTES + 1] = 0; + extseed[1][MLDSA_CRHBYTES + 1] = 0; + extseed[2][MLDSA_CRHBYTES + 1] = 0; + extseed[3][MLDSA_CRHBYTES + 1] = 0; + + mld_xof256_x4_init(&state); + mld_xof256_x4_absorb(&state, extseed, MLDSA_CRHBYTES + 2); + + /* + * Initially, squeeze heuristic number of POLY_UNIFORM_ETA_NBLOCKS. + * This should generate the coefficients with high probability. + */ + mld_xof256_x4_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); + buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES; + + ctr[0] = rej_eta(r0->coeffs, MLDSA_N, 0, buf[0], buflen); + ctr[1] = rej_eta(r1->coeffs, MLDSA_N, 0, buf[1], buflen); + ctr[2] = rej_eta(r2->coeffs, MLDSA_N, 0, buf[2], buflen); + ctr[3] = rej_eta(r3->coeffs, MLDSA_N, 0, buf[3], buflen); + + /* + * So long as not all entries have been generated, squeeze + * one more block a time until we're done. + */ + buflen = STREAM256_BLOCKBYTES; + while (ctr[0] < MLDSA_N || ctr[1] < MLDSA_N || ctr[2] < MLDSA_N || + ctr[3] < MLDSA_N) + { + mld_xof256_x4_squeezeblocks(buf, 1, &state); + ctr[0] = rej_eta(r0->coeffs, MLDSA_N, ctr[0], buf[0], buflen); + ctr[1] = rej_eta(r1->coeffs, MLDSA_N, ctr[1], buf[1], buflen); + ctr[2] = rej_eta(r2->coeffs, MLDSA_N, ctr[2], buf[2], buflen); + ctr[3] = rej_eta(r3->coeffs, MLDSA_N, ctr[3], buf[3], buflen); + } + + mld_xof256_x4_release(&state); +} + +static void polyz_unpack(poly *r, const uint8_t *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N / 4; ++i) + { + r->coeffs[4 * i + 0] = a[9 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8; + r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16; + r->coeffs[4 * i + 0] &= 0x3FFFF; + + r->coeffs[4 * i + 1] = a[9 * i + 2] >> 2; + r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6; + r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14; + r->coeffs[4 * i + 1] &= 0x3FFFF; + + r->coeffs[4 * i + 2] = a[9 * i + 4] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12; + r->coeffs[4 * i + 2] &= 0x3FFFF; + + r->coeffs[4 * i + 3] = a[9 * i + 6] >> 6; + r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2; + r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10; + r->coeffs[4 * i + 3] &= 0x3FFFF; + + r->coeffs[4 * i + 0] = MLDSA_GAMMA1 - r->coeffs[4 * i + 0]; + r->coeffs[4 * i + 1] = MLDSA_GAMMA1 - r->coeffs[4 * i + 1]; + r->coeffs[4 * i + 2] = MLDSA_GAMMA1 - r->coeffs[4 * i + 2]; + r->coeffs[4 * i + 3] = MLDSA_GAMMA1 - r->coeffs[4 * i + 3]; + } +} + + +#define POLY_UNIFORM_GAMMA1_NBLOCKS \ + ((MLDSA_POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1) / STREAM256_BLOCKBYTES) + +static void +poly_uniform_gamma1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLDSA_CRHBYTES], uint16_t nonce0, + uint16_t nonce1, uint16_t nonce2, uint16_t nonce3) +{ + /* Temporary buffers for XOF output before rejection sampling */ + MLD_ALIGN uint8_t + buf[4][MLD_ALIGN_UP(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES)]; + + MLD_ALIGN uint8_t extseed[4][MLD_ALIGN_UP(MLDSA_CRHBYTES + 2)]; + + /* Tracks the number of coefficients we have already sampled */ + mld_xof256_x4_ctx state; + + memcpy(extseed[0], seed, MLDSA_CRHBYTES); + memcpy(extseed[1], seed, MLDSA_CRHBYTES); + memcpy(extseed[2], seed, MLDSA_CRHBYTES); + memcpy(extseed[3], seed, MLDSA_CRHBYTES); + extseed[0][MLDSA_CRHBYTES] = nonce0 & 0xFF; + extseed[1][MLDSA_CRHBYTES] = nonce1 & 0xFF; + extseed[2][MLDSA_CRHBYTES] = nonce2 & 0xFF; + extseed[3][MLDSA_CRHBYTES] = nonce3 & 0xFF; + extseed[0][MLDSA_CRHBYTES + 1] = nonce0 >> 8; + extseed[1][MLDSA_CRHBYTES + 1] = nonce1 >> 8; + extseed[2][MLDSA_CRHBYTES + 1] = nonce2 >> 8; + extseed[3][MLDSA_CRHBYTES + 1] = nonce3 >> 8; + + mld_xof256_x4_init(&state); + mld_xof256_x4_absorb(&state, extseed, MLDSA_CRHBYTES + 2); + mld_xof256_x4_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + + polyz_unpack(r0, buf[0]); + polyz_unpack(r1, buf[1]); + polyz_unpack(r2, buf[2]); + polyz_unpack(r3, buf[3]); + mld_xof256_x4_release(&state); +} + + +static void poly_challenge(poly *c, const uint8_t seed[MLDSA_CTILDEBYTES]) +{ + unsigned int i, j, pos; + uint64_t signs; + uint64_t offset; + MLD_ALIGN uint8_t buf[SHAKE256_RATE]; + keccak_state state; + + shake256_init(&state); + shake256_absorb(&state, seed, MLDSA_CTILDEBYTES); + shake256_finalize(&state); + shake256_squeezeblocks(buf, 1, &state); + + /* Convert the first 8 bytes of buf[] into an unsigned 64-bit value. */ + /* Each bit of that dictates the sign of the resulting challenge value */ + signs = 0; + for (i = 0; i < 8; ++i) + { + signs |= (uint64_t)buf[i] << 8 * i; + } + pos = 8; + + memset(c, 0, sizeof(poly)); + + for (i = MLDSA_N - MLDSA_TAU; i < MLDSA_N; ++i) + { + do + { + if (pos >= SHAKE256_RATE) + { + shake256_squeezeblocks(buf, 1, &state); + pos = 0; + } + j = buf[pos++]; + } while (j > i); + + c->coeffs[i] = c->coeffs[j]; + + /* Reference: Compute coefficent value here in two steps to */ + /* mixinf unsigned and signed arithmetic with implicit */ + /* conversions, and so that CBMC can keep track of ranges */ + /* to complete type-safety proof here. */ + + /* The least-significant bit of signs tells us if we want -1 or +1 */ + offset = 2 * (signs & 1); + + /* offset has value 0 or 2 here, so (1 - (int32_t) offset) has + * value -1 or +1 */ + c->coeffs[j] = 1 - (int32_t)offset; + + /* Move to the next bit of signs for next time */ + signs >>= 1; + } +} + +static void polyeta_pack(uint8_t *r, const poly *a) +{ + unsigned int i; + uint8_t t[8]; + + for (i = 0; i < MLDSA_N / 8; ++i) + { + t[0] = MLDSA_ETA - a->coeffs[8 * i + 0]; + t[1] = MLDSA_ETA - a->coeffs[8 * i + 1]; + t[2] = MLDSA_ETA - a->coeffs[8 * i + 2]; + t[3] = MLDSA_ETA - a->coeffs[8 * i + 3]; + t[4] = MLDSA_ETA - a->coeffs[8 * i + 4]; + t[5] = MLDSA_ETA - a->coeffs[8 * i + 5]; + t[6] = MLDSA_ETA - a->coeffs[8 * i + 6]; + t[7] = MLDSA_ETA - a->coeffs[8 * i + 7]; + + r[3 * i + 0] = ((t[0] >> 0) | (t[1] << 3) | (t[2] << 6)) & 0xFF; + r[3 * i + 1] = + ((t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7)) & 0xFF; + r[3 * i + 2] = ((t[5] >> 1) | (t[6] << 2) | (t[7] << 5)) & 0xFF; + } +} + +static void polyeta_unpack(poly *r, const uint8_t *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N / 8; ++i) + { + r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; + r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; + r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; + + r->coeffs[8 * i + 0] = MLDSA_ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = MLDSA_ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = MLDSA_ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = MLDSA_ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = MLDSA_ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = MLDSA_ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = MLDSA_ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = MLDSA_ETA - r->coeffs[8 * i + 7]; + } +} + +static void polyt1_pack(uint8_t *r, const poly *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N / 4; ++i) + { + r[5 * i + 0] = (a->coeffs[4 * i + 0] >> 0) & 0xFF; + r[5 * i + 1] = + ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2)) & 0xFF; + r[5 * i + 2] = + ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4)) & 0xFF; + r[5 * i + 3] = + ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6)) & 0xFF; + r[5 * i + 4] = (a->coeffs[4 * i + 3] >> 2) & 0xFF; + } +} + +static void polyt1_unpack(poly *r, const uint8_t *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N / 4; ++i) + { + r->coeffs[4 * i + 0] = + ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = + ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = + ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = + ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; + } +} + +static void polyt0_pack(uint8_t *r, const poly *a) +{ + unsigned int i; + uint32_t t[8]; + + for (i = 0; i < MLDSA_N / 8; ++i) + { + t[0] = (1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 7]; + + r[13 * i + 0] = (t[0]) & 0xFF; + r[13 * i + 1] = (t[0] >> 8) & 0xFF; + r[13 * i + 1] |= (t[1] << 5) & 0xFF; + r[13 * i + 2] = (t[1] >> 3) & 0xFF; + r[13 * i + 3] = (t[1] >> 11) & 0xFF; + r[13 * i + 3] |= (t[2] << 2) & 0xFF; + r[13 * i + 4] = (t[2] >> 6) & 0xFF; + r[13 * i + 4] |= (t[3] << 7) & 0xFF; + r[13 * i + 5] = (t[3] >> 1) & 0xFF; + r[13 * i + 6] = (t[3] >> 9) & 0xFF; + r[13 * i + 6] |= (t[4] << 4) & 0xFF; + r[13 * i + 7] = (t[4] >> 4) & 0xFF; + r[13 * i + 8] = (t[4] >> 12) & 0xFF; + r[13 * i + 8] |= (t[5] << 1) & 0xFF; + r[13 * i + 9] = (t[5] >> 7) & 0xFF; + r[13 * i + 9] |= (t[6] << 6) & 0xFF; + r[13 * i + 10] = (t[6] >> 2) & 0xFF; + r[13 * i + 11] = (t[6] >> 10) & 0xFF; + r[13 * i + 11] |= (t[7] << 3) & 0xFF; + r[13 * i + 12] = (t[7] >> 5) & 0xFF; + } +} + +static void polyt0_unpack(poly *r, const uint8_t *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N / 8; ++i) + { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; + + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; + + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; + + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 7]; + } +} + +static void polyz_pack(uint8_t *r, const poly *a) +{ + unsigned int i; + uint32_t t[4]; + + for (i = 0; i < MLDSA_N / 4; ++i) + { + t[0] = MLDSA_GAMMA1 - a->coeffs[4 * i + 0]; + t[1] = MLDSA_GAMMA1 - a->coeffs[4 * i + 1]; + t[2] = MLDSA_GAMMA1 - a->coeffs[4 * i + 2]; + t[3] = MLDSA_GAMMA1 - a->coeffs[4 * i + 3]; + + r[9 * i + 0] = (t[0]) & 0xFF; + r[9 * i + 1] = (t[0] >> 8) & 0xFF; + r[9 * i + 2] = (t[0] >> 16) & 0xFF; + r[9 * i + 2] |= (t[1] << 2) & 0xFF; + r[9 * i + 3] = (t[1] >> 6) & 0xFF; + r[9 * i + 4] = (t[1] >> 14) & 0xFF; + r[9 * i + 4] |= (t[2] << 4) & 0xFF; + r[9 * i + 5] = (t[2] >> 4) & 0xFF; + r[9 * i + 6] = (t[2] >> 12) & 0xFF; + r[9 * i + 6] |= (t[3] << 6) & 0xFF; + r[9 * i + 7] = (t[3] >> 2) & 0xFF; + r[9 * i + 8] = (t[3] >> 10) & 0xFF; + } +} + +static void polyw1_pack(uint8_t *r, const poly *a) +{ + unsigned int i; + + for (i = 0; i < MLDSA_N / 4; ++i) + { + r[3 * i + 0] = (a->coeffs[4 * i + 0]) & 0xFF; + r[3 * i + 0] |= (a->coeffs[4 * i + 1] << 6) & 0xFF; + r[3 * i + 1] = (a->coeffs[4 * i + 1] >> 2) & 0xFF; + r[3 * i + 1] |= (a->coeffs[4 * i + 2] << 4) & 0xFF; + r[3 * i + 2] = (a->coeffs[4 * i + 2] >> 4) & 0xFF; + r[3 * i + 2] |= (a->coeffs[4 * i + 3] << 2) & 0xFF; + } +} + +static void +polyvec_matrix_expand(polyvecl mat[MLDSA_K], + const uint8_t rho[MLDSA_SEEDBYTES]) +{ + unsigned int i, j; + /* + * We generate four separate seed arrays rather than a single one to work + * around limitations in CBMC function contracts dealing with disjoint slices + * of the same parent object. + */ + + MLD_ALIGN uint8_t seed_ext[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)]; + + for (j = 0; j < 4; j++) + { + memcpy(seed_ext[j], rho, MLDSA_SEEDBYTES); + } + /* Sample 4 matrix entries a time. */ + for (i = 0; i < (MLDSA_K * MLDSA_L / 4) * 4; i += 4) + { + for (j = 0; j < 4; j++) + { + uint8_t x = (i + j) / MLDSA_L; + uint8_t y = (i + j) % MLDSA_L; + + seed_ext[j][MLDSA_SEEDBYTES + 0] = y; + seed_ext[j][MLDSA_SEEDBYTES + 1] = x; + } + + poly_uniform_4x(&mat[i / MLDSA_L].vec[i % MLDSA_L], + &mat[(i + 1) / MLDSA_L].vec[(i + 1) % MLDSA_L], + &mat[(i + 2) / MLDSA_L].vec[(i + 2) % MLDSA_L], + &mat[(i + 3) / MLDSA_L].vec[(i + 3) % MLDSA_L], seed_ext); + } + + /* For MLDSA_K=6, MLDSA_L=5, process the last two entries individually */ + while (i < MLDSA_K * MLDSA_L) + { + uint8_t x = i / MLDSA_L; + uint8_t y = i % MLDSA_L; + poly *this_poly = &mat[i / MLDSA_L].vec[i % MLDSA_L]; + + seed_ext[0][MLDSA_SEEDBYTES + 0] = y; + seed_ext[0][MLDSA_SEEDBYTES + 1] = x; + + poly_uniform(this_poly, seed_ext[0]); + i++; + } +} + +/**************************************************************/ +/************ Vectors of polynomials of length MLDSA_L **************/ +/**************************************************************/ +static void +polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[MLDSA_CRHBYTES], + uint16_t nonce) +{ + nonce = MLDSA_L * nonce; + poly_uniform_gamma1_4x(&v->vec[0], &v->vec[1], &v->vec[2], &v->vec[3], seed, + nonce, nonce + 1, nonce + 2, nonce + 3); +} + +static void polyvecl_reduce(polyvecl *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_L; ++i) + { + poly_reduce(&v->vec[i]); + } +} + +/* Reference: We use destructive version (output=first input) to avoid + * reasoning about aliasing in the CBMC specification */ +static void polyvecl_add(polyvecl *u, const polyvecl *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_L; ++i) + { + poly_add(&u->vec[i], &v->vec[i]); + } +} + +static void polyvecl_ntt(polyvecl *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_L; ++i) + { + poly_ntt(&v->vec[i]); + } +} + +static void polyvecl_invntt_tomont(polyvecl *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_L; ++i) + { + poly_invntt_tomont(&v->vec[i]); + } +} + +static void +polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, + const polyvecl *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_L; ++i) + { + poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +static void +polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, + const polyvecl *v) +{ + unsigned int i, j; + /* The first input is bounded by [0, Q-1] inclusive + * The second input is bounded by [-9Q+1, 9Q-1] inclusive . Hence, we can + * safely accumulate in 64-bits without intermediate reductions as + * MLDSA_L * (MLD_NTT_BOUND-1) * (Q-1) < INT64_MAX + * + * The worst case is ML-DSA-87: 7 * (9Q-1) * (Q-1) < 2**52 + * (and likewise for negative values) + */ + + for (i = 0; i < MLDSA_N; i++) + { + int64_t t = 0; + int32_t r; + for (j = 0; j < MLDSA_L; j++) + { + t += (int64_t)u->vec[j].coeffs[i] * v->vec[j].coeffs[i]; + } + + r = montgomery_reduce(t); + + w->coeffs[i] = r; + } +} + + +static int polyvecl_chknorm(const polyvecl *v, int32_t bound) +{ + unsigned int i; + + for (i = 0; i < MLDSA_L; ++i) + { + if (poly_chknorm(&v->vec[i], bound)) + { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length MLDSA_K **************/ +/**************************************************************/ + +static void polyveck_reduce(polyveck *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_reduce(&v->vec[i]); + } +} + +static void polyveck_caddq(polyveck *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_caddq(&v->vec[i]); + } +} + +/* Reference: We use destructive version (output=first input) to avoid + * reasoning about aliasing in the CBMC specification */ +static void polyveck_add(polyveck *u, const polyveck *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_add(&u->vec[i], &v->vec[i]); + } +} + +static void polyveck_sub(polyveck *u, const polyveck *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_sub(&u->vec[i], &v->vec[i]); + } +} + +static void polyveck_shiftl(polyveck *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_shiftl(&v->vec[i]); + } +} + +static void polyveck_ntt(polyveck *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_ntt(&v->vec[i]); + } +} + +static void polyveck_invntt_tomont(polyveck *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_invntt_tomont(&v->vec[i]); + } +} + +static void +polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, + const polyveck *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + + +static int polyveck_chknorm(const polyveck *v, int32_t bound) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + if (poly_chknorm(&v->vec[i], bound)) + { + return 1; + } + } + + return 0; +} + +static void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +static void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +static unsigned int +polyveck_make_hint(polyveck *h, const polyveck *v0, + const polyveck *v1) +{ + unsigned int i, s = 0; + + for (i = 0; i < MLDSA_K; ++i) + { + s += poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + } + + return s; +} + +static void polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +static void +polyveck_pack_w1(uint8_t r[MLDSA_K * MLDSA_POLYW1_PACKEDBYTES], + const polyveck *w1) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + polyw1_pack(&r[i * MLDSA_POLYW1_PACKEDBYTES], &w1->vec[i]); + } +} + +static void +polyveck_pack_eta(uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES], + const polyveck *p) +{ + unsigned int i; + for (i = 0; i < MLDSA_K; ++i) + { + polyeta_pack(r + i * MLDSA_POLYETA_PACKEDBYTES, &p->vec[i]); + } +} + +static void +polyvecl_pack_eta(uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES], + const polyvecl *p) +{ + unsigned int i; + for (i = 0; i < MLDSA_L; ++i) + { + polyeta_pack(r + i * MLDSA_POLYETA_PACKEDBYTES, &p->vec[i]); + } +} + +static void +polyvecl_pack_z(uint8_t r[MLDSA_L * MLDSA_POLYZ_PACKEDBYTES], + const polyvecl *p) +{ + unsigned int i; + for (i = 0; i < MLDSA_L; ++i) + { + polyz_pack(r + i * MLDSA_POLYZ_PACKEDBYTES, &p->vec[i]); + } +} + + +static void +polyveck_pack_t0(uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES], + const polyveck *p) +{ + unsigned int i; + for (i = 0; i < MLDSA_K; ++i) + { + polyt0_pack(r + i * MLDSA_POLYT0_PACKEDBYTES, &p->vec[i]); + } +} + +static void +polyvecl_unpack_eta(polyvecl *p, + const uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES]) +{ + unsigned int i; + for (i = 0; i < MLDSA_L; ++i) + { + polyeta_unpack(&p->vec[i], r + i * MLDSA_POLYETA_PACKEDBYTES); + } +} + +static void +polyvecl_unpack_z(polyvecl *z, + const uint8_t r[MLDSA_L * MLDSA_POLYZ_PACKEDBYTES]) +{ + unsigned int i; + for (i = 0; i < MLDSA_L; ++i) + { + polyz_unpack(&z->vec[i], r + i * MLDSA_POLYZ_PACKEDBYTES); + } +} + +static void +polyveck_unpack_eta(polyveck *p, + const uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES]) +{ + unsigned int i; + for (i = 0; i < MLDSA_K; ++i) + { + polyeta_unpack(&p->vec[i], r + i * MLDSA_POLYETA_PACKEDBYTES); + } +} + +static void +polyveck_unpack_t0(polyveck *p, + const uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES]) +{ + unsigned int i; + for (i = 0; i < MLDSA_K; ++i) + { + polyt0_unpack(&p->vec[i], r + i * MLDSA_POLYT0_PACKEDBYTES); + } +} + +static void +polyvec_matrix_pointwise_montgomery(polyveck *t, + const polyvecl mat[MLDSA_K], + const polyvecl *v) +{ + unsigned int i; + + for (i = 0; i < MLDSA_K; ++i) + { + polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} + +static void mld_sample_s1_s2(polyvecl *s1, polyveck *s2, + const uint8_t seed[MLDSA_CRHBYTES]) +{ +/* Sample short vectors s1 and s2 */ + poly_uniform_eta_4x(&s1->vec[0], &s1->vec[1], &s1->vec[2], &s1->vec[3], seed, + 0, 1, 2, 3); + poly_uniform_eta_4x(&s2->vec[0], &s2->vec[1], &s2->vec[2], &s2->vec[3], seed, + 4, 5, 6, 7); +} + +static void +pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[MLDSA_SEEDBYTES], const polyveck *t1) +{ + unsigned int i; + + memcpy(pk, rho, MLDSA_SEEDBYTES); + pk += MLDSA_SEEDBYTES; + + for (i = 0; i < MLDSA_K; ++i) + { + polyt1_pack(pk + i * MLDSA_POLYT1_PACKEDBYTES, &t1->vec[i]); + } +} + +static void +unpack_pk(uint8_t rho[MLDSA_SEEDBYTES], polyveck *t1, + const uint8_t pk[CRYPTO_PUBLICKEYBYTES]) +{ + unsigned int i; + + memcpy(rho, pk, MLDSA_SEEDBYTES); + pk += MLDSA_SEEDBYTES; + + for (i = 0; i < MLDSA_K; ++i) + { + polyt1_unpack(&t1->vec[i], pk + i * MLDSA_POLYT1_PACKEDBYTES); + } +} + +static void +pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], + const uint8_t rho[MLDSA_SEEDBYTES], + const uint8_t tr[MLDSA_TRBYTES], + const uint8_t key[MLDSA_SEEDBYTES], const polyveck *t0, + const polyvecl *s1, const polyveck *s2) +{ + memcpy(sk, rho, MLDSA_SEEDBYTES); + sk += MLDSA_SEEDBYTES; + + memcpy(sk, key, MLDSA_SEEDBYTES); + sk += MLDSA_SEEDBYTES; + + memcpy(sk, tr, MLDSA_TRBYTES); + sk += MLDSA_TRBYTES; + + polyvecl_pack_eta(sk, s1); + sk += MLDSA_L * MLDSA_POLYETA_PACKEDBYTES; + + polyveck_pack_eta(sk, s2); + sk += MLDSA_K * MLDSA_POLYETA_PACKEDBYTES; + + polyveck_pack_t0(sk, t0); +} + +static void +unpack_sk(uint8_t rho[MLDSA_SEEDBYTES], uint8_t tr[MLDSA_TRBYTES], + uint8_t key[MLDSA_SEEDBYTES], polyveck *t0, polyvecl *s1, + polyveck *s2, const uint8_t sk[CRYPTO_SECRETKEYBYTES]) +{ + memcpy(rho, sk, MLDSA_SEEDBYTES); + sk += MLDSA_SEEDBYTES; + + memcpy(key, sk, MLDSA_SEEDBYTES); + sk += MLDSA_SEEDBYTES; + + memcpy(tr, sk, MLDSA_TRBYTES); + sk += MLDSA_TRBYTES; + + polyvecl_unpack_eta(s1, sk); + sk += MLDSA_L * MLDSA_POLYETA_PACKEDBYTES; + + polyveck_unpack_eta(s2, sk); + sk += MLDSA_K * MLDSA_POLYETA_PACKEDBYTES; + + polyveck_unpack_t0(t0, sk); +} + +static void +pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[MLDSA_CTILDEBYTES], + const polyvecl *z, const polyveck *h, + const unsigned int number_of_hints) +{ + unsigned int i, j, k; + + memcpy(sig, c, MLDSA_CTILDEBYTES); + sig += MLDSA_CTILDEBYTES; + + polyvecl_pack_z(sig, z); + sig += MLDSA_L * MLDSA_POLYZ_PACKEDBYTES; + + /* Encode hints h */ + + /* The final section of sig[] is MLDSA_POLYVECH_PACKEDBYTES long, where + * MLDSA_POLYVECH_PACKEDBYTES = MLDSA_OMEGA + MLDSA_K + * + * The first OMEGA bytes record the index numbers of the coefficients + * that are not equal to 0 + * + * The final K bytes record a running tally of the number of hints + * coming from each of the K polynomials in h. + * + * The pre-condition tells us that number_of_hints <= OMEGA, so some + * bytes may not be written, so we initialize all of them to zero + * to start. + */ + memset(sig, 0, MLDSA_POLYVECH_PACKEDBYTES); + + k = 0; + /* For each polynomial in h... */ + for (i = 0; i < MLDSA_K; ++i) + { + /* For each coefficient in that polynomial, record it as as hint */ + /* if its value is not zero */ + for (j = 0; j < MLDSA_N; ++j) + { + /* The reference implementation implicitly relies on the total */ + /* number of hints being less than OMEGA, assuming h is valid. */ + /* In mldsa-native, we check this explicitly to ease proof of */ + /* type safety. */ + if (h->vec[i].coeffs[j] != 0 && k < number_of_hints) + { + /* The enclosing if condition AND the loop invariant infer */ + /* that k < MLDSA_OMEGA, so writing to sig[k] is safe and k */ + /* can be incremented. */ + sig[k++] = j; + } + } + /* Having recorded all the hints for this polynomial, also */ + /* record the running tally into the correct "slot" for that */ + /* coefficient in the final K bytes */ + sig[MLDSA_OMEGA + i] = k; + } +} + +/************************************************* + * Name: unpack_hints + * + * Description: Unpack raw hint bytes into a polyveck + * struct + * + * Arguments: - polyveck *h: pointer to output hint vector h + * - const uint8_t packed_hints[MLDSA_POLYVECH_PACKEDBYTES]: + * raw hint bytes + * + * Returns 1 in case of malformed hints; otherwise 0. + **************************************************/ +static int unpack_hints(polyveck *h, + const uint8_t packed_hints[MLDSA_POLYVECH_PACKEDBYTES]) +{ + unsigned int i, j; + unsigned int old_hint_count; + + /* Set all coefficients of all polynomials to 0. */ + /* Only those that are actually non-zero hints will */ + /* be overwritten below. */ + memset(h, 0, sizeof(polyveck)); + + old_hint_count = 0; + for (i = 0; i < MLDSA_K; ++i) + { + /* Grab the hint count for the i'th polynomial */ + const unsigned int new_hint_count = packed_hints[MLDSA_OMEGA + i]; + + /* new_hint_count must increase or stay the same, but also remain */ + /* less than or equal to MLDSA_OMEGA */ + if (new_hint_count < old_hint_count || new_hint_count > MLDSA_OMEGA) + { + /* Error - new_hint_count is invalid */ + return 1; + } + + /* If new_hint_count == old_hint_count, then this polynomial has */ + /* zero hints, so this loop executes zero times and we move */ + /* straight on to the next polynomial. */ + for (j = old_hint_count; j < new_hint_count; ++j) + { + const uint8_t this_hint_index = packed_hints[j]; + + /* Coefficients must be ordered for strong unforgeability */ + if (j > old_hint_count && this_hint_index <= packed_hints[j - 1]) + { + return 1; + } + h->vec[i].coeffs[this_hint_index] = 1; + } + + old_hint_count = new_hint_count; + } + + /* Extra indices must be zero for strong unforgeability */ + for (j = old_hint_count; j < MLDSA_OMEGA; ++j) + { + if (packed_hints[j] != 0) + { + return 1; + } + } + + return 0; +} + +static int +unpack_sig(uint8_t c[MLDSA_CTILDEBYTES], polyvecl *z, polyveck *h, + const uint8_t sig[CRYPTO_BYTES]) +{ + memcpy(c, sig, MLDSA_CTILDEBYTES); + sig += MLDSA_CTILDEBYTES; + + polyvecl_unpack_z(z, sig); + sig += MLDSA_L * MLDSA_POLYZ_PACKEDBYTES; + + return unpack_hints(h, sig); +} + +static int +crypto_sign_keypair_internal(uint8_t *pk, uint8_t *sk, + const uint8_t seed[MLDSA_SEEDBYTES]) +{ + uint8_t seedbuf[2 * MLDSA_SEEDBYTES + MLDSA_CRHBYTES]; + uint8_t inbuf[MLDSA_SEEDBYTES + 2]; + uint8_t tr[MLDSA_TRBYTES]; + const uint8_t *rho, *rhoprime, *key; + polyvecl mat[MLDSA_K]; + polyvecl s1, s1hat; + polyveck s2, t2, t1, t0; + + /* Get randomness for rho, rhoprime and key */ + memcpy(inbuf, seed, MLDSA_SEEDBYTES); + inbuf[MLDSA_SEEDBYTES + 0] = MLDSA_K; + inbuf[MLDSA_SEEDBYTES + 1] = MLDSA_L; + shake256(seedbuf, 2 * MLDSA_SEEDBYTES + MLDSA_CRHBYTES, inbuf, + MLDSA_SEEDBYTES + 2); + rho = seedbuf; + rhoprime = rho + MLDSA_SEEDBYTES; + key = rhoprime + MLDSA_CRHBYTES; + + /* Expand matrix */ + polyvec_matrix_expand(mat, rho); + + mld_sample_s1_s2(&s1, &s2, rhoprime); + + /* Matrix-vector multiplication */ + s1hat = s1; + polyvecl_ntt(&s1hat); + polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); + polyveck_reduce(&t1); + polyveck_invntt_tomont(&t1); + + /* Add error vector s2 */ + polyveck_add(&t1, &s2); + + /* Extract t1 and write public key */ + polyveck_caddq(&t1); + polyveck_power2round(&t2, &t0, &t1); + pack_pk(pk, rho, &t2); + + /* Compute H(rho, t1) and write secret key */ + shake256(tr, MLDSA_TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + pack_sk(sk, rho, tr, key, &t0, &s1, &s2); + return 0; +} + +int MLD_44_ref_keypair(uint8_t *pk, uint8_t *sk) +{ + uint8_t seed[MLDSA_SEEDBYTES]; + randombytes(seed, MLDSA_SEEDBYTES); + return crypto_sign_keypair_internal(pk, sk, seed); +} + +/************************************************* + * Name: mld_H + * + * Description: Abstracts application of SHAKE256 to + * one, two or three blocks of data, + * yielding a user-requested size of + * output. + * + * Arguments: - uint8_t *out: pointer to output + * - size_t outlen: requested output length in bytes + * - const uint8_t *in1: pointer to input block 1 + * Must NOT be NULL + * - size_t in1len: length of input in1 bytes + * - const uint8_t *in2: pointer to input block 2 + * May be NULL, in which case + * this block is ignored + * - size_t in2len: length of input in2 bytes + * - const uint8_t *in3: pointer to input block 3 + * May be NULL, in which case + * this block is ignored + * - size_t in3len: length of input in3 bytes + **************************************************/ +static void mld_H(uint8_t *out, size_t outlen, const uint8_t *in1, + size_t in1len, const uint8_t *in2, size_t in2len, + const uint8_t *in3, size_t in3len) +{ + keccak_state state; + shake256_init(&state); + shake256_absorb(&state, in1, in1len); + if (in2 != NULL) + { + shake256_absorb(&state, in2, in2len); + } + if (in3 != NULL) + { + shake256_absorb(&state, in3, in3len); + } + shake256_finalize(&state); + shake256_squeeze(out, outlen, &state); +} + +static int +crypto_sign_signature_internal(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pre, size_t prelen, + const uint8_t rnd[MLDSA_RNDBYTES], + const uint8_t *sk, int externalmu) +{ + unsigned int n; + uint8_t seedbuf[2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + 2 * MLDSA_CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint16_t nonce = 0; + polyvecl mat[MLDSA_K], s1, y, z; + polyveck t0, s2, w1, w0, h; + poly cp; + + rho = seedbuf; + tr = rho + MLDSA_SEEDBYTES; + key = tr + MLDSA_TRBYTES; + mu = key + MLDSA_SEEDBYTES; + rhoprime = mu + MLDSA_CRHBYTES; + unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); + + if (!externalmu) + { + /* Compute mu = CRH(tr, pre, msg) */ + mld_H(mu, MLDSA_CRHBYTES, tr, MLDSA_TRBYTES, pre, prelen, m, mlen); + } + else + { + /* mu has been provided directly */ + memcpy(mu, m, MLDSA_CRHBYTES); + } + + /* Compute rhoprime = CRH(key, rnd, mu) */ + mld_H(rhoprime, MLDSA_CRHBYTES, key, MLDSA_SEEDBYTES, rnd, MLDSA_RNDBYTES, mu, + MLDSA_CRHBYTES); + + /* Expand matrix and transform vectors */ + polyvec_matrix_expand(mat, rho); + polyvecl_ntt(&s1); + polyveck_ntt(&s2); + polyveck_ntt(&t0); + + /* Reference: This code is re-structured using a while(1), */ + /* with explicit "continue" statements (rather than "goto") */ + /* to implement rejection of invalid signatures. */ + /* The loop statement also supplies a syntactic location to */ + /* place loop invariants for CBMC. */ + while (1) + { + /* Sample intermediate vector y */ + polyvecl_uniform_gamma1(&y, rhoprime, nonce++); + + /* Matrix-vector multiplication */ + z = y; + polyvecl_ntt(&z); + polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + polyveck_reduce(&w1); + polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + polyveck_caddq(&w1); + polyveck_decompose(&w1, &w0, &w1); + polyveck_pack_w1(sig, &w1); + + mld_H(sig, MLDSA_CTILDEBYTES, mu, MLDSA_CRHBYTES, sig, + MLDSA_K * MLDSA_POLYW1_PACKEDBYTES, NULL, 0); + poly_challenge(&cp, sig); + poly_ntt(&cp); + + /* Compute z, reject if it reveals secret */ + polyvecl_pointwise_poly_montgomery(&z, &cp, &s1); + polyvecl_invntt_tomont(&z); + polyvecl_add(&z, &y); + polyvecl_reduce(&z); + if (polyvecl_chknorm(&z, MLDSA_GAMMA1 - MLDSA_BETA)) + { + /* reject */ + continue; + } + + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + polyveck_pointwise_poly_montgomery(&h, &cp, &s2); + polyveck_invntt_tomont(&h); + polyveck_sub(&w0, &h); + polyveck_reduce(&w0); + if (polyveck_chknorm(&w0, MLDSA_GAMMA2 - MLDSA_BETA)) + { + /* reject */ + continue; + } + + /* Compute hints for w1 */ + polyveck_pointwise_poly_montgomery(&h, &cp, &t0); + polyveck_invntt_tomont(&h); + polyveck_reduce(&h); + if (polyveck_chknorm(&h, MLDSA_GAMMA2)) + { + /* reject */ + continue; + } + + polyveck_add(&w0, &h); + n = polyveck_make_hint(&h, &w0, &w1); + if (n > MLDSA_OMEGA) + { + /* reject */ + continue; + } + + /* Write signature */ + pack_sig(sig, sig, &z, &h, n); + *siglen = CRYPTO_BYTES; + return 0; + } +} + +int MLD_44_ref_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, + size_t mlen, const uint8_t *ctx, size_t ctxlen, + const uint8_t *sk) +{ + size_t i; + uint8_t pre[257]; + uint8_t rnd[MLDSA_RNDBYTES]; + + if (ctxlen > 255) + { + return -1; + } + + /* Prepare pre = (0, ctxlen, ctx) */ + pre[0] = 0; + pre[1] = ctxlen; + for (i = 0; i < ctxlen; i++) + { + pre[2 + i] = ctx[i]; + } + + randombytes(rnd, MLDSA_RNDBYTES); + + crypto_sign_signature_internal(sig, siglen, m, mlen, pre, 2 + ctxlen, rnd, sk, + 0); + return 0; +} + +int MLD_44_ref(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, + const uint8_t *ctx, size_t ctxlen, const uint8_t *sk) +{ + int ret; + size_t i; + + for (i = 0; i < mlen; ++i) + { + sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + ret = MLD_44_ref_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, ctx, ctxlen, + sk); + *smlen += mlen; + return ret; +} + +static int +MLD_44_ref_verify_internal(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pre, size_t prelen, + const uint8_t *pk, int externalmu) +{ + unsigned int i; + uint8_t buf[MLDSA_K * MLDSA_POLYW1_PACKEDBYTES]; + uint8_t rho[MLDSA_SEEDBYTES]; + uint8_t mu[MLDSA_CRHBYTES]; + uint8_t c[MLDSA_CTILDEBYTES]; + uint8_t c2[MLDSA_CTILDEBYTES]; + poly cp; + polyvecl mat[MLDSA_K], z; + polyveck t1, w1, tmp, h; + + if (siglen != CRYPTO_BYTES) + { + return -1; + } + + unpack_pk(rho, &t1, pk); + if (unpack_sig(c, &z, &h, sig)) + { + return -1; + } + if (polyvecl_chknorm(&z, MLDSA_GAMMA1 - MLDSA_BETA)) + { + return -1; + } + + if (!externalmu) + { + /* Compute CRH(H(rho, t1), pre, msg) */ + uint8_t hpk[MLDSA_CRHBYTES]; + mld_H(hpk, MLDSA_TRBYTES, pk, CRYPTO_PUBLICKEYBYTES, NULL, 0, NULL, 0); + mld_H(mu, MLDSA_CRHBYTES, hpk, MLDSA_TRBYTES, pre, prelen, m, mlen); + } + else + { + /* mu has been provided directly */ + memcpy(mu, m, MLDSA_CRHBYTES); + } + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + poly_challenge(&cp, c); + polyvec_matrix_expand(mat, rho); + + polyvecl_ntt(&z); + polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + + poly_ntt(&cp); + polyveck_shiftl(&t1); + polyveck_ntt(&t1); + + polyveck_pointwise_poly_montgomery(&tmp, &cp, &t1); + + polyveck_sub(&w1, &tmp); + polyveck_reduce(&w1); + polyveck_invntt_tomont(&w1); + + /* Reconstruct w1 */ + polyveck_caddq(&w1); + polyveck_use_hint(&tmp, &w1, &h); + polyveck_pack_w1(buf, &tmp); + /* Call random oracle and verify challenge */ + mld_H(c2, MLDSA_CTILDEBYTES, mu, MLDSA_CRHBYTES, buf, + MLDSA_K * MLDSA_POLYW1_PACKEDBYTES, NULL, 0); + for (i = 0; i < MLDSA_CTILDEBYTES; ++i) + { + if (c[i] != c2[i]) + { + return -1; + } + } + return 0; +} + +int MLD_44_ref_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, + size_t mlen, const uint8_t *ctx, size_t ctxlen, + const uint8_t *pk) +{ + size_t i; + uint8_t pre[257]; + + if (ctxlen > 255) + { + return -1; + } + + pre[0] = 0; + pre[1] = ctxlen; + for (i = 0; i < ctxlen; i++) + { + pre[2 + i] = ctx[i]; + } + + return MLD_44_ref_verify_internal(sig, siglen, m, mlen, pre, 2 + ctxlen, pk, + 0); +} + +int MLD_44_ref_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, + const uint8_t *ctx, size_t ctxlen, const uint8_t *pk) +{ + size_t i; + + if (smlen < CRYPTO_BYTES) + { + goto badsig; + } + + *mlen = smlen - CRYPTO_BYTES; + if (MLD_44_ref_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, ctx, + ctxlen, pk)) + { + goto badsig; + } + else + { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) + { + m[i] = sm[CRYPTO_BYTES + i]; + } + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = 0; + for (i = 0; i < smlen; ++i) + { + m[i] = 0; + } + + return -1; +} diff --git a/mldsa.h b/mldsa.h new file mode 100644 index 0000000..7c41d2b --- /dev/null +++ b/mldsa.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLD_API_H +#define MLD_API_H + +#include +#include + +#define MLD_44_PUBLICKEYBYTES 1312 +#define MLD_44_SECRETKEYBYTES 2560 +#define MLD_44_BYTES 2420 + +#define MLD_44_ref_PUBLICKEYBYTES MLD_44_PUBLICKEYBYTES +#define MLD_44_ref_SECRETKEYBYTES MLD_44_SECRETKEYBYTES +#define MLD_44_ref_BYTES MLD_44_BYTES + +int MLD_44_ref_keypair(uint8_t *pk, uint8_t *sk); + +int MLD_44_ref_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, + size_t mlen, const uint8_t *ctx, size_t ctxlen, + const uint8_t *sk); + +int MLD_44_ref(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, + const uint8_t *ctx, size_t ctxlen, const uint8_t *sk); + +int MLD_44_ref_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, + size_t mlen, const uint8_t *ctx, size_t ctxlen, + const uint8_t *pk); + +int MLD_44_ref_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, + const uint8_t *ctx, size_t ctxlen, const uint8_t *pk); + +#endif /* !MLD_API_H */