--- sys/src/libsec/port/ecc.c
+++ sys/src/libsec/port/ecc.c
@@ -13,6 +13,7 @@ extern void ecmul_p256(ECdomain *dom, ECpoint *a, mpin
mpint *X2, mpint *Y2, mpint *Z2,
mpint *X3, mpint *Y3, mpint *Z3);
extern void ecmul_p256(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s);
+extern void ecmul_p384(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s);
void
ecassign(ECdomain *dom, ECpoint *a, ECpoint *b)
@@ -90,6 +91,22 @@ p256dom_prime(void)
return p;
}
+/*
+ * P-384 prime, lazily parsed. Same dispatch trick as
+ * p256dom_prime: ecmul detects P-384 by mpcmp against dom->p.
+ */
+static mpint*
+p384dom_prime(void)
+{
+ static mpint *p;
+
+ if(p == nil)
+ p = strtomp(
+ "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFF0000000000000000FFFFFFFF",
+ nil, 16, nil);
+ return p;
+}
+
void
ecmul(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
{
@@ -102,6 +119,10 @@ ecmul(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
}
if(mpcmp(dom->p, p256dom_prime()) == 0){
ecmul_p256(dom, a, k, s);
+ return;
+ }
+ if(mpcmp(dom->p, p384dom_prime()) == 0){
+ ecmul_p384(dom, a, k, s);
return;
}
ns.inf = 1;
--- sys/src/libsec/port/mkfile
+++ sys/src/libsec/port/mkfile
@@ -7,6 +7,7 @@ CFILES = des.c desmodes.c desECB.c desCBC.c des3ECB.c
curve25519.c curve25519_dh.c\
ecc.c jacobian.c secp256r1.c secp384r1.c\
p256.c\
+ p384.c\
hmac.c md5.c md5block.c md4.c sha1.c sha1block.c\
sha2_64.c sha2_128.c sha2block64.c sha2block128.c\
sha1pickle.c md5pickle.c\
@@ -60,4 +61,7 @@ $O.p256timetest: p256timetest.$O
$LD -o $target $prereq
$O.p256timetest: p256timetest.$O
+ $LD -o $target $prereq
+
+$O.p384test: p384test.$O
$LD -o $target $prereq
--- sys/src/libsec/port/p384.c
+++ sys/src/libsec/port/p384.c
@@ -0,0 +1,883 @@
+#include <u.h>
+#include <libc.h>
+#include <mp.h>
+#include <libsec.h>
+
+/*
+ * P-384 (secp384r1, FIPS 186-4 D.1.2.4) constant-time field
+ * arithmetic and scalar multiplication. Mirrors p256.c structure
+ * for the larger 384-bit prime; same constant-time discipline,
+ * same homogeneous projective coordinates, same Montgomery ladder.
+ *
+ * Limb layout: 12 little-endian uint limbs (v[0] = LSB), 48 bytes
+ * per field element. Same width on 386 and amd64; an aliased
+ * v64[6] view is exposed for an ASM accelerator that may layer
+ * atop this code. All field/point ops use v[].
+ *
+ * P-384 prime from FIPS 186-4 Section D.1.2.4 / RFC 5480
+ * Section 2.1.1.1: p = 2^384 - 2^128 - 2^96 + 2^32 - 1. Solinas
+ * reduction recipe: NIST SP 800-186 Section F.6.7.
+ *
+ * The single non-static entry point will be ecmul_p384, called
+ * from ecc.c's ecmul dispatch when dom->p matches the P-384 prime
+ * (added in a follow-on patch). This file lands the scaffolding,
+ * boundary conversion, and field ops (additive plus Solinas-based
+ * multiplicative); the point ops and scalar multiplier follow.
+ */
+
+typedef struct P384field P384field;
+typedef struct P384point P384point;
+
+struct P384field {
+ union {
+ uint v[12];
+ uvlong v64[6];
+ };
+};
+
+struct P384point {
+ P384field x;
+ P384field y;
+ P384field z; /* homogeneous projective; z=0 is identity */
+};
+
+/* p = 2^384 - 2^128 - 2^96 + 2^32 - 1, big-endian 48 bytes */
+static uchar p384_p_be[48] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe,
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+};
+
+/* same prime, 32-bit-LE limb form */
+static const uint p384_p[12] = {
+ 0xffffffff, 0x00000000, 0x00000000, 0xffffffff,
+ 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff,
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+};
+
+/*
+ * b = b3312fa7 e23ee7e4 988e056b e3f82d19
+ * 181d9c6e fe814112 0314088f 5013875a
+ * c656398d 8a2ed19d 2a85c8ed d3ec2aef
+ * Stored as 12 little-endian uint limbs (v[0] = LSB), matching
+ * the layout produced by mpToP384Field on the same 48-byte BE
+ * literal. Cross-check against secp384r1.c.
+ */
+static P384field p384_b = { { {
+ 0xd3ec2aef, 0x2a85c8ed, 0x8a2ed19d, 0xc656398d,
+ 0x5013875a, 0x0314088f, 0xfe814112, 0x181d9c6e,
+ 0xe3f82d19, 0x988e056b, 0xe23ee7e4, 0xb3312fa7,
+} } };
+
+/*
+ * mp <-> P384field/P384point boundary conversion. Big-endian byte
+ * order at the mp interface; little-endian-limb (v[0] holds the
+ * least-significant 32 bits) inside.
+ */
+
+/*
+ * Reduce in modulo p384_p and write the residue as 48 big-endian
+ * bytes into buf.
+ */
+static void
+mpToBE48(uchar *buf, mpint *in)
+{
+ mpint *p, *r;
+
+ p = betomp(p384_p_be, sizeof p384_p_be, nil);
+ r = mpnew(0);
+ mpmod(in, p, r);
+ mptober(r, buf, 48);
+ mpfree(r);
+ mpfree(p);
+}
+
+static void
+mpToP384Field(P384field *out, mpint *in)
+{
+ uchar buf[48];
+ int i;
+
+ mpToBE48(buf, in);
+ for(i = 0; i < 12; i++)
+ out->v[i] = ((uint)buf[44-4*i]<<24) |
+ ((uint)buf[45-4*i]<<16) |
+ ((uint)buf[46-4*i]<<8) |
+ (uint)buf[47-4*i];
+}
+
+static void
+p384FieldToMp(mpint *out, P384field *in)
+{
+ uchar buf[48];
+ int i;
+
+ for(i = 0; i < 12; i++){
+ buf[47-4*i] = in->v[i];
+ buf[46-4*i] = in->v[i] >> 8;
+ buf[45-4*i] = in->v[i] >> 16;
+ buf[44-4*i] = in->v[i] >> 24;
+ }
+ betomp(buf, sizeof buf, out);
+}
+
+static void
+mpToP384Point(P384point *out, mpint *x, mpint *y)
+{
+ mpToP384Field(&out->x, x);
+ mpToP384Field(&out->y, y);
+ memset(&out->z, 0, sizeof out->z);
+ out->z.v[0] = 1; /* Z = 1 in 32-bit-limb form */
+}
+
+/*
+ * Precondition: p is already affine (Z = 1 canonical encoding).
+ * The future p384PointToAffine produces this state from arbitrary
+ * projective input.
+ */
+static void
+p384PointToMp(mpint *x, mpint *y, P384point *p)
+{
+ static P384field one = { { { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } };
+
+ if(memcmp(&p->z, &one, sizeof one) != 0){
+ werrstr("p384PointToMp: input not affine");
+ mpassign(mpzero, x);
+ mpassign(mpzero, y);
+ return;
+ }
+ p384FieldToMp(x, &p->x);
+ p384FieldToMp(y, &p->y);
+}
+
+/*
+ * P-384 field operations. Additive ops (Add/Sub/Neg/Cmov) use
+ * full-length carry-chain loops over 12 limbs; multiplicative ops
+ * (Mul/Sqr/Inv) are filled in by the follow-on Solinas patch. No
+ * branches or table lookups depend on input limbs; final reductions
+ * select via precomputed masks.
+ *
+ * Carry and borrow bits are pulled from the high half of a 64-bit
+ * uvlong accumulator: a 32-bit add or subtract cannot overflow 64
+ * bits, so the carry-out is exactly the low bit of (acc >> 32).
+ */
+
+static void
+p384FieldAdd(P384field *r, P384field *a, P384field *b)
+{
+ uvlong acc;
+ uint s[12], t[12], carry, borrow, mask, m;
+ int i;
+
+ /* s = a + b mod 2^384; carry holds the 385th bit. */
+ carry = 0;
+ for(i = 0; i < 12; i++){
+ acc = (uvlong)a->v[i] + b->v[i] + carry;
+ s[i] = (uint)acc;
+ carry = (uint)(acc >> 32);
+ }
+
+ /* t = s - p mod 2^384; borrow=1 iff s < p as a 384-bit int. */
+ borrow = 0;
+ for(i = 0; i < 12; i++){
+ acc = (uvlong)s[i] - p384_p[i] - borrow;
+ t[i] = (uint)acc;
+ borrow = (uint)(acc >> 32) & 1;
+ }
+
+ /* Pick s when carry=0 and borrow=1 (s fits in 384 bits and
+ * is less than p); pick t otherwise (s >= p as a 385-bit
+ * value, so the wrap of s-p gives the correct residue). */
+ m = borrow & (carry ^ 1);
+ mask = -m;
+ for(i = 0; i < 12; i++)
+ r->v[i] = (s[i] & mask) | (t[i] & ~mask);
+}
+
+static void
+p384FieldSub(P384field *r, P384field *a, P384field *b)
+{
+ uvlong acc;
+ uint t[12], s[12], carry, borrow, mask;
+ int i;
+
+ /* t = a - b mod 2^384; borrow=1 iff true value is negative. */
+ borrow = 0;
+ for(i = 0; i < 12; i++){
+ acc = (uvlong)a->v[i] - b->v[i] - borrow;
+ t[i] = (uint)acc;
+ borrow = (uint)(acc >> 32) & 1;
+ }
+
+ /* s = t + p mod 2^384. When t underflowed (borrow=1) the
+ * mathematical t is a-b+2^384, so s = a-b+2^384+p mod 2^384
+ * = a-b+p, the correct non-negative residue. */
+ carry = 0;
+ for(i = 0; i < 12; i++){
+ acc = (uvlong)t[i] + p384_p[i] + carry;
+ s[i] = (uint)acc;
+ carry = (uint)(acc >> 32);
+ }
+
+ /* Pick s when borrow=1, else t (already in [0, p)). */
+ mask = -borrow;
+ for(i = 0; i < 12; i++)
+ r->v[i] = (s[i] & mask) | (t[i] & ~mask);
+}
+
+static void
+p384FieldNeg(P384field *r, P384field *a)
+{
+ P384field zero;
+
+ memset(&zero, 0, sizeof zero);
+ p384FieldSub(r, &zero, a);
+}
+
+static void
+p384FieldCmov(P384field *r, P384field *a, P384field *b, int cond)
+{
+ uint c, nz, mask;
+ int i;
+
+ /* Collapse any nonzero cond to all-1s without branching:
+ * for unsigned 32-bit c, (c | -c) has bit 31 set iff c != 0;
+ * shifting that down to bit 0 and negating gives the full
+ * mask. The behaviour matches the C ternary cond ? b : a. */
+ c = (uint)cond;
+ nz = (c | (0u - c)) >> 31;
+ mask = -nz;
+ for(i = 0; i < 12; i++)
+ r->v[i] = (a->v[i] & ~mask) | (b->v[i] & mask);
+}
+
+/*
+ * 12x12 schoolbook multiply: T[0..23] = a[0..11] * b[0..11] as a
+ * 24-limb 32-bit-LE bignum. T limbs are uvlong-typed but each
+ * stays in [0, 2^32) at the boundaries; the inner accumulator
+ * carries into the next limb. Inputs are canonical (each limb
+ * < 2^32; full value < 2^384), so the partial-product sum stays
+ * below 2^768 and every accumulator step fits in 64 bits.
+ */
+static void
+p384_mul_12x12(uvlong T[24], uint *a, uint *b)
+{
+ uvlong acc;
+ int i, j;
+
+ for(i = 0; i < 24; i++)
+ T[i] = 0;
+ for(i = 0; i < 12; i++){
+ acc = 0;
+ for(j = 0; j < 12; j++){
+ acc += T[i+j] + (uvlong)a[i] * b[j];
+ T[i+j] = acc & 0xffffffff;
+ acc >>= 32;
+ }
+ T[i+12] = acc;
+ }
+}
+
+/*
+ * NIST P-384 Solinas reduction. Folds the 12 high limbs of T
+ * (T[12..23]) into the lower 12 using the prime structure
+ * 2^384 = 2^128 + 2^96 - 2^32 + 1 (mod p).
+ *
+ * Recipe (FIPS 186-2 Appendix D.2.4 / NIST SP 800-186 Section
+ * F.6.7). Each Si/Di is a 12-limb word with v[0] LSB; the NIST
+ * text lists each as a high-to-low concatenation of selected
+ * input limbs (a0 = T[0] = LSB).
+ *
+ * T = (a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0)
+ * S1 = (0 0 0 0 0 a23 a22 a21 0 0 0 0 )
+ * S2 = (a23 a22 a21 a20 a19 a18 a17 a16 a15 a14 a13 a12)
+ * S3 = (a20 a19 a18 a17 a16 a15 a14 a13 a12 a23 a22 a21)
+ * S4 = (a19 a18 a17 a16 a15 a14 a13 a12 a20 0 a23 0 )
+ * S5 = (0 0 0 0 a23 a22 a21 a20 0 0 0 0 )
+ * S6 = (0 0 0 0 0 0 a23 a22 a21 0 0 a20)
+ * D1 = (a22 a21 a20 a19 a18 a17 a16 a15 a14 a13 a12 a23)
+ * D2 = (0 0 0 0 0 0 0 a23 a22 a21 a20 0 )
+ * D3 = (0 0 0 0 0 0 0 a23 a23 0 0 0 )
+ *
+ * R = T + 2*S1 + S2 + S3 + S4 + S5 + S6 - D1 - D2 - D3 (mod p).
+ *
+ * The recipe was verified symbolically against full-precision
+ * reduction for 200 random 768-bit inputs and 200 random 12x12
+ * products (200/200 + 200/200 matches) before this code was
+ * written.
+ *
+ * pos = T + 2*S1 + S2 + S3 + S4 + S5 + S6 fits below 8 * 2^384
+ * (max coefficient sum 1+2+1+1+1+1+1 = 8 against a 12-limb max);
+ * neg = D1 + D2 + D3 fits below 3 * 2^384. After biasing pos
+ * with 3*p the signed difference pos + 3p - neg is non-negative
+ * and below ~11*p; a fixed sequence of conditional subtractions
+ * of p brings the residue into [0, p). Working accumulators are
+ * 13 32-bit limbs wide to capture the carry above 2^384.
+ */
+
+/* Add a 12-limb little-endian value t into a 13-limb accumulator
+ * buf, weighted by `weight` (1 or 2). */
+static void
+p384_add_term(uvlong buf[13], uint t[12], int weight)
+{
+ uvlong acc;
+ int i;
+
+ acc = 0;
+ for(i = 0; i < 12; i++){
+ acc += buf[i] + (uvlong)t[i] * weight;
+ buf[i] = acc & 0xffffffff;
+ acc >>= 32;
+ }
+ buf[12] += acc;
+}
+
+/* Add 3*p into a 13-limb accumulator (used to bias the signed
+ * difference non-negative). 3*p fits in 386 bits; the top limb
+ * holds 2 bits. Limb-encoded form of 3 * (2^384 - 2^128 - 2^96
+ * + 2^32 - 1). */
+static void
+p384_add_3p(uvlong buf[13])
+{
+ static const uvlong threep[13] = {
+ 0xfffffffdULL, 0x00000002ULL, 0x00000000ULL, 0xfffffffdULL,
+ 0xfffffffcULL, 0xffffffffULL, 0xffffffffULL, 0xffffffffULL,
+ 0xffffffffULL, 0xffffffffULL, 0xffffffffULL, 0xffffffffULL,
+ 0x00000002ULL,
+ };
+ uvlong acc;
+ int i;
+
+ acc = 0;
+ for(i = 0; i < 13; i++){
+ acc += buf[i] + threep[i];
+ buf[i] = acc & 0xffffffff;
+ acc >>= 32;
+ }
+}
+
+/* sub_buf13: out = a - b in 13-limb arithmetic. Returns the
+ * borrow-out (always 0 for our pre-biased values, but kept for
+ * defensive use). */
+static uint
+p384_sub_buf13(uvlong out[13], uvlong a[13], uvlong b[13])
+{
+ uvlong acc;
+ uint borrow;
+ int i;
+
+ borrow = 0;
+ for(i = 0; i < 13; i++){
+ acc = a[i] - b[i];
+ acc -= borrow;
+ out[i] = acc & 0xffffffff;
+ borrow = (uint)(acc >> 32) & 1;
+ }
+ return borrow;
+}
+
+/* Conditionally subtract p from buf in constant time: always
+ * perform the subtraction into a scratch, then mask-select between
+ * buf and scratch based on whether buf >= p. Repeats `rounds`
+ * times to handle large multiples of p left over from the bias +
+ * Solinas sums. */
+static void
+p384_final_reduce(uint r[12], uvlong buf[13], int rounds)
+{
+ uvlong scratch[13];
+ uvlong acc;
+ uint borrow, mask;
+ int i, k;
+ static const uvlong p13[13] = {
+ 0xffffffffULL, 0x00000000ULL, 0x00000000ULL, 0xffffffffULL,
+ 0xfffffffeULL, 0xffffffffULL, 0xffffffffULL, 0xffffffffULL,
+ 0xffffffffULL, 0xffffffffULL, 0xffffffffULL, 0xffffffffULL,
+ 0x00000000ULL,
+ };
+
+ for(k = 0; k < rounds; k++){
+ borrow = 0;
+ for(i = 0; i < 13; i++){
+ acc = buf[i] - p13[i];
+ acc -= borrow;
+ scratch[i] = acc & 0xffffffff;
+ borrow = (uint)(acc >> 32) & 1;
+ }
+ /* If buf >= p, scratch holds the smaller representative
+ * and borrow=0. If buf < p, borrow=1 and we keep buf.
+ * mask = 0xffffffff when borrow=0 (take scratch). */
+ mask = -(borrow ^ 1);
+ for(i = 0; i < 13; i++)
+ buf[i] = (scratch[i] & mask) | (buf[i] & ~mask);
+ }
+ for(i = 0; i < 12; i++)
+ r[i] = (uint)buf[i];
+}
+
+static void
+p384FieldMul(P384field *r, P384field *a, P384field *b)
+{
+ uvlong T[24];
+ uvlong pos[13], neg[13], diff[13];
+ uint S1[12], S2[12], S3[12], S4[12], S5[12], S6[12];
+ uint D1[12], D2[12], D3[12];
+ uint t[24];
+ int i;
+
+ /* 12x12 schoolbook product into 24 limbs. */
+ p384_mul_12x12(T, a->v, b->v);
+ for(i = 0; i < 24; i++)
+ t[i] = (uint)T[i];
+
+ /* Build the six S vectors and three D vectors per the recipe.
+ * Indexing: position i in the limb array carries 2^(32i); the
+ * NIST table reads MSB-first (position 11 is leftmost). Empty
+ * slots are zero. */
+
+ /* S1: a21 a22 a23 at limbs 4..6. */
+ for(i = 0; i < 12; i++) S1[i] = 0;
+ S1[4] = t[21]; S1[5] = t[22]; S1[6] = t[23];
+
+ /* S2 = high half of T (limbs 12..23). */
+ for(i = 0; i < 12; i++) S2[i] = t[12+i];
+
+ /* S3: rotation of high half. */
+ S3[0]=t[21]; S3[1]=t[22]; S3[2]=t[23]; S3[3]=t[12];
+ S3[4]=t[13]; S3[5]=t[14]; S3[6]=t[15]; S3[7]=t[16];
+ S3[8]=t[17]; S3[9]=t[18]; S3[10]=t[19]; S3[11]=t[20];
+
+ /* S4. */
+ S4[0]=0; S4[1]=t[23]; S4[2]=0; S4[3]=t[20];
+ S4[4]=t[12]; S4[5]=t[13]; S4[6]=t[14]; S4[7]=t[15];
+ S4[8]=t[16]; S4[9]=t[17]; S4[10]=t[18]; S4[11]=t[19];
+
+ /* S5: a20..a23 at limbs 4..7. */
+ for(i = 0; i < 12; i++) S5[i] = 0;
+ S5[4]=t[20]; S5[5]=t[21]; S5[6]=t[22]; S5[7]=t[23];
+
+ /* S6. */
+ for(i = 0; i < 12; i++) S6[i] = 0;
+ S6[0]=t[20]; S6[3]=t[21]; S6[4]=t[22]; S6[5]=t[23];
+
+ /* D1: rotated high half with a23 at the low limb. */
+ D1[0]=t[23]; D1[1]=t[12]; D1[2]=t[13]; D1[3]=t[14];
+ D1[4]=t[15]; D1[5]=t[16]; D1[6]=t[17]; D1[7]=t[18];
+ D1[8]=t[19]; D1[9]=t[20]; D1[10]=t[21]; D1[11]=t[22];
+
+ /* D2: a20..a23 at limbs 1..4. */
+ for(i = 0; i < 12; i++) D2[i] = 0;
+ D2[1]=t[20]; D2[2]=t[21]; D2[3]=t[22]; D2[4]=t[23];
+
+ /* D3: a23 twice at limbs 3..4. */
+ for(i = 0; i < 12; i++) D3[i] = 0;
+ D3[3]=t[23]; D3[4]=t[23];
+
+ /* pos = T + 2*S1 + S2 + S3 + S4 + S5 + S6. */
+ for(i = 0; i < 13; i++)
+ pos[i] = 0;
+ for(i = 0; i < 12; i++)
+ pos[i] = t[i];
+ p384_add_term(pos, S1, 2);
+ p384_add_term(pos, S2, 1);
+ p384_add_term(pos, S3, 1);
+ p384_add_term(pos, S4, 1);
+ p384_add_term(pos, S5, 1);
+ p384_add_term(pos, S6, 1);
+
+ /* neg = D1 + D2 + D3. */
+ for(i = 0; i < 13; i++)
+ neg[i] = 0;
+ p384_add_term(neg, D1, 1);
+ p384_add_term(neg, D2, 1);
+ p384_add_term(neg, D3, 1);
+
+ /* Bias non-negative: pos + 3p - neg is non-negative since
+ * |neg| < 3 * 2^384 < 3p + 3*(2^128 + 2^96). */
+ p384_add_3p(pos);
+
+ /* diff = pos - neg. Result lies in [0, ~11p). */
+ (void)p384_sub_buf13(diff, pos, neg);
+
+ /* 14 rounds of CT subtract-or-keep brings diff into [0, p);
+ * 11 would suffice for the bound above, slack guards reasoning
+ * gaps in the worst-case analysis. */
+ p384_final_reduce(r->v, diff, 14);
+}
+
+static void
+p384FieldSqr(P384field *r, P384field *a)
+{
+ /* BUGlet: squaring delegates to Mul; a dedicated symmetric
+ * kernel that exploits a*b == b*a in the partial-product
+ * matrix would save ~25% on this hot path. Out of scope
+ * for the initial constant-time landing. */
+ p384FieldMul(r, a, a);
+}
+
+/*
+ * Inversion via Fermat's little theorem: a^(p-2) = a^-1 mod p.
+ * Exponent
+ * p - 2 = 2^384 - 2^128 - 2^96 + 2^32 - 3
+ * has the bit pattern (MSB to LSB):
+ * 255 ones | 1 zero | 32 ones | 64 zeros | 30 ones | 1 zero | 1 one
+ *
+ * Decomposition:
+ * p - 2 = (2^255 - 1) << 129
+ * + (2^32 - 1) << 96
+ * + (2^32 - 3)
+ * (verified symbolically; sums to p - 2).
+ *
+ * Plan, working with running exponent of `acc`:
+ * 1. Build the helper f_n = a^(2^n - 1) for n in
+ * {2,3,6,12,15,16,30,32,48,64,128,192,240,252,255}.
+ * 2. acc = f255; square 129 times -> acc = a^(2^384 - 2^129).
+ * 3. Multiply by (f32 squared 96 times) = a^(2^128 - 2^96).
+ * 4. Multiply by (f30 squared 2 times then * a) = a^(2^32 - 3).
+ *
+ * Cost: ~498 squarings + 18 multiplications. All operations run
+ * in time independent of the operand value; the exponent is the
+ * public prime constant, so unrolling on its bit pattern leaks
+ * nothing.
+ */
+static void
+p384FieldInv(P384field *r, P384field *a)
+{
+ P384field f1, f2, f3, f6, f12, f15, f16, f30, f32, f48;
+ P384field f64, f128, f192, f240, f252, f255;
+ P384field acc, t;
+ int i;
+
+ f1 = *a; /* a^(2^1 - 1) */
+
+ p384FieldSqr(&t, &f1); /* a^2 */
+ p384FieldMul(&f2, &t, &f1); /* a^(2^2 - 1) */
+
+ p384FieldSqr(&t, &f2); /* a^6 */
+ p384FieldMul(&f3, &t, &f1); /* a^(2^3 - 1) */
+
+ t = f3;
+ for(i = 0; i < 3; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f6, &t, &f3); /* a^(2^6 - 1) */
+
+ t = f6;
+ for(i = 0; i < 6; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f12, &t, &f6); /* a^(2^12 - 1) */
+
+ t = f12;
+ for(i = 0; i < 3; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f15, &t, &f3); /* a^(2^15 - 1) */
+
+ p384FieldSqr(&t, &f15);
+ p384FieldMul(&f16, &t, &f1); /* a^(2^16 - 1) */
+
+ t = f15;
+ for(i = 0; i < 15; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f30, &t, &f15); /* a^(2^30 - 1) */
+
+ t = f30;
+ for(i = 0; i < 2; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f32, &t, &f2); /* a^(2^32 - 1) */
+
+ t = f32;
+ for(i = 0; i < 16; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f48, &t, &f16); /* a^(2^48 - 1) */
+
+ t = f32;
+ for(i = 0; i < 32; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f64, &t, &f32); /* a^(2^64 - 1) */
+
+ t = f64;
+ for(i = 0; i < 64; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f128, &t, &f64); /* a^(2^128 - 1) */
+
+ t = f128;
+ for(i = 0; i < 64; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f192, &t, &f64); /* a^(2^192 - 1) */
+
+ t = f192;
+ for(i = 0; i < 48; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f240, &t, &f48); /* a^(2^240 - 1) */
+
+ t = f240;
+ for(i = 0; i < 12; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f252, &t, &f12); /* a^(2^252 - 1) */
+
+ t = f252;
+ for(i = 0; i < 3; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&f255, &t, &f3); /* a^(2^255 - 1) */
+
+ /* acc = a^((2^255 - 1) * 2^129) = a^(2^384 - 2^129). */
+ acc = f255;
+ for(i = 0; i < 129; i++)
+ p384FieldSqr(&acc, &acc);
+
+ /* t = a^((2^32 - 1) * 2^96) = a^(2^128 - 2^96). */
+ t = f32;
+ for(i = 0; i < 96; i++)
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&acc, &acc, &t);
+ /* acc = a^(2^384 - 2^128 - 2^96). */
+
+ /* t = a^((2^30 - 1) * 4 + 1) = a^(2^32 - 3). */
+ t = f30;
+ p384FieldSqr(&t, &t);
+ p384FieldSqr(&t, &t);
+ p384FieldMul(&t, &t, &f1);
+ p384FieldMul(r, &acc, &t);
+ /* r = a^(p - 2). */
+}
+
+/*
+ * P-384 point arithmetic in homogeneous projective coordinates
+ * (X:Y:Z) with affine map x = X/Z, y = Y/Z; identity is Z = 0.
+ * All routines run in time independent of operand value: every
+ * field op executes regardless of input, and identity / doubling
+ * / inverse-pair cases emerge from the algebra without a branch.
+ */
+
+static void
+p384PointAdd(P384point *r, P384point *a, P384point *b)
+{
+ P384field X1, Y1, Z1, X2, Y2, Z2, X3, Y3, Z3;
+ P384field t0, t1, t2, t3, t4;
+
+ X1 = a->x; Y1 = a->y; Z1 = a->z;
+ X2 = b->x; Y2 = b->y; Z2 = b->z;
+
+ /* RCB 2016 Algorithm 4, a = -3. Same 39-step sequence as
+ * p256PointAdd; only the field width differs. */
+ p384FieldMul(&t0, &X1, &X2); /* t0 = X1*X2 */
+ p384FieldMul(&t1, &Y1, &Y2); /* t1 = Y1*Y2 */
+ p384FieldMul(&t2, &Z1, &Z2); /* t2 = Z1*Z2 */
+ p384FieldAdd(&t3, &X1, &Y1); /* t3 = X1+Y1 */
+ p384FieldAdd(&t4, &X2, &Y2); /* t4 = X2+Y2 */
+ p384FieldMul(&t3, &t3, &t4); /* t3 = t3*t4 */
+ p384FieldAdd(&t4, &t0, &t1); /* t4 = t0+t1 */
+ p384FieldSub(&t3, &t3, &t4); /* t3 = t3-t4 */
+ p384FieldAdd(&t4, &Y1, &Z1); /* t4 = Y1+Z1 */
+ p384FieldAdd(&X3, &Y2, &Z2); /* X3 = Y2+Z2 */
+ p384FieldMul(&t4, &t4, &X3); /* t4 = t4*X3 */
+ p384FieldAdd(&X3, &t1, &t2); /* X3 = t1+t2 */
+ p384FieldSub(&t4, &t4, &X3); /* t4 = t4-X3 */
+ p384FieldAdd(&X3, &X1, &Z1); /* X3 = X1+Z1 */
+ p384FieldAdd(&Y3, &X2, &Z2); /* Y3 = X2+Z2 */
+ p384FieldMul(&X3, &X3, &Y3); /* X3 = X3*Y3 */
+ p384FieldAdd(&Y3, &t0, &t2); /* Y3 = t0+t2 */
+ p384FieldSub(&Y3, &X3, &Y3); /* Y3 = X3-Y3 */
+ p384FieldMul(&Z3, &p384_b, &t2); /* Z3 = b*t2 */
+ p384FieldSub(&X3, &Y3, &Z3); /* X3 = Y3-Z3 */
+ p384FieldAdd(&Z3, &X3, &X3); /* Z3 = X3+X3 */
+ p384FieldAdd(&X3, &X3, &Z3); /* X3 = X3+Z3 */
+ p384FieldSub(&Z3, &t1, &X3); /* Z3 = t1-X3 */
+ p384FieldAdd(&X3, &t1, &X3); /* X3 = t1+X3 */
+ p384FieldMul(&Y3, &p384_b, &Y3); /* Y3 = b*Y3 */
+ p384FieldAdd(&t1, &t2, &t2); /* t1 = t2+t2 */
+ p384FieldAdd(&t2, &t1, &t2); /* t2 = t1+t2 */
+ p384FieldSub(&Y3, &Y3, &t2); /* Y3 = Y3-t2 */
+ p384FieldSub(&Y3, &Y3, &t0); /* Y3 = Y3-t0 */
+ p384FieldAdd(&t1, &Y3, &Y3); /* t1 = Y3+Y3 */
+ p384FieldAdd(&Y3, &t1, &Y3); /* Y3 = t1+Y3 */
+ p384FieldAdd(&t1, &t0, &t0); /* t1 = t0+t0 */
+ p384FieldAdd(&t0, &t1, &t0); /* t0 = t1+t0 */
+ p384FieldSub(&t0, &t0, &t2); /* t0 = t0-t2 */
+ p384FieldMul(&t1, &t4, &Y3); /* t1 = t4*Y3 */
+ p384FieldMul(&t2, &t0, &Y3); /* t2 = t0*Y3 */
+ p384FieldMul(&Y3, &X3, &Z3); /* Y3 = X3*Z3 */
+ p384FieldAdd(&Y3, &Y3, &t2); /* Y3 = Y3+t2 */
+ p384FieldMul(&X3, &t3, &X3); /* X3 = t3*X3 */
+ p384FieldSub(&X3, &X3, &t1); /* X3 = X3-t1 */
+ p384FieldMul(&Z3, &t4, &Z3); /* Z3 = t4*Z3 */
+ p384FieldMul(&t1, &t3, &t0); /* t1 = t3*t0 */
+ p384FieldAdd(&Z3, &Z3, &t1); /* Z3 = Z3+t1 */
+
+ r->x = X3; r->y = Y3; r->z = Z3;
+}
+
+static void
+p384PointDouble(P384point *r, P384point *a)
+{
+ P384field X1, Y1, Z1, X3, Y3, Z3;
+ P384field t0, t1, t2, t3;
+
+ X1 = a->x; Y1 = a->y; Z1 = a->z;
+
+ /* RCB 2016 Algorithm 6, a = -3. Exception-free doubling. */
+ p384FieldSqr(&t0, &X1); /* t0 = X^2 */
+ p384FieldSqr(&t1, &Y1); /* t1 = Y^2 */
+ p384FieldSqr(&t2, &Z1); /* t2 = Z^2 */
+ p384FieldMul(&t3, &X1, &Y1); /* t3 = X*Y */
+ p384FieldAdd(&t3, &t3, &t3); /* t3 = t3+t3 */
+ p384FieldMul(&Z3, &X1, &Z1); /* Z3 = X*Z */
+ p384FieldAdd(&Z3, &Z3, &Z3); /* Z3 = Z3+Z3 */
+ p384FieldMul(&Y3, &p384_b, &t2); /* Y3 = b*t2 */
+ p384FieldSub(&Y3, &Y3, &Z3); /* Y3 = Y3-Z3 */
+ p384FieldAdd(&X3, &Y3, &Y3); /* X3 = Y3+Y3 */
+ p384FieldAdd(&Y3, &X3, &Y3); /* Y3 = X3+Y3 */
+ p384FieldSub(&X3, &t1, &Y3); /* X3 = t1-Y3 */
+ p384FieldAdd(&Y3, &t1, &Y3); /* Y3 = t1+Y3 */
+ p384FieldMul(&Y3, &X3, &Y3); /* Y3 = X3*Y3 */
+ p384FieldMul(&X3, &X3, &t3); /* X3 = X3*t3 */
+ p384FieldAdd(&t3, &t2, &t2); /* t3 = t2+t2 */
+ p384FieldAdd(&t2, &t2, &t3); /* t2 = t2+t3 */
+ p384FieldMul(&Z3, &p384_b, &Z3); /* Z3 = b*Z3 */
+ p384FieldSub(&Z3, &Z3, &t2); /* Z3 = Z3-t2 */
+ p384FieldSub(&Z3, &Z3, &t0); /* Z3 = Z3-t0 */
+ p384FieldAdd(&t3, &Z3, &Z3); /* t3 = Z3+Z3 */
+ p384FieldAdd(&Z3, &Z3, &t3); /* Z3 = Z3+t3 */
+ p384FieldAdd(&t3, &t0, &t0); /* t3 = t0+t0 */
+ p384FieldAdd(&t0, &t3, &t0); /* t0 = t3+t0 */
+ p384FieldSub(&t0, &t0, &t2); /* t0 = t0-t2 */
+ p384FieldMul(&t0, &t0, &Z3); /* t0 = t0*Z3 */
+ p384FieldAdd(&Y3, &Y3, &t0); /* Y3 = Y3+t0 */
+ p384FieldMul(&t0, &Y1, &Z1); /* t0 = Y*Z */
+ p384FieldAdd(&t0, &t0, &t0); /* t0 = t0+t0 */
+ p384FieldMul(&Z3, &t0, &Z3); /* Z3 = t0*Z3 */
+ p384FieldSub(&X3, &X3, &Z3); /* X3 = X3-Z3 */
+ p384FieldMul(&Z3, &t0, &t1); /* Z3 = t0*t1 */
+ p384FieldAdd(&Z3, &Z3, &Z3); /* Z3 = Z3+Z3 */
+ p384FieldAdd(&Z3, &Z3, &Z3); /* Z3 = Z3+Z3 */
+
+ r->x = X3; r->y = Y3; r->z = Z3;
+}
+
+/*
+ * Project (X:Y:Z) to the affine pair (X/Z, Y/Z). One field
+ * inversion per scalar mul -- the most expensive single op, so
+ * callers fold to once-per-result rather than once-per-step.
+ * Precondition: Z != 0; calling with the identity yields garbage
+ * (callers detect Z = 0 and reject before reaching us).
+ */
+static void
+p384PointToAffine(P384field *x, P384field *y, P384point *p)
+{
+ P384field zinv;
+
+ p384FieldInv(&zinv, &p->z);
+ p384FieldMul(x, &p->x, &zinv);
+ p384FieldMul(y, &p->y, &zinv);
+}
+
+/*
+ * P-384 scalar multiplication via the Montgomery ladder.
+ * Constant-time wrt the scalar: loop count is fixed at 384 (the
+ * bit length of the group order n, FIPS 186-4 D.1.2.4), every
+ * iteration runs one point add and one point double regardless
+ * of the bit value, and the branch on each scalar bit is replaced
+ * by a bit-mask conditional swap of (R0, R1).
+ *
+ * Identity in this representation is (0:1:0), not (0:0:0): RCB
+ * 2016 Algorithm 4 requires Y != 0 even when Z = 0, otherwise
+ * the formula degenerates. R0 is initialised to (0:1:0).
+ *
+ * Reference: SEC 1 v2.0 Section 3.2; Joye-Yen "The Montgomery
+ * powering ladder", CHES 2002.
+ */
+
+static void
+p384PointCswap(P384point *a, P384point *b, int swap)
+{
+ P384point tmpA, tmpB;
+
+ p384FieldCmov(&tmpA.x, &a->x, &b->x, swap);
+ p384FieldCmov(&tmpA.y, &a->y, &b->y, swap);
+ p384FieldCmov(&tmpA.z, &a->z, &b->z, swap);
+ p384FieldCmov(&tmpB.x, &b->x, &a->x, swap);
+ p384FieldCmov(&tmpB.y, &b->y, &a->y, swap);
+ p384FieldCmov(&tmpB.z, &b->z, &a->z, swap);
+ *a = tmpA;
+ *b = tmpB;
+}
+
+static void
+p384ScalarMul(P384point *r, mpint *k, P384point *P)
+{
+ P384point R0, R1;
+ uchar k_be[48];
+ int i, bit, prev_bit, swap;
+
+ /* R0 = identity (0:1:0); R1 = P. */
+ memset(&R0, 0, sizeof R0);
+ R0.y.v[0] = 1;
+ R1 = *P;
+
+ /* Serialize k as 48 big-endian bytes. mptober left-pads with
+ * zeros if k is shorter than 48 bytes; if k were ever larger
+ * than 2^384 the high bytes would be clipped, but callers feed
+ * scalars already reduced mod n (n < 2^384). */
+ mptober(k, k_be, sizeof k_be);
+
+ prev_bit = 0;
+ for(i = 383; i >= 0; i--){
+ bit = (k_be[47 - i/8] >> (i & 7)) & 1;
+ swap = prev_bit ^ bit;
+ p384PointCswap(&R0, &R1, swap);
+ prev_bit = bit;
+ p384PointAdd(&R1, &R0, &R1);
+ p384PointDouble(&R0, &R0);
+ }
+ /* If the final scalar bit was 1, R0 and R1 are swapped from
+ * what the invariant requires; undo. */
+ p384PointCswap(&R0, &R1, prev_bit);
+
+ *r = R0;
+}
+
+/*
+ * P-384 constant-time scalar multiply: s = k*a. Sign and
+ * identity normalisation match the generic ecmul path so callers
+ * see one ecmul contract regardless of curve. This is the only
+ * non-static symbol in this file; ecc.c declares it extern and
+ * dispatches to it when dom->p matches the P-384 prime.
+ */
+void
+ecmul_p384(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
+{
+ P384point P, R;
+ P384field rx, ry, zero;
+ mpint *kk, *yneg;
+
+ kk = mpcopy(k);
+ kk->sign = 1;
+ mpmod(kk, dom->n, kk);
+ mpToP384Point(&P, a->x, a->y);
+ p384ScalarMul(&R, kk, &P);
+ mpfree(kk);
+
+ /* Identity from k*P = O. Non-CT check is fine: the only
+ * secret is k, and "k*P = O" only fires for k a multiple of
+ * the group order n; TLS callers feed k in [1, n-1] so the
+ * leak surface is empty in practice. */
+ memset(&zero, 0, sizeof zero);
+ if(memcmp(&R.z, &zero, sizeof zero) == 0){
+ s->inf = 1;
+ return;
+ }
+ p384PointToAffine(&rx, &ry, &R);
+ s->inf = 0;
+ p384FieldToMp(s->x, &rx);
+ p384FieldToMp(s->y, &ry);
+ if(s->z != nil)
+ mpassign(mpone, s->z);
+ if(k->sign < 0){
+ /* (-y) mod p == p - y for y in [0, p). */
+ yneg = mpnew(0);
+ mpsub(dom->p, s->y, yneg);
+ mpassign(yneg, s->y);
+ mpfree(yneg);
+ }
+}
--- sys/src/libsec/port/p384test.c
+++ sys/src/libsec/port/p384test.c
@@ -0,0 +1,322 @@
+#include <u.h>
+#include <libc.h>
+#include <mp.h>
+#include <libsec.h>
+
+/*
+ * Regression vectors for the constant-time P-384 path in libsec.
+ * Drives ecmul_p384 (via ecdsaverify and a sign+verify round-trip)
+ * so any miscompile or aliasing fault in p384.c surfaces here, not
+ * in TLS handshake debugging.
+ *
+ * Vector sources cited per-row:
+ * RFC 6979 Appendix A.2.6 -- deterministic ECDSA over P-384;
+ * given (d, msg, hash) the signature (r, s) is uniquely
+ * determined and reproducible across implementations.
+ * FIPS 186-4 Appendix D.1.2.4 -- curve parameters.
+ *
+ * What is exercised:
+ * 1. RFC 6979 verify: each row's (r, s) verifies against the
+ * public key derived from d. ecdsaverify drives ecmul on
+ * the generator and on the public point; the constant-time
+ * P-384 path receives both.
+ * 2. Sign+verify round-trip: ecdsasign produces (r, s) with a
+ * random k; the same key set then verifies the result.
+ * Drives ecmul through ecgen (signing direction) plus
+ * ecdsaverify.
+ * 3. Negative tests: tampered (r, s) or tampered digest must
+ * fail verify. Catches a permissive ecdsaverify or a
+ * degenerate ecmul that returns identity for all inputs.
+ */
+
+typedef struct EcdsaVector EcdsaVector;
+struct EcdsaVector {
+ char *name;
+ char *msg;
+ int (*hash)(uchar *in, ulong inlen, uchar *out);
+ int dlen;
+ char *r_hex;
+ char *s_hex;
+};
+
+/*
+ * RFC 6979 Appendix A.2.6 shared key pair.
+ *
+ * d = 6B9D3DAD2E1B8C1C05B19875B6659F4DE23C3B667BF297BA9AA47740
+ * 787137D896D5724E4C70A825F872C9EA60D2EDF5
+ * Qx = EC3A4E415B4E19A4568618029F427FA5DA9A8BC4AE92E02E06AAE528
+ * 6B300C64DEF8F0EA9055866064A254515480BC13
+ * Qy = 8015D9B72D7D57244EA8EF9AC0C621896708A59367F9DFB9F54CA84B
+ * 3F1C9DB1288B231C3AE0D4FE7344FD2533264720
+ */
+static char *rfc6979_d =
+ "6B9D3DAD2E1B8C1C05B19875B6659F4DE23C3B667BF297BA9AA47740"
+ "787137D896D5724E4C70A825F872C9EA60D2EDF5";
+static char *rfc6979_Qx =
+ "EC3A4E415B4E19A4568618029F427FA5DA9A8BC4AE92E02E06AAE528"
+ "6B300C64DEF8F0EA9055866064A254515480BC13";
+static char *rfc6979_Qy =
+ "8015D9B72D7D57244EA8EF9AC0C621896708A59367F9DFB9F54CA84B"
+ "3F1C9DB1288B231C3AE0D4FE7344FD2533264720";
+
+/*
+ * Hash adaptors: project each digest function into a uniform
+ * (in, len, out) -> dlen signature so the table stays narrow.
+ */
+static int
+hash_sha256(uchar *in, ulong inlen, uchar *out)
+{
+ sha2_256(in, inlen, out, nil);
+ return SHA2_256dlen;
+}
+
+static int
+hash_sha384(uchar *in, ulong inlen, uchar *out)
+{
+ sha2_384(in, inlen, out, nil);
+ return SHA2_384dlen;
+}
+
+static int
+hash_sha512(uchar *in, ulong inlen, uchar *out)
+{
+ sha2_512(in, inlen, out, nil);
+ return SHA2_512dlen;
+}
+
+static EcdsaVector rfc6979_vectors[] = {
+ {
+ "RFC 6979 A.2.6 sample SHA-256",
+ "sample", hash_sha256, SHA2_256dlen,
+ "21B13D1E013C7FA1392D03C5F99AF8B30C570C6F98D4EA8E354B63A21D3DAA33"
+ "BDE1E888E63355D92FA2B3C36D8FB2CD",
+ "F3AA443FB107745BF4BD77CB3891674632068A10CA67E3D45DB2266FA7D1FEEB"
+ "EFDC63ECCD1AC42EC0CB8668A4FA0AB0",
+ },
+ {
+ "RFC 6979 A.2.6 sample SHA-384",
+ "sample", hash_sha384, SHA2_384dlen,
+ "94EDBB92A5ECB8AAD4736E56C691916B3F88140666CE9FA73D64C4EA95AD133C"
+ "81A648152E44ACF96E36DD1E80FABE46",
+ "99EF4AEB15F178CEA1FE40DB2603138F130E740A19624526203B6351D0A3A94F"
+ "A329C145786E679E7B82C71A38628AC8",
+ },
+ {
+ "RFC 6979 A.2.6 sample SHA-512",
+ "sample", hash_sha512, SHA2_512dlen,
+ "ED0959D5880AB2D869AE7F6C2915C6D60F96507F9CB3E047C0046861DA4A799C"
+ "FE30F35CC900056D7C99CD7882433709",
+ "512C8CCEEE3890A84058CE1E22DBC2198F42323CE8ACA9135329F03C068E5112"
+ "DC7CC3EF3446DEFCEB01A45C2667FDD5",
+ },
+ {
+ "RFC 6979 A.2.6 test SHA-256",
+ "test", hash_sha256, SHA2_256dlen,
+ "6D6DEFAC9AB64DABAFE36C6BF510352A4CC27001263638E5B16D9BB51D451559"
+ "F918EEDAF2293BE5B475CC8F0188636B",
+ "2D46F3BECBCC523D5F1A1256BF0C9B024D879BA9E838144C8BA6BAEB4B53B47D"
+ "51AB373F9845C0514EEFB14024787265",
+ },
+ {
+ "RFC 6979 A.2.6 test SHA-384",
+ "test", hash_sha384, SHA2_384dlen,
+ "8203B63D3C853E8D77227FB377BCF7B7B772E97892A80F36AB775D509D7A5FEB"
+ "0542A7F0812998DA8F1DD3CA3CF023DB",
+ "DDD0760448D42D8A43AF45AF836FCE4DE8BE06B485E9B61B827C2F13173923E0"
+ "6A739F040649A667BF3B828246BAA5A5",
+ },
+ {
+ "RFC 6979 A.2.6 test SHA-512",
+ "test", hash_sha512, SHA2_512dlen,
+ "A0D5D090C9980FAF3C2CE57B7AE951D31977DD11C775D314AF55F76C676447D0"
+ "6FB6495CD21B4B6E340FC236584FB277",
+ "976984E59B4C77B0E8E4460DCA3D9F20E07B9BB1F63BEEFAF576F6B2E8B22463"
+ "4A2092CD3792E0159AD9CEE37659C736",
+ },
+};
+
+static int passed;
+static int failed;
+
+static void
+report(char *name, int ok)
+{
+ print(" %s: %s\n", name, ok ? "ok" : "BAD");
+ if(ok)
+ passed++;
+ else
+ failed++;
+}
+
+/*
+ * Run one RFC 6979 row: hash msg, parse expected (r, s), call
+ * ecdsaverify against the shared public key, expect 1.
+ */
+static void
+run_rfc6979(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+ uchar dig[SHA2_512dlen];
+ mpint *r, *s;
+ int n, ok;
+
+ n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+ if(n != v->dlen){
+ report(v->name, 0);
+ return;
+ }
+ r = strtomp(v->r_hex, nil, 16, nil);
+ s = strtomp(v->s_hex, nil, 16, nil);
+ ok = ecdsaverify(dom, Q, dig, n, r, s);
+ report(v->name, ok == 1);
+ mpfree(r);
+ mpfree(s);
+}
+
+/*
+ * Bit-flip on r (or s) must fail verify. Catches a permissive
+ * verify and a degenerate ecmul returning the same value for
+ * every scalar. We toggle the low bit of r; if r-1 happens to
+ * also be a valid signature the test would spuriously pass, but
+ * the probability against a random valid pair is 2^-384.
+ */
+static void
+run_negative_r(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+ uchar dig[SHA2_512dlen];
+ mpint *r, *s;
+ char name[128];
+ int n, ok;
+
+ n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+ r = strtomp(v->r_hex, nil, 16, nil);
+ s = strtomp(v->s_hex, nil, 16, nil);
+ /* perturb r by 1: any change to r reshapes the verify
+ * equation u1*G + u2*Q = R and the recomputed R.x mod n
+ * no longer matches. Probability of accidental hit: 2^-384. */
+ mpadd(r, mpone, r);
+ ok = ecdsaverify(dom, Q, dig, n, r, s);
+ snprint(name, sizeof name, "%s tamper-r", v->name);
+ report(name, ok == 0);
+ mpfree(r);
+ mpfree(s);
+}
+
+/*
+ * Tamper with the digest: any bit flip yields a different E and
+ * must fail verify against the original (r, s).
+ */
+static void
+run_negative_digest(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+ uchar dig[SHA2_512dlen];
+ mpint *r, *s;
+ char name[128];
+ int n, ok;
+
+ n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+ dig[0] ^= 0x01;
+ r = strtomp(v->r_hex, nil, 16, nil);
+ s = strtomp(v->s_hex, nil, 16, nil);
+ ok = ecdsaverify(dom, Q, dig, n, r, s);
+ snprint(name, sizeof name, "%s tamper-digest", v->name);
+ report(name, ok == 0);
+ mpfree(r);
+ mpfree(s);
+}
+
+/*
+ * Sign a digest with priv, then verify the resulting (r, s)
+ * against pub. Drives ecmul through both the signing direction
+ * (ecgen builds k*G via ecmul) and the verifying direction.
+ * Repeat several times to exercise multiple random k.
+ */
+static void
+run_round_trip(ECdomain *dom, ECpriv *priv, ECpub *pub, char *tag, int rounds)
+{
+ uchar dig[SHA2_384dlen];
+ uchar buf[64];
+ mpint *r, *s;
+ char name[128];
+ int i, ok;
+
+ for(i = 0; i < rounds; i++){
+ snprint((char*)buf, sizeof buf, "round-trip-%d", i);
+ sha2_384(buf, strlen((char*)buf), dig, nil);
+ r = mpnew(0);
+ s = mpnew(0);
+ ecdsasign(dom, priv, dig, SHA2_384dlen, r, s);
+ ok = ecdsaverify(dom, pub, dig, SHA2_384dlen, r, s);
+ snprint(name, sizeof name, "%s round-trip #%d", tag, i);
+ report(name, ok == 1);
+ mpfree(r);
+ mpfree(s);
+ }
+}
+
+void
+main(int argc, char **argv)
+{
+ ECdomain dom;
+ ECpriv priv;
+ ECpub Q;
+ int i;
+
+ USED(argc); USED(argv);
+ fmtinstall('B', mpfmt);
+
+ print("p384test:\n");
+
+ ecdominit(&dom, secp384r1);
+
+ /*
+ * RFC 6979 A.2.6 fixed key pair: parse d, Qx, Qy into the
+ * libsec ECpub / ECpriv shape used by ecdsaverify and
+ * ecdsasign. ECpub is a typedef of ECpoint so x/y/z fields
+ * carry over; z is allocated and set to one (affine) so any
+ * code path that assumes a populated z stays well-defined.
+ */
+ memset(&priv, 0, sizeof priv);
+ priv.x = mpnew(0);
+ priv.y = mpnew(0);
+ priv.d = mpnew(0);
+ strtomp(rfc6979_d, nil, 16, priv.d);
+ strtomp(rfc6979_Qx, nil, 16, priv.x);
+ strtomp(rfc6979_Qy, nil, 16, priv.y);
+ priv.inf = 0;
+
+ memset(&Q, 0, sizeof Q);
+ Q.x = mpnew(0);
+ Q.y = mpnew(0);
+ Q.z = mpnew(0);
+ strtomp(rfc6979_Qx, nil, 16, Q.x);
+ strtomp(rfc6979_Qy, nil, 16, Q.y);
+ mpassign(mpone, Q.z);
+ Q.inf = 0;
+
+ if(!ecpubverify(&dom, &Q)){
+ print(" RFC 6979 public key fails ecpubverify\n");
+ exits("bad pub");
+ }
+
+ for(i = 0; i < nelem(rfc6979_vectors); i++)
+ run_rfc6979(&dom, &Q, &rfc6979_vectors[i]);
+
+ for(i = 0; i < nelem(rfc6979_vectors); i++)
+ run_negative_r(&dom, &Q, &rfc6979_vectors[i]);
+
+ for(i = 0; i < nelem(rfc6979_vectors); i++)
+ run_negative_digest(&dom, &Q, &rfc6979_vectors[i]);
+
+ run_round_trip(&dom, &priv, &Q, "RFC 6979 keypair", 5);
+
+ mpfree(priv.x);
+ mpfree(priv.y);
+ mpfree(priv.d);
+ mpfree(Q.x);
+ mpfree(Q.y);
+ mpfree(Q.z);
+ ecdomfree(&dom);
+
+ print("passed: %d/%d\n", passed, passed+failed);
+ exits(failed ? "fail" : nil);
+}
|