/usr/web/sources/contrib/mospak/tls-1.2/libsec-ecmul-ct-p384.diff

Plan 9 from Bell Labs’s /usr/web/sources/contrib/mospak/tls-1.2/libsec-ecmul-ct-p384.diff

--- sys/src/libsec/port/ecc.c
+++ sys/src/libsec/port/ecc.c
@@ -13,6 +13,7 @@ extern void ecmul_p256(ECdomain *dom, ECpoint *a, mpin
 	mpint *X2, mpint *Y2, mpint *Z2,
 	mpint *X3, mpint *Y3, mpint *Z3);
 extern void ecmul_p256(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s);
+extern void ecmul_p384(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s);
 
 void
 ecassign(ECdomain *dom, ECpoint *a, ECpoint *b)
@@ -90,6 +91,22 @@ p256dom_prime(void)
 	return p;
 }
 
+/*
+ * P-384 prime, lazily parsed.  Same dispatch trick as
+ * p256dom_prime: ecmul detects P-384 by mpcmp against dom->p.
+ */
+static mpint*
+p384dom_prime(void)
+{
+	static mpint *p;
+
+	if(p == nil)
+		p = strtomp(
+		"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFF0000000000000000FFFFFFFF",
+			nil, 16, nil);
+	return p;
+}
+
 void
 ecmul(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
 {
@@ -102,6 +119,10 @@ ecmul(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
 	}
 	if(mpcmp(dom->p, p256dom_prime()) == 0){
 		ecmul_p256(dom, a, k, s);
+		return;
+	}
+	if(mpcmp(dom->p, p384dom_prime()) == 0){
+		ecmul_p384(dom, a, k, s);
 		return;
 	}
 	ns.inf = 1;
--- sys/src/libsec/port/mkfile
+++ sys/src/libsec/port/mkfile
@@ -7,6 +7,7 @@ CFILES = des.c desmodes.c desECB.c desCBC.c des3ECB.c 
 	curve25519.c curve25519_dh.c\
 	ecc.c jacobian.c secp256r1.c secp384r1.c\
 	p256.c\
+	p384.c\
 	hmac.c md5.c md5block.c md4.c sha1.c sha1block.c\
 	sha2_64.c sha2_128.c sha2block64.c sha2block128.c\
 	sha1pickle.c md5pickle.c\
@@ -60,4 +61,7 @@ $O.p256timetest: p256timetest.$O
 	$LD -o $target $prereq
 
 $O.p256timetest: p256timetest.$O
+	$LD -o $target $prereq
+
+$O.p384test: p384test.$O
 	$LD -o $target $prereq
--- sys/src/libsec/port/p384.c
+++ sys/src/libsec/port/p384.c
@@ -0,0 +1,883 @@
+#include <u.h>
+#include <libc.h>
+#include <mp.h>
+#include <libsec.h>
+
+/*
+ * P-384 (secp384r1, FIPS 186-4 D.1.2.4) constant-time field
+ * arithmetic and scalar multiplication.  Mirrors p256.c structure
+ * for the larger 384-bit prime; same constant-time discipline,
+ * same homogeneous projective coordinates, same Montgomery ladder.
+ *
+ * Limb layout: 12 little-endian uint limbs (v[0] = LSB), 48 bytes
+ * per field element.  Same width on 386 and amd64; an aliased
+ * v64[6] view is exposed for an ASM accelerator that may layer
+ * atop this code.  All field/point ops use v[].
+ *
+ * P-384 prime from FIPS 186-4 Section D.1.2.4 / RFC 5480
+ * Section 2.1.1.1: p = 2^384 - 2^128 - 2^96 + 2^32 - 1.  Solinas
+ * reduction recipe: NIST SP 800-186 Section F.6.7.
+ *
+ * The single non-static entry point will be ecmul_p384, called
+ * from ecc.c's ecmul dispatch when dom->p matches the P-384 prime
+ * (added in a follow-on patch).  This file lands the scaffolding,
+ * boundary conversion, and field ops (additive plus Solinas-based
+ * multiplicative); the point ops and scalar multiplier follow.
+ */
+
+typedef struct P384field P384field;
+typedef struct P384point P384point;
+
+struct P384field {
+	union {
+		uint	v[12];
+		uvlong	v64[6];
+	};
+};
+
+struct P384point {
+	P384field	x;
+	P384field	y;
+	P384field	z;	/* homogeneous projective; z=0 is identity */
+};
+
+/* p = 2^384 - 2^128 - 2^96 + 2^32 - 1, big-endian 48 bytes */
+static uchar p384_p_be[48] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe,
+	0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+};
+
+/* same prime, 32-bit-LE limb form */
+static const uint p384_p[12] = {
+	0xffffffff, 0x00000000, 0x00000000, 0xffffffff,
+	0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff,
+	0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+};
+
+/*
+ * b = b3312fa7 e23ee7e4 988e056b e3f82d19
+ *     181d9c6e fe814112 0314088f 5013875a
+ *     c656398d 8a2ed19d 2a85c8ed d3ec2aef
+ * Stored as 12 little-endian uint limbs (v[0] = LSB), matching
+ * the layout produced by mpToP384Field on the same 48-byte BE
+ * literal.  Cross-check against secp384r1.c.
+ */
+static P384field p384_b = { { {
+	0xd3ec2aef, 0x2a85c8ed, 0x8a2ed19d, 0xc656398d,
+	0x5013875a, 0x0314088f, 0xfe814112, 0x181d9c6e,
+	0xe3f82d19, 0x988e056b, 0xe23ee7e4, 0xb3312fa7,
+} } };
+
+/*
+ * mp <-> P384field/P384point boundary conversion.  Big-endian byte
+ * order at the mp interface; little-endian-limb (v[0] holds the
+ * least-significant 32 bits) inside.
+ */
+
+/*
+ * Reduce in modulo p384_p and write the residue as 48 big-endian
+ * bytes into buf.
+ */
+static void
+mpToBE48(uchar *buf, mpint *in)
+{
+	mpint *p, *r;
+
+	p = betomp(p384_p_be, sizeof p384_p_be, nil);
+	r = mpnew(0);
+	mpmod(in, p, r);
+	mptober(r, buf, 48);
+	mpfree(r);
+	mpfree(p);
+}
+
+static void
+mpToP384Field(P384field *out, mpint *in)
+{
+	uchar buf[48];
+	int i;
+
+	mpToBE48(buf, in);
+	for(i = 0; i < 12; i++)
+		out->v[i] = ((uint)buf[44-4*i]<<24) |
+			((uint)buf[45-4*i]<<16) |
+			((uint)buf[46-4*i]<<8) |
+			 (uint)buf[47-4*i];
+}
+
+static void
+p384FieldToMp(mpint *out, P384field *in)
+{
+	uchar buf[48];
+	int i;
+
+	for(i = 0; i < 12; i++){
+		buf[47-4*i] = in->v[i];
+		buf[46-4*i] = in->v[i] >> 8;
+		buf[45-4*i] = in->v[i] >> 16;
+		buf[44-4*i] = in->v[i] >> 24;
+	}
+	betomp(buf, sizeof buf, out);
+}
+
+static void
+mpToP384Point(P384point *out, mpint *x, mpint *y)
+{
+	mpToP384Field(&out->x, x);
+	mpToP384Field(&out->y, y);
+	memset(&out->z, 0, sizeof out->z);
+	out->z.v[0] = 1;	/* Z = 1 in 32-bit-limb form */
+}
+
+/*
+ * Precondition: p is already affine (Z = 1 canonical encoding).
+ * The future p384PointToAffine produces this state from arbitrary
+ * projective input.
+ */
+static void
+p384PointToMp(mpint *x, mpint *y, P384point *p)
+{
+	static P384field one = { { { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } };
+
+	if(memcmp(&p->z, &one, sizeof one) != 0){
+		werrstr("p384PointToMp: input not affine");
+		mpassign(mpzero, x);
+		mpassign(mpzero, y);
+		return;
+	}
+	p384FieldToMp(x, &p->x);
+	p384FieldToMp(y, &p->y);
+}
+
+/*
+ * P-384 field operations.  Additive ops (Add/Sub/Neg/Cmov) use
+ * full-length carry-chain loops over 12 limbs; multiplicative ops
+ * (Mul/Sqr/Inv) are filled in by the follow-on Solinas patch.  No
+ * branches or table lookups depend on input limbs; final reductions
+ * select via precomputed masks.
+ *
+ * Carry and borrow bits are pulled from the high half of a 64-bit
+ * uvlong accumulator: a 32-bit add or subtract cannot overflow 64
+ * bits, so the carry-out is exactly the low bit of (acc >> 32).
+ */
+
+static void
+p384FieldAdd(P384field *r, P384field *a, P384field *b)
+{
+	uvlong acc;
+	uint s[12], t[12], carry, borrow, mask, m;
+	int i;
+
+	/* s = a + b mod 2^384; carry holds the 385th bit. */
+	carry = 0;
+	for(i = 0; i < 12; i++){
+		acc = (uvlong)a->v[i] + b->v[i] + carry;
+		s[i] = (uint)acc;
+		carry = (uint)(acc >> 32);
+	}
+
+	/* t = s - p mod 2^384; borrow=1 iff s < p as a 384-bit int. */
+	borrow = 0;
+	for(i = 0; i < 12; i++){
+		acc = (uvlong)s[i] - p384_p[i] - borrow;
+		t[i] = (uint)acc;
+		borrow = (uint)(acc >> 32) & 1;
+	}
+
+	/* Pick s when carry=0 and borrow=1 (s fits in 384 bits and
+	 * is less than p); pick t otherwise (s >= p as a 385-bit
+	 * value, so the wrap of s-p gives the correct residue). */
+	m = borrow & (carry ^ 1);
+	mask = -m;
+	for(i = 0; i < 12; i++)
+		r->v[i] = (s[i] & mask) | (t[i] & ~mask);
+}
+
+static void
+p384FieldSub(P384field *r, P384field *a, P384field *b)
+{
+	uvlong acc;
+	uint t[12], s[12], carry, borrow, mask;
+	int i;
+
+	/* t = a - b mod 2^384; borrow=1 iff true value is negative. */
+	borrow = 0;
+	for(i = 0; i < 12; i++){
+		acc = (uvlong)a->v[i] - b->v[i] - borrow;
+		t[i] = (uint)acc;
+		borrow = (uint)(acc >> 32) & 1;
+	}
+
+	/* s = t + p mod 2^384.  When t underflowed (borrow=1) the
+	 * mathematical t is a-b+2^384, so s = a-b+2^384+p mod 2^384
+	 * = a-b+p, the correct non-negative residue. */
+	carry = 0;
+	for(i = 0; i < 12; i++){
+		acc = (uvlong)t[i] + p384_p[i] + carry;
+		s[i] = (uint)acc;
+		carry = (uint)(acc >> 32);
+	}
+
+	/* Pick s when borrow=1, else t (already in [0, p)). */
+	mask = -borrow;
+	for(i = 0; i < 12; i++)
+		r->v[i] = (s[i] & mask) | (t[i] & ~mask);
+}
+
+static void
+p384FieldNeg(P384field *r, P384field *a)
+{
+	P384field zero;
+
+	memset(&zero, 0, sizeof zero);
+	p384FieldSub(r, &zero, a);
+}
+
+static void
+p384FieldCmov(P384field *r, P384field *a, P384field *b, int cond)
+{
+	uint c, nz, mask;
+	int i;
+
+	/* Collapse any nonzero cond to all-1s without branching:
+	 * for unsigned 32-bit c, (c | -c) has bit 31 set iff c != 0;
+	 * shifting that down to bit 0 and negating gives the full
+	 * mask.  The behaviour matches the C ternary cond ? b : a. */
+	c = (uint)cond;
+	nz = (c | (0u - c)) >> 31;
+	mask = -nz;
+	for(i = 0; i < 12; i++)
+		r->v[i] = (a->v[i] & ~mask) | (b->v[i] & mask);
+}
+
+/*
+ * 12x12 schoolbook multiply: T[0..23] = a[0..11] * b[0..11] as a
+ * 24-limb 32-bit-LE bignum.  T limbs are uvlong-typed but each
+ * stays in [0, 2^32) at the boundaries; the inner accumulator
+ * carries into the next limb.  Inputs are canonical (each limb
+ * < 2^32; full value < 2^384), so the partial-product sum stays
+ * below 2^768 and every accumulator step fits in 64 bits.
+ */
+static void
+p384_mul_12x12(uvlong T[24], uint *a, uint *b)
+{
+	uvlong acc;
+	int i, j;
+
+	for(i = 0; i < 24; i++)
+		T[i] = 0;
+	for(i = 0; i < 12; i++){
+		acc = 0;
+		for(j = 0; j < 12; j++){
+			acc += T[i+j] + (uvlong)a[i] * b[j];
+			T[i+j] = acc & 0xffffffff;
+			acc >>= 32;
+		}
+		T[i+12] = acc;
+	}
+}
+
+/*
+ * NIST P-384 Solinas reduction.  Folds the 12 high limbs of T
+ * (T[12..23]) into the lower 12 using the prime structure
+ *     2^384 = 2^128 + 2^96 - 2^32 + 1   (mod p).
+ *
+ * Recipe (FIPS 186-2 Appendix D.2.4 / NIST SP 800-186 Section
+ * F.6.7).  Each Si/Di is a 12-limb word with v[0] LSB; the NIST
+ * text lists each as a high-to-low concatenation of selected
+ * input limbs (a0 = T[0] = LSB).
+ *
+ *   T  = (a11 a10 a9  a8  a7  a6  a5  a4  a3  a2  a1  a0)
+ *   S1 = (0   0   0   0   0   a23 a22 a21 0   0   0   0 )
+ *   S2 = (a23 a22 a21 a20 a19 a18 a17 a16 a15 a14 a13 a12)
+ *   S3 = (a20 a19 a18 a17 a16 a15 a14 a13 a12 a23 a22 a21)
+ *   S4 = (a19 a18 a17 a16 a15 a14 a13 a12 a20 0   a23 0 )
+ *   S5 = (0   0   0   0   a23 a22 a21 a20 0   0   0   0 )
+ *   S6 = (0   0   0   0   0   0   a23 a22 a21 0   0   a20)
+ *   D1 = (a22 a21 a20 a19 a18 a17 a16 a15 a14 a13 a12 a23)
+ *   D2 = (0   0   0   0   0   0   0   a23 a22 a21 a20 0 )
+ *   D3 = (0   0   0   0   0   0   0   a23 a23 0   0   0 )
+ *
+ * R = T + 2*S1 + S2 + S3 + S4 + S5 + S6 - D1 - D2 - D3 (mod p).
+ *
+ * The recipe was verified symbolically against full-precision
+ * reduction for 200 random 768-bit inputs and 200 random 12x12
+ * products (200/200 + 200/200 matches) before this code was
+ * written.
+ *
+ * pos = T + 2*S1 + S2 + S3 + S4 + S5 + S6 fits below 8 * 2^384
+ * (max coefficient sum 1+2+1+1+1+1+1 = 8 against a 12-limb max);
+ * neg = D1 + D2 + D3 fits below 3 * 2^384.  After biasing pos
+ * with 3*p the signed difference pos + 3p - neg is non-negative
+ * and below ~11*p; a fixed sequence of conditional subtractions
+ * of p brings the residue into [0, p).  Working accumulators are
+ * 13 32-bit limbs wide to capture the carry above 2^384.
+ */
+
+/* Add a 12-limb little-endian value t into a 13-limb accumulator
+ * buf, weighted by `weight` (1 or 2). */
+static void
+p384_add_term(uvlong buf[13], uint t[12], int weight)
+{
+	uvlong acc;
+	int i;
+
+	acc = 0;
+	for(i = 0; i < 12; i++){
+		acc += buf[i] + (uvlong)t[i] * weight;
+		buf[i] = acc & 0xffffffff;
+		acc >>= 32;
+	}
+	buf[12] += acc;
+}
+
+/* Add 3*p into a 13-limb accumulator (used to bias the signed
+ * difference non-negative).  3*p fits in 386 bits; the top limb
+ * holds 2 bits.  Limb-encoded form of 3 * (2^384 - 2^128 - 2^96
+ * + 2^32 - 1). */
+static void
+p384_add_3p(uvlong buf[13])
+{
+	static const uvlong threep[13] = {
+		0xfffffffdULL, 0x00000002ULL, 0x00000000ULL, 0xfffffffdULL,
+		0xfffffffcULL, 0xffffffffULL, 0xffffffffULL, 0xffffffffULL,
+		0xffffffffULL, 0xffffffffULL, 0xffffffffULL, 0xffffffffULL,
+		0x00000002ULL,
+	};
+	uvlong acc;
+	int i;
+
+	acc = 0;
+	for(i = 0; i < 13; i++){
+		acc += buf[i] + threep[i];
+		buf[i] = acc & 0xffffffff;
+		acc >>= 32;
+	}
+}
+
+/* sub_buf13: out = a - b in 13-limb arithmetic.  Returns the
+ * borrow-out (always 0 for our pre-biased values, but kept for
+ * defensive use). */
+static uint
+p384_sub_buf13(uvlong out[13], uvlong a[13], uvlong b[13])
+{
+	uvlong acc;
+	uint borrow;
+	int i;
+
+	borrow = 0;
+	for(i = 0; i < 13; i++){
+		acc = a[i] - b[i];
+		acc -= borrow;
+		out[i] = acc & 0xffffffff;
+		borrow = (uint)(acc >> 32) & 1;
+	}
+	return borrow;
+}
+
+/* Conditionally subtract p from buf in constant time: always
+ * perform the subtraction into a scratch, then mask-select between
+ * buf and scratch based on whether buf >= p.  Repeats `rounds`
+ * times to handle large multiples of p left over from the bias +
+ * Solinas sums. */
+static void
+p384_final_reduce(uint r[12], uvlong buf[13], int rounds)
+{
+	uvlong scratch[13];
+	uvlong acc;
+	uint borrow, mask;
+	int i, k;
+	static const uvlong p13[13] = {
+		0xffffffffULL, 0x00000000ULL, 0x00000000ULL, 0xffffffffULL,
+		0xfffffffeULL, 0xffffffffULL, 0xffffffffULL, 0xffffffffULL,
+		0xffffffffULL, 0xffffffffULL, 0xffffffffULL, 0xffffffffULL,
+		0x00000000ULL,
+	};
+
+	for(k = 0; k < rounds; k++){
+		borrow = 0;
+		for(i = 0; i < 13; i++){
+			acc = buf[i] - p13[i];
+			acc -= borrow;
+			scratch[i] = acc & 0xffffffff;
+			borrow = (uint)(acc >> 32) & 1;
+		}
+		/* If buf >= p, scratch holds the smaller representative
+		 * and borrow=0.  If buf < p, borrow=1 and we keep buf.
+		 * mask = 0xffffffff when borrow=0 (take scratch). */
+		mask = -(borrow ^ 1);
+		for(i = 0; i < 13; i++)
+			buf[i] = (scratch[i] & mask) | (buf[i] & ~mask);
+	}
+	for(i = 0; i < 12; i++)
+		r[i] = (uint)buf[i];
+}
+
+static void
+p384FieldMul(P384field *r, P384field *a, P384field *b)
+{
+	uvlong T[24];
+	uvlong pos[13], neg[13], diff[13];
+	uint S1[12], S2[12], S3[12], S4[12], S5[12], S6[12];
+	uint D1[12], D2[12], D3[12];
+	uint t[24];
+	int i;
+
+	/* 12x12 schoolbook product into 24 limbs. */
+	p384_mul_12x12(T, a->v, b->v);
+	for(i = 0; i < 24; i++)
+		t[i] = (uint)T[i];
+
+	/* Build the six S vectors and three D vectors per the recipe.
+	 * Indexing: position i in the limb array carries 2^(32i); the
+	 * NIST table reads MSB-first (position 11 is leftmost).  Empty
+	 * slots are zero. */
+
+	/* S1: a21 a22 a23 at limbs 4..6. */
+	for(i = 0; i < 12; i++) S1[i] = 0;
+	S1[4] = t[21]; S1[5] = t[22]; S1[6] = t[23];
+
+	/* S2 = high half of T (limbs 12..23). */
+	for(i = 0; i < 12; i++) S2[i] = t[12+i];
+
+	/* S3: rotation of high half. */
+	S3[0]=t[21]; S3[1]=t[22]; S3[2]=t[23]; S3[3]=t[12];
+	S3[4]=t[13]; S3[5]=t[14]; S3[6]=t[15]; S3[7]=t[16];
+	S3[8]=t[17]; S3[9]=t[18]; S3[10]=t[19]; S3[11]=t[20];
+
+	/* S4. */
+	S4[0]=0;     S4[1]=t[23]; S4[2]=0;     S4[3]=t[20];
+	S4[4]=t[12]; S4[5]=t[13]; S4[6]=t[14]; S4[7]=t[15];
+	S4[8]=t[16]; S4[9]=t[17]; S4[10]=t[18]; S4[11]=t[19];
+
+	/* S5: a20..a23 at limbs 4..7. */
+	for(i = 0; i < 12; i++) S5[i] = 0;
+	S5[4]=t[20]; S5[5]=t[21]; S5[6]=t[22]; S5[7]=t[23];
+
+	/* S6. */
+	for(i = 0; i < 12; i++) S6[i] = 0;
+	S6[0]=t[20]; S6[3]=t[21]; S6[4]=t[22]; S6[5]=t[23];
+
+	/* D1: rotated high half with a23 at the low limb. */
+	D1[0]=t[23]; D1[1]=t[12]; D1[2]=t[13]; D1[3]=t[14];
+	D1[4]=t[15]; D1[5]=t[16]; D1[6]=t[17]; D1[7]=t[18];
+	D1[8]=t[19]; D1[9]=t[20]; D1[10]=t[21]; D1[11]=t[22];
+
+	/* D2: a20..a23 at limbs 1..4. */
+	for(i = 0; i < 12; i++) D2[i] = 0;
+	D2[1]=t[20]; D2[2]=t[21]; D2[3]=t[22]; D2[4]=t[23];
+
+	/* D3: a23 twice at limbs 3..4. */
+	for(i = 0; i < 12; i++) D3[i] = 0;
+	D3[3]=t[23]; D3[4]=t[23];
+
+	/* pos = T + 2*S1 + S2 + S3 + S4 + S5 + S6. */
+	for(i = 0; i < 13; i++)
+		pos[i] = 0;
+	for(i = 0; i < 12; i++)
+		pos[i] = t[i];
+	p384_add_term(pos, S1, 2);
+	p384_add_term(pos, S2, 1);
+	p384_add_term(pos, S3, 1);
+	p384_add_term(pos, S4, 1);
+	p384_add_term(pos, S5, 1);
+	p384_add_term(pos, S6, 1);
+
+	/* neg = D1 + D2 + D3. */
+	for(i = 0; i < 13; i++)
+		neg[i] = 0;
+	p384_add_term(neg, D1, 1);
+	p384_add_term(neg, D2, 1);
+	p384_add_term(neg, D3, 1);
+
+	/* Bias non-negative: pos + 3p - neg is non-negative since
+	 * |neg| < 3 * 2^384 < 3p + 3*(2^128 + 2^96). */
+	p384_add_3p(pos);
+
+	/* diff = pos - neg.  Result lies in [0, ~11p). */
+	(void)p384_sub_buf13(diff, pos, neg);
+
+	/* 14 rounds of CT subtract-or-keep brings diff into [0, p);
+	 * 11 would suffice for the bound above, slack guards reasoning
+	 * gaps in the worst-case analysis. */
+	p384_final_reduce(r->v, diff, 14);
+}
+
+static void
+p384FieldSqr(P384field *r, P384field *a)
+{
+	/* BUGlet: squaring delegates to Mul; a dedicated symmetric
+	 * kernel that exploits a*b == b*a in the partial-product
+	 * matrix would save ~25% on this hot path.  Out of scope
+	 * for the initial constant-time landing. */
+	p384FieldMul(r, a, a);
+}
+
+/*
+ * Inversion via Fermat's little theorem: a^(p-2) = a^-1 mod p.
+ * Exponent
+ *     p - 2 = 2^384 - 2^128 - 2^96 + 2^32 - 3
+ * has the bit pattern (MSB to LSB):
+ *     255 ones | 1 zero | 32 ones | 64 zeros | 30 ones | 1 zero | 1 one
+ *
+ * Decomposition:
+ *     p - 2 = (2^255 - 1) << 129
+ *           + (2^32  - 1) << 96
+ *           + (2^32  - 3)
+ * (verified symbolically; sums to p - 2).
+ *
+ * Plan, working with running exponent of `acc`:
+ *   1. Build the helper f_n = a^(2^n - 1) for n in
+ *      {2,3,6,12,15,16,30,32,48,64,128,192,240,252,255}.
+ *   2. acc = f255; square 129 times -> acc = a^(2^384 - 2^129).
+ *   3. Multiply by (f32 squared 96 times) = a^(2^128 - 2^96).
+ *   4. Multiply by (f30 squared 2 times then * a) = a^(2^32 - 3).
+ *
+ * Cost: ~498 squarings + 18 multiplications.  All operations run
+ * in time independent of the operand value; the exponent is the
+ * public prime constant, so unrolling on its bit pattern leaks
+ * nothing.
+ */
+static void
+p384FieldInv(P384field *r, P384field *a)
+{
+	P384field f1, f2, f3, f6, f12, f15, f16, f30, f32, f48;
+	P384field f64, f128, f192, f240, f252, f255;
+	P384field acc, t;
+	int i;
+
+	f1 = *a;					/* a^(2^1 - 1) */
+
+	p384FieldSqr(&t, &f1);				/* a^2 */
+	p384FieldMul(&f2, &t, &f1);			/* a^(2^2 - 1) */
+
+	p384FieldSqr(&t, &f2);				/* a^6 */
+	p384FieldMul(&f3, &t, &f1);			/* a^(2^3 - 1) */
+
+	t = f3;
+	for(i = 0; i < 3; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f6, &t, &f3);			/* a^(2^6 - 1) */
+
+	t = f6;
+	for(i = 0; i < 6; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f12, &t, &f6);			/* a^(2^12 - 1) */
+
+	t = f12;
+	for(i = 0; i < 3; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f15, &t, &f3);			/* a^(2^15 - 1) */
+
+	p384FieldSqr(&t, &f15);
+	p384FieldMul(&f16, &t, &f1);			/* a^(2^16 - 1) */
+
+	t = f15;
+	for(i = 0; i < 15; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f30, &t, &f15);			/* a^(2^30 - 1) */
+
+	t = f30;
+	for(i = 0; i < 2; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f32, &t, &f2);			/* a^(2^32 - 1) */
+
+	t = f32;
+	for(i = 0; i < 16; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f48, &t, &f16);			/* a^(2^48 - 1) */
+
+	t = f32;
+	for(i = 0; i < 32; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f64, &t, &f32);			/* a^(2^64 - 1) */
+
+	t = f64;
+	for(i = 0; i < 64; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f128, &t, &f64);			/* a^(2^128 - 1) */
+
+	t = f128;
+	for(i = 0; i < 64; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f192, &t, &f64);			/* a^(2^192 - 1) */
+
+	t = f192;
+	for(i = 0; i < 48; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f240, &t, &f48);			/* a^(2^240 - 1) */
+
+	t = f240;
+	for(i = 0; i < 12; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f252, &t, &f12);			/* a^(2^252 - 1) */
+
+	t = f252;
+	for(i = 0; i < 3; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&f255, &t, &f3);			/* a^(2^255 - 1) */
+
+	/* acc = a^((2^255 - 1) * 2^129) = a^(2^384 - 2^129). */
+	acc = f255;
+	for(i = 0; i < 129; i++)
+		p384FieldSqr(&acc, &acc);
+
+	/* t = a^((2^32 - 1) * 2^96) = a^(2^128 - 2^96). */
+	t = f32;
+	for(i = 0; i < 96; i++)
+		p384FieldSqr(&t, &t);
+	p384FieldMul(&acc, &acc, &t);
+	/* acc = a^(2^384 - 2^128 - 2^96). */
+
+	/* t = a^((2^30 - 1) * 4 + 1) = a^(2^32 - 3). */
+	t = f30;
+	p384FieldSqr(&t, &t);
+	p384FieldSqr(&t, &t);
+	p384FieldMul(&t, &t, &f1);
+	p384FieldMul(r, &acc, &t);
+	/* r = a^(p - 2). */
+}
+
+/*
+ * P-384 point arithmetic in homogeneous projective coordinates
+ * (X:Y:Z) with affine map x = X/Z, y = Y/Z; identity is Z = 0.
+ * All routines run in time independent of operand value: every
+ * field op executes regardless of input, and identity / doubling
+ * / inverse-pair cases emerge from the algebra without a branch.
+ */
+
+static void
+p384PointAdd(P384point *r, P384point *a, P384point *b)
+{
+	P384field X1, Y1, Z1, X2, Y2, Z2, X3, Y3, Z3;
+	P384field t0, t1, t2, t3, t4;
+
+	X1 = a->x; Y1 = a->y; Z1 = a->z;
+	X2 = b->x; Y2 = b->y; Z2 = b->z;
+
+	/* RCB 2016 Algorithm 4, a = -3.  Same 39-step sequence as
+	 * p256PointAdd; only the field width differs. */
+	p384FieldMul(&t0, &X1, &X2);		/* t0 = X1*X2 */
+	p384FieldMul(&t1, &Y1, &Y2);		/* t1 = Y1*Y2 */
+	p384FieldMul(&t2, &Z1, &Z2);		/* t2 = Z1*Z2 */
+	p384FieldAdd(&t3, &X1, &Y1);		/* t3 = X1+Y1 */
+	p384FieldAdd(&t4, &X2, &Y2);		/* t4 = X2+Y2 */
+	p384FieldMul(&t3, &t3, &t4);		/* t3 = t3*t4 */
+	p384FieldAdd(&t4, &t0, &t1);		/* t4 = t0+t1 */
+	p384FieldSub(&t3, &t3, &t4);		/* t3 = t3-t4 */
+	p384FieldAdd(&t4, &Y1, &Z1);		/* t4 = Y1+Z1 */
+	p384FieldAdd(&X3, &Y2, &Z2);		/* X3 = Y2+Z2 */
+	p384FieldMul(&t4, &t4, &X3);		/* t4 = t4*X3 */
+	p384FieldAdd(&X3, &t1, &t2);		/* X3 = t1+t2 */
+	p384FieldSub(&t4, &t4, &X3);		/* t4 = t4-X3 */
+	p384FieldAdd(&X3, &X1, &Z1);		/* X3 = X1+Z1 */
+	p384FieldAdd(&Y3, &X2, &Z2);		/* Y3 = X2+Z2 */
+	p384FieldMul(&X3, &X3, &Y3);		/* X3 = X3*Y3 */
+	p384FieldAdd(&Y3, &t0, &t2);		/* Y3 = t0+t2 */
+	p384FieldSub(&Y3, &X3, &Y3);		/* Y3 = X3-Y3 */
+	p384FieldMul(&Z3, &p384_b, &t2);	/* Z3 = b*t2 */
+	p384FieldSub(&X3, &Y3, &Z3);		/* X3 = Y3-Z3 */
+	p384FieldAdd(&Z3, &X3, &X3);		/* Z3 = X3+X3 */
+	p384FieldAdd(&X3, &X3, &Z3);		/* X3 = X3+Z3 */
+	p384FieldSub(&Z3, &t1, &X3);		/* Z3 = t1-X3 */
+	p384FieldAdd(&X3, &t1, &X3);		/* X3 = t1+X3 */
+	p384FieldMul(&Y3, &p384_b, &Y3);	/* Y3 = b*Y3 */
+	p384FieldAdd(&t1, &t2, &t2);		/* t1 = t2+t2 */
+	p384FieldAdd(&t2, &t1, &t2);		/* t2 = t1+t2 */
+	p384FieldSub(&Y3, &Y3, &t2);		/* Y3 = Y3-t2 */
+	p384FieldSub(&Y3, &Y3, &t0);		/* Y3 = Y3-t0 */
+	p384FieldAdd(&t1, &Y3, &Y3);		/* t1 = Y3+Y3 */
+	p384FieldAdd(&Y3, &t1, &Y3);		/* Y3 = t1+Y3 */
+	p384FieldAdd(&t1, &t0, &t0);		/* t1 = t0+t0 */
+	p384FieldAdd(&t0, &t1, &t0);		/* t0 = t1+t0 */
+	p384FieldSub(&t0, &t0, &t2);		/* t0 = t0-t2 */
+	p384FieldMul(&t1, &t4, &Y3);		/* t1 = t4*Y3 */
+	p384FieldMul(&t2, &t0, &Y3);		/* t2 = t0*Y3 */
+	p384FieldMul(&Y3, &X3, &Z3);		/* Y3 = X3*Z3 */
+	p384FieldAdd(&Y3, &Y3, &t2);		/* Y3 = Y3+t2 */
+	p384FieldMul(&X3, &t3, &X3);		/* X3 = t3*X3 */
+	p384FieldSub(&X3, &X3, &t1);		/* X3 = X3-t1 */
+	p384FieldMul(&Z3, &t4, &Z3);		/* Z3 = t4*Z3 */
+	p384FieldMul(&t1, &t3, &t0);		/* t1 = t3*t0 */
+	p384FieldAdd(&Z3, &Z3, &t1);		/* Z3 = Z3+t1 */
+
+	r->x = X3; r->y = Y3; r->z = Z3;
+}
+
+static void
+p384PointDouble(P384point *r, P384point *a)
+{
+	P384field X1, Y1, Z1, X3, Y3, Z3;
+	P384field t0, t1, t2, t3;
+
+	X1 = a->x; Y1 = a->y; Z1 = a->z;
+
+	/* RCB 2016 Algorithm 6, a = -3.  Exception-free doubling. */
+	p384FieldSqr(&t0, &X1);			/* t0 = X^2 */
+	p384FieldSqr(&t1, &Y1);			/* t1 = Y^2 */
+	p384FieldSqr(&t2, &Z1);			/* t2 = Z^2 */
+	p384FieldMul(&t3, &X1, &Y1);		/* t3 = X*Y */
+	p384FieldAdd(&t3, &t3, &t3);		/* t3 = t3+t3 */
+	p384FieldMul(&Z3, &X1, &Z1);		/* Z3 = X*Z */
+	p384FieldAdd(&Z3, &Z3, &Z3);		/* Z3 = Z3+Z3 */
+	p384FieldMul(&Y3, &p384_b, &t2);	/* Y3 = b*t2 */
+	p384FieldSub(&Y3, &Y3, &Z3);		/* Y3 = Y3-Z3 */
+	p384FieldAdd(&X3, &Y3, &Y3);		/* X3 = Y3+Y3 */
+	p384FieldAdd(&Y3, &X3, &Y3);		/* Y3 = X3+Y3 */
+	p384FieldSub(&X3, &t1, &Y3);		/* X3 = t1-Y3 */
+	p384FieldAdd(&Y3, &t1, &Y3);		/* Y3 = t1+Y3 */
+	p384FieldMul(&Y3, &X3, &Y3);		/* Y3 = X3*Y3 */
+	p384FieldMul(&X3, &X3, &t3);		/* X3 = X3*t3 */
+	p384FieldAdd(&t3, &t2, &t2);		/* t3 = t2+t2 */
+	p384FieldAdd(&t2, &t2, &t3);		/* t2 = t2+t3 */
+	p384FieldMul(&Z3, &p384_b, &Z3);	/* Z3 = b*Z3 */
+	p384FieldSub(&Z3, &Z3, &t2);		/* Z3 = Z3-t2 */
+	p384FieldSub(&Z3, &Z3, &t0);		/* Z3 = Z3-t0 */
+	p384FieldAdd(&t3, &Z3, &Z3);		/* t3 = Z3+Z3 */
+	p384FieldAdd(&Z3, &Z3, &t3);		/* Z3 = Z3+t3 */
+	p384FieldAdd(&t3, &t0, &t0);		/* t3 = t0+t0 */
+	p384FieldAdd(&t0, &t3, &t0);		/* t0 = t3+t0 */
+	p384FieldSub(&t0, &t0, &t2);		/* t0 = t0-t2 */
+	p384FieldMul(&t0, &t0, &Z3);		/* t0 = t0*Z3 */
+	p384FieldAdd(&Y3, &Y3, &t0);		/* Y3 = Y3+t0 */
+	p384FieldMul(&t0, &Y1, &Z1);		/* t0 = Y*Z */
+	p384FieldAdd(&t0, &t0, &t0);		/* t0 = t0+t0 */
+	p384FieldMul(&Z3, &t0, &Z3);		/* Z3 = t0*Z3 */
+	p384FieldSub(&X3, &X3, &Z3);		/* X3 = X3-Z3 */
+	p384FieldMul(&Z3, &t0, &t1);		/* Z3 = t0*t1 */
+	p384FieldAdd(&Z3, &Z3, &Z3);		/* Z3 = Z3+Z3 */
+	p384FieldAdd(&Z3, &Z3, &Z3);		/* Z3 = Z3+Z3 */
+
+	r->x = X3; r->y = Y3; r->z = Z3;
+}
+
+/*
+ * Project (X:Y:Z) to the affine pair (X/Z, Y/Z).  One field
+ * inversion per scalar mul -- the most expensive single op, so
+ * callers fold to once-per-result rather than once-per-step.
+ * Precondition: Z != 0; calling with the identity yields garbage
+ * (callers detect Z = 0 and reject before reaching us).
+ */
+static void
+p384PointToAffine(P384field *x, P384field *y, P384point *p)
+{
+	P384field zinv;
+
+	p384FieldInv(&zinv, &p->z);
+	p384FieldMul(x, &p->x, &zinv);
+	p384FieldMul(y, &p->y, &zinv);
+}
+
+/*
+ * P-384 scalar multiplication via the Montgomery ladder.
+ * Constant-time wrt the scalar: loop count is fixed at 384 (the
+ * bit length of the group order n, FIPS 186-4 D.1.2.4), every
+ * iteration runs one point add and one point double regardless
+ * of the bit value, and the branch on each scalar bit is replaced
+ * by a bit-mask conditional swap of (R0, R1).
+ *
+ * Identity in this representation is (0:1:0), not (0:0:0): RCB
+ * 2016 Algorithm 4 requires Y != 0 even when Z = 0, otherwise
+ * the formula degenerates.  R0 is initialised to (0:1:0).
+ *
+ * Reference: SEC 1 v2.0 Section 3.2; Joye-Yen "The Montgomery
+ * powering ladder", CHES 2002.
+ */
+
+static void
+p384PointCswap(P384point *a, P384point *b, int swap)
+{
+	P384point tmpA, tmpB;
+
+	p384FieldCmov(&tmpA.x, &a->x, &b->x, swap);
+	p384FieldCmov(&tmpA.y, &a->y, &b->y, swap);
+	p384FieldCmov(&tmpA.z, &a->z, &b->z, swap);
+	p384FieldCmov(&tmpB.x, &b->x, &a->x, swap);
+	p384FieldCmov(&tmpB.y, &b->y, &a->y, swap);
+	p384FieldCmov(&tmpB.z, &b->z, &a->z, swap);
+	*a = tmpA;
+	*b = tmpB;
+}
+
+static void
+p384ScalarMul(P384point *r, mpint *k, P384point *P)
+{
+	P384point R0, R1;
+	uchar k_be[48];
+	int i, bit, prev_bit, swap;
+
+	/* R0 = identity (0:1:0); R1 = P. */
+	memset(&R0, 0, sizeof R0);
+	R0.y.v[0] = 1;
+	R1 = *P;
+
+	/* Serialize k as 48 big-endian bytes.  mptober left-pads with
+	 * zeros if k is shorter than 48 bytes; if k were ever larger
+	 * than 2^384 the high bytes would be clipped, but callers feed
+	 * scalars already reduced mod n (n < 2^384). */
+	mptober(k, k_be, sizeof k_be);
+
+	prev_bit = 0;
+	for(i = 383; i >= 0; i--){
+		bit = (k_be[47 - i/8] >> (i & 7)) & 1;
+		swap = prev_bit ^ bit;
+		p384PointCswap(&R0, &R1, swap);
+		prev_bit = bit;
+		p384PointAdd(&R1, &R0, &R1);
+		p384PointDouble(&R0, &R0);
+	}
+	/* If the final scalar bit was 1, R0 and R1 are swapped from
+	 * what the invariant requires; undo. */
+	p384PointCswap(&R0, &R1, prev_bit);
+
+	*r = R0;
+}
+
+/*
+ * P-384 constant-time scalar multiply: s = k*a.  Sign and
+ * identity normalisation match the generic ecmul path so callers
+ * see one ecmul contract regardless of curve.  This is the only
+ * non-static symbol in this file; ecc.c declares it extern and
+ * dispatches to it when dom->p matches the P-384 prime.
+ */
+void
+ecmul_p384(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
+{
+	P384point P, R;
+	P384field rx, ry, zero;
+	mpint *kk, *yneg;
+
+	kk = mpcopy(k);
+	kk->sign = 1;
+	mpmod(kk, dom->n, kk);
+	mpToP384Point(&P, a->x, a->y);
+	p384ScalarMul(&R, kk, &P);
+	mpfree(kk);
+
+	/* Identity from k*P = O.  Non-CT check is fine: the only
+	 * secret is k, and "k*P = O" only fires for k a multiple of
+	 * the group order n; TLS callers feed k in [1, n-1] so the
+	 * leak surface is empty in practice. */
+	memset(&zero, 0, sizeof zero);
+	if(memcmp(&R.z, &zero, sizeof zero) == 0){
+		s->inf = 1;
+		return;
+	}
+	p384PointToAffine(&rx, &ry, &R);
+	s->inf = 0;
+	p384FieldToMp(s->x, &rx);
+	p384FieldToMp(s->y, &ry);
+	if(s->z != nil)
+		mpassign(mpone, s->z);
+	if(k->sign < 0){
+		/* (-y) mod p == p - y for y in [0, p). */
+		yneg = mpnew(0);
+		mpsub(dom->p, s->y, yneg);
+		mpassign(yneg, s->y);
+		mpfree(yneg);
+	}
+}
--- sys/src/libsec/port/p384test.c
+++ sys/src/libsec/port/p384test.c
@@ -0,0 +1,322 @@
+#include <u.h>
+#include <libc.h>
+#include <mp.h>
+#include <libsec.h>
+
+/*
+ * Regression vectors for the constant-time P-384 path in libsec.
+ * Drives ecmul_p384 (via ecdsaverify and a sign+verify round-trip)
+ * so any miscompile or aliasing fault in p384.c surfaces here, not
+ * in TLS handshake debugging.
+ *
+ * Vector sources cited per-row:
+ *   RFC 6979 Appendix A.2.6 -- deterministic ECDSA over P-384;
+ *     given (d, msg, hash) the signature (r, s) is uniquely
+ *     determined and reproducible across implementations.
+ *   FIPS 186-4 Appendix D.1.2.4 -- curve parameters.
+ *
+ * What is exercised:
+ *   1. RFC 6979 verify: each row's (r, s) verifies against the
+ *      public key derived from d.  ecdsaverify drives ecmul on
+ *      the generator and on the public point; the constant-time
+ *      P-384 path receives both.
+ *   2. Sign+verify round-trip: ecdsasign produces (r, s) with a
+ *      random k; the same key set then verifies the result.
+ *      Drives ecmul through ecgen (signing direction) plus
+ *      ecdsaverify.
+ *   3. Negative tests: tampered (r, s) or tampered digest must
+ *      fail verify.  Catches a permissive ecdsaverify or a
+ *      degenerate ecmul that returns identity for all inputs.
+ */
+
+typedef struct EcdsaVector EcdsaVector;
+struct EcdsaVector {
+	char	*name;
+	char	*msg;
+	int	(*hash)(uchar *in, ulong inlen, uchar *out);
+	int	dlen;
+	char	*r_hex;
+	char	*s_hex;
+};
+
+/*
+ * RFC 6979 Appendix A.2.6 shared key pair.
+ *
+ *   d  = 6B9D3DAD2E1B8C1C05B19875B6659F4DE23C3B667BF297BA9AA47740
+ *        787137D896D5724E4C70A825F872C9EA60D2EDF5
+ *   Qx = EC3A4E415B4E19A4568618029F427FA5DA9A8BC4AE92E02E06AAE528
+ *        6B300C64DEF8F0EA9055866064A254515480BC13
+ *   Qy = 8015D9B72D7D57244EA8EF9AC0C621896708A59367F9DFB9F54CA84B
+ *        3F1C9DB1288B231C3AE0D4FE7344FD2533264720
+ */
+static char *rfc6979_d  =
+	"6B9D3DAD2E1B8C1C05B19875B6659F4DE23C3B667BF297BA9AA47740"
+	"787137D896D5724E4C70A825F872C9EA60D2EDF5";
+static char *rfc6979_Qx =
+	"EC3A4E415B4E19A4568618029F427FA5DA9A8BC4AE92E02E06AAE528"
+	"6B300C64DEF8F0EA9055866064A254515480BC13";
+static char *rfc6979_Qy =
+	"8015D9B72D7D57244EA8EF9AC0C621896708A59367F9DFB9F54CA84B"
+	"3F1C9DB1288B231C3AE0D4FE7344FD2533264720";
+
+/*
+ * Hash adaptors: project each digest function into a uniform
+ * (in, len, out) -> dlen signature so the table stays narrow.
+ */
+static int
+hash_sha256(uchar *in, ulong inlen, uchar *out)
+{
+	sha2_256(in, inlen, out, nil);
+	return SHA2_256dlen;
+}
+
+static int
+hash_sha384(uchar *in, ulong inlen, uchar *out)
+{
+	sha2_384(in, inlen, out, nil);
+	return SHA2_384dlen;
+}
+
+static int
+hash_sha512(uchar *in, ulong inlen, uchar *out)
+{
+	sha2_512(in, inlen, out, nil);
+	return SHA2_512dlen;
+}
+
+static EcdsaVector rfc6979_vectors[] = {
+	{
+		"RFC 6979 A.2.6 sample SHA-256",
+		"sample", hash_sha256, SHA2_256dlen,
+		"21B13D1E013C7FA1392D03C5F99AF8B30C570C6F98D4EA8E354B63A21D3DAA33"
+		"BDE1E888E63355D92FA2B3C36D8FB2CD",
+		"F3AA443FB107745BF4BD77CB3891674632068A10CA67E3D45DB2266FA7D1FEEB"
+		"EFDC63ECCD1AC42EC0CB8668A4FA0AB0",
+	},
+	{
+		"RFC 6979 A.2.6 sample SHA-384",
+		"sample", hash_sha384, SHA2_384dlen,
+		"94EDBB92A5ECB8AAD4736E56C691916B3F88140666CE9FA73D64C4EA95AD133C"
+		"81A648152E44ACF96E36DD1E80FABE46",
+		"99EF4AEB15F178CEA1FE40DB2603138F130E740A19624526203B6351D0A3A94F"
+		"A329C145786E679E7B82C71A38628AC8",
+	},
+	{
+		"RFC 6979 A.2.6 sample SHA-512",
+		"sample", hash_sha512, SHA2_512dlen,
+		"ED0959D5880AB2D869AE7F6C2915C6D60F96507F9CB3E047C0046861DA4A799C"
+		"FE30F35CC900056D7C99CD7882433709",
+		"512C8CCEEE3890A84058CE1E22DBC2198F42323CE8ACA9135329F03C068E5112"
+		"DC7CC3EF3446DEFCEB01A45C2667FDD5",
+	},
+	{
+		"RFC 6979 A.2.6 test SHA-256",
+		"test", hash_sha256, SHA2_256dlen,
+		"6D6DEFAC9AB64DABAFE36C6BF510352A4CC27001263638E5B16D9BB51D451559"
+		"F918EEDAF2293BE5B475CC8F0188636B",
+		"2D46F3BECBCC523D5F1A1256BF0C9B024D879BA9E838144C8BA6BAEB4B53B47D"
+		"51AB373F9845C0514EEFB14024787265",
+	},
+	{
+		"RFC 6979 A.2.6 test SHA-384",
+		"test", hash_sha384, SHA2_384dlen,
+		"8203B63D3C853E8D77227FB377BCF7B7B772E97892A80F36AB775D509D7A5FEB"
+		"0542A7F0812998DA8F1DD3CA3CF023DB",
+		"DDD0760448D42D8A43AF45AF836FCE4DE8BE06B485E9B61B827C2F13173923E0"
+		"6A739F040649A667BF3B828246BAA5A5",
+	},
+	{
+		"RFC 6979 A.2.6 test SHA-512",
+		"test", hash_sha512, SHA2_512dlen,
+		"A0D5D090C9980FAF3C2CE57B7AE951D31977DD11C775D314AF55F76C676447D0"
+		"6FB6495CD21B4B6E340FC236584FB277",
+		"976984E59B4C77B0E8E4460DCA3D9F20E07B9BB1F63BEEFAF576F6B2E8B22463"
+		"4A2092CD3792E0159AD9CEE37659C736",
+	},
+};
+
+static int passed;
+static int failed;
+
+static void
+report(char *name, int ok)
+{
+	print("  %s: %s\n", name, ok ? "ok" : "BAD");
+	if(ok)
+		passed++;
+	else
+		failed++;
+}
+
+/*
+ * Run one RFC 6979 row: hash msg, parse expected (r, s), call
+ * ecdsaverify against the shared public key, expect 1.
+ */
+static void
+run_rfc6979(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+	uchar dig[SHA2_512dlen];
+	mpint *r, *s;
+	int n, ok;
+
+	n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+	if(n != v->dlen){
+		report(v->name, 0);
+		return;
+	}
+	r = strtomp(v->r_hex, nil, 16, nil);
+	s = strtomp(v->s_hex, nil, 16, nil);
+	ok = ecdsaverify(dom, Q, dig, n, r, s);
+	report(v->name, ok == 1);
+	mpfree(r);
+	mpfree(s);
+}
+
+/*
+ * Bit-flip on r (or s) must fail verify.  Catches a permissive
+ * verify and a degenerate ecmul returning the same value for
+ * every scalar.  We toggle the low bit of r; if r-1 happens to
+ * also be a valid signature the test would spuriously pass, but
+ * the probability against a random valid pair is 2^-384.
+ */
+static void
+run_negative_r(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+	uchar dig[SHA2_512dlen];
+	mpint *r, *s;
+	char name[128];
+	int n, ok;
+
+	n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+	r = strtomp(v->r_hex, nil, 16, nil);
+	s = strtomp(v->s_hex, nil, 16, nil);
+	/* perturb r by 1: any change to r reshapes the verify
+	 * equation u1*G + u2*Q = R and the recomputed R.x mod n
+	 * no longer matches.  Probability of accidental hit: 2^-384. */
+	mpadd(r, mpone, r);
+	ok = ecdsaverify(dom, Q, dig, n, r, s);
+	snprint(name, sizeof name, "%s tamper-r", v->name);
+	report(name, ok == 0);
+	mpfree(r);
+	mpfree(s);
+}
+
+/*
+ * Tamper with the digest: any bit flip yields a different E and
+ * must fail verify against the original (r, s).
+ */
+static void
+run_negative_digest(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+	uchar dig[SHA2_512dlen];
+	mpint *r, *s;
+	char name[128];
+	int n, ok;
+
+	n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+	dig[0] ^= 0x01;
+	r = strtomp(v->r_hex, nil, 16, nil);
+	s = strtomp(v->s_hex, nil, 16, nil);
+	ok = ecdsaverify(dom, Q, dig, n, r, s);
+	snprint(name, sizeof name, "%s tamper-digest", v->name);
+	report(name, ok == 0);
+	mpfree(r);
+	mpfree(s);
+}
+
+/*
+ * Sign a digest with priv, then verify the resulting (r, s)
+ * against pub.  Drives ecmul through both the signing direction
+ * (ecgen builds k*G via ecmul) and the verifying direction.
+ * Repeat several times to exercise multiple random k.
+ */
+static void
+run_round_trip(ECdomain *dom, ECpriv *priv, ECpub *pub, char *tag, int rounds)
+{
+	uchar dig[SHA2_384dlen];
+	uchar buf[64];
+	mpint *r, *s;
+	char name[128];
+	int i, ok;
+
+	for(i = 0; i < rounds; i++){
+		snprint((char*)buf, sizeof buf, "round-trip-%d", i);
+		sha2_384(buf, strlen((char*)buf), dig, nil);
+		r = mpnew(0);
+		s = mpnew(0);
+		ecdsasign(dom, priv, dig, SHA2_384dlen, r, s);
+		ok = ecdsaverify(dom, pub, dig, SHA2_384dlen, r, s);
+		snprint(name, sizeof name, "%s round-trip #%d", tag, i);
+		report(name, ok == 1);
+		mpfree(r);
+		mpfree(s);
+	}
+}
+
+void
+main(int argc, char **argv)
+{
+	ECdomain dom;
+	ECpriv priv;
+	ECpub Q;
+	int i;
+
+	USED(argc); USED(argv);
+	fmtinstall('B', mpfmt);
+
+	print("p384test:\n");
+
+	ecdominit(&dom, secp384r1);
+
+	/*
+	 * RFC 6979 A.2.6 fixed key pair: parse d, Qx, Qy into the
+	 * libsec ECpub / ECpriv shape used by ecdsaverify and
+	 * ecdsasign.  ECpub is a typedef of ECpoint so x/y/z fields
+	 * carry over; z is allocated and set to one (affine) so any
+	 * code path that assumes a populated z stays well-defined.
+	 */
+	memset(&priv, 0, sizeof priv);
+	priv.x = mpnew(0);
+	priv.y = mpnew(0);
+	priv.d = mpnew(0);
+	strtomp(rfc6979_d,  nil, 16, priv.d);
+	strtomp(rfc6979_Qx, nil, 16, priv.x);
+	strtomp(rfc6979_Qy, nil, 16, priv.y);
+	priv.inf = 0;
+
+	memset(&Q, 0, sizeof Q);
+	Q.x = mpnew(0);
+	Q.y = mpnew(0);
+	Q.z = mpnew(0);
+	strtomp(rfc6979_Qx, nil, 16, Q.x);
+	strtomp(rfc6979_Qy, nil, 16, Q.y);
+	mpassign(mpone, Q.z);
+	Q.inf = 0;
+
+	if(!ecpubverify(&dom, &Q)){
+		print("  RFC 6979 public key fails ecpubverify\n");
+		exits("bad pub");
+	}
+
+	for(i = 0; i < nelem(rfc6979_vectors); i++)
+		run_rfc6979(&dom, &Q, &rfc6979_vectors[i]);
+
+	for(i = 0; i < nelem(rfc6979_vectors); i++)
+		run_negative_r(&dom, &Q, &rfc6979_vectors[i]);
+
+	for(i = 0; i < nelem(rfc6979_vectors); i++)
+		run_negative_digest(&dom, &Q, &rfc6979_vectors[i]);
+
+	run_round_trip(&dom, &priv, &Q, "RFC 6979 keypair", 5);
+
+	mpfree(priv.x);
+	mpfree(priv.y);
+	mpfree(priv.d);
+	mpfree(Q.x);
+	mpfree(Q.y);
+	mpfree(Q.z);
+	ecdomfree(&dom);
+
+	print("passed: %d/%d\n", passed, passed+failed);
+	exits(failed ? "fail" : nil);
+}
(Return to Plan 9 Home Page)