/usr/web/sources/contrib/mospak/tls-1.2/libsec-ecmul-ct-p256.diff

Plan 9 from Bell Labs’s /usr/web/sources/contrib/mospak/tls-1.2/libsec-ecmul-ct-p256.diff

--- sys/src/libsec/port/ecc.c
+++ sys/src/libsec/port/ecc.c
@@ -12,6 +12,7 @@ extern void jacobian_add(mpint *p, mpint *a,
 	mpint *X1, mpint *Y1, mpint *Z1,
 	mpint *X2, mpint *Y2, mpint *Z2,
 	mpint *X3, mpint *Y3, mpint *Z3);
+extern void ecmul_p256(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s);
 
 void
 ecassign(ECdomain *dom, ECpoint *a, ECpoint *b)
@@ -70,6 +71,25 @@ ecadd(ECdomain *dom, ECpoint *a, ECpoint *b, ECpoint *
 	s->inf = mpcmp(s->z, mpzero) == 0;
 }
 
+/*
+ * P-256 prime, lazily parsed.  Used to detect the P-256 curve
+ * inside ecmul without adding a curve-id field to ECdomain --
+ * the prime is unique per curve and already mp-resident in
+ * dom->p, so the check costs one mpcmp.  ecmul_p256 itself
+ * lives in p256.c alongside the constant-time field arithmetic.
+ */
+static mpint*
+p256dom_prime(void)
+{
+	static mpint *p;
+
+	if(p == nil)
+		p = strtomp(
+		"FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFF",
+			nil, 16, nil);
+	return p;
+}
+
 void
 ecmul(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
 {
@@ -78,6 +98,10 @@ ecmul(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
 
 	if(a->inf || mpcmp(k, mpzero) == 0){
 		s->inf = 1;
+		return;
+	}
+	if(mpcmp(dom->p, p256dom_prime()) == 0){
+		ecmul_p256(dom, a, k, s);
 		return;
 	}
 	ns.inf = 1;
--- sys/src/libsec/port/mkfile
+++ sys/src/libsec/port/mkfile
@@ -6,6 +6,7 @@ CFILES = des.c desmodes.c desECB.c desCBC.c des3ECB.c 
 	aes.c aes_gcm.c blowfish.c chacha.c \
 	curve25519.c curve25519_dh.c\
 	ecc.c jacobian.c secp256r1.c secp384r1.c\
+	p256.c\
 	hmac.c md5.c md5block.c md4.c sha1.c sha1block.c\
 	sha2_64.c sha2_128.c sha2block64.c sha2block128.c\
 	sha1pickle.c md5pickle.c\
@@ -53,4 +54,10 @@ $O.rsatest: rsatest.$O
 	mpc $prereq >> $target
 
 $O.rsatest: rsatest.$O
+	$LD -o $target $prereq
+
+$O.p256test: p256test.$O
+	$LD -o $target $prereq
+
+$O.p256timetest: p256timetest.$O
 	$LD -o $target $prereq
--- sys/src/libsec/port/p256.c
+++ sys/src/libsec/port/p256.c
@@ -0,0 +1,806 @@
+#include <u.h>
+#include <libc.h>
+#include <mp.h>
+#include <libsec.h>
+
+/*
+ * P-256 (secp256r1, FIPS 186-4 D.1.2.3) constant-time field
+ * arithmetic and scalar multiplication.  Used by ecmul on the
+ * P-256 curve to give a side-channel-resistant scalar mul for
+ * ECDHE and ECDSA signing (verify is public-input and stays on
+ * the generic path).
+ *
+ * Limb layout: 8 little-endian uint limbs (v[0] = LSB), 32 bytes
+ * per field element.  Same width on 386 and amd64 since Plan 9
+ * keeps int at 32 bits on both; an aliased v64[4] view is exposed
+ * for an ASM accelerator that may layer atop this code.  All
+ * field/point ops use v[].
+ *
+ * All field ops run in time independent of operand value.  Point
+ * Add and Double use the complete formulas of Renes-Costello-
+ * Batina, "Complete addition formulas for prime order elliptic
+ * curves", EUROCRYPT 2016, eprint 2015/1060: Algorithm 4 for the
+ * addition (P-256 has a = -3, appendix A.2) and Algorithm 6 for
+ * the doubling.  Scalar multiplication is the Montgomery ladder
+ * (SEC 1 v2.0 Section 3.2; Joye-Yen "The Montgomery powering
+ * ladder", CHES 2002), constant-time wrt the scalar.
+ *
+ * P-256 prime from FIPS 186-4 Section D.1.2.3 / RFC 5480
+ * Section 2.1.1.1: p = 2^256 - 2^224 + 2^192 + 2^96 - 1.
+ * Curve parameter b is from FIPS 186-4 Appendix D.1.2.3.
+ *
+ * The single non-static entry point is ecmul_p256, called from
+ * ecc.c's ecmul dispatch when dom->p matches the P-256 prime.
+ */
+
+typedef struct P256field P256field;
+typedef struct P256point P256point;
+
+struct P256field {
+	union {
+		uint	v[8];
+		uvlong	v64[4];
+	};
+};
+
+struct P256point {
+	P256field	x;
+	P256field	y;
+	P256field	z;	/* homogeneous projective; z=0 is identity */
+};
+
+/* p = 2^256 - 2^224 + 2^192 + 2^96 - 1, big-endian 32 bytes */
+static uchar p256_p_be[32] = {
+	0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+};
+
+/* same prime, 32-bit-LE limb form */
+static const uint p256_p[8] = {
+	0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+	0x00000000, 0x00000000, 0x00000001, 0xffffffff,
+};
+
+/*
+ * b = 5ac635d8 aa3a93e7 b3ebbd55 769886bc
+ *     651d06b0 cc53b0f6 3bce3c3e 27d2604b
+ * Stored as 8 little-endian uint limbs (v[0] = LSB), matching
+ * the layout produced by mpToP256Field on the same 32-byte BE
+ * literal.  Cross-check against secp256r1.c.
+ */
+static P256field p256_b = { {
+	0x27d2604b, 0x3bce3c3e, 0xcc53b0f6, 0x651d06b0,
+	0x769886bc, 0xb3ebbd55, 0xaa3a93e7, 0x5ac635d8,
+} };
+
+/*
+ * mp <-> P256field/P256point boundary conversion.  Big-endian
+ * byte order at the mp interface; little-endian-limb (v[0]
+ * holds the least-significant 32 bits) inside.
+ */
+
+/*
+ * Reduce in modulo p256_p and write the residue as 32 big-endian
+ * bytes into buf.
+ */
+static void
+mpToBE32(uchar *buf, mpint *in)
+{
+	mpint *p, *r;
+
+	p = betomp(p256_p_be, sizeof p256_p_be, nil);
+	r = mpnew(0);
+	mpmod(in, p, r);
+	mptober(r, buf, 32);
+	mpfree(r);
+	mpfree(p);
+}
+
+static void
+mpToP256Field(P256field *out, mpint *in)
+{
+	uchar buf[32];
+	int i;
+
+	mpToBE32(buf, in);
+	for(i = 0; i < 8; i++)
+		out->v[i] = ((uint)buf[28-4*i]<<24) |
+			((uint)buf[29-4*i]<<16) |
+			((uint)buf[30-4*i]<<8) |
+			 (uint)buf[31-4*i];
+}
+
+static void
+p256FieldToMp(mpint *out, P256field *in)
+{
+	uchar buf[32];
+	int i;
+
+	for(i = 0; i < 8; i++){
+		buf[31-4*i] = in->v[i];
+		buf[30-4*i] = in->v[i] >> 8;
+		buf[29-4*i] = in->v[i] >> 16;
+		buf[28-4*i] = in->v[i] >> 24;
+	}
+	betomp(buf, sizeof buf, out);
+}
+
+static void
+mpToP256Point(P256point *out, mpint *x, mpint *y)
+{
+	mpToP256Field(&out->x, x);
+	mpToP256Field(&out->y, y);
+	memset(&out->z, 0, sizeof out->z);
+	out->z.v[0] = 1;	/* Z = 1 in 32-bit-limb form */
+}
+
+/*
+ * P-256 field operations.  Additive ops (Add/Sub/Neg/Cmov) use
+ * full-length carry-chain loops; multiplicative ops (Mul/Sqr/
+ * Inv) use 8x8 schoolbook with NIST Solinas reduction and a
+ * Fermat addition chain for inversion.  No branches or table
+ * lookups depend on input limbs; final reductions select via
+ * precomputed masks.
+ *
+ * Carry and borrow bits are pulled from the high half of a
+ * 64-bit uvlong accumulator: a 32-bit add or subtract cannot
+ * overflow 64 bits, so the carry-out is exactly the low bit of
+ * (acc >> 32).
+ */
+
+static void
+p256FieldAdd(P256field *r, P256field *a, P256field *b)
+{
+	uvlong acc;
+	uint s[8], t[8], carry, borrow, mask, m;
+	int i;
+
+	/* s = a + b mod 2^256; carry holds the 257th bit. */
+	carry = 0;
+	for(i = 0; i < 8; i++){
+		acc = (uvlong)a->v[i] + b->v[i] + carry;
+		s[i] = (uint)acc;
+		carry = (uint)(acc >> 32);
+	}
+
+	/* t = s - p mod 2^256; borrow=1 iff s < p as a 256-bit int. */
+	borrow = 0;
+	for(i = 0; i < 8; i++){
+		acc = (uvlong)s[i] - p256_p[i] - borrow;
+		t[i] = (uint)acc;
+		borrow = (uint)(acc >> 32) & 1;
+	}
+
+	/* Pick s when carry=0 and borrow=1 (s fits in 256 bits and
+	 * is less than p); pick t otherwise (s >= p as a 257-bit
+	 * value, so the wrap of s-p gives the correct residue). */
+	m = borrow & (carry ^ 1);
+	mask = -m;
+	for(i = 0; i < 8; i++)
+		r->v[i] = (s[i] & mask) | (t[i] & ~mask);
+}
+
+static void
+p256FieldSub(P256field *r, P256field *a, P256field *b)
+{
+	uvlong acc;
+	uint t[8], s[8], carry, borrow, mask;
+	int i;
+
+	/* t = a - b mod 2^256; borrow=1 iff true value is negative. */
+	borrow = 0;
+	for(i = 0; i < 8; i++){
+		acc = (uvlong)a->v[i] - b->v[i] - borrow;
+		t[i] = (uint)acc;
+		borrow = (uint)(acc >> 32) & 1;
+	}
+
+	/* s = t + p mod 2^256.  When t underflowed (borrow=1) the
+	 * mathematical t is a-b+2^256, so s = a-b+2^256+p mod 2^256
+	 * = a-b+p, the correct non-negative residue. */
+	carry = 0;
+	for(i = 0; i < 8; i++){
+		acc = (uvlong)t[i] + p256_p[i] + carry;
+		s[i] = (uint)acc;
+		carry = (uint)(acc >> 32);
+	}
+
+	/* Pick s when borrow=1, else t (already in [0, p)). */
+	mask = -borrow;
+	for(i = 0; i < 8; i++)
+		r->v[i] = (s[i] & mask) | (t[i] & ~mask);
+}
+
+static void
+p256FieldNeg(P256field *r, P256field *a)
+{
+	P256field zero;
+
+	memset(&zero, 0, sizeof zero);
+	p256FieldSub(r, &zero, a);
+}
+
+static void
+p256FieldCmov(P256field *r, P256field *a, P256field *b, int cond)
+{
+	uint c, nz, mask;
+	int i;
+
+	/* Collapse any nonzero cond to all-1s without branching:
+	 * for unsigned 32-bit c, (c | -c) has bit 31 set iff c != 0;
+	 * shifting that down to bit 0 and negating gives the full
+	 * mask.  The behaviour matches the C ternary cond ? b : a. */
+	c = (uint)cond;
+	nz = (c | (0u - c)) >> 31;
+	mask = -nz;
+	for(i = 0; i < 8; i++)
+		r->v[i] = (a->v[i] & ~mask) | (b->v[i] & mask);
+}
+
+/*
+ * 8x8 schoolbook multiply: T[0..15] = a[0..7] * b[0..7] as a
+ * 16-limb 32-bit-LE bignum.  T limbs are uvlong-typed but each
+ * stays in [0, 2^32) at the boundaries; the inner accumulator
+ * carries into the next limb.  Inputs are canonical (each limb
+ * < 2^32; full value < 2^256), so the partial-product sum stays
+ * below 2^512 and every accumulator step fits in 64 bits.
+ */
+static void
+p256_mul_8x8(uvlong T[16], uint *a, uint *b)
+{
+	uvlong acc;
+	int i, j;
+
+	for(i = 0; i < 16; i++)
+		T[i] = 0;
+	for(i = 0; i < 8; i++){
+		acc = 0;
+		for(j = 0; j < 8; j++){
+			acc += T[i+j] + (uvlong)a[i] * b[j];
+			T[i+j] = acc & 0xffffffff;
+			acc >>= 32;
+		}
+		T[i+8] = acc;
+	}
+}
+
+/*
+ * NIST P-256 Solinas reduction.  Folds the upper 8 limbs of T
+ * (T[8..15]) into the lower 8 using the identity
+ *     2^256 = 2^224 - 2^192 - 2^96 + 1   (mod p).
+ *
+ * Recipe (FIPS 186-4 Appendix D / Solinas 1999): write nine
+ * 8-limb words S1..S9 from selected T limbs, then compute
+ *     R = S1 + 2*S2 + 2*S3 + S4 + S5 - S6 - S7 - S8 - S9 (mod p).
+ *
+ *   S1 = (T0,  T1,  T2,  T3,  T4,  T5,  T6,  T7)
+ *   S2 = (0,   0,   0,   T11, T12, T13, T14, T15)
+ *   S3 = (0,   0,   0,   T12, T13, T14, T15, 0  )
+ *   S4 = (T8,  T9,  T10, 0,   0,   0,   T14, T15)
+ *   S5 = (T9,  T10, T11, T13, T14, T15, T13, T8 )
+ *   S6 = (T11, T12, T13, 0,   0,   0,   T8,  T10)
+ *   S7 = (T12, T13, T14, T15, 0,   0,   T9,  T11)
+ *   S8 = (T13, T14, T15, T8,  T9,  T10, 0,   T12)
+ *   S9 = (T14, T15, 0,   T9,  T10, T11, 0,   T13)
+ *
+ * pos = S1 + 2*S2 + 2*S3 + S4 + S5 fits in 9 limbs (max
+ * coefficient 1+2+2+1+1 = 7 against an 8-limb max, so the high
+ * accumulator stays well-bounded); neg = S6+S7+S8+S9 likewise
+ * fits in 9 limbs.  The signed difference pos - neg lies in
+ * (-4*2^256, 8*2^256); adding 4*p biases it non-negative below
+ * 12*p, then a fixed sequence of conditional subtractions of p
+ * brings the residue into [0, p).
+ */
+
+/* Add an 8-limb little-endian value t shifted by `shift` 32-bit
+ * limbs into a 9-limb accumulator buf, with optional weight
+ * (1 or 2).  shift+8 must be <= 9; in our recipe shift is always
+ * 0 since each Si is already an 8-limb word. */
+static void
+add_term(uvlong buf[9], uint t[8], int weight)
+{
+	uvlong acc;
+	int i;
+
+	acc = 0;
+	for(i = 0; i < 8; i++){
+		acc += buf[i] + (uvlong)t[i] * weight;
+		buf[i] = acc & 0xffffffff;
+		acc >>= 32;
+	}
+	buf[8] += acc;
+}
+
+/* Add 4*p into a 9-limb buffer (used to bias diff non-negative
+ * before final reductions).  4*p = (2^258 - 2^226 + 2^194 +
+ * 2^98 - 4); represented as 8 32-bit limbs plus a top limb
+ * holding bits 256..258. */
+static void
+add_4p(uvlong buf[9])
+{
+	/* 4 * p = 2^258 - 2^226 + 2^194 + 2^98 - 4.
+	 * Limb-encode the 32-bit-LE representation of 4p:
+	 *   bytes (LSB first): 0xfc 0xff 0xff 0xff 0xff 0xff 0xff 0xff
+	 *   ... see derivation below.
+	 * Computed as p shifted left by 2.  p as 32-bit LE limbs is
+	 * (ffffffff, ffffffff, ffffffff, 00000000, 00000000, 00000000,
+	 *  00000001, ffffffff); shifting left by 2 propagates two
+	 * bits per limb upward.  The 9th limb holds the top 2 bits
+	 * (which equal 3 since the top of p is ffffffff). */
+	static const uvlong fourp[9] = {
+		0xfffffffcULL, 0xffffffffULL, 0xffffffffULL, 0x00000003ULL,
+		0x00000000ULL, 0x00000000ULL, 0x00000004ULL, 0xfffffffcULL,
+		0x00000003ULL,
+	};
+	uvlong acc;
+	int i;
+
+	acc = 0;
+	for(i = 0; i < 9; i++){
+		acc += buf[i] + fourp[i];
+		buf[i] = acc & 0xffffffff;
+		acc >>= 32;
+	}
+}
+
+/* sub_buf: out = a - b in 9-limb arithmetic; if true result is
+ * non-negative, no work is needed afterwards.  Returns the
+ * borrow-out (always 0 for our pre-biased values, but kept for
+ * defensive use). */
+static uint
+sub_buf9(uvlong out[9], uvlong a[9], uvlong b[9])
+{
+	uvlong acc;
+	uint borrow;
+	int i;
+
+	borrow = 0;
+	for(i = 0; i < 9; i++){
+		acc = a[i] - b[i];
+		acc -= borrow;
+		out[i] = acc & 0xffffffff;
+		borrow = (uint)(acc >> 32) & 1;
+	}
+	return borrow;
+}
+
+/* Conditionally subtract p (expressed as a 9-limb value with
+ * top limb 0) from buf in constant time: always perform the
+ * subtraction into a scratch, then mask-select between buf and
+ * scratch based on whether buf >= p.  Repeats `rounds` times to
+ * handle large multiples of p left over from add_4p + Solinas
+ * sums. */
+static void
+final_reduce(uint r[8], uvlong buf[9], int rounds)
+{
+	uvlong scratch[9];
+	uvlong acc;
+	uint borrow, mask;
+	int i, k;
+	static const uvlong p9[9] = {
+		0xffffffffULL, 0xffffffffULL, 0xffffffffULL, 0x00000000ULL,
+		0x00000000ULL, 0x00000000ULL, 0x00000001ULL, 0xffffffffULL,
+		0x00000000ULL,
+	};
+
+	for(k = 0; k < rounds; k++){
+		borrow = 0;
+		for(i = 0; i < 9; i++){
+			acc = buf[i] - p9[i];
+			acc -= borrow;
+			scratch[i] = acc & 0xffffffff;
+			borrow = (uint)(acc >> 32) & 1;
+		}
+		/* If buf >= p, scratch holds the smaller representative
+		 * and borrow=0.  If buf < p, borrow=1 and we keep buf.
+		 * mask = 0xffffffff when borrow=0 (take scratch). */
+		mask = -(borrow ^ 1);
+		for(i = 0; i < 9; i++)
+			buf[i] = (scratch[i] & mask) | (buf[i] & ~mask);
+	}
+	for(i = 0; i < 8; i++)
+		r[i] = (uint)buf[i];
+}
+
+static void
+p256FieldMul(P256field *r, P256field *a, P256field *b)
+{
+	uvlong T[16];
+	uvlong pos[9], neg[9], diff[9];
+	uint S1[8], S2[8], S3[8], S4[8], S5[8];
+	uint S6[8], S7[8], S8[8], S9[8];
+	uint t[16];
+	int i;
+
+	/* 8x8 schoolbook product into 16 limbs. */
+	p256_mul_8x8(T, a->v, b->v);
+	for(i = 0; i < 16; i++)
+		t[i] = (uint)T[i];
+
+	/* Build the nine S vectors per the recipe above. */
+	S1[0]=t[0];  S1[1]=t[1];  S1[2]=t[2];  S1[3]=t[3];
+	S1[4]=t[4];  S1[5]=t[5];  S1[6]=t[6];  S1[7]=t[7];
+
+	S2[0]=0;     S2[1]=0;     S2[2]=0;     S2[3]=t[11];
+	S2[4]=t[12]; S2[5]=t[13]; S2[6]=t[14]; S2[7]=t[15];
+
+	S3[0]=0;     S3[1]=0;     S3[2]=0;     S3[3]=t[12];
+	S3[4]=t[13]; S3[5]=t[14]; S3[6]=t[15]; S3[7]=0;
+
+	S4[0]=t[8];  S4[1]=t[9];  S4[2]=t[10]; S4[3]=0;
+	S4[4]=0;     S4[5]=0;     S4[6]=t[14]; S4[7]=t[15];
+
+	S5[0]=t[9];  S5[1]=t[10]; S5[2]=t[11]; S5[3]=t[13];
+	S5[4]=t[14]; S5[5]=t[15]; S5[6]=t[13]; S5[7]=t[8];
+
+	S6[0]=t[11]; S6[1]=t[12]; S6[2]=t[13]; S6[3]=0;
+	S6[4]=0;     S6[5]=0;     S6[6]=t[8];  S6[7]=t[10];
+
+	S7[0]=t[12]; S7[1]=t[13]; S7[2]=t[14]; S7[3]=t[15];
+	S7[4]=0;     S7[5]=0;     S7[6]=t[9];  S7[7]=t[11];
+
+	S8[0]=t[13]; S8[1]=t[14]; S8[2]=t[15]; S8[3]=t[8];
+	S8[4]=t[9];  S8[5]=t[10]; S8[6]=0;     S8[7]=t[12];
+
+	S9[0]=t[14]; S9[1]=t[15]; S9[2]=0;     S9[3]=t[9];
+	S9[4]=t[10]; S9[5]=t[11]; S9[6]=0;     S9[7]=t[13];
+
+	/* pos = S1 + 2*S2 + 2*S3 + S4 + S5 in a 9-limb accumulator. */
+	for(i = 0; i < 9; i++)
+		pos[i] = 0;
+	add_term(pos, S1, 1);
+	add_term(pos, S2, 2);
+	add_term(pos, S3, 2);
+	add_term(pos, S4, 1);
+	add_term(pos, S5, 1);
+
+	/* neg = S6 + S7 + S8 + S9. */
+	for(i = 0; i < 9; i++)
+		neg[i] = 0;
+	add_term(neg, S6, 1);
+	add_term(neg, S7, 1);
+	add_term(neg, S8, 1);
+	add_term(neg, S9, 1);
+
+	/* Bias non-negative: add 4*p.  Worst case |neg| <= 4*(2^256-1),
+	 * 4*p > 4*(2^256-1) - 4, so pos + 4p - neg is non-negative. */
+	add_4p(pos);
+
+	/* diff = pos - neg.  Result is in [0, ~12*p). */
+	(void)sub_buf9(diff, pos, neg);
+
+	/* 12 rounds of CT subtract-or-keep brings diff into [0, p).
+	 * 8 would be sufficient for the bound above; 12 leaves slack
+	 * for any reasoning gap. */
+	final_reduce(r->v, diff, 12);
+}
+
+static void
+p256FieldSqr(P256field *r, P256field *a)
+{
+	/* BUGlet: squaring delegates to Mul; a dedicated symmetric
+	 * kernel that exploits a*b == b*a in the partial-product
+	 * matrix would save ~25% on this hot path.  Out of scope
+	 * for the initial constant-time landing. */
+	p256FieldMul(r, a, a);
+}
+
+/*
+ * Inversion via Fermat's little theorem: a^(p-2) = a^-1 mod p.
+ * The exponent p-2 = 2^256 - 2^224 + 2^192 + 2^96 - 3 is fixed,
+ * so a windowed addition chain runs in time independent of the
+ * operand value.  Addition chain follows Bernstein 2014: build
+ * windows e_I = a^(2^I - 1) and combine them to land on the bit-
+ * pattern of p-2.  Total: 256 squarings + 12 multiplications.
+ * Annotations show the exponent of a each variable holds.
+ */
+static void
+p256FieldInv(P256field *r, P256field *a)
+{
+	P256field ftmp, ftmp2, e2, e4, e8, e16, e32, e64;
+	int i;
+
+	p256FieldSqr(&ftmp, a);			/* 2^1 */
+	p256FieldMul(&ftmp, &ftmp, a);		/* 2^2 - 2^0 */
+	e2 = ftmp;
+	p256FieldSqr(&ftmp, &ftmp);		/* 2^3 - 2^1 */
+	p256FieldSqr(&ftmp, &ftmp);		/* 2^4 - 2^2 */
+	p256FieldMul(&ftmp, &ftmp, &e2);	/* 2^4 - 2^0 */
+	e4 = ftmp;
+	p256FieldSqr(&ftmp, &ftmp);		/* 2^5 - 2^1 */
+	p256FieldSqr(&ftmp, &ftmp);		/* 2^6 - 2^2 */
+	p256FieldSqr(&ftmp, &ftmp);		/* 2^7 - 2^3 */
+	p256FieldSqr(&ftmp, &ftmp);		/* 2^8 - 2^4 */
+	p256FieldMul(&ftmp, &ftmp, &e4);	/* 2^8 - 2^0 */
+	e8 = ftmp;
+	for(i = 0; i < 8; i++)
+		p256FieldSqr(&ftmp, &ftmp);	/* 2^16 - 2^8 */
+	p256FieldMul(&ftmp, &ftmp, &e8);	/* 2^16 - 2^0 */
+	e16 = ftmp;
+	for(i = 0; i < 16; i++)
+		p256FieldSqr(&ftmp, &ftmp);	/* 2^32 - 2^16 */
+	p256FieldMul(&ftmp, &ftmp, &e16);	/* 2^32 - 2^0 */
+	e32 = ftmp;
+	for(i = 0; i < 32; i++)
+		p256FieldSqr(&ftmp, &ftmp);	/* 2^64 - 2^32 */
+	e64 = ftmp;
+	p256FieldMul(&ftmp, &ftmp, a);		/* 2^64 - 2^32 + 2^0 */
+	for(i = 0; i < 192; i++)
+		p256FieldSqr(&ftmp, &ftmp);
+	/* ftmp = 2^256 - 2^224 + 2^192 */
+
+	p256FieldMul(&ftmp2, &e64, &e32);	/* 2^64 - 2^0 */
+	for(i = 0; i < 16; i++)
+		p256FieldSqr(&ftmp2, &ftmp2);	/* 2^80 - 2^16 */
+	p256FieldMul(&ftmp2, &ftmp2, &e16);	/* 2^80 - 2^0 */
+	for(i = 0; i < 8; i++)
+		p256FieldSqr(&ftmp2, &ftmp2);	/* 2^88 - 2^8 */
+	p256FieldMul(&ftmp2, &ftmp2, &e8);	/* 2^88 - 2^0 */
+	for(i = 0; i < 4; i++)
+		p256FieldSqr(&ftmp2, &ftmp2);	/* 2^92 - 2^4 */
+	p256FieldMul(&ftmp2, &ftmp2, &e4);	/* 2^92 - 2^0 */
+	p256FieldSqr(&ftmp2, &ftmp2);		/* 2^93 - 2^1 */
+	p256FieldSqr(&ftmp2, &ftmp2);		/* 2^94 - 2^2 */
+	p256FieldMul(&ftmp2, &ftmp2, &e2);	/* 2^94 - 2^0 */
+	p256FieldSqr(&ftmp2, &ftmp2);		/* 2^95 - 2^1 */
+	p256FieldSqr(&ftmp2, &ftmp2);		/* 2^96 - 2^2 */
+	p256FieldMul(&ftmp2, &ftmp2, a);	/* 2^96 - 3 */
+
+	p256FieldMul(r, &ftmp2, &ftmp);
+	/* r = 2^256 - 2^224 + 2^192 + 2^96 - 3 = p - 2 */
+}
+
+/*
+ * P-256 point arithmetic in homogeneous projective coordinates
+ * (X:Y:Z) with affine map x = X/Z, y = Y/Z; identity is Z = 0.
+ * All routines run in time independent of operand value: every
+ * field op executes regardless of input, and identity / doubling
+ * / inverse-pair cases emerge from the algebra without a branch.
+ */
+
+static void
+p256PointAdd(P256point *r, P256point *a, P256point *b)
+{
+	P256field X1, Y1, Z1, X2, Y2, Z2, X3, Y3, Z3;
+	P256field t0, t1, t2, t3, t4;
+
+	X1 = a->x; Y1 = a->y; Z1 = a->z;
+	X2 = b->x; Y2 = b->y; Z2 = b->z;
+
+	/* RCB 2016 Algorithm 4, a = -3.  Each line is one field
+	 * op; the entire 39-step sequence runs unconditionally. */
+	p256FieldMul(&t0, &X1, &X2);		/* t0 = X1*X2 */
+	p256FieldMul(&t1, &Y1, &Y2);		/* t1 = Y1*Y2 */
+	p256FieldMul(&t2, &Z1, &Z2);		/* t2 = Z1*Z2 */
+	p256FieldAdd(&t3, &X1, &Y1);		/* t3 = X1+Y1 */
+	p256FieldAdd(&t4, &X2, &Y2);		/* t4 = X2+Y2 */
+	p256FieldMul(&t3, &t3, &t4);		/* t3 = t3*t4 */
+	p256FieldAdd(&t4, &t0, &t1);		/* t4 = t0+t1 */
+	p256FieldSub(&t3, &t3, &t4);		/* t3 = t3-t4 */
+	p256FieldAdd(&t4, &Y1, &Z1);		/* t4 = Y1+Z1 */
+	p256FieldAdd(&X3, &Y2, &Z2);		/* X3 = Y2+Z2 */
+	p256FieldMul(&t4, &t4, &X3);		/* t4 = t4*X3 */
+	p256FieldAdd(&X3, &t1, &t2);		/* X3 = t1+t2 */
+	p256FieldSub(&t4, &t4, &X3);		/* t4 = t4-X3 */
+	p256FieldAdd(&X3, &X1, &Z1);		/* X3 = X1+Z1 */
+	p256FieldAdd(&Y3, &X2, &Z2);		/* Y3 = X2+Z2 */
+	p256FieldMul(&X3, &X3, &Y3);		/* X3 = X3*Y3 */
+	p256FieldAdd(&Y3, &t0, &t2);		/* Y3 = t0+t2 */
+	p256FieldSub(&Y3, &X3, &Y3);		/* Y3 = X3-Y3 */
+	p256FieldMul(&Z3, &p256_b, &t2);	/* Z3 = b*t2 */
+	p256FieldSub(&X3, &Y3, &Z3);		/* X3 = Y3-Z3 */
+	p256FieldAdd(&Z3, &X3, &X3);		/* Z3 = X3+X3 */
+	p256FieldAdd(&X3, &X3, &Z3);		/* X3 = X3+Z3 */
+	p256FieldSub(&Z3, &t1, &X3);		/* Z3 = t1-X3 */
+	p256FieldAdd(&X3, &t1, &X3);		/* X3 = t1+X3 */
+	p256FieldMul(&Y3, &p256_b, &Y3);	/* Y3 = b*Y3 */
+	p256FieldAdd(&t1, &t2, &t2);		/* t1 = t2+t2 */
+	p256FieldAdd(&t2, &t1, &t2);		/* t2 = t1+t2 */
+	p256FieldSub(&Y3, &Y3, &t2);		/* Y3 = Y3-t2 */
+	p256FieldSub(&Y3, &Y3, &t0);		/* Y3 = Y3-t0 */
+	p256FieldAdd(&t1, &Y3, &Y3);		/* t1 = Y3+Y3 */
+	p256FieldAdd(&Y3, &t1, &Y3);		/* Y3 = t1+Y3 */
+	p256FieldAdd(&t1, &t0, &t0);		/* t1 = t0+t0 */
+	p256FieldAdd(&t0, &t1, &t0);		/* t0 = t1+t0 */
+	p256FieldSub(&t0, &t0, &t2);		/* t0 = t0-t2 */
+	p256FieldMul(&t1, &t4, &Y3);		/* t1 = t4*Y3 */
+	p256FieldMul(&t2, &t0, &Y3);		/* t2 = t0*Y3 */
+	p256FieldMul(&Y3, &X3, &Z3);		/* Y3 = X3*Z3 */
+	p256FieldAdd(&Y3, &Y3, &t2);		/* Y3 = Y3+t2 */
+	p256FieldMul(&X3, &t3, &X3);		/* X3 = t3*X3 */
+	p256FieldSub(&X3, &X3, &t1);		/* X3 = X3-t1 */
+	p256FieldMul(&Z3, &t4, &Z3);		/* Z3 = t4*Z3 */
+	p256FieldMul(&t1, &t3, &t0);		/* t1 = t3*t0 */
+	p256FieldAdd(&Z3, &Z3, &t1);		/* Z3 = Z3+t1 */
+
+	r->x = X3; r->y = Y3; r->z = Z3;
+}
+
+static void
+p256PointDouble(P256point *r, P256point *a)
+{
+	P256field X1, Y1, Z1, X3, Y3, Z3;
+	P256field t0, t1, t2, t3;
+
+	X1 = a->x; Y1 = a->y; Z1 = a->z;
+
+	/* RCB 2016 Algorithm 6, a = -3.  Exception-free doubling. */
+	p256FieldSqr(&t0, &X1);			/* t0 = X^2 */
+	p256FieldSqr(&t1, &Y1);			/* t1 = Y^2 */
+	p256FieldSqr(&t2, &Z1);			/* t2 = Z^2 */
+	p256FieldMul(&t3, &X1, &Y1);		/* t3 = X*Y */
+	p256FieldAdd(&t3, &t3, &t3);		/* t3 = t3+t3 */
+	p256FieldMul(&Z3, &X1, &Z1);		/* Z3 = X*Z */
+	p256FieldAdd(&Z3, &Z3, &Z3);		/* Z3 = Z3+Z3 */
+	p256FieldMul(&Y3, &p256_b, &t2);	/* Y3 = b*t2 */
+	p256FieldSub(&Y3, &Y3, &Z3);		/* Y3 = Y3-Z3 */
+	p256FieldAdd(&X3, &Y3, &Y3);		/* X3 = Y3+Y3 */
+	p256FieldAdd(&Y3, &X3, &Y3);		/* Y3 = X3+Y3 */
+	p256FieldSub(&X3, &t1, &Y3);		/* X3 = t1-Y3 */
+	p256FieldAdd(&Y3, &t1, &Y3);		/* Y3 = t1+Y3 */
+	p256FieldMul(&Y3, &X3, &Y3);		/* Y3 = X3*Y3 */
+	p256FieldMul(&X3, &X3, &t3);		/* X3 = X3*t3 */
+	p256FieldAdd(&t3, &t2, &t2);		/* t3 = t2+t2 */
+	p256FieldAdd(&t2, &t2, &t3);		/* t2 = t2+t3 */
+	p256FieldMul(&Z3, &p256_b, &Z3);	/* Z3 = b*Z3 */
+	p256FieldSub(&Z3, &Z3, &t2);		/* Z3 = Z3-t2 */
+	p256FieldSub(&Z3, &Z3, &t0);		/* Z3 = Z3-t0 */
+	p256FieldAdd(&t3, &Z3, &Z3);		/* t3 = Z3+Z3 */
+	p256FieldAdd(&Z3, &Z3, &t3);		/* Z3 = Z3+t3 */
+	p256FieldAdd(&t3, &t0, &t0);		/* t3 = t0+t0 */
+	p256FieldAdd(&t0, &t3, &t0);		/* t0 = t3+t0 */
+	p256FieldSub(&t0, &t0, &t2);		/* t0 = t0-t2 */
+	p256FieldMul(&t0, &t0, &Z3);		/* t0 = t0*Z3 */
+	p256FieldAdd(&Y3, &Y3, &t0);		/* Y3 = Y3+t0 */
+	p256FieldMul(&t0, &Y1, &Z1);		/* t0 = Y*Z */
+	p256FieldAdd(&t0, &t0, &t0);		/* t0 = t0+t0 */
+	p256FieldMul(&Z3, &t0, &Z3);		/* Z3 = t0*Z3 */
+	p256FieldSub(&X3, &X3, &Z3);		/* X3 = X3-Z3 */
+	p256FieldMul(&Z3, &t0, &t1);		/* Z3 = t0*t1 */
+	p256FieldAdd(&Z3, &Z3, &Z3);		/* Z3 = Z3+Z3 */
+	p256FieldAdd(&Z3, &Z3, &Z3);		/* Z3 = Z3+Z3 */
+
+	r->x = X3; r->y = Y3; r->z = Z3;
+}
+
+/*
+ * Project (X:Y:Z) to the affine pair (X/Z, Y/Z).  One field
+ * inversion -- the most expensive single op in the scalar mul,
+ * so callers fold it to once-per-result rather than once-per-
+ * ladder-step.  Precondition: Z != 0; calling with the identity
+ * yields meaningless output (the caller is expected to detect
+ * Z = 0 and reject before reaching us).
+ */
+static void
+p256PointToAffine(P256field *x, P256field *y, P256point *p)
+{
+	P256field zinv;
+
+	p256FieldInv(&zinv, &p->z);
+	p256FieldMul(x, &p->x, &zinv);
+	p256FieldMul(y, &p->y, &zinv);
+}
+
+/*
+ * P-256 scalar multiplication via the Montgomery ladder.
+ * Constant-time wrt the scalar: loop count is fixed at 256 (the
+ * bit length of the group order n, FIPS 186-4 D.1.2.3), every
+ * iteration runs one point add and one point double regardless
+ * of the bit value, and the branch on each scalar bit is
+ * replaced by a bit-mask conditional swap of (R0, R1).
+ *
+ * Identity in this representation is (0:1:0), not (0:0:0): RCB
+ * 2016 Algorithm 4 requires Y != 0 even when Z = 0, otherwise
+ * the formula degenerates.  R0 is initialised to (0:1:0).
+ *
+ * The classic ladder maintains R1 = R0 + P throughout.  Reading
+ * the scalar from MSB to LSB, each step doubles R0 and lets R1
+ * track via R1 = R0 + R1 = R0 + (R0 + P) = 2*R0 + P; if the
+ * current bit is 1 we cswap before the step so that the doubling
+ * absorbs the +P contribution.  After bitlen(n) iterations and
+ * one final unswap, R0 = k*P.
+ */
+
+/*
+ * Constant-time conditional swap of two projective points.
+ * Writes the result through tmpA/tmpB so the field cmovs see
+ * stable inputs even though one source aliases an output (we
+ * read a->x before writing it).  swap is treated as boolean by
+ * p256FieldCmov: 0 -> keep, anything else -> swap.
+ */
+static void
+p256PointCswap(P256point *a, P256point *b, int swap)
+{
+	P256point tmpA, tmpB;
+
+	p256FieldCmov(&tmpA.x, &a->x, &b->x, swap);
+	p256FieldCmov(&tmpA.y, &a->y, &b->y, swap);
+	p256FieldCmov(&tmpA.z, &a->z, &b->z, swap);
+	p256FieldCmov(&tmpB.x, &b->x, &a->x, swap);
+	p256FieldCmov(&tmpB.y, &b->y, &a->y, swap);
+	p256FieldCmov(&tmpB.z, &b->z, &a->z, swap);
+	*a = tmpA;
+	*b = tmpB;
+}
+
+static void
+p256ScalarMul(P256point *r, mpint *k, P256point *P)
+{
+	P256point R0, R1;
+	uchar k_be[32];
+	int i, bit, prev_bit, swap;
+
+	/* R0 = identity (0:1:0); R1 = P. */
+	memset(&R0, 0, sizeof R0);
+	R0.y.v[0] = 1;
+	R1 = *P;
+
+	/* Serialize k as 32 big-endian bytes.  mptober left-pads
+	 * with zeros if k is shorter than 32 bytes; if k were ever
+	 * larger than 2^256 the high bytes would be clipped, but
+	 * callers are required to feed us scalars already reduced
+	 * mod n (n < 2^256). */
+	mptober(k, k_be, sizeof k_be);
+
+	prev_bit = 0;
+	for(i = 255; i >= 0; i--){
+		bit = (k_be[31 - i/8] >> (i & 7)) & 1;
+		swap = prev_bit ^ bit;
+		p256PointCswap(&R0, &R1, swap);
+		prev_bit = bit;
+		p256PointAdd(&R1, &R0, &R1);
+		p256PointDouble(&R0, &R0);
+	}
+	/* If the final scalar bit was 1, R0 and R1 are swapped from
+	 * what the invariant requires; undo. */
+	p256PointCswap(&R0, &R1, prev_bit);
+
+	*r = R0;
+}
+
+/*
+ * P-256 constant-time scalar multiply: s = k*a.  Sign and
+ * identity normalisation match the generic ecmul path so callers
+ * see one ecmul contract regardless of curve.  This is the only
+ * non-static symbol in this file; ecc.c declares it extern and
+ * dispatches to it when dom->p matches the P-256 prime.
+ */
+void
+ecmul_p256(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
+{
+	P256point P, R;
+	P256field rx, ry, zero;
+	mpint *kk, *yneg;
+
+	kk = mpcopy(k);
+	kk->sign = 1;
+	mpmod(kk, dom->n, kk);
+	mpToP256Point(&P, a->x, a->y);
+	p256ScalarMul(&R, kk, &P);
+	mpfree(kk);
+
+	/* Identity from k*P = O.  Non-CT check is fine: the only
+	 * secret is k, and "k*P = O" only fires for k a multiple of
+	 * the group order n; TLS callers feed k in [1, n-1] so the
+	 * leak surface is empty in practice. */
+	memset(&zero, 0, sizeof zero);
+	if(memcmp(&R.z, &zero, sizeof zero) == 0){
+		s->inf = 1;
+		return;
+	}
+	p256PointToAffine(&rx, &ry, &R);
+	s->inf = 0;
+	p256FieldToMp(s->x, &rx);
+	p256FieldToMp(s->y, &ry);
+	if(s->z != nil)
+		mpassign(mpone, s->z);
+	if(k->sign < 0){
+		/* (-y) mod p == p - y for y in [0, p). */
+		yneg = mpnew(0);
+		mpsub(dom->p, s->y, yneg);
+		mpassign(yneg, s->y);
+		mpfree(yneg);
+	}
+}
--- sys/src/libsec/port/p256test.c
+++ sys/src/libsec/port/p256test.c
@@ -0,0 +1,310 @@
+#include <u.h>
+#include <libc.h>
+#include <mp.h>
+#include <libsec.h>
+
+/*
+ * Regression vectors for the constant-time P-256 path in libsec.
+ * Drives ecmul (via ecdsaverify and a sign+verify round-trip) so
+ * any miscompile or aliasing fault in p256.c surfaces
+ * here, not in TLS handshake debugging.
+ *
+ * Vector sources cited per-row:
+ *   RFC 6979 Appendix A.2.5 -- deterministic ECDSA over P-256;
+ *     given (d, msg, hash) the signature (r, s) is uniquely
+ *     determined and reproducible across implementations.
+ *   FIPS 186-4 Appendix D.1.2.3 -- curve parameters.
+ *
+ * What is exercised:
+ *   1. RFC 6979 verify: each row's (r, s) verifies against the
+ *      public key derived from d.  ecdsaverify drives ecmul on
+ *      the generator and on the public point; the constant-time
+ *      P-256 path receives both.
+ *   2. Sign+verify round-trip: ecdsasign produces (r, s) with a
+ *      random k; the same key set then verifies the result.
+ *      Drives ecmul through ecgen (signing direction) plus
+ *      ecdsaverify.
+ *   3. Negative tests: tampered (r, s) or tampered digest must
+ *      fail verify.  Catches a permissive ecdsaverify or a
+ *      degenerate ecmul that returns identity for all inputs.
+ *
+ * NIST CAVP SigVer.rsp coverage is intentionally deferred:
+ *   the archive (186-4ecdsatestvectors.zip) is many MB and not
+ *   reproduced here; RFC 6979 vectors plus round-trip cover the
+ *   same ecmul code path with values that can be re-checked from
+ *   the RFC text alone.
+ */
+
+typedef struct EcdsaVector EcdsaVector;
+struct EcdsaVector {
+	char	*name;
+	char	*msg;
+	int	(*hash)(uchar *in, ulong inlen, uchar *out);
+	int	dlen;
+	char	*r_hex;
+	char	*s_hex;
+};
+
+/*
+ * RFC 6979 Appendix A.2.5 shared key pair.
+ *
+ *   d  = C9AFA9D845BA75166B5C215767B1D6934E50C3DB36E89B127B8A622B120F6721
+ *   Qx = 60FED4BA255A9D31C961EB74C6356D68C049B8923B61FA6CE669622E60F29FB6
+ *   Qy = 7903FE1008B8BC99A41AE9E95628BC64F2F1B20C2D7E9F5177A3C294D4462299
+ */
+static char *rfc6979_d  =
+	"C9AFA9D845BA75166B5C215767B1D6934E50C3DB36E89B127B8A622B120F6721";
+static char *rfc6979_Qx =
+	"60FED4BA255A9D31C961EB74C6356D68C049B8923B61FA6CE669622E60F29FB6";
+static char *rfc6979_Qy =
+	"7903FE1008B8BC99A41AE9E95628BC64F2F1B20C2D7E9F5177A3C294D4462299";
+
+/*
+ * Hash adaptors: project each digest function into a uniform
+ * (in, len, out) -> dlen signature so the table stays narrow.
+ */
+static int
+hash_sha256(uchar *in, ulong inlen, uchar *out)
+{
+	sha2_256(in, inlen, out, nil);
+	return SHA2_256dlen;
+}
+
+static int
+hash_sha384(uchar *in, ulong inlen, uchar *out)
+{
+	sha2_384(in, inlen, out, nil);
+	return SHA2_384dlen;
+}
+
+static int
+hash_sha512(uchar *in, ulong inlen, uchar *out)
+{
+	sha2_512(in, inlen, out, nil);
+	return SHA2_512dlen;
+}
+
+static EcdsaVector rfc6979_vectors[] = {
+	{
+		"RFC 6979 A.2.5 sample SHA-256",
+		"sample", hash_sha256, SHA2_256dlen,
+		"EFD48B2AACB6A8FD1140DD9CD45E81D69D2C877B56AAF991C34D0EA84EAF3716",
+		"F7CB1C942D657C41D436C7A1B6E29F65F3E900DBB9AFF4064DC4AB2F843ACDA8",
+	},
+	{
+		"RFC 6979 A.2.5 sample SHA-384",
+		"sample", hash_sha384, SHA2_384dlen,
+		"0EAFEA039B20E9B42309FB1D89E213057CBF973DC0CFC8F129EDDDC800EF7719",
+		"4861F0491E6998B9455193E34E7B0D284DDD7149A74B95B9261F13ABDE940954",
+	},
+	{
+		"RFC 6979 A.2.5 sample SHA-512",
+		"sample", hash_sha512, SHA2_512dlen,
+		"8496A60B5E9B47C825488827E0495B0E3FA109EC4568FD3F8D1097678EB97F00",
+		"2362AB1ADBE2B8ADF9CB9EDAB740EA6049C028114F2460F96554F61FAE3302FE",
+	},
+	{
+		"RFC 6979 A.2.5 test SHA-256",
+		"test", hash_sha256, SHA2_256dlen,
+		"F1ABB023518351CD71D881567B1EA663ED3EFCF6C5132B354F28D3B0B7D38367",
+		"019F4113742A2B14BD25926B49C649155F267E60D3814B4C0CC84250E46F0083",
+	},
+	{
+		"RFC 6979 A.2.5 test SHA-384",
+		"test", hash_sha384, SHA2_384dlen,
+		"83910E8B48BB0C74244EBDF7F07A1C5413D61472BD941EF3920E623FBCCEBEB6",
+		"8DDBEC54CF8CD5874883841D712142A56A8D0F218F5003CB0296B6B509619F2C",
+	},
+	{
+		"RFC 6979 A.2.5 test SHA-512",
+		"test", hash_sha512, SHA2_512dlen,
+		"461D93F31B6540894788FD206C07CFA0CC35F46FA3C91816FFF1040AD1581A04",
+		"39AF9F15DE0DB8D97E72719C74820D304CE5226E32DEDAE67519E840D1194E55",
+	},
+};
+
+static int passed;
+static int failed;
+
+static void
+report(char *name, int ok)
+{
+	print("  %s: %s\n", name, ok ? "ok" : "BAD");
+	if(ok)
+		passed++;
+	else
+		failed++;
+}
+
+/*
+ * Run one RFC 6979 row: hash msg, parse expected (r, s), call
+ * ecdsaverify against the shared public key, expect 1.
+ */
+static void
+run_rfc6979(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+	uchar dig[SHA2_512dlen];
+	mpint *r, *s;
+	int n, ok;
+
+	n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+	if(n != v->dlen){
+		report(v->name, 0);
+		return;
+	}
+	r = strtomp(v->r_hex, nil, 16, nil);
+	s = strtomp(v->s_hex, nil, 16, nil);
+	ok = ecdsaverify(dom, Q, dig, n, r, s);
+	report(v->name, ok == 1);
+	mpfree(r);
+	mpfree(s);
+}
+
+/*
+ * Bit-flip on r (or s) must fail verify.  Catches a permissive
+ * verify and a degenerate ecmul returning the same value for
+ * every scalar.  We toggle the low bit of r; if r-1 happens to
+ * also be a valid signature the test would spuriously pass, but
+ * the probability against a random valid pair is 2^-256.
+ */
+static void
+run_negative_r(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+	uchar dig[SHA2_512dlen];
+	mpint *r, *s;
+	char name[128];
+	int n, ok;
+
+	n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+	r = strtomp(v->r_hex, nil, 16, nil);
+	s = strtomp(v->s_hex, nil, 16, nil);
+	/* perturb r by 1: any change to r reshapes the verify
+	 * equation u1*G + u2*Q = R and the recomputed R.x mod n
+	 * no longer matches.  Probability of accidental hit: 2^-256. */
+	mpadd(r, mpone, r);
+	ok = ecdsaverify(dom, Q, dig, n, r, s);
+	snprint(name, sizeof name, "%s tamper-r", v->name);
+	report(name, ok == 0);
+	mpfree(r);
+	mpfree(s);
+}
+
+/*
+ * Tamper with the digest: any bit flip yields a different E and
+ * must fail verify against the original (r, s).
+ */
+static void
+run_negative_digest(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+	uchar dig[SHA2_512dlen];
+	mpint *r, *s;
+	char name[128];
+	int n, ok;
+
+	n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+	dig[0] ^= 0x01;
+	r = strtomp(v->r_hex, nil, 16, nil);
+	s = strtomp(v->s_hex, nil, 16, nil);
+	ok = ecdsaverify(dom, Q, dig, n, r, s);
+	snprint(name, sizeof name, "%s tamper-digest", v->name);
+	report(name, ok == 0);
+	mpfree(r);
+	mpfree(s);
+}
+
+/*
+ * Sign a digest with priv, then verify the resulting (r, s)
+ * against pub.  Drives ecmul through both the signing direction
+ * (ecgen builds k*G via ecmul) and the verifying direction.
+ * Repeat several times to exercise multiple random k.
+ */
+static void
+run_round_trip(ECdomain *dom, ECpriv *priv, ECpub *pub, char *tag, int rounds)
+{
+	uchar dig[SHA2_256dlen];
+	uchar buf[64];
+	mpint *r, *s;
+	char name[128];
+	int i, ok;
+
+	for(i = 0; i < rounds; i++){
+		snprint((char*)buf, sizeof buf, "round-trip-%d", i);
+		sha2_256(buf, strlen((char*)buf), dig, nil);
+		r = mpnew(0);
+		s = mpnew(0);
+		ecdsasign(dom, priv, dig, SHA2_256dlen, r, s);
+		ok = ecdsaverify(dom, pub, dig, SHA2_256dlen, r, s);
+		snprint(name, sizeof name, "%s round-trip #%d", tag, i);
+		report(name, ok == 1);
+		mpfree(r);
+		mpfree(s);
+	}
+}
+
+void
+main(int argc, char **argv)
+{
+	ECdomain dom;
+	ECpriv priv;
+	ECpub Q;
+	int i;
+
+	USED(argc); USED(argv);
+	fmtinstall('B', mpfmt);
+
+	print("p256test:\n");
+
+	ecdominit(&dom, secp256r1);
+
+	/*
+	 * RFC 6979 A.2.5 fixed key pair: parse d, Qx, Qy into the
+	 * libsec ECpub / ECpriv shape used by ecdsaverify and
+	 * ecdsasign.  ECpub is a typedef of ECpoint so x/y/z fields
+	 * carry over; z is allocated and set to one (affine) so any
+	 * code path that assumes a populated z stays well-defined.
+	 */
+	memset(&priv, 0, sizeof priv);
+	priv.x = mpnew(0);
+	priv.y = mpnew(0);
+	priv.d = mpnew(0);
+	strtomp(rfc6979_d,  nil, 16, priv.d);
+	strtomp(rfc6979_Qx, nil, 16, priv.x);
+	strtomp(rfc6979_Qy, nil, 16, priv.y);
+	priv.inf = 0;
+
+	memset(&Q, 0, sizeof Q);
+	Q.x = mpnew(0);
+	Q.y = mpnew(0);
+	Q.z = mpnew(0);
+	strtomp(rfc6979_Qx, nil, 16, Q.x);
+	strtomp(rfc6979_Qy, nil, 16, Q.y);
+	mpassign(mpone, Q.z);
+	Q.inf = 0;
+
+	if(!ecpubverify(&dom, &Q)){
+		print("  RFC 6979 public key fails ecpubverify\n");
+		exits("bad pub");
+	}
+
+	for(i = 0; i < nelem(rfc6979_vectors); i++)
+		run_rfc6979(&dom, &Q, &rfc6979_vectors[i]);
+
+	for(i = 0; i < nelem(rfc6979_vectors); i++)
+		run_negative_r(&dom, &Q, &rfc6979_vectors[i]);
+
+	for(i = 0; i < nelem(rfc6979_vectors); i++)
+		run_negative_digest(&dom, &Q, &rfc6979_vectors[i]);
+
+	run_round_trip(&dom, &priv, &Q, "RFC 6979 keypair", 5);
+
+	mpfree(priv.x);
+	mpfree(priv.y);
+	mpfree(priv.d);
+	mpfree(Q.x);
+	mpfree(Q.y);
+	mpfree(Q.z);
+	ecdomfree(&dom);
+
+	print("passed: %d/%d\n", passed, passed+failed);
+	exits(failed ? "fail" : nil);
+}
--- sys/src/libsec/port/p256timetest.c
+++ sys/src/libsec/port/p256timetest.c
@@ -0,0 +1,194 @@
+#include <u.h>
+#include <libc.h>
+#include <mp.h>
+#include <libsec.h>
+
+/*
+ * Light side-channel sanity probe for the constant-time P-256
+ * scalar multiplier.  Compares cycle-counter samples between a
+ * fixed scalar (k = 1) and uniformly-random 256-bit scalars; if
+ * the means agree within a few percent and the spread is
+ * comparable, the ladder leaks no timing signal at the
+ * resolution of the cycle counter on the host running the test.
+ *
+ * This is the cheap end of the spectrum.  It does not replace a
+ * statistical test such as dudect (Reparaz, Balasch, Verbauwhede,
+ * "Dude, is my code constant time?", DATE 2017): a few thousand
+ * samples cannot bound a sub-cycle leak, and the operator must
+ * still run dudect or an equivalent for a strong CT claim.  It
+ * does catch coarse breakage -- a missing cmov, a bit-dependent
+ * loop bound, a branchful field op -- which is the failure mode
+ * worth ruling out before TLS handshake debugging.
+ *
+ * QEMU caveat: virtualised cycle counters are noisy and biased
+ * by the host scheduler.  A negative result here on QEMU is
+ * suggestive but not conclusive; rerun on bare hardware before
+ * declaring a leak.  A clean (within-tolerance) result on QEMU
+ * is the strongest claim this probe can make on its own.
+ *
+ * Driver: ecmul on a P-256 ECdomain.  The dispatch in ecmul
+ * routes to the constant-time ladder via ecmul_p256, so the
+ * timing window covers exactly the production path.  The
+ * ECpoint setup happens once before the timing loop and is not
+ * counted; only the ecmul call is between the cycles() reads.
+ *
+ * Reference: cycles(2) reads the per-CPU cycle counter (TSC on
+ * 386/amd64); resolution is one CPU cycle.
+ */
+
+#define N	1000	/* samples per group */
+#define WARMUP	16	/* iterations to prime caches before timing */
+
+static int
+ulcmp(void *a, void *b)
+{
+	uvlong x, y;
+
+	x = *(uvlong*)a;
+	y = *(uvlong*)b;
+	if(x < y) return -1;
+	if(x > y) return 1;
+	return 0;
+}
+
+/*
+ * Run N scalar multiplications and record the cycle-count delta
+ * for each.  When fixed != 0 the same scalar (k = 1) is used for
+ * every iteration; otherwise genrandom fills 32 fresh bytes per
+ * iteration.  P is reused across iterations -- ecmul builds its
+ * working state from scratch every call, so input-state effects
+ * are bounded to the single point P.
+ */
+static void
+measure(int fixed, ECdomain *dom, ECpoint *P, uvlong *out)
+{
+	mpint *k;
+	ECpoint R;
+	uvlong t0, t1;
+	uchar buf[32];
+	int i;
+
+	k = mpnew(0);
+	memset(&R, 0, sizeof R);
+	R.x = mpnew(0);
+	R.y = mpnew(0);
+	R.z = mpnew(0);
+	for(i = 0; i < N + WARMUP; i++){
+		if(fixed){
+			memset(buf, 0, sizeof buf);
+			buf[31] = 1;
+		} else {
+			genrandom(buf, sizeof buf);
+		}
+		betomp(buf, sizeof buf, k);
+
+		cycles(&t0);
+		ecmul(dom, P, k, &R);
+		cycles(&t1);
+
+		if(i >= WARMUP)
+			out[i - WARMUP] = t1 - t0;
+	}
+	mpfree(R.x);
+	mpfree(R.y);
+	mpfree(R.z);
+	mpfree(k);
+}
+
+/*
+ * Mean, sample stddev (Bessel-corrected), and median of N
+ * cycle-count samples.  The samples buffer is sorted in place;
+ * callers must not depend on its order afterwards.
+ */
+static void
+stats(uvlong *samples, double *mean, double *stddev, uvlong *median)
+{
+	double m, s, d;
+	int i;
+
+	qsort(samples, N, sizeof samples[0], ulcmp);
+	*median = samples[N/2];
+
+	m = 0.0;
+	for(i = 0; i < N; i++)
+		m += (double)samples[i];
+	m /= N;
+	*mean = m;
+
+	s = 0.0;
+	for(i = 0; i < N; i++){
+		d = (double)samples[i] - m;
+		s += d * d;
+	}
+	*stddev = sqrt(s / (N - 1));
+}
+
+void
+main(int argc, char **argv)
+{
+	ECdomain dom;
+	ECpoint G;
+	uvlong *fix, *rnd;
+	uvlong fmed, rmed;
+	double fmean, fsd, rmean, rsd, ratio;
+
+	USED(argc); USED(argv);
+	fmtinstall('B', mpfmt);
+
+	print("p256timetest:\n");
+	print("  N=%d samples per group, WARMUP=%d\n", N, WARMUP);
+
+	ecdominit(&dom, secp256r1);
+
+	/* G is already populated as dom.G by ecdominit; copy it into
+	 * a local ECpoint with its own mpints so measure() can pass
+	 * a stable pointer through the timing window. */
+	memset(&G, 0, sizeof G);
+	G.x = mpnew(0);
+	G.y = mpnew(0);
+	G.z = mpnew(0);
+	mpassign(dom.G.x, G.x);
+	mpassign(dom.G.y, G.y);
+	mpassign(mpone, G.z);
+	G.inf = 0;
+
+	fix = mallocz(N * sizeof *fix, 1);
+	rnd = mallocz(N * sizeof *rnd, 1);
+	if(fix == nil || rnd == nil){
+		print("  out of memory\n");
+		exits("nomem");
+	}
+
+	measure(1, &dom, &G, fix);
+	measure(0, &dom, &G, rnd);
+
+	stats(fix, &fmean, &fsd, &fmed);
+	stats(rnd, &rmean, &rsd, &rmed);
+
+	print("  fixed   mean=%.0f stddev=%.0f median=%llud\n",
+		fmean, fsd, fmed);
+	print("  random  mean=%.0f stddev=%.0f median=%llud\n",
+		rmean, rsd, rmed);
+
+	if(fmean > 0.0)
+		ratio = (rmean - fmean) / fmean;
+	else
+		ratio = 0.0;
+	if(ratio < 0.0)
+		ratio = -ratio;
+	print("  ratio   |random-fixed|/fixed = %.4f\n", ratio);
+
+	if(ratio < 0.05)
+		print("  no measurable leak at this resolution\n");
+	else
+		print("  inconclusive -- repeat on bare hardware\n");
+
+	mpfree(G.x);
+	mpfree(G.y);
+	mpfree(G.z);
+	ecdomfree(&dom);
+
+	free(fix);
+	free(rnd);
+	exits(nil);
+}
(Return to Plan 9 Home Page)