--- sys/src/libsec/port/ecc.c
+++ sys/src/libsec/port/ecc.c
@@ -12,6 +12,7 @@ extern void jacobian_add(mpint *p, mpint *a,
mpint *X1, mpint *Y1, mpint *Z1,
mpint *X2, mpint *Y2, mpint *Z2,
mpint *X3, mpint *Y3, mpint *Z3);
+extern void ecmul_p256(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s);
void
ecassign(ECdomain *dom, ECpoint *a, ECpoint *b)
@@ -70,6 +71,25 @@ ecadd(ECdomain *dom, ECpoint *a, ECpoint *b, ECpoint *
s->inf = mpcmp(s->z, mpzero) == 0;
}
+/*
+ * P-256 prime, lazily parsed. Used to detect the P-256 curve
+ * inside ecmul without adding a curve-id field to ECdomain --
+ * the prime is unique per curve and already mp-resident in
+ * dom->p, so the check costs one mpcmp. ecmul_p256 itself
+ * lives in p256.c alongside the constant-time field arithmetic.
+ */
+static mpint*
+p256dom_prime(void)
+{
+ static mpint *p;
+
+ if(p == nil)
+ p = strtomp(
+ "FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFF",
+ nil, 16, nil);
+ return p;
+}
+
void
ecmul(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
{
@@ -78,6 +98,10 @@ ecmul(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
if(a->inf || mpcmp(k, mpzero) == 0){
s->inf = 1;
+ return;
+ }
+ if(mpcmp(dom->p, p256dom_prime()) == 0){
+ ecmul_p256(dom, a, k, s);
return;
}
ns.inf = 1;
--- sys/src/libsec/port/mkfile
+++ sys/src/libsec/port/mkfile
@@ -6,6 +6,7 @@ CFILES = des.c desmodes.c desECB.c desCBC.c des3ECB.c
aes.c aes_gcm.c blowfish.c chacha.c \
curve25519.c curve25519_dh.c\
ecc.c jacobian.c secp256r1.c secp384r1.c\
+ p256.c\
hmac.c md5.c md5block.c md4.c sha1.c sha1block.c\
sha2_64.c sha2_128.c sha2block64.c sha2block128.c\
sha1pickle.c md5pickle.c\
@@ -53,4 +54,10 @@ $O.rsatest: rsatest.$O
mpc $prereq >> $target
$O.rsatest: rsatest.$O
+ $LD -o $target $prereq
+
+$O.p256test: p256test.$O
+ $LD -o $target $prereq
+
+$O.p256timetest: p256timetest.$O
$LD -o $target $prereq
--- sys/src/libsec/port/p256.c
+++ sys/src/libsec/port/p256.c
@@ -0,0 +1,806 @@
+#include <u.h>
+#include <libc.h>
+#include <mp.h>
+#include <libsec.h>
+
+/*
+ * P-256 (secp256r1, FIPS 186-4 D.1.2.3) constant-time field
+ * arithmetic and scalar multiplication. Used by ecmul on the
+ * P-256 curve to give a side-channel-resistant scalar mul for
+ * ECDHE and ECDSA signing (verify is public-input and stays on
+ * the generic path).
+ *
+ * Limb layout: 8 little-endian uint limbs (v[0] = LSB), 32 bytes
+ * per field element. Same width on 386 and amd64 since Plan 9
+ * keeps int at 32 bits on both; an aliased v64[4] view is exposed
+ * for an ASM accelerator that may layer atop this code. All
+ * field/point ops use v[].
+ *
+ * All field ops run in time independent of operand value. Point
+ * Add and Double use the complete formulas of Renes-Costello-
+ * Batina, "Complete addition formulas for prime order elliptic
+ * curves", EUROCRYPT 2016, eprint 2015/1060: Algorithm 4 for the
+ * addition (P-256 has a = -3, appendix A.2) and Algorithm 6 for
+ * the doubling. Scalar multiplication is the Montgomery ladder
+ * (SEC 1 v2.0 Section 3.2; Joye-Yen "The Montgomery powering
+ * ladder", CHES 2002), constant-time wrt the scalar.
+ *
+ * P-256 prime from FIPS 186-4 Section D.1.2.3 / RFC 5480
+ * Section 2.1.1.1: p = 2^256 - 2^224 + 2^192 + 2^96 - 1.
+ * Curve parameter b is from FIPS 186-4 Appendix D.1.2.3.
+ *
+ * The single non-static entry point is ecmul_p256, called from
+ * ecc.c's ecmul dispatch when dom->p matches the P-256 prime.
+ */
+
+typedef struct P256field P256field;
+typedef struct P256point P256point;
+
+struct P256field {
+ union {
+ uint v[8];
+ uvlong v64[4];
+ };
+};
+
+struct P256point {
+ P256field x;
+ P256field y;
+ P256field z; /* homogeneous projective; z=0 is identity */
+};
+
+/* p = 2^256 - 2^224 + 2^192 + 2^96 - 1, big-endian 32 bytes */
+static uchar p256_p_be[32] = {
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+};
+
+/* same prime, 32-bit-LE limb form */
+static const uint p256_p[8] = {
+ 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000001, 0xffffffff,
+};
+
+/*
+ * b = 5ac635d8 aa3a93e7 b3ebbd55 769886bc
+ * 651d06b0 cc53b0f6 3bce3c3e 27d2604b
+ * Stored as 8 little-endian uint limbs (v[0] = LSB), matching
+ * the layout produced by mpToP256Field on the same 32-byte BE
+ * literal. Cross-check against secp256r1.c.
+ */
+static P256field p256_b = { {
+ 0x27d2604b, 0x3bce3c3e, 0xcc53b0f6, 0x651d06b0,
+ 0x769886bc, 0xb3ebbd55, 0xaa3a93e7, 0x5ac635d8,
+} };
+
+/*
+ * mp <-> P256field/P256point boundary conversion. Big-endian
+ * byte order at the mp interface; little-endian-limb (v[0]
+ * holds the least-significant 32 bits) inside.
+ */
+
+/*
+ * Reduce in modulo p256_p and write the residue as 32 big-endian
+ * bytes into buf.
+ */
+static void
+mpToBE32(uchar *buf, mpint *in)
+{
+ mpint *p, *r;
+
+ p = betomp(p256_p_be, sizeof p256_p_be, nil);
+ r = mpnew(0);
+ mpmod(in, p, r);
+ mptober(r, buf, 32);
+ mpfree(r);
+ mpfree(p);
+}
+
+static void
+mpToP256Field(P256field *out, mpint *in)
+{
+ uchar buf[32];
+ int i;
+
+ mpToBE32(buf, in);
+ for(i = 0; i < 8; i++)
+ out->v[i] = ((uint)buf[28-4*i]<<24) |
+ ((uint)buf[29-4*i]<<16) |
+ ((uint)buf[30-4*i]<<8) |
+ (uint)buf[31-4*i];
+}
+
+static void
+p256FieldToMp(mpint *out, P256field *in)
+{
+ uchar buf[32];
+ int i;
+
+ for(i = 0; i < 8; i++){
+ buf[31-4*i] = in->v[i];
+ buf[30-4*i] = in->v[i] >> 8;
+ buf[29-4*i] = in->v[i] >> 16;
+ buf[28-4*i] = in->v[i] >> 24;
+ }
+ betomp(buf, sizeof buf, out);
+}
+
+static void
+mpToP256Point(P256point *out, mpint *x, mpint *y)
+{
+ mpToP256Field(&out->x, x);
+ mpToP256Field(&out->y, y);
+ memset(&out->z, 0, sizeof out->z);
+ out->z.v[0] = 1; /* Z = 1 in 32-bit-limb form */
+}
+
+/*
+ * P-256 field operations. Additive ops (Add/Sub/Neg/Cmov) use
+ * full-length carry-chain loops; multiplicative ops (Mul/Sqr/
+ * Inv) use 8x8 schoolbook with NIST Solinas reduction and a
+ * Fermat addition chain for inversion. No branches or table
+ * lookups depend on input limbs; final reductions select via
+ * precomputed masks.
+ *
+ * Carry and borrow bits are pulled from the high half of a
+ * 64-bit uvlong accumulator: a 32-bit add or subtract cannot
+ * overflow 64 bits, so the carry-out is exactly the low bit of
+ * (acc >> 32).
+ */
+
+static void
+p256FieldAdd(P256field *r, P256field *a, P256field *b)
+{
+ uvlong acc;
+ uint s[8], t[8], carry, borrow, mask, m;
+ int i;
+
+ /* s = a + b mod 2^256; carry holds the 257th bit. */
+ carry = 0;
+ for(i = 0; i < 8; i++){
+ acc = (uvlong)a->v[i] + b->v[i] + carry;
+ s[i] = (uint)acc;
+ carry = (uint)(acc >> 32);
+ }
+
+ /* t = s - p mod 2^256; borrow=1 iff s < p as a 256-bit int. */
+ borrow = 0;
+ for(i = 0; i < 8; i++){
+ acc = (uvlong)s[i] - p256_p[i] - borrow;
+ t[i] = (uint)acc;
+ borrow = (uint)(acc >> 32) & 1;
+ }
+
+ /* Pick s when carry=0 and borrow=1 (s fits in 256 bits and
+ * is less than p); pick t otherwise (s >= p as a 257-bit
+ * value, so the wrap of s-p gives the correct residue). */
+ m = borrow & (carry ^ 1);
+ mask = -m;
+ for(i = 0; i < 8; i++)
+ r->v[i] = (s[i] & mask) | (t[i] & ~mask);
+}
+
+static void
+p256FieldSub(P256field *r, P256field *a, P256field *b)
+{
+ uvlong acc;
+ uint t[8], s[8], carry, borrow, mask;
+ int i;
+
+ /* t = a - b mod 2^256; borrow=1 iff true value is negative. */
+ borrow = 0;
+ for(i = 0; i < 8; i++){
+ acc = (uvlong)a->v[i] - b->v[i] - borrow;
+ t[i] = (uint)acc;
+ borrow = (uint)(acc >> 32) & 1;
+ }
+
+ /* s = t + p mod 2^256. When t underflowed (borrow=1) the
+ * mathematical t is a-b+2^256, so s = a-b+2^256+p mod 2^256
+ * = a-b+p, the correct non-negative residue. */
+ carry = 0;
+ for(i = 0; i < 8; i++){
+ acc = (uvlong)t[i] + p256_p[i] + carry;
+ s[i] = (uint)acc;
+ carry = (uint)(acc >> 32);
+ }
+
+ /* Pick s when borrow=1, else t (already in [0, p)). */
+ mask = -borrow;
+ for(i = 0; i < 8; i++)
+ r->v[i] = (s[i] & mask) | (t[i] & ~mask);
+}
+
+static void
+p256FieldNeg(P256field *r, P256field *a)
+{
+ P256field zero;
+
+ memset(&zero, 0, sizeof zero);
+ p256FieldSub(r, &zero, a);
+}
+
+static void
+p256FieldCmov(P256field *r, P256field *a, P256field *b, int cond)
+{
+ uint c, nz, mask;
+ int i;
+
+ /* Collapse any nonzero cond to all-1s without branching:
+ * for unsigned 32-bit c, (c | -c) has bit 31 set iff c != 0;
+ * shifting that down to bit 0 and negating gives the full
+ * mask. The behaviour matches the C ternary cond ? b : a. */
+ c = (uint)cond;
+ nz = (c | (0u - c)) >> 31;
+ mask = -nz;
+ for(i = 0; i < 8; i++)
+ r->v[i] = (a->v[i] & ~mask) | (b->v[i] & mask);
+}
+
+/*
+ * 8x8 schoolbook multiply: T[0..15] = a[0..7] * b[0..7] as a
+ * 16-limb 32-bit-LE bignum. T limbs are uvlong-typed but each
+ * stays in [0, 2^32) at the boundaries; the inner accumulator
+ * carries into the next limb. Inputs are canonical (each limb
+ * < 2^32; full value < 2^256), so the partial-product sum stays
+ * below 2^512 and every accumulator step fits in 64 bits.
+ */
+static void
+p256_mul_8x8(uvlong T[16], uint *a, uint *b)
+{
+ uvlong acc;
+ int i, j;
+
+ for(i = 0; i < 16; i++)
+ T[i] = 0;
+ for(i = 0; i < 8; i++){
+ acc = 0;
+ for(j = 0; j < 8; j++){
+ acc += T[i+j] + (uvlong)a[i] * b[j];
+ T[i+j] = acc & 0xffffffff;
+ acc >>= 32;
+ }
+ T[i+8] = acc;
+ }
+}
+
+/*
+ * NIST P-256 Solinas reduction. Folds the upper 8 limbs of T
+ * (T[8..15]) into the lower 8 using the identity
+ * 2^256 = 2^224 - 2^192 - 2^96 + 1 (mod p).
+ *
+ * Recipe (FIPS 186-4 Appendix D / Solinas 1999): write nine
+ * 8-limb words S1..S9 from selected T limbs, then compute
+ * R = S1 + 2*S2 + 2*S3 + S4 + S5 - S6 - S7 - S8 - S9 (mod p).
+ *
+ * S1 = (T0, T1, T2, T3, T4, T5, T6, T7)
+ * S2 = (0, 0, 0, T11, T12, T13, T14, T15)
+ * S3 = (0, 0, 0, T12, T13, T14, T15, 0 )
+ * S4 = (T8, T9, T10, 0, 0, 0, T14, T15)
+ * S5 = (T9, T10, T11, T13, T14, T15, T13, T8 )
+ * S6 = (T11, T12, T13, 0, 0, 0, T8, T10)
+ * S7 = (T12, T13, T14, T15, 0, 0, T9, T11)
+ * S8 = (T13, T14, T15, T8, T9, T10, 0, T12)
+ * S9 = (T14, T15, 0, T9, T10, T11, 0, T13)
+ *
+ * pos = S1 + 2*S2 + 2*S3 + S4 + S5 fits in 9 limbs (max
+ * coefficient 1+2+2+1+1 = 7 against an 8-limb max, so the high
+ * accumulator stays well-bounded); neg = S6+S7+S8+S9 likewise
+ * fits in 9 limbs. The signed difference pos - neg lies in
+ * (-4*2^256, 8*2^256); adding 4*p biases it non-negative below
+ * 12*p, then a fixed sequence of conditional subtractions of p
+ * brings the residue into [0, p).
+ */
+
+/* Add an 8-limb little-endian value t shifted by `shift` 32-bit
+ * limbs into a 9-limb accumulator buf, with optional weight
+ * (1 or 2). shift+8 must be <= 9; in our recipe shift is always
+ * 0 since each Si is already an 8-limb word. */
+static void
+add_term(uvlong buf[9], uint t[8], int weight)
+{
+ uvlong acc;
+ int i;
+
+ acc = 0;
+ for(i = 0; i < 8; i++){
+ acc += buf[i] + (uvlong)t[i] * weight;
+ buf[i] = acc & 0xffffffff;
+ acc >>= 32;
+ }
+ buf[8] += acc;
+}
+
+/* Add 4*p into a 9-limb buffer (used to bias diff non-negative
+ * before final reductions). 4*p = (2^258 - 2^226 + 2^194 +
+ * 2^98 - 4); represented as 8 32-bit limbs plus a top limb
+ * holding bits 256..258. */
+static void
+add_4p(uvlong buf[9])
+{
+ /* 4 * p = 2^258 - 2^226 + 2^194 + 2^98 - 4.
+ * Limb-encode the 32-bit-LE representation of 4p:
+ * bytes (LSB first): 0xfc 0xff 0xff 0xff 0xff 0xff 0xff 0xff
+ * ... see derivation below.
+ * Computed as p shifted left by 2. p as 32-bit LE limbs is
+ * (ffffffff, ffffffff, ffffffff, 00000000, 00000000, 00000000,
+ * 00000001, ffffffff); shifting left by 2 propagates two
+ * bits per limb upward. The 9th limb holds the top 2 bits
+ * (which equal 3 since the top of p is ffffffff). */
+ static const uvlong fourp[9] = {
+ 0xfffffffcULL, 0xffffffffULL, 0xffffffffULL, 0x00000003ULL,
+ 0x00000000ULL, 0x00000000ULL, 0x00000004ULL, 0xfffffffcULL,
+ 0x00000003ULL,
+ };
+ uvlong acc;
+ int i;
+
+ acc = 0;
+ for(i = 0; i < 9; i++){
+ acc += buf[i] + fourp[i];
+ buf[i] = acc & 0xffffffff;
+ acc >>= 32;
+ }
+}
+
+/* sub_buf: out = a - b in 9-limb arithmetic; if true result is
+ * non-negative, no work is needed afterwards. Returns the
+ * borrow-out (always 0 for our pre-biased values, but kept for
+ * defensive use). */
+static uint
+sub_buf9(uvlong out[9], uvlong a[9], uvlong b[9])
+{
+ uvlong acc;
+ uint borrow;
+ int i;
+
+ borrow = 0;
+ for(i = 0; i < 9; i++){
+ acc = a[i] - b[i];
+ acc -= borrow;
+ out[i] = acc & 0xffffffff;
+ borrow = (uint)(acc >> 32) & 1;
+ }
+ return borrow;
+}
+
+/* Conditionally subtract p (expressed as a 9-limb value with
+ * top limb 0) from buf in constant time: always perform the
+ * subtraction into a scratch, then mask-select between buf and
+ * scratch based on whether buf >= p. Repeats `rounds` times to
+ * handle large multiples of p left over from add_4p + Solinas
+ * sums. */
+static void
+final_reduce(uint r[8], uvlong buf[9], int rounds)
+{
+ uvlong scratch[9];
+ uvlong acc;
+ uint borrow, mask;
+ int i, k;
+ static const uvlong p9[9] = {
+ 0xffffffffULL, 0xffffffffULL, 0xffffffffULL, 0x00000000ULL,
+ 0x00000000ULL, 0x00000000ULL, 0x00000001ULL, 0xffffffffULL,
+ 0x00000000ULL,
+ };
+
+ for(k = 0; k < rounds; k++){
+ borrow = 0;
+ for(i = 0; i < 9; i++){
+ acc = buf[i] - p9[i];
+ acc -= borrow;
+ scratch[i] = acc & 0xffffffff;
+ borrow = (uint)(acc >> 32) & 1;
+ }
+ /* If buf >= p, scratch holds the smaller representative
+ * and borrow=0. If buf < p, borrow=1 and we keep buf.
+ * mask = 0xffffffff when borrow=0 (take scratch). */
+ mask = -(borrow ^ 1);
+ for(i = 0; i < 9; i++)
+ buf[i] = (scratch[i] & mask) | (buf[i] & ~mask);
+ }
+ for(i = 0; i < 8; i++)
+ r[i] = (uint)buf[i];
+}
+
+static void
+p256FieldMul(P256field *r, P256field *a, P256field *b)
+{
+ uvlong T[16];
+ uvlong pos[9], neg[9], diff[9];
+ uint S1[8], S2[8], S3[8], S4[8], S5[8];
+ uint S6[8], S7[8], S8[8], S9[8];
+ uint t[16];
+ int i;
+
+ /* 8x8 schoolbook product into 16 limbs. */
+ p256_mul_8x8(T, a->v, b->v);
+ for(i = 0; i < 16; i++)
+ t[i] = (uint)T[i];
+
+ /* Build the nine S vectors per the recipe above. */
+ S1[0]=t[0]; S1[1]=t[1]; S1[2]=t[2]; S1[3]=t[3];
+ S1[4]=t[4]; S1[5]=t[5]; S1[6]=t[6]; S1[7]=t[7];
+
+ S2[0]=0; S2[1]=0; S2[2]=0; S2[3]=t[11];
+ S2[4]=t[12]; S2[5]=t[13]; S2[6]=t[14]; S2[7]=t[15];
+
+ S3[0]=0; S3[1]=0; S3[2]=0; S3[3]=t[12];
+ S3[4]=t[13]; S3[5]=t[14]; S3[6]=t[15]; S3[7]=0;
+
+ S4[0]=t[8]; S4[1]=t[9]; S4[2]=t[10]; S4[3]=0;
+ S4[4]=0; S4[5]=0; S4[6]=t[14]; S4[7]=t[15];
+
+ S5[0]=t[9]; S5[1]=t[10]; S5[2]=t[11]; S5[3]=t[13];
+ S5[4]=t[14]; S5[5]=t[15]; S5[6]=t[13]; S5[7]=t[8];
+
+ S6[0]=t[11]; S6[1]=t[12]; S6[2]=t[13]; S6[3]=0;
+ S6[4]=0; S6[5]=0; S6[6]=t[8]; S6[7]=t[10];
+
+ S7[0]=t[12]; S7[1]=t[13]; S7[2]=t[14]; S7[3]=t[15];
+ S7[4]=0; S7[5]=0; S7[6]=t[9]; S7[7]=t[11];
+
+ S8[0]=t[13]; S8[1]=t[14]; S8[2]=t[15]; S8[3]=t[8];
+ S8[4]=t[9]; S8[5]=t[10]; S8[6]=0; S8[7]=t[12];
+
+ S9[0]=t[14]; S9[1]=t[15]; S9[2]=0; S9[3]=t[9];
+ S9[4]=t[10]; S9[5]=t[11]; S9[6]=0; S9[7]=t[13];
+
+ /* pos = S1 + 2*S2 + 2*S3 + S4 + S5 in a 9-limb accumulator. */
+ for(i = 0; i < 9; i++)
+ pos[i] = 0;
+ add_term(pos, S1, 1);
+ add_term(pos, S2, 2);
+ add_term(pos, S3, 2);
+ add_term(pos, S4, 1);
+ add_term(pos, S5, 1);
+
+ /* neg = S6 + S7 + S8 + S9. */
+ for(i = 0; i < 9; i++)
+ neg[i] = 0;
+ add_term(neg, S6, 1);
+ add_term(neg, S7, 1);
+ add_term(neg, S8, 1);
+ add_term(neg, S9, 1);
+
+ /* Bias non-negative: add 4*p. Worst case |neg| <= 4*(2^256-1),
+ * 4*p > 4*(2^256-1) - 4, so pos + 4p - neg is non-negative. */
+ add_4p(pos);
+
+ /* diff = pos - neg. Result is in [0, ~12*p). */
+ (void)sub_buf9(diff, pos, neg);
+
+ /* 12 rounds of CT subtract-or-keep brings diff into [0, p).
+ * 8 would be sufficient for the bound above; 12 leaves slack
+ * for any reasoning gap. */
+ final_reduce(r->v, diff, 12);
+}
+
+static void
+p256FieldSqr(P256field *r, P256field *a)
+{
+ /* BUGlet: squaring delegates to Mul; a dedicated symmetric
+ * kernel that exploits a*b == b*a in the partial-product
+ * matrix would save ~25% on this hot path. Out of scope
+ * for the initial constant-time landing. */
+ p256FieldMul(r, a, a);
+}
+
+/*
+ * Inversion via Fermat's little theorem: a^(p-2) = a^-1 mod p.
+ * The exponent p-2 = 2^256 - 2^224 + 2^192 + 2^96 - 3 is fixed,
+ * so a windowed addition chain runs in time independent of the
+ * operand value. Addition chain follows Bernstein 2014: build
+ * windows e_I = a^(2^I - 1) and combine them to land on the bit-
+ * pattern of p-2. Total: 256 squarings + 12 multiplications.
+ * Annotations show the exponent of a each variable holds.
+ */
+static void
+p256FieldInv(P256field *r, P256field *a)
+{
+ P256field ftmp, ftmp2, e2, e4, e8, e16, e32, e64;
+ int i;
+
+ p256FieldSqr(&ftmp, a); /* 2^1 */
+ p256FieldMul(&ftmp, &ftmp, a); /* 2^2 - 2^0 */
+ e2 = ftmp;
+ p256FieldSqr(&ftmp, &ftmp); /* 2^3 - 2^1 */
+ p256FieldSqr(&ftmp, &ftmp); /* 2^4 - 2^2 */
+ p256FieldMul(&ftmp, &ftmp, &e2); /* 2^4 - 2^0 */
+ e4 = ftmp;
+ p256FieldSqr(&ftmp, &ftmp); /* 2^5 - 2^1 */
+ p256FieldSqr(&ftmp, &ftmp); /* 2^6 - 2^2 */
+ p256FieldSqr(&ftmp, &ftmp); /* 2^7 - 2^3 */
+ p256FieldSqr(&ftmp, &ftmp); /* 2^8 - 2^4 */
+ p256FieldMul(&ftmp, &ftmp, &e4); /* 2^8 - 2^0 */
+ e8 = ftmp;
+ for(i = 0; i < 8; i++)
+ p256FieldSqr(&ftmp, &ftmp); /* 2^16 - 2^8 */
+ p256FieldMul(&ftmp, &ftmp, &e8); /* 2^16 - 2^0 */
+ e16 = ftmp;
+ for(i = 0; i < 16; i++)
+ p256FieldSqr(&ftmp, &ftmp); /* 2^32 - 2^16 */
+ p256FieldMul(&ftmp, &ftmp, &e16); /* 2^32 - 2^0 */
+ e32 = ftmp;
+ for(i = 0; i < 32; i++)
+ p256FieldSqr(&ftmp, &ftmp); /* 2^64 - 2^32 */
+ e64 = ftmp;
+ p256FieldMul(&ftmp, &ftmp, a); /* 2^64 - 2^32 + 2^0 */
+ for(i = 0; i < 192; i++)
+ p256FieldSqr(&ftmp, &ftmp);
+ /* ftmp = 2^256 - 2^224 + 2^192 */
+
+ p256FieldMul(&ftmp2, &e64, &e32); /* 2^64 - 2^0 */
+ for(i = 0; i < 16; i++)
+ p256FieldSqr(&ftmp2, &ftmp2); /* 2^80 - 2^16 */
+ p256FieldMul(&ftmp2, &ftmp2, &e16); /* 2^80 - 2^0 */
+ for(i = 0; i < 8; i++)
+ p256FieldSqr(&ftmp2, &ftmp2); /* 2^88 - 2^8 */
+ p256FieldMul(&ftmp2, &ftmp2, &e8); /* 2^88 - 2^0 */
+ for(i = 0; i < 4; i++)
+ p256FieldSqr(&ftmp2, &ftmp2); /* 2^92 - 2^4 */
+ p256FieldMul(&ftmp2, &ftmp2, &e4); /* 2^92 - 2^0 */
+ p256FieldSqr(&ftmp2, &ftmp2); /* 2^93 - 2^1 */
+ p256FieldSqr(&ftmp2, &ftmp2); /* 2^94 - 2^2 */
+ p256FieldMul(&ftmp2, &ftmp2, &e2); /* 2^94 - 2^0 */
+ p256FieldSqr(&ftmp2, &ftmp2); /* 2^95 - 2^1 */
+ p256FieldSqr(&ftmp2, &ftmp2); /* 2^96 - 2^2 */
+ p256FieldMul(&ftmp2, &ftmp2, a); /* 2^96 - 3 */
+
+ p256FieldMul(r, &ftmp2, &ftmp);
+ /* r = 2^256 - 2^224 + 2^192 + 2^96 - 3 = p - 2 */
+}
+
+/*
+ * P-256 point arithmetic in homogeneous projective coordinates
+ * (X:Y:Z) with affine map x = X/Z, y = Y/Z; identity is Z = 0.
+ * All routines run in time independent of operand value: every
+ * field op executes regardless of input, and identity / doubling
+ * / inverse-pair cases emerge from the algebra without a branch.
+ */
+
+static void
+p256PointAdd(P256point *r, P256point *a, P256point *b)
+{
+ P256field X1, Y1, Z1, X2, Y2, Z2, X3, Y3, Z3;
+ P256field t0, t1, t2, t3, t4;
+
+ X1 = a->x; Y1 = a->y; Z1 = a->z;
+ X2 = b->x; Y2 = b->y; Z2 = b->z;
+
+ /* RCB 2016 Algorithm 4, a = -3. Each line is one field
+ * op; the entire 39-step sequence runs unconditionally. */
+ p256FieldMul(&t0, &X1, &X2); /* t0 = X1*X2 */
+ p256FieldMul(&t1, &Y1, &Y2); /* t1 = Y1*Y2 */
+ p256FieldMul(&t2, &Z1, &Z2); /* t2 = Z1*Z2 */
+ p256FieldAdd(&t3, &X1, &Y1); /* t3 = X1+Y1 */
+ p256FieldAdd(&t4, &X2, &Y2); /* t4 = X2+Y2 */
+ p256FieldMul(&t3, &t3, &t4); /* t3 = t3*t4 */
+ p256FieldAdd(&t4, &t0, &t1); /* t4 = t0+t1 */
+ p256FieldSub(&t3, &t3, &t4); /* t3 = t3-t4 */
+ p256FieldAdd(&t4, &Y1, &Z1); /* t4 = Y1+Z1 */
+ p256FieldAdd(&X3, &Y2, &Z2); /* X3 = Y2+Z2 */
+ p256FieldMul(&t4, &t4, &X3); /* t4 = t4*X3 */
+ p256FieldAdd(&X3, &t1, &t2); /* X3 = t1+t2 */
+ p256FieldSub(&t4, &t4, &X3); /* t4 = t4-X3 */
+ p256FieldAdd(&X3, &X1, &Z1); /* X3 = X1+Z1 */
+ p256FieldAdd(&Y3, &X2, &Z2); /* Y3 = X2+Z2 */
+ p256FieldMul(&X3, &X3, &Y3); /* X3 = X3*Y3 */
+ p256FieldAdd(&Y3, &t0, &t2); /* Y3 = t0+t2 */
+ p256FieldSub(&Y3, &X3, &Y3); /* Y3 = X3-Y3 */
+ p256FieldMul(&Z3, &p256_b, &t2); /* Z3 = b*t2 */
+ p256FieldSub(&X3, &Y3, &Z3); /* X3 = Y3-Z3 */
+ p256FieldAdd(&Z3, &X3, &X3); /* Z3 = X3+X3 */
+ p256FieldAdd(&X3, &X3, &Z3); /* X3 = X3+Z3 */
+ p256FieldSub(&Z3, &t1, &X3); /* Z3 = t1-X3 */
+ p256FieldAdd(&X3, &t1, &X3); /* X3 = t1+X3 */
+ p256FieldMul(&Y3, &p256_b, &Y3); /* Y3 = b*Y3 */
+ p256FieldAdd(&t1, &t2, &t2); /* t1 = t2+t2 */
+ p256FieldAdd(&t2, &t1, &t2); /* t2 = t1+t2 */
+ p256FieldSub(&Y3, &Y3, &t2); /* Y3 = Y3-t2 */
+ p256FieldSub(&Y3, &Y3, &t0); /* Y3 = Y3-t0 */
+ p256FieldAdd(&t1, &Y3, &Y3); /* t1 = Y3+Y3 */
+ p256FieldAdd(&Y3, &t1, &Y3); /* Y3 = t1+Y3 */
+ p256FieldAdd(&t1, &t0, &t0); /* t1 = t0+t0 */
+ p256FieldAdd(&t0, &t1, &t0); /* t0 = t1+t0 */
+ p256FieldSub(&t0, &t0, &t2); /* t0 = t0-t2 */
+ p256FieldMul(&t1, &t4, &Y3); /* t1 = t4*Y3 */
+ p256FieldMul(&t2, &t0, &Y3); /* t2 = t0*Y3 */
+ p256FieldMul(&Y3, &X3, &Z3); /* Y3 = X3*Z3 */
+ p256FieldAdd(&Y3, &Y3, &t2); /* Y3 = Y3+t2 */
+ p256FieldMul(&X3, &t3, &X3); /* X3 = t3*X3 */
+ p256FieldSub(&X3, &X3, &t1); /* X3 = X3-t1 */
+ p256FieldMul(&Z3, &t4, &Z3); /* Z3 = t4*Z3 */
+ p256FieldMul(&t1, &t3, &t0); /* t1 = t3*t0 */
+ p256FieldAdd(&Z3, &Z3, &t1); /* Z3 = Z3+t1 */
+
+ r->x = X3; r->y = Y3; r->z = Z3;
+}
+
+static void
+p256PointDouble(P256point *r, P256point *a)
+{
+ P256field X1, Y1, Z1, X3, Y3, Z3;
+ P256field t0, t1, t2, t3;
+
+ X1 = a->x; Y1 = a->y; Z1 = a->z;
+
+ /* RCB 2016 Algorithm 6, a = -3. Exception-free doubling. */
+ p256FieldSqr(&t0, &X1); /* t0 = X^2 */
+ p256FieldSqr(&t1, &Y1); /* t1 = Y^2 */
+ p256FieldSqr(&t2, &Z1); /* t2 = Z^2 */
+ p256FieldMul(&t3, &X1, &Y1); /* t3 = X*Y */
+ p256FieldAdd(&t3, &t3, &t3); /* t3 = t3+t3 */
+ p256FieldMul(&Z3, &X1, &Z1); /* Z3 = X*Z */
+ p256FieldAdd(&Z3, &Z3, &Z3); /* Z3 = Z3+Z3 */
+ p256FieldMul(&Y3, &p256_b, &t2); /* Y3 = b*t2 */
+ p256FieldSub(&Y3, &Y3, &Z3); /* Y3 = Y3-Z3 */
+ p256FieldAdd(&X3, &Y3, &Y3); /* X3 = Y3+Y3 */
+ p256FieldAdd(&Y3, &X3, &Y3); /* Y3 = X3+Y3 */
+ p256FieldSub(&X3, &t1, &Y3); /* X3 = t1-Y3 */
+ p256FieldAdd(&Y3, &t1, &Y3); /* Y3 = t1+Y3 */
+ p256FieldMul(&Y3, &X3, &Y3); /* Y3 = X3*Y3 */
+ p256FieldMul(&X3, &X3, &t3); /* X3 = X3*t3 */
+ p256FieldAdd(&t3, &t2, &t2); /* t3 = t2+t2 */
+ p256FieldAdd(&t2, &t2, &t3); /* t2 = t2+t3 */
+ p256FieldMul(&Z3, &p256_b, &Z3); /* Z3 = b*Z3 */
+ p256FieldSub(&Z3, &Z3, &t2); /* Z3 = Z3-t2 */
+ p256FieldSub(&Z3, &Z3, &t0); /* Z3 = Z3-t0 */
+ p256FieldAdd(&t3, &Z3, &Z3); /* t3 = Z3+Z3 */
+ p256FieldAdd(&Z3, &Z3, &t3); /* Z3 = Z3+t3 */
+ p256FieldAdd(&t3, &t0, &t0); /* t3 = t0+t0 */
+ p256FieldAdd(&t0, &t3, &t0); /* t0 = t3+t0 */
+ p256FieldSub(&t0, &t0, &t2); /* t0 = t0-t2 */
+ p256FieldMul(&t0, &t0, &Z3); /* t0 = t0*Z3 */
+ p256FieldAdd(&Y3, &Y3, &t0); /* Y3 = Y3+t0 */
+ p256FieldMul(&t0, &Y1, &Z1); /* t0 = Y*Z */
+ p256FieldAdd(&t0, &t0, &t0); /* t0 = t0+t0 */
+ p256FieldMul(&Z3, &t0, &Z3); /* Z3 = t0*Z3 */
+ p256FieldSub(&X3, &X3, &Z3); /* X3 = X3-Z3 */
+ p256FieldMul(&Z3, &t0, &t1); /* Z3 = t0*t1 */
+ p256FieldAdd(&Z3, &Z3, &Z3); /* Z3 = Z3+Z3 */
+ p256FieldAdd(&Z3, &Z3, &Z3); /* Z3 = Z3+Z3 */
+
+ r->x = X3; r->y = Y3; r->z = Z3;
+}
+
+/*
+ * Project (X:Y:Z) to the affine pair (X/Z, Y/Z). One field
+ * inversion -- the most expensive single op in the scalar mul,
+ * so callers fold it to once-per-result rather than once-per-
+ * ladder-step. Precondition: Z != 0; calling with the identity
+ * yields meaningless output (the caller is expected to detect
+ * Z = 0 and reject before reaching us).
+ */
+static void
+p256PointToAffine(P256field *x, P256field *y, P256point *p)
+{
+ P256field zinv;
+
+ p256FieldInv(&zinv, &p->z);
+ p256FieldMul(x, &p->x, &zinv);
+ p256FieldMul(y, &p->y, &zinv);
+}
+
+/*
+ * P-256 scalar multiplication via the Montgomery ladder.
+ * Constant-time wrt the scalar: loop count is fixed at 256 (the
+ * bit length of the group order n, FIPS 186-4 D.1.2.3), every
+ * iteration runs one point add and one point double regardless
+ * of the bit value, and the branch on each scalar bit is
+ * replaced by a bit-mask conditional swap of (R0, R1).
+ *
+ * Identity in this representation is (0:1:0), not (0:0:0): RCB
+ * 2016 Algorithm 4 requires Y != 0 even when Z = 0, otherwise
+ * the formula degenerates. R0 is initialised to (0:1:0).
+ *
+ * The classic ladder maintains R1 = R0 + P throughout. Reading
+ * the scalar from MSB to LSB, each step doubles R0 and lets R1
+ * track via R1 = R0 + R1 = R0 + (R0 + P) = 2*R0 + P; if the
+ * current bit is 1 we cswap before the step so that the doubling
+ * absorbs the +P contribution. After bitlen(n) iterations and
+ * one final unswap, R0 = k*P.
+ */
+
+/*
+ * Constant-time conditional swap of two projective points.
+ * Writes the result through tmpA/tmpB so the field cmovs see
+ * stable inputs even though one source aliases an output (we
+ * read a->x before writing it). swap is treated as boolean by
+ * p256FieldCmov: 0 -> keep, anything else -> swap.
+ */
+static void
+p256PointCswap(P256point *a, P256point *b, int swap)
+{
+ P256point tmpA, tmpB;
+
+ p256FieldCmov(&tmpA.x, &a->x, &b->x, swap);
+ p256FieldCmov(&tmpA.y, &a->y, &b->y, swap);
+ p256FieldCmov(&tmpA.z, &a->z, &b->z, swap);
+ p256FieldCmov(&tmpB.x, &b->x, &a->x, swap);
+ p256FieldCmov(&tmpB.y, &b->y, &a->y, swap);
+ p256FieldCmov(&tmpB.z, &b->z, &a->z, swap);
+ *a = tmpA;
+ *b = tmpB;
+}
+
+static void
+p256ScalarMul(P256point *r, mpint *k, P256point *P)
+{
+ P256point R0, R1;
+ uchar k_be[32];
+ int i, bit, prev_bit, swap;
+
+ /* R0 = identity (0:1:0); R1 = P. */
+ memset(&R0, 0, sizeof R0);
+ R0.y.v[0] = 1;
+ R1 = *P;
+
+ /* Serialize k as 32 big-endian bytes. mptober left-pads
+ * with zeros if k is shorter than 32 bytes; if k were ever
+ * larger than 2^256 the high bytes would be clipped, but
+ * callers are required to feed us scalars already reduced
+ * mod n (n < 2^256). */
+ mptober(k, k_be, sizeof k_be);
+
+ prev_bit = 0;
+ for(i = 255; i >= 0; i--){
+ bit = (k_be[31 - i/8] >> (i & 7)) & 1;
+ swap = prev_bit ^ bit;
+ p256PointCswap(&R0, &R1, swap);
+ prev_bit = bit;
+ p256PointAdd(&R1, &R0, &R1);
+ p256PointDouble(&R0, &R0);
+ }
+ /* If the final scalar bit was 1, R0 and R1 are swapped from
+ * what the invariant requires; undo. */
+ p256PointCswap(&R0, &R1, prev_bit);
+
+ *r = R0;
+}
+
+/*
+ * P-256 constant-time scalar multiply: s = k*a. Sign and
+ * identity normalisation match the generic ecmul path so callers
+ * see one ecmul contract regardless of curve. This is the only
+ * non-static symbol in this file; ecc.c declares it extern and
+ * dispatches to it when dom->p matches the P-256 prime.
+ */
+void
+ecmul_p256(ECdomain *dom, ECpoint *a, mpint *k, ECpoint *s)
+{
+ P256point P, R;
+ P256field rx, ry, zero;
+ mpint *kk, *yneg;
+
+ kk = mpcopy(k);
+ kk->sign = 1;
+ mpmod(kk, dom->n, kk);
+ mpToP256Point(&P, a->x, a->y);
+ p256ScalarMul(&R, kk, &P);
+ mpfree(kk);
+
+ /* Identity from k*P = O. Non-CT check is fine: the only
+ * secret is k, and "k*P = O" only fires for k a multiple of
+ * the group order n; TLS callers feed k in [1, n-1] so the
+ * leak surface is empty in practice. */
+ memset(&zero, 0, sizeof zero);
+ if(memcmp(&R.z, &zero, sizeof zero) == 0){
+ s->inf = 1;
+ return;
+ }
+ p256PointToAffine(&rx, &ry, &R);
+ s->inf = 0;
+ p256FieldToMp(s->x, &rx);
+ p256FieldToMp(s->y, &ry);
+ if(s->z != nil)
+ mpassign(mpone, s->z);
+ if(k->sign < 0){
+ /* (-y) mod p == p - y for y in [0, p). */
+ yneg = mpnew(0);
+ mpsub(dom->p, s->y, yneg);
+ mpassign(yneg, s->y);
+ mpfree(yneg);
+ }
+}
--- sys/src/libsec/port/p256test.c
+++ sys/src/libsec/port/p256test.c
@@ -0,0 +1,310 @@
+#include <u.h>
+#include <libc.h>
+#include <mp.h>
+#include <libsec.h>
+
+/*
+ * Regression vectors for the constant-time P-256 path in libsec.
+ * Drives ecmul (via ecdsaverify and a sign+verify round-trip) so
+ * any miscompile or aliasing fault in p256.c surfaces
+ * here, not in TLS handshake debugging.
+ *
+ * Vector sources cited per-row:
+ * RFC 6979 Appendix A.2.5 -- deterministic ECDSA over P-256;
+ * given (d, msg, hash) the signature (r, s) is uniquely
+ * determined and reproducible across implementations.
+ * FIPS 186-4 Appendix D.1.2.3 -- curve parameters.
+ *
+ * What is exercised:
+ * 1. RFC 6979 verify: each row's (r, s) verifies against the
+ * public key derived from d. ecdsaverify drives ecmul on
+ * the generator and on the public point; the constant-time
+ * P-256 path receives both.
+ * 2. Sign+verify round-trip: ecdsasign produces (r, s) with a
+ * random k; the same key set then verifies the result.
+ * Drives ecmul through ecgen (signing direction) plus
+ * ecdsaverify.
+ * 3. Negative tests: tampered (r, s) or tampered digest must
+ * fail verify. Catches a permissive ecdsaverify or a
+ * degenerate ecmul that returns identity for all inputs.
+ *
+ * NIST CAVP SigVer.rsp coverage is intentionally deferred:
+ * the archive (186-4ecdsatestvectors.zip) is many MB and not
+ * reproduced here; RFC 6979 vectors plus round-trip cover the
+ * same ecmul code path with values that can be re-checked from
+ * the RFC text alone.
+ */
+
+typedef struct EcdsaVector EcdsaVector;
+struct EcdsaVector {
+ char *name;
+ char *msg;
+ int (*hash)(uchar *in, ulong inlen, uchar *out);
+ int dlen;
+ char *r_hex;
+ char *s_hex;
+};
+
+/*
+ * RFC 6979 Appendix A.2.5 shared key pair.
+ *
+ * d = C9AFA9D845BA75166B5C215767B1D6934E50C3DB36E89B127B8A622B120F6721
+ * Qx = 60FED4BA255A9D31C961EB74C6356D68C049B8923B61FA6CE669622E60F29FB6
+ * Qy = 7903FE1008B8BC99A41AE9E95628BC64F2F1B20C2D7E9F5177A3C294D4462299
+ */
+static char *rfc6979_d =
+ "C9AFA9D845BA75166B5C215767B1D6934E50C3DB36E89B127B8A622B120F6721";
+static char *rfc6979_Qx =
+ "60FED4BA255A9D31C961EB74C6356D68C049B8923B61FA6CE669622E60F29FB6";
+static char *rfc6979_Qy =
+ "7903FE1008B8BC99A41AE9E95628BC64F2F1B20C2D7E9F5177A3C294D4462299";
+
+/*
+ * Hash adaptors: project each digest function into a uniform
+ * (in, len, out) -> dlen signature so the table stays narrow.
+ */
+static int
+hash_sha256(uchar *in, ulong inlen, uchar *out)
+{
+ sha2_256(in, inlen, out, nil);
+ return SHA2_256dlen;
+}
+
+static int
+hash_sha384(uchar *in, ulong inlen, uchar *out)
+{
+ sha2_384(in, inlen, out, nil);
+ return SHA2_384dlen;
+}
+
+static int
+hash_sha512(uchar *in, ulong inlen, uchar *out)
+{
+ sha2_512(in, inlen, out, nil);
+ return SHA2_512dlen;
+}
+
+static EcdsaVector rfc6979_vectors[] = {
+ {
+ "RFC 6979 A.2.5 sample SHA-256",
+ "sample", hash_sha256, SHA2_256dlen,
+ "EFD48B2AACB6A8FD1140DD9CD45E81D69D2C877B56AAF991C34D0EA84EAF3716",
+ "F7CB1C942D657C41D436C7A1B6E29F65F3E900DBB9AFF4064DC4AB2F843ACDA8",
+ },
+ {
+ "RFC 6979 A.2.5 sample SHA-384",
+ "sample", hash_sha384, SHA2_384dlen,
+ "0EAFEA039B20E9B42309FB1D89E213057CBF973DC0CFC8F129EDDDC800EF7719",
+ "4861F0491E6998B9455193E34E7B0D284DDD7149A74B95B9261F13ABDE940954",
+ },
+ {
+ "RFC 6979 A.2.5 sample SHA-512",
+ "sample", hash_sha512, SHA2_512dlen,
+ "8496A60B5E9B47C825488827E0495B0E3FA109EC4568FD3F8D1097678EB97F00",
+ "2362AB1ADBE2B8ADF9CB9EDAB740EA6049C028114F2460F96554F61FAE3302FE",
+ },
+ {
+ "RFC 6979 A.2.5 test SHA-256",
+ "test", hash_sha256, SHA2_256dlen,
+ "F1ABB023518351CD71D881567B1EA663ED3EFCF6C5132B354F28D3B0B7D38367",
+ "019F4113742A2B14BD25926B49C649155F267E60D3814B4C0CC84250E46F0083",
+ },
+ {
+ "RFC 6979 A.2.5 test SHA-384",
+ "test", hash_sha384, SHA2_384dlen,
+ "83910E8B48BB0C74244EBDF7F07A1C5413D61472BD941EF3920E623FBCCEBEB6",
+ "8DDBEC54CF8CD5874883841D712142A56A8D0F218F5003CB0296B6B509619F2C",
+ },
+ {
+ "RFC 6979 A.2.5 test SHA-512",
+ "test", hash_sha512, SHA2_512dlen,
+ "461D93F31B6540894788FD206C07CFA0CC35F46FA3C91816FFF1040AD1581A04",
+ "39AF9F15DE0DB8D97E72719C74820D304CE5226E32DEDAE67519E840D1194E55",
+ },
+};
+
+static int passed;
+static int failed;
+
+static void
+report(char *name, int ok)
+{
+ print(" %s: %s\n", name, ok ? "ok" : "BAD");
+ if(ok)
+ passed++;
+ else
+ failed++;
+}
+
+/*
+ * Run one RFC 6979 row: hash msg, parse expected (r, s), call
+ * ecdsaverify against the shared public key, expect 1.
+ */
+static void
+run_rfc6979(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+ uchar dig[SHA2_512dlen];
+ mpint *r, *s;
+ int n, ok;
+
+ n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+ if(n != v->dlen){
+ report(v->name, 0);
+ return;
+ }
+ r = strtomp(v->r_hex, nil, 16, nil);
+ s = strtomp(v->s_hex, nil, 16, nil);
+ ok = ecdsaverify(dom, Q, dig, n, r, s);
+ report(v->name, ok == 1);
+ mpfree(r);
+ mpfree(s);
+}
+
+/*
+ * Bit-flip on r (or s) must fail verify. Catches a permissive
+ * verify and a degenerate ecmul returning the same value for
+ * every scalar. We toggle the low bit of r; if r-1 happens to
+ * also be a valid signature the test would spuriously pass, but
+ * the probability against a random valid pair is 2^-256.
+ */
+static void
+run_negative_r(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+ uchar dig[SHA2_512dlen];
+ mpint *r, *s;
+ char name[128];
+ int n, ok;
+
+ n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+ r = strtomp(v->r_hex, nil, 16, nil);
+ s = strtomp(v->s_hex, nil, 16, nil);
+ /* perturb r by 1: any change to r reshapes the verify
+ * equation u1*G + u2*Q = R and the recomputed R.x mod n
+ * no longer matches. Probability of accidental hit: 2^-256. */
+ mpadd(r, mpone, r);
+ ok = ecdsaverify(dom, Q, dig, n, r, s);
+ snprint(name, sizeof name, "%s tamper-r", v->name);
+ report(name, ok == 0);
+ mpfree(r);
+ mpfree(s);
+}
+
+/*
+ * Tamper with the digest: any bit flip yields a different E and
+ * must fail verify against the original (r, s).
+ */
+static void
+run_negative_digest(ECdomain *dom, ECpub *Q, EcdsaVector *v)
+{
+ uchar dig[SHA2_512dlen];
+ mpint *r, *s;
+ char name[128];
+ int n, ok;
+
+ n = v->hash((uchar*)v->msg, strlen(v->msg), dig);
+ dig[0] ^= 0x01;
+ r = strtomp(v->r_hex, nil, 16, nil);
+ s = strtomp(v->s_hex, nil, 16, nil);
+ ok = ecdsaverify(dom, Q, dig, n, r, s);
+ snprint(name, sizeof name, "%s tamper-digest", v->name);
+ report(name, ok == 0);
+ mpfree(r);
+ mpfree(s);
+}
+
+/*
+ * Sign a digest with priv, then verify the resulting (r, s)
+ * against pub. Drives ecmul through both the signing direction
+ * (ecgen builds k*G via ecmul) and the verifying direction.
+ * Repeat several times to exercise multiple random k.
+ */
+static void
+run_round_trip(ECdomain *dom, ECpriv *priv, ECpub *pub, char *tag, int rounds)
+{
+ uchar dig[SHA2_256dlen];
+ uchar buf[64];
+ mpint *r, *s;
+ char name[128];
+ int i, ok;
+
+ for(i = 0; i < rounds; i++){
+ snprint((char*)buf, sizeof buf, "round-trip-%d", i);
+ sha2_256(buf, strlen((char*)buf), dig, nil);
+ r = mpnew(0);
+ s = mpnew(0);
+ ecdsasign(dom, priv, dig, SHA2_256dlen, r, s);
+ ok = ecdsaverify(dom, pub, dig, SHA2_256dlen, r, s);
+ snprint(name, sizeof name, "%s round-trip #%d", tag, i);
+ report(name, ok == 1);
+ mpfree(r);
+ mpfree(s);
+ }
+}
+
+void
+main(int argc, char **argv)
+{
+ ECdomain dom;
+ ECpriv priv;
+ ECpub Q;
+ int i;
+
+ USED(argc); USED(argv);
+ fmtinstall('B', mpfmt);
+
+ print("p256test:\n");
+
+ ecdominit(&dom, secp256r1);
+
+ /*
+ * RFC 6979 A.2.5 fixed key pair: parse d, Qx, Qy into the
+ * libsec ECpub / ECpriv shape used by ecdsaverify and
+ * ecdsasign. ECpub is a typedef of ECpoint so x/y/z fields
+ * carry over; z is allocated and set to one (affine) so any
+ * code path that assumes a populated z stays well-defined.
+ */
+ memset(&priv, 0, sizeof priv);
+ priv.x = mpnew(0);
+ priv.y = mpnew(0);
+ priv.d = mpnew(0);
+ strtomp(rfc6979_d, nil, 16, priv.d);
+ strtomp(rfc6979_Qx, nil, 16, priv.x);
+ strtomp(rfc6979_Qy, nil, 16, priv.y);
+ priv.inf = 0;
+
+ memset(&Q, 0, sizeof Q);
+ Q.x = mpnew(0);
+ Q.y = mpnew(0);
+ Q.z = mpnew(0);
+ strtomp(rfc6979_Qx, nil, 16, Q.x);
+ strtomp(rfc6979_Qy, nil, 16, Q.y);
+ mpassign(mpone, Q.z);
+ Q.inf = 0;
+
+ if(!ecpubverify(&dom, &Q)){
+ print(" RFC 6979 public key fails ecpubverify\n");
+ exits("bad pub");
+ }
+
+ for(i = 0; i < nelem(rfc6979_vectors); i++)
+ run_rfc6979(&dom, &Q, &rfc6979_vectors[i]);
+
+ for(i = 0; i < nelem(rfc6979_vectors); i++)
+ run_negative_r(&dom, &Q, &rfc6979_vectors[i]);
+
+ for(i = 0; i < nelem(rfc6979_vectors); i++)
+ run_negative_digest(&dom, &Q, &rfc6979_vectors[i]);
+
+ run_round_trip(&dom, &priv, &Q, "RFC 6979 keypair", 5);
+
+ mpfree(priv.x);
+ mpfree(priv.y);
+ mpfree(priv.d);
+ mpfree(Q.x);
+ mpfree(Q.y);
+ mpfree(Q.z);
+ ecdomfree(&dom);
+
+ print("passed: %d/%d\n", passed, passed+failed);
+ exits(failed ? "fail" : nil);
+}
--- sys/src/libsec/port/p256timetest.c
+++ sys/src/libsec/port/p256timetest.c
@@ -0,0 +1,194 @@
+#include <u.h>
+#include <libc.h>
+#include <mp.h>
+#include <libsec.h>
+
+/*
+ * Light side-channel sanity probe for the constant-time P-256
+ * scalar multiplier. Compares cycle-counter samples between a
+ * fixed scalar (k = 1) and uniformly-random 256-bit scalars; if
+ * the means agree within a few percent and the spread is
+ * comparable, the ladder leaks no timing signal at the
+ * resolution of the cycle counter on the host running the test.
+ *
+ * This is the cheap end of the spectrum. It does not replace a
+ * statistical test such as dudect (Reparaz, Balasch, Verbauwhede,
+ * "Dude, is my code constant time?", DATE 2017): a few thousand
+ * samples cannot bound a sub-cycle leak, and the operator must
+ * still run dudect or an equivalent for a strong CT claim. It
+ * does catch coarse breakage -- a missing cmov, a bit-dependent
+ * loop bound, a branchful field op -- which is the failure mode
+ * worth ruling out before TLS handshake debugging.
+ *
+ * QEMU caveat: virtualised cycle counters are noisy and biased
+ * by the host scheduler. A negative result here on QEMU is
+ * suggestive but not conclusive; rerun on bare hardware before
+ * declaring a leak. A clean (within-tolerance) result on QEMU
+ * is the strongest claim this probe can make on its own.
+ *
+ * Driver: ecmul on a P-256 ECdomain. The dispatch in ecmul
+ * routes to the constant-time ladder via ecmul_p256, so the
+ * timing window covers exactly the production path. The
+ * ECpoint setup happens once before the timing loop and is not
+ * counted; only the ecmul call is between the cycles() reads.
+ *
+ * Reference: cycles(2) reads the per-CPU cycle counter (TSC on
+ * 386/amd64); resolution is one CPU cycle.
+ */
+
+#define N 1000 /* samples per group */
+#define WARMUP 16 /* iterations to prime caches before timing */
+
+static int
+ulcmp(void *a, void *b)
+{
+ uvlong x, y;
+
+ x = *(uvlong*)a;
+ y = *(uvlong*)b;
+ if(x < y) return -1;
+ if(x > y) return 1;
+ return 0;
+}
+
+/*
+ * Run N scalar multiplications and record the cycle-count delta
+ * for each. When fixed != 0 the same scalar (k = 1) is used for
+ * every iteration; otherwise genrandom fills 32 fresh bytes per
+ * iteration. P is reused across iterations -- ecmul builds its
+ * working state from scratch every call, so input-state effects
+ * are bounded to the single point P.
+ */
+static void
+measure(int fixed, ECdomain *dom, ECpoint *P, uvlong *out)
+{
+ mpint *k;
+ ECpoint R;
+ uvlong t0, t1;
+ uchar buf[32];
+ int i;
+
+ k = mpnew(0);
+ memset(&R, 0, sizeof R);
+ R.x = mpnew(0);
+ R.y = mpnew(0);
+ R.z = mpnew(0);
+ for(i = 0; i < N + WARMUP; i++){
+ if(fixed){
+ memset(buf, 0, sizeof buf);
+ buf[31] = 1;
+ } else {
+ genrandom(buf, sizeof buf);
+ }
+ betomp(buf, sizeof buf, k);
+
+ cycles(&t0);
+ ecmul(dom, P, k, &R);
+ cycles(&t1);
+
+ if(i >= WARMUP)
+ out[i - WARMUP] = t1 - t0;
+ }
+ mpfree(R.x);
+ mpfree(R.y);
+ mpfree(R.z);
+ mpfree(k);
+}
+
+/*
+ * Mean, sample stddev (Bessel-corrected), and median of N
+ * cycle-count samples. The samples buffer is sorted in place;
+ * callers must not depend on its order afterwards.
+ */
+static void
+stats(uvlong *samples, double *mean, double *stddev, uvlong *median)
+{
+ double m, s, d;
+ int i;
+
+ qsort(samples, N, sizeof samples[0], ulcmp);
+ *median = samples[N/2];
+
+ m = 0.0;
+ for(i = 0; i < N; i++)
+ m += (double)samples[i];
+ m /= N;
+ *mean = m;
+
+ s = 0.0;
+ for(i = 0; i < N; i++){
+ d = (double)samples[i] - m;
+ s += d * d;
+ }
+ *stddev = sqrt(s / (N - 1));
+}
+
+void
+main(int argc, char **argv)
+{
+ ECdomain dom;
+ ECpoint G;
+ uvlong *fix, *rnd;
+ uvlong fmed, rmed;
+ double fmean, fsd, rmean, rsd, ratio;
+
+ USED(argc); USED(argv);
+ fmtinstall('B', mpfmt);
+
+ print("p256timetest:\n");
+ print(" N=%d samples per group, WARMUP=%d\n", N, WARMUP);
+
+ ecdominit(&dom, secp256r1);
+
+ /* G is already populated as dom.G by ecdominit; copy it into
+ * a local ECpoint with its own mpints so measure() can pass
+ * a stable pointer through the timing window. */
+ memset(&G, 0, sizeof G);
+ G.x = mpnew(0);
+ G.y = mpnew(0);
+ G.z = mpnew(0);
+ mpassign(dom.G.x, G.x);
+ mpassign(dom.G.y, G.y);
+ mpassign(mpone, G.z);
+ G.inf = 0;
+
+ fix = mallocz(N * sizeof *fix, 1);
+ rnd = mallocz(N * sizeof *rnd, 1);
+ if(fix == nil || rnd == nil){
+ print(" out of memory\n");
+ exits("nomem");
+ }
+
+ measure(1, &dom, &G, fix);
+ measure(0, &dom, &G, rnd);
+
+ stats(fix, &fmean, &fsd, &fmed);
+ stats(rnd, &rmean, &rsd, &rmed);
+
+ print(" fixed mean=%.0f stddev=%.0f median=%llud\n",
+ fmean, fsd, fmed);
+ print(" random mean=%.0f stddev=%.0f median=%llud\n",
+ rmean, rsd, rmed);
+
+ if(fmean > 0.0)
+ ratio = (rmean - fmean) / fmean;
+ else
+ ratio = 0.0;
+ if(ratio < 0.0)
+ ratio = -ratio;
+ print(" ratio |random-fixed|/fixed = %.4f\n", ratio);
+
+ if(ratio < 0.05)
+ print(" no measurable leak at this resolution\n");
+ else
+ print(" inconclusive -- repeat on bare hardware\n");
+
+ mpfree(G.x);
+ mpfree(G.y);
+ mpfree(G.z);
+ ecdomfree(&dom);
+
+ free(fix);
+ free(rnd);
+ exits(nil);
+}
|