Skip to content

Commit c4a3a9c

Browse files
vkrasnovbradfitz
authored andcommitted
crypto/elliptic: improve P256 implementation on amd64 a bit
Minor modifications to the optimized amd64 implememntation. * Reduce window size: reduces size of the lookup tables by 40% * Revised scalar inversion formula, with less operations * Field square function now uses intental loop, saving call overhead This change will serve as a basis for an arm64 implementation. Performance results on Skylake MacBook Pro: pkg:crypto/elliptic goos:darwin goarch:amd64 BaseMultP256 17.8µs ± 1% 17.5µs ± 1% -1.41% (p=0.003 n=10+10) ScalarMultP256 70.7µs ± 1% 68.9µs ± 2% -2.57% (p=0.000 n=9+9) pkg:crypto/ecdsa goos:darwin goarch:amd64 SignP256 32.7µs ± 1% 31.4µs ± 1% -3.96% (p=0.000 n=10+8) VerifyP256 95.1µs ± 1% 93.5µs ± 2% -1.73% (p=0.001 n=10+9) name old alloc/op new alloc/op delta pkg:crypto/elliptic goos:darwin goarch:amd64 BaseMultP256 288B ± 0% 288B ± 0% ~ (all equal) ScalarMultP256 256B ± 0% 256B ± 0% ~ (all equal) pkg:crypto/ecdsa goos:darwin goarch:amd64 SignP256 2.90kB ± 0% 2.90kB ± 0% ~ (all equal) VerifyP256 976B ± 0% 976B ± 0% ~ (all equal) name old allocs/op new allocs/op delta pkg:crypto/elliptic goos:darwin goarch:amd64 BaseMultP256 6.00 ± 0% 6.00 ± 0% ~ (all equal) ScalarMultP256 5.00 ± 0% 5.00 ± 0% ~ (all equal) pkg:crypto/ecdsa goos:darwin goarch:amd64 SignP256 34.0 ± 0% 34.0 ± 0% ~ (all equal) VerifyP256 17.0 ± 0% 17.0 ± 0% ~ (all equal) Change-Id: I3f0e2e197a54e7bc7916dedc5dbf085e2c4aea24 Reviewed-on: https://go-review.googlesource.com/99622 Reviewed-by: Vlad Krasnov <vlad@cloudflare.com> Reviewed-by: Filippo Valsorda <filippo@golang.org> Run-TryBot: Vlad Krasnov <vlad@cloudflare.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
1 parent efa0d1f commit c4a3a9c

File tree

2 files changed

+99
-115
lines changed

2 files changed

+99
-115
lines changed

src/crypto/elliptic/p256_amd64.go

Lines changed: 90 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ type (
3131

3232
var (
3333
p256 p256Curve
34-
p256Precomputed *[37][64 * 8]uint64
34+
p256Precomputed *[43][32 * 8]uint64
3535
precomputeOnce sync.Once
3636
)
3737

@@ -50,14 +50,14 @@ func (curve p256Curve) Params() *CurveParams {
5050
return curve.CurveParams
5151
}
5252

53-
// Functions implemented in p256_asm_amd64.s
53+
// Functions implemented in p256_asm_*64.s
5454
// Montgomery multiplication modulo P256
5555
//go:noescape
5656
func p256Mul(res, in1, in2 []uint64)
5757

58-
// Montgomery square modulo P256
58+
// Montgomery square modulo P256, repeated n times (n >= 1)
5959
//go:noescape
60-
func p256Sqr(res, in []uint64)
60+
func p256Sqr(res, in []uint64, n int)
6161

6262
// Montgomery multiplication by 1
6363
//go:noescape
@@ -121,65 +121,70 @@ func (curve p256Curve) Inverse(k *big.Int) *big.Int {
121121
k = new(big.Int).Mod(k, p256.N)
122122
}
123123

124-
// table will store precomputed powers of x. The four words at index
125-
// 4×i store x^(i+1).
126-
var table [4 * 15]uint64
124+
// table will store precomputed powers of x.
125+
var table [4 * 9]uint64
126+
var (
127+
_1 = table[4*0 : 4*1]
128+
_11 = table[4*1 : 4*2]
129+
_101 = table[4*2 : 4*3]
130+
_111 = table[4*3 : 4*4]
131+
_1111 = table[4*4 : 4*5]
132+
_10101 = table[4*5 : 4*6]
133+
_101111 = table[4*6 : 4*7]
134+
x = table[4*7 : 4*8]
135+
t = table[4*8 : 4*9]
136+
)
127137

128-
x := make([]uint64, 4)
129138
fromBig(x[:], k)
130139
// This code operates in the Montgomery domain where R = 2^256 mod n
131140
// and n is the order of the scalar field. (See initP256 for the
132141
// value.) Elements in the Montgomery domain take the form a×R and
133142
// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
134143
// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
135144
// i.e. converts x into the Montgomery domain.
145+
// Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
136146
RR := []uint64{0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, 0x66e12d94f3d95620}
137-
p256OrdMul(table[:4], x, RR)
138-
139-
// Prepare the table, no need in constant time access, because the
140-
// power is not a secret. (Entry 0 is never used.)
141-
for i := 2; i < 16; i += 2 {
142-
p256OrdSqr(table[4*(i-1):], table[4*((i/2)-1):], 1)
143-
p256OrdMul(table[4*i:], table[4*(i-1):], table[:4])
144-
}
145-
146-
x[0] = table[4*14+0] // f
147-
x[1] = table[4*14+1]
148-
x[2] = table[4*14+2]
149-
x[3] = table[4*14+3]
150-
151-
p256OrdSqr(x, x, 4)
152-
p256OrdMul(x, x, table[4*14:4*14+4]) // ff
153-
t := make([]uint64, 4, 4)
154-
t[0] = x[0]
155-
t[1] = x[1]
156-
t[2] = x[2]
157-
t[3] = x[3]
158-
159-
p256OrdSqr(x, x, 8)
160-
p256OrdMul(x, x, t) // ffff
161-
t[0] = x[0]
162-
t[1] = x[1]
163-
t[2] = x[2]
164-
t[3] = x[3]
165-
166-
p256OrdSqr(x, x, 16)
167-
p256OrdMul(x, x, t) // ffffffff
168-
t[0] = x[0]
169-
t[1] = x[1]
170-
t[2] = x[2]
171-
t[3] = x[3]
172-
173-
p256OrdSqr(x, x, 64) // ffffffff0000000000000000
174-
p256OrdMul(x, x, t) // ffffffff00000000ffffffff
175-
p256OrdSqr(x, x, 32) // ffffffff00000000ffffffff00000000
176-
p256OrdMul(x, x, t) // ffffffff00000000ffffffffffffffff
177-
178-
// Remaining 32 windows
179-
expLo := [32]byte{0xb, 0xc, 0xe, 0x6, 0xf, 0xa, 0xa, 0xd, 0xa, 0x7, 0x1, 0x7, 0x9, 0xe, 0x8, 0x4, 0xf, 0x3, 0xb, 0x9, 0xc, 0xa, 0xc, 0x2, 0xf, 0xc, 0x6, 0x3, 0x2, 0x5, 0x4, 0xf}
180-
for i := 0; i < 32; i++ {
181-
p256OrdSqr(x, x, 4)
182-
p256OrdMul(x, x, table[4*(expLo[i]-1):])
147+
p256OrdMul(_1, x, RR) // _1
148+
p256OrdSqr(x, _1, 1) // _10
149+
p256OrdMul(_11, x, _1) // _11
150+
p256OrdMul(_101, x, _11) // _101
151+
p256OrdMul(_111, x, _101) // _111
152+
p256OrdSqr(x, _101, 1) // _1010
153+
p256OrdMul(_1111, _101, x) // _1111
154+
155+
p256OrdSqr(t, x, 1) // _10100
156+
p256OrdMul(_10101, t, _1) // _10101
157+
p256OrdSqr(x, _10101, 1) // _101010
158+
p256OrdMul(_101111, _101, x) // _101111
159+
p256OrdMul(x, _10101, x) // _111111 = x6
160+
p256OrdSqr(t, x, 2) // _11111100
161+
p256OrdMul(t, t, _11) // _11111111 = x8
162+
p256OrdSqr(x, t, 8) // _ff00
163+
p256OrdMul(x, x, t) // _ffff = x16
164+
p256OrdSqr(t, x, 16) // _ffff0000
165+
p256OrdMul(t, t, x) // _ffffffff = x32
166+
167+
p256OrdSqr(x, t, 64)
168+
p256OrdMul(x, x, t)
169+
p256OrdSqr(x, x, 32)
170+
p256OrdMul(x, x, t)
171+
172+
sqrs := []uint8{
173+
6, 5, 4, 5, 5,
174+
4, 3, 3, 5, 9,
175+
6, 2, 5, 6, 5,
176+
4, 5, 5, 3, 10,
177+
2, 5, 5, 3, 7, 6}
178+
muls := [][]uint64{
179+
_101111, _111, _11, _1111, _10101,
180+
_101, _101, _101, _111, _101111,
181+
_1111, _1, _1, _1111, _111,
182+
_111, _111, _101, _11, _101111,
183+
_11, _11, _11, _1, _10101, _1111}
184+
185+
for i, s := range sqrs {
186+
p256OrdSqr(x, x, int(s))
187+
p256OrdMul(x, x, muls[i])
183188
}
184189

185190
// Multiplying by one in the Montgomery domain converts a Montgomery
@@ -309,7 +314,7 @@ func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
309314
zInv := make([]uint64, 4)
310315
zInvSq := make([]uint64, 4)
311316
p256Inverse(zInv, p.xyz[8:12])
312-
p256Sqr(zInvSq, zInv)
317+
p256Sqr(zInvSq, zInv, 1)
313318
p256Mul(zInv, zInv, zInvSq)
314319

315320
p256Mul(zInvSq, p.xyz[0:4], zInvSq)
@@ -346,71 +351,43 @@ func p256Inverse(out, in []uint64) {
346351
p16 := stack[4*3 : 4*3+4]
347352
p32 := stack[4*4 : 4*4+4]
348353

349-
p256Sqr(out, in)
354+
p256Sqr(out, in, 1)
350355
p256Mul(p2, out, in) // 3*p
351356

352-
p256Sqr(out, p2)
353-
p256Sqr(out, out)
357+
p256Sqr(out, p2, 2)
354358
p256Mul(p4, out, p2) // f*p
355359

356-
p256Sqr(out, p4)
357-
p256Sqr(out, out)
358-
p256Sqr(out, out)
359-
p256Sqr(out, out)
360+
p256Sqr(out, p4, 4)
360361
p256Mul(p8, out, p4) // ff*p
361362

362-
p256Sqr(out, p8)
363-
364-
for i := 0; i < 7; i++ {
365-
p256Sqr(out, out)
366-
}
363+
p256Sqr(out, p8, 8)
367364
p256Mul(p16, out, p8) // ffff*p
368365

369-
p256Sqr(out, p16)
370-
for i := 0; i < 15; i++ {
371-
p256Sqr(out, out)
372-
}
366+
p256Sqr(out, p16, 16)
373367
p256Mul(p32, out, p16) // ffffffff*p
374368

375-
p256Sqr(out, p32)
376-
377-
for i := 0; i < 31; i++ {
378-
p256Sqr(out, out)
379-
}
369+
p256Sqr(out, p32, 32)
380370
p256Mul(out, out, in)
381371

382-
for i := 0; i < 32*4; i++ {
383-
p256Sqr(out, out)
384-
}
372+
p256Sqr(out, out, 128)
385373
p256Mul(out, out, p32)
386374

387-
for i := 0; i < 32; i++ {
388-
p256Sqr(out, out)
389-
}
375+
p256Sqr(out, out, 32)
390376
p256Mul(out, out, p32)
391377

392-
for i := 0; i < 16; i++ {
393-
p256Sqr(out, out)
394-
}
378+
p256Sqr(out, out, 16)
395379
p256Mul(out, out, p16)
396380

397-
for i := 0; i < 8; i++ {
398-
p256Sqr(out, out)
399-
}
381+
p256Sqr(out, out, 8)
400382
p256Mul(out, out, p8)
401383

402-
p256Sqr(out, out)
403-
p256Sqr(out, out)
404-
p256Sqr(out, out)
405-
p256Sqr(out, out)
384+
p256Sqr(out, out, 4)
406385
p256Mul(out, out, p4)
407386

408-
p256Sqr(out, out)
409-
p256Sqr(out, out)
387+
p256Sqr(out, out, 2)
410388
p256Mul(out, out, p2)
411389

412-
p256Sqr(out, out)
413-
p256Sqr(out, out)
390+
p256Sqr(out, out, 2)
414391
p256Mul(out, out, in)
415392
}
416393

@@ -426,16 +403,16 @@ func boothW5(in uint) (int, int) {
426403
return int(d), int(s & 1)
427404
}
428405

429-
func boothW7(in uint) (int, int) {
430-
var s uint = ^((in >> 7) - 1)
431-
var d uint = (1 << 8) - in - 1
406+
func boothW6(in uint) (int, int) {
407+
var s uint = ^((in >> 6) - 1)
408+
var d uint = (1 << 7) - in - 1
432409
d = (d & s) | (in & (^s))
433410
d = (d >> 1) + (d & 1)
434411
return int(d), int(s & 1)
435412
}
436413

437414
func initTable() {
438-
p256Precomputed = new([37][64 * 8]uint64)
415+
p256Precomputed = new([43][32 * 8]uint64)
439416

440417
basePoint := []uint64{
441418
0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, 0x18905f76a53755c6,
@@ -448,19 +425,19 @@ func initTable() {
448425

449426
zInv := make([]uint64, 4)
450427
zInvSq := make([]uint64, 4)
451-
for j := 0; j < 64; j++ {
428+
for j := 0; j < 32; j++ {
452429
copy(t1, t2)
453-
for i := 0; i < 37; i++ {
454-
// The window size is 7 so we need to double 7 times.
430+
for i := 0; i < 43; i++ {
431+
// The window size is 6 so we need to double 6 times.
455432
if i != 0 {
456-
for k := 0; k < 7; k++ {
433+
for k := 0; k < 6; k++ {
457434
p256PointDoubleAsm(t1, t1)
458435
}
459436
}
460437
// Convert the point to affine form. (Its values are
461438
// still in Montgomery form however.)
462439
p256Inverse(zInv, t1[8:12])
463-
p256Sqr(zInvSq, zInv)
440+
p256Sqr(zInvSq, zInv, 1)
464441
p256Mul(zInv, zInv, zInvSq)
465442

466443
p256Mul(t1[:4], t1[:4], zInvSq)
@@ -481,8 +458,8 @@ func initTable() {
481458
func (p *p256Point) p256BaseMult(scalar []uint64) {
482459
precomputeOnce.Do(initTable)
483460

484-
wvalue := (scalar[0] << 1) & 0xff
485-
sel, sign := boothW7(uint(wvalue))
461+
wvalue := (scalar[0] << 1) & 0x7f
462+
sel, sign := boothW6(uint(wvalue))
486463
p256SelectBase(p.xyz[0:8], p256Precomputed[0][0:], sel)
487464
p256NegCond(p.xyz[4:8], sign)
488465

@@ -499,17 +476,17 @@ func (p *p256Point) p256BaseMult(scalar []uint64) {
499476
t0.xyz[10] = 0xffffffffffffffff
500477
t0.xyz[11] = 0x00000000fffffffe
501478

502-
index := uint(6)
479+
index := uint(5)
503480
zero := sel
504481

505-
for i := 1; i < 37; i++ {
482+
for i := 1; i < 43; i++ {
506483
if index < 192 {
507-
wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0xff
484+
wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
508485
} else {
509-
wvalue = (scalar[index/64] >> (index % 64)) & 0xff
486+
wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
510487
}
511-
index += 7
512-
sel, sign = boothW7(uint(wvalue))
488+
index += 6
489+
sel, sign = boothW6(uint(wvalue))
513490
p256SelectBase(t0.xyz[0:8], p256Precomputed[i][0:], sel)
514491
p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
515492
zero |= sel

src/crypto/elliptic/p256_asm_amd64.s

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,10 +162,14 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0
162162

163163
RET
164164
/* ---------------------------------------*/
165-
// func p256Sqr(res, in []uint64)
165+
// func p256Sqr(res, in []uint64, n int)
166166
TEXT ·p256Sqr(SB),NOSPLIT,$0
167167
MOVQ res+0(FP), res_ptr
168168
MOVQ in+24(FP), x_ptr
169+
MOVQ n+48(FP), BX
170+
171+
sqrLoop:
172+
169173
// y[1:] * y[0]
170174
MOVQ (8*0)(x_ptr), t0
171175

@@ -316,6 +320,9 @@ TEXT ·p256Sqr(SB),NOSPLIT,$0
316320
MOVQ acc1, (8*1)(res_ptr)
317321
MOVQ acc2, (8*2)(res_ptr)
318322
MOVQ acc3, (8*3)(res_ptr)
323+
MOVQ res_ptr, x_ptr
324+
DECQ BX
325+
JNE sqrLoop
319326

320327
RET
321328
/* ---------------------------------------*/
@@ -677,7 +684,7 @@ TEXT ·p256SelectBase(SB),NOSPLIT,$0
677684
PXOR X1, X1
678685
PXOR X2, X2
679686
PXOR X3, X3
680-
MOVQ $32, AX
687+
MOVQ $16, AX
681688

682689
MOVOU X15, X13
683690

0 commit comments

Comments
 (0)