31
31
32
32
var (
33
33
p256 p256Curve
34
- p256Precomputed * [37 ][ 64 * 8 ]uint64
34
+ p256Precomputed * [43 ][ 32 * 8 ]uint64
35
35
precomputeOnce sync.Once
36
36
)
37
37
@@ -50,14 +50,14 @@ func (curve p256Curve) Params() *CurveParams {
50
50
return curve .CurveParams
51
51
}
52
52
53
- // Functions implemented in p256_asm_amd64 .s
53
+ // Functions implemented in p256_asm_*64 .s
54
54
// Montgomery multiplication modulo P256
55
55
//go:noescape
56
56
func p256Mul (res , in1 , in2 []uint64 )
57
57
58
- // Montgomery square modulo P256
58
+ // Montgomery square modulo P256, repeated n times (n >= 1)
59
59
//go:noescape
60
- func p256Sqr (res , in []uint64 )
60
+ func p256Sqr (res , in []uint64 , n int )
61
61
62
62
// Montgomery multiplication by 1
63
63
//go:noescape
@@ -121,65 +121,70 @@ func (curve p256Curve) Inverse(k *big.Int) *big.Int {
121
121
k = new (big.Int ).Mod (k , p256 .N )
122
122
}
123
123
124
- // table will store precomputed powers of x. The four words at index
125
- // 4×i store x^(i+1).
126
- var table [4 * 15 ]uint64
124
+ // table will store precomputed powers of x.
125
+ var table [4 * 9 ]uint64
126
+ var (
127
+ _1 = table [4 * 0 : 4 * 1 ]
128
+ _11 = table [4 * 1 : 4 * 2 ]
129
+ _101 = table [4 * 2 : 4 * 3 ]
130
+ _111 = table [4 * 3 : 4 * 4 ]
131
+ _1111 = table [4 * 4 : 4 * 5 ]
132
+ _10101 = table [4 * 5 : 4 * 6 ]
133
+ _101111 = table [4 * 6 : 4 * 7 ]
134
+ x = table [4 * 7 : 4 * 8 ]
135
+ t = table [4 * 8 : 4 * 9 ]
136
+ )
127
137
128
- x := make ([]uint64 , 4 )
129
138
fromBig (x [:], k )
130
139
// This code operates in the Montgomery domain where R = 2^256 mod n
131
140
// and n is the order of the scalar field. (See initP256 for the
132
141
// value.) Elements in the Montgomery domain take the form a×R and
133
142
// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
134
143
// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
135
144
// i.e. converts x into the Montgomery domain.
145
+ // Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
136
146
RR := []uint64 {0x83244c95be79eea2 , 0x4699799c49bd6fa6 , 0x2845b2392b6bec59 , 0x66e12d94f3d95620 }
137
- p256OrdMul (table [:4 ], x , RR )
138
-
139
- // Prepare the table, no need in constant time access, because the
140
- // power is not a secret. (Entry 0 is never used.)
141
- for i := 2 ; i < 16 ; i += 2 {
142
- p256OrdSqr (table [4 * (i - 1 ):], table [4 * ((i / 2 )- 1 ):], 1 )
143
- p256OrdMul (table [4 * i :], table [4 * (i - 1 ):], table [:4 ])
144
- }
145
-
146
- x [0 ] = table [4 * 14 + 0 ] // f
147
- x [1 ] = table [4 * 14 + 1 ]
148
- x [2 ] = table [4 * 14 + 2 ]
149
- x [3 ] = table [4 * 14 + 3 ]
150
-
151
- p256OrdSqr (x , x , 4 )
152
- p256OrdMul (x , x , table [4 * 14 :4 * 14 + 4 ]) // ff
153
- t := make ([]uint64 , 4 , 4 )
154
- t [0 ] = x [0 ]
155
- t [1 ] = x [1 ]
156
- t [2 ] = x [2 ]
157
- t [3 ] = x [3 ]
158
-
159
- p256OrdSqr (x , x , 8 )
160
- p256OrdMul (x , x , t ) // ffff
161
- t [0 ] = x [0 ]
162
- t [1 ] = x [1 ]
163
- t [2 ] = x [2 ]
164
- t [3 ] = x [3 ]
165
-
166
- p256OrdSqr (x , x , 16 )
167
- p256OrdMul (x , x , t ) // ffffffff
168
- t [0 ] = x [0 ]
169
- t [1 ] = x [1 ]
170
- t [2 ] = x [2 ]
171
- t [3 ] = x [3 ]
172
-
173
- p256OrdSqr (x , x , 64 ) // ffffffff0000000000000000
174
- p256OrdMul (x , x , t ) // ffffffff00000000ffffffff
175
- p256OrdSqr (x , x , 32 ) // ffffffff00000000ffffffff00000000
176
- p256OrdMul (x , x , t ) // ffffffff00000000ffffffffffffffff
177
-
178
- // Remaining 32 windows
179
- expLo := [32 ]byte {0xb , 0xc , 0xe , 0x6 , 0xf , 0xa , 0xa , 0xd , 0xa , 0x7 , 0x1 , 0x7 , 0x9 , 0xe , 0x8 , 0x4 , 0xf , 0x3 , 0xb , 0x9 , 0xc , 0xa , 0xc , 0x2 , 0xf , 0xc , 0x6 , 0x3 , 0x2 , 0x5 , 0x4 , 0xf }
180
- for i := 0 ; i < 32 ; i ++ {
181
- p256OrdSqr (x , x , 4 )
182
- p256OrdMul (x , x , table [4 * (expLo [i ]- 1 ):])
147
+ p256OrdMul (_1 , x , RR ) // _1
148
+ p256OrdSqr (x , _1 , 1 ) // _10
149
+ p256OrdMul (_11 , x , _1 ) // _11
150
+ p256OrdMul (_101 , x , _11 ) // _101
151
+ p256OrdMul (_111 , x , _101 ) // _111
152
+ p256OrdSqr (x , _101 , 1 ) // _1010
153
+ p256OrdMul (_1111 , _101 , x ) // _1111
154
+
155
+ p256OrdSqr (t , x , 1 ) // _10100
156
+ p256OrdMul (_10101 , t , _1 ) // _10101
157
+ p256OrdSqr (x , _10101 , 1 ) // _101010
158
+ p256OrdMul (_101111 , _101 , x ) // _101111
159
+ p256OrdMul (x , _10101 , x ) // _111111 = x6
160
+ p256OrdSqr (t , x , 2 ) // _11111100
161
+ p256OrdMul (t , t , _11 ) // _11111111 = x8
162
+ p256OrdSqr (x , t , 8 ) // _ff00
163
+ p256OrdMul (x , x , t ) // _ffff = x16
164
+ p256OrdSqr (t , x , 16 ) // _ffff0000
165
+ p256OrdMul (t , t , x ) // _ffffffff = x32
166
+
167
+ p256OrdSqr (x , t , 64 )
168
+ p256OrdMul (x , x , t )
169
+ p256OrdSqr (x , x , 32 )
170
+ p256OrdMul (x , x , t )
171
+
172
+ sqrs := []uint8 {
173
+ 6 , 5 , 4 , 5 , 5 ,
174
+ 4 , 3 , 3 , 5 , 9 ,
175
+ 6 , 2 , 5 , 6 , 5 ,
176
+ 4 , 5 , 5 , 3 , 10 ,
177
+ 2 , 5 , 5 , 3 , 7 , 6 }
178
+ muls := [][]uint64 {
179
+ _101111 , _111 , _11 , _1111 , _10101 ,
180
+ _101 , _101 , _101 , _111 , _101111 ,
181
+ _1111 , _1 , _1 , _1111 , _111 ,
182
+ _111 , _111 , _101 , _11 , _101111 ,
183
+ _11 , _11 , _11 , _1 , _10101 , _1111 }
184
+
185
+ for i , s := range sqrs {
186
+ p256OrdSqr (x , x , int (s ))
187
+ p256OrdMul (x , x , muls [i ])
183
188
}
184
189
185
190
// Multiplying by one in the Montgomery domain converts a Montgomery
@@ -309,7 +314,7 @@ func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
309
314
zInv := make ([]uint64 , 4 )
310
315
zInvSq := make ([]uint64 , 4 )
311
316
p256Inverse (zInv , p .xyz [8 :12 ])
312
- p256Sqr (zInvSq , zInv )
317
+ p256Sqr (zInvSq , zInv , 1 )
313
318
p256Mul (zInv , zInv , zInvSq )
314
319
315
320
p256Mul (zInvSq , p .xyz [0 :4 ], zInvSq )
@@ -346,71 +351,43 @@ func p256Inverse(out, in []uint64) {
346
351
p16 := stack [4 * 3 : 4 * 3 + 4 ]
347
352
p32 := stack [4 * 4 : 4 * 4 + 4 ]
348
353
349
- p256Sqr (out , in )
354
+ p256Sqr (out , in , 1 )
350
355
p256Mul (p2 , out , in ) // 3*p
351
356
352
- p256Sqr (out , p2 )
353
- p256Sqr (out , out )
357
+ p256Sqr (out , p2 , 2 )
354
358
p256Mul (p4 , out , p2 ) // f*p
355
359
356
- p256Sqr (out , p4 )
357
- p256Sqr (out , out )
358
- p256Sqr (out , out )
359
- p256Sqr (out , out )
360
+ p256Sqr (out , p4 , 4 )
360
361
p256Mul (p8 , out , p4 ) // ff*p
361
362
362
- p256Sqr (out , p8 )
363
-
364
- for i := 0 ; i < 7 ; i ++ {
365
- p256Sqr (out , out )
366
- }
363
+ p256Sqr (out , p8 , 8 )
367
364
p256Mul (p16 , out , p8 ) // ffff*p
368
365
369
- p256Sqr (out , p16 )
370
- for i := 0 ; i < 15 ; i ++ {
371
- p256Sqr (out , out )
372
- }
366
+ p256Sqr (out , p16 , 16 )
373
367
p256Mul (p32 , out , p16 ) // ffffffff*p
374
368
375
- p256Sqr (out , p32 )
376
-
377
- for i := 0 ; i < 31 ; i ++ {
378
- p256Sqr (out , out )
379
- }
369
+ p256Sqr (out , p32 , 32 )
380
370
p256Mul (out , out , in )
381
371
382
- for i := 0 ; i < 32 * 4 ; i ++ {
383
- p256Sqr (out , out )
384
- }
372
+ p256Sqr (out , out , 128 )
385
373
p256Mul (out , out , p32 )
386
374
387
- for i := 0 ; i < 32 ; i ++ {
388
- p256Sqr (out , out )
389
- }
375
+ p256Sqr (out , out , 32 )
390
376
p256Mul (out , out , p32 )
391
377
392
- for i := 0 ; i < 16 ; i ++ {
393
- p256Sqr (out , out )
394
- }
378
+ p256Sqr (out , out , 16 )
395
379
p256Mul (out , out , p16 )
396
380
397
- for i := 0 ; i < 8 ; i ++ {
398
- p256Sqr (out , out )
399
- }
381
+ p256Sqr (out , out , 8 )
400
382
p256Mul (out , out , p8 )
401
383
402
- p256Sqr (out , out )
403
- p256Sqr (out , out )
404
- p256Sqr (out , out )
405
- p256Sqr (out , out )
384
+ p256Sqr (out , out , 4 )
406
385
p256Mul (out , out , p4 )
407
386
408
- p256Sqr (out , out )
409
- p256Sqr (out , out )
387
+ p256Sqr (out , out , 2 )
410
388
p256Mul (out , out , p2 )
411
389
412
- p256Sqr (out , out )
413
- p256Sqr (out , out )
390
+ p256Sqr (out , out , 2 )
414
391
p256Mul (out , out , in )
415
392
}
416
393
@@ -426,16 +403,16 @@ func boothW5(in uint) (int, int) {
426
403
return int (d ), int (s & 1 )
427
404
}
428
405
429
- func boothW7 (in uint ) (int , int ) {
430
- var s uint = ^ ((in >> 7 ) - 1 )
431
- var d uint = (1 << 8 ) - in - 1
406
+ func boothW6 (in uint ) (int , int ) {
407
+ var s uint = ^ ((in >> 6 ) - 1 )
408
+ var d uint = (1 << 7 ) - in - 1
432
409
d = (d & s ) | (in & (^ s ))
433
410
d = (d >> 1 ) + (d & 1 )
434
411
return int (d ), int (s & 1 )
435
412
}
436
413
437
414
func initTable () {
438
- p256Precomputed = new ([37 ][ 64 * 8 ]uint64 )
415
+ p256Precomputed = new ([43 ][ 32 * 8 ]uint64 )
439
416
440
417
basePoint := []uint64 {
441
418
0x79e730d418a9143c , 0x75ba95fc5fedb601 , 0x79fb732b77622510 , 0x18905f76a53755c6 ,
@@ -448,19 +425,19 @@ func initTable() {
448
425
449
426
zInv := make ([]uint64 , 4 )
450
427
zInvSq := make ([]uint64 , 4 )
451
- for j := 0 ; j < 64 ; j ++ {
428
+ for j := 0 ; j < 32 ; j ++ {
452
429
copy (t1 , t2 )
453
- for i := 0 ; i < 37 ; i ++ {
454
- // The window size is 7 so we need to double 7 times.
430
+ for i := 0 ; i < 43 ; i ++ {
431
+ // The window size is 6 so we need to double 6 times.
455
432
if i != 0 {
456
- for k := 0 ; k < 7 ; k ++ {
433
+ for k := 0 ; k < 6 ; k ++ {
457
434
p256PointDoubleAsm (t1 , t1 )
458
435
}
459
436
}
460
437
// Convert the point to affine form. (Its values are
461
438
// still in Montgomery form however.)
462
439
p256Inverse (zInv , t1 [8 :12 ])
463
- p256Sqr (zInvSq , zInv )
440
+ p256Sqr (zInvSq , zInv , 1 )
464
441
p256Mul (zInv , zInv , zInvSq )
465
442
466
443
p256Mul (t1 [:4 ], t1 [:4 ], zInvSq )
@@ -481,8 +458,8 @@ func initTable() {
481
458
func (p * p256Point ) p256BaseMult (scalar []uint64 ) {
482
459
precomputeOnce .Do (initTable )
483
460
484
- wvalue := (scalar [0 ] << 1 ) & 0xff
485
- sel , sign := boothW7 (uint (wvalue ))
461
+ wvalue := (scalar [0 ] << 1 ) & 0x7f
462
+ sel , sign := boothW6 (uint (wvalue ))
486
463
p256SelectBase (p .xyz [0 :8 ], p256Precomputed [0 ][0 :], sel )
487
464
p256NegCond (p .xyz [4 :8 ], sign )
488
465
@@ -499,17 +476,17 @@ func (p *p256Point) p256BaseMult(scalar []uint64) {
499
476
t0 .xyz [10 ] = 0xffffffffffffffff
500
477
t0 .xyz [11 ] = 0x00000000fffffffe
501
478
502
- index := uint (6 )
479
+ index := uint (5 )
503
480
zero := sel
504
481
505
- for i := 1 ; i < 37 ; i ++ {
482
+ for i := 1 ; i < 43 ; i ++ {
506
483
if index < 192 {
507
- wvalue = ((scalar [index / 64 ] >> (index % 64 )) + (scalar [index / 64 + 1 ] << (64 - (index % 64 )))) & 0xff
484
+ wvalue = ((scalar [index / 64 ] >> (index % 64 )) + (scalar [index / 64 + 1 ] << (64 - (index % 64 )))) & 0x7f
508
485
} else {
509
- wvalue = (scalar [index / 64 ] >> (index % 64 )) & 0xff
486
+ wvalue = (scalar [index / 64 ] >> (index % 64 )) & 0x7f
510
487
}
511
- index += 7
512
- sel , sign = boothW7 (uint (wvalue ))
488
+ index += 6
489
+ sel , sign = boothW6 (uint (wvalue ))
513
490
p256SelectBase (t0 .xyz [0 :8 ], p256Precomputed [i ][0 :], sel )
514
491
p256PointAddAffineAsm (p .xyz [0 :12 ], p .xyz [0 :12 ], t0 .xyz [0 :8 ], sign , sel , zero )
515
492
zero |= sel
0 commit comments