@@ -2498,7 +2498,7 @@ verified for 1 billion random inputs with n=5. [7]
2498
2498
static inline double
2499
2499
vector_norm (Py_ssize_t n , double * vec , double max , int found_nan )
2500
2500
{
2501
- double x , h , scale , oldcsum , csum = 1.0 , frac1 = 0.0 , frac2 = 0.0 ;
2501
+ double x , h , scale , csum = 1.0 , frac1 = 0.0 , frac2 = 0.0 ;
2502
2502
DoubleLength pr , sm ;
2503
2503
int max_e ;
2504
2504
Py_ssize_t i ;
@@ -2513,49 +2513,42 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
2513
2513
return max ;
2514
2514
}
2515
2515
frexp (max , & max_e );
2516
- if (max_e >= -1023 ) {
2517
- scale = ldexp (1.0 , - max_e );
2518
- assert (max * scale >= 0.5 );
2519
- assert (max * scale < 1.0 );
2516
+ if (max_e < -1023 ) {
2517
+ /* When max_e < -1023, ldexp(1.0, -max_e) would overflow.
2518
+ So we first perform lossless scaling from subnormals back to normals,
2519
+ then recurse back to vector_norm(), and then finally undo the scaling.
2520
+ */
2520
2521
for (i = 0 ; i < n ; i ++ ) {
2521
- x = vec [i ];
2522
- assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2522
+ vec [i ] /= DBL_MIN ;
2523
+ }
2524
+ return DBL_MIN * vector_norm (n , vec , max / DBL_MIN , found_nan );
2525
+ }
2526
+ scale = ldexp (1.0 , - max_e );
2527
+ assert (max * scale >= 0.5 );
2528
+ assert (max * scale < 1.0 );
2529
+ for (i = 0 ; i < n ; i ++ ) {
2530
+ x = vec [i ];
2531
+ assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2523
2532
2524
- x *= scale ;
2525
- assert (fabs (x ) < 1.0 );
2533
+ x *= scale ;
2534
+ assert (fabs (x ) < 1.0 );
2526
2535
2527
- pr = dl_mul (x , x );
2528
- assert (pr .hi <= 1.0 );
2536
+ pr = dl_mul (x , x );
2537
+ assert (pr .hi <= 1.0 );
2529
2538
2530
- sm = dl_fast_sum (csum , pr .hi );
2531
- csum = sm .hi ;
2532
- frac1 += pr .lo ;
2533
- frac2 += sm .lo ;
2534
- }
2535
- h = sqrt (csum - 1.0 + (frac1 + frac2 ));
2536
- pr = dl_mul (- h , h );
2537
2539
sm = dl_fast_sum (csum , pr .hi );
2538
2540
csum = sm .hi ;
2539
2541
frac1 += pr .lo ;
2540
2542
frac2 += sm .lo ;
2541
- x = csum - 1.0 + (frac1 + frac2 );
2542
- return (h + x / (2.0 * h )) / scale ;
2543
2543
}
2544
- /* When max_e < -1023, ldexp(1.0, -max_e) overflows.
2545
- So instead of multiplying by a scale, we just divide by *max*.
2546
- */
2547
- for (i = 0 ; i < n ; i ++ ) {
2548
- x = vec [i ];
2549
- assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2550
- x /= max ;
2551
- x = x * x ;
2552
- assert (x <= 1.0 );
2553
- assert (fabs (csum ) >= fabs (x ));
2554
- oldcsum = csum ;
2555
- csum += x ;
2556
- frac1 += (oldcsum - csum ) + x ;
2557
- }
2558
- return max * sqrt (csum - 1.0 + frac1 );
2544
+ h = sqrt (csum - 1.0 + (frac1 + frac2 ));
2545
+ pr = dl_mul (- h , h );
2546
+ sm = dl_fast_sum (csum , pr .hi );
2547
+ csum = sm .hi ;
2548
+ frac1 += pr .lo ;
2549
+ frac2 += sm .lo ;
2550
+ x = csum - 1.0 + (frac1 + frac2 );
2551
+ return (h + x / (2.0 * h )) / scale ;
2559
2552
}
2560
2553
2561
2554
#define NUM_STACK_ELEMS 16
0 commit comments