Skip to content

Commit 72186aa

Browse files
authored
Simplify and improve accuracy for subnormals in hypot() (GH-102785)
1 parent 174c4bf commit 72186aa

File tree

1 file changed

+28
-35
lines changed

1 file changed

+28
-35
lines changed

Modules/mathmodule.c

+28-35
Original file line numberDiff line numberDiff line change
@@ -2498,7 +2498,7 @@ verified for 1 billion random inputs with n=5. [7]
24982498
static inline double
24992499
vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
25002500
{
2501-
double x, h, scale, oldcsum, csum = 1.0, frac1 = 0.0, frac2 = 0.0;
2501+
double x, h, scale, csum = 1.0, frac1 = 0.0, frac2 = 0.0;
25022502
DoubleLength pr, sm;
25032503
int max_e;
25042504
Py_ssize_t i;
@@ -2513,49 +2513,42 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
25132513
return max;
25142514
}
25152515
frexp(max, &max_e);
2516-
if (max_e >= -1023) {
2517-
scale = ldexp(1.0, -max_e);
2518-
assert(max * scale >= 0.5);
2519-
assert(max * scale < 1.0);
2516+
if (max_e < -1023) {
2517+
/* When max_e < -1023, ldexp(1.0, -max_e) would overflow.
2518+
So we first perform lossless scaling from subnormals back to normals,
2519+
then recurse back to vector_norm(), and then finally undo the scaling.
2520+
*/
25202521
for (i=0 ; i < n ; i++) {
2521-
x = vec[i];
2522-
assert(Py_IS_FINITE(x) && fabs(x) <= max);
2522+
vec[i] /= DBL_MIN;
2523+
}
2524+
return DBL_MIN * vector_norm(n, vec, max / DBL_MIN, found_nan);
2525+
}
2526+
scale = ldexp(1.0, -max_e);
2527+
assert(max * scale >= 0.5);
2528+
assert(max * scale < 1.0);
2529+
for (i=0 ; i < n ; i++) {
2530+
x = vec[i];
2531+
assert(Py_IS_FINITE(x) && fabs(x) <= max);
25232532

2524-
x *= scale;
2525-
assert(fabs(x) < 1.0);
2533+
x *= scale;
2534+
assert(fabs(x) < 1.0);
25262535

2527-
pr = dl_mul(x, x);
2528-
assert(pr.hi <= 1.0);
2536+
pr = dl_mul(x, x);
2537+
assert(pr.hi <= 1.0);
25292538

2530-
sm = dl_fast_sum(csum, pr.hi);
2531-
csum = sm.hi;
2532-
frac1 += pr.lo;
2533-
frac2 += sm.lo;
2534-
}
2535-
h = sqrt(csum - 1.0 + (frac1 + frac2));
2536-
pr = dl_mul(-h, h);
25372539
sm = dl_fast_sum(csum, pr.hi);
25382540
csum = sm.hi;
25392541
frac1 += pr.lo;
25402542
frac2 += sm.lo;
2541-
x = csum - 1.0 + (frac1 + frac2);
2542-
return (h + x / (2.0 * h)) / scale;
25432543
}
2544-
/* When max_e < -1023, ldexp(1.0, -max_e) overflows.
2545-
So instead of multiplying by a scale, we just divide by *max*.
2546-
*/
2547-
for (i=0 ; i < n ; i++) {
2548-
x = vec[i];
2549-
assert(Py_IS_FINITE(x) && fabs(x) <= max);
2550-
x /= max;
2551-
x = x*x;
2552-
assert(x <= 1.0);
2553-
assert(fabs(csum) >= fabs(x));
2554-
oldcsum = csum;
2555-
csum += x;
2556-
frac1 += (oldcsum - csum) + x;
2557-
}
2558-
return max * sqrt(csum - 1.0 + frac1);
2544+
h = sqrt(csum - 1.0 + (frac1 + frac2));
2545+
pr = dl_mul(-h, h);
2546+
sm = dl_fast_sum(csum, pr.hi);
2547+
csum = sm.hi;
2548+
frac1 += pr.lo;
2549+
frac2 += sm.lo;
2550+
x = csum - 1.0 + (frac1 + frac2);
2551+
return (h + x / (2.0 * h)) / scale;
25592552
}
25602553

25612554
#define NUM_STACK_ELEMS 16

0 commit comments

Comments
 (0)