From ac1cec4689836581575ce7b3267c3ef69594ebd6 Mon Sep 17 00:00:00 2001 From: auderson Date: Fri, 25 Mar 2022 14:00:15 +0800 Subject: [PATCH 01/16] fix merge --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/window/aggregations.pyx | 34 ++++++--- pandas/core/_numba/kernels/var_.py | 59 +++++++++++++--- pandas/core/window/rolling.py | 38 +++++----- pandas/tests/window/test_rolling.py | 100 +++++++++++++++++++++++++++ 5 files changed, 192 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 50a5bf383de77..e393eda48b239 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -46,6 +46,7 @@ Other enhancements - Improved error message in :class:`~pandas.core.window.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`) - :class:`Series` and :class:`DataFrame` with ``IntegerDtype`` now supports bitwise operations (:issue:`34463`) - Add ``milliseconds`` field support for :class:`~pandas.DateOffset` (:issue:`43371`) +- :meth:`DataFrame.rolling.var`, :meth:`DataFrame.rolling.std`, :meth:`Series.rolling.var`, :meth:`Series.rolling.std` now generate correct result 0 for window containing same values (:issue:`42064`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 8191084da924f..69285e06fa7ba 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -278,15 +278,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start, cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, - float64_t ssqdm_x) nogil: + float64_t ssqdm_x, int64_t num_consecutive_same_value) nogil: cdef: float64_t result # Variance is unchanged if no observation is added or removed if (nobs >= minp) and (nobs > ddof): - # pathological case - if nobs == 1: + # pathological case & repeatedly same values case + if nobs == 1 or num_consecutive_same_value >= nobs: result = 0 else: result = ssqdm_x / (nobs - ddof) @@ -297,7 +297,8 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x, - float64_t *ssqdm_x, float64_t *compensation) nogil: + float64_t *ssqdm_x, float64_t *compensation, + int64_t *num_consecutive_same_value, float64_t *prev_value) nogil: """ add a value from the var calc """ cdef: float64_t delta, prev_mean, y, t @@ -307,6 +308,15 @@ cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x, return nobs[0] = nobs[0] + 1 + + # GH#42064, record num of same values to remove floating point artifacts + if val == prev_value[0]: + num_consecutive_same_value[0] += 1 + else: + # reset to 1 (include current value itself) + num_consecutive_same_value[0] = 1 + prev_value[0] = val + # Welford's method for the online variance-calculation # using Kahan summation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance @@ -352,9 +362,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, """ cdef: float64_t mean_x, ssqdm_x, nobs, compensation_add, - float64_t compensation_remove, - float64_t val, prev, delta, mean_x_old - int64_t s, e + float64_t compensation_remove, prev_value + int64_t s, e, num_consecutive_same_value Py_ssize_t i, j, N = len(start) ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -376,9 +385,13 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, # never removed if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + prev_value = values[s] + num_consecutive_same_value = 0 + mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0 for j in range(s, e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add) + add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, + &num_consecutive_same_value, &prev_value) else: @@ -392,9 +405,10 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, # calculate adds for j in range(end[i - 1], e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add) + add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, + &num_consecutive_same_value, &prev_value) - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value) if not is_monotonic_increasing_bounds: nobs = 0.0 diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index 2e5660673701b..b1c72832d249b 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -16,9 +16,22 @@ @numba.jit(nopython=True, nogil=True, parallel=False) def add_var( - val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float -) -> tuple[int, float, float, float]: + val: float, + nobs: int, + mean_x: float, + ssqdm_x: float, + compensation: float, + num_consecutive_same_value: int, + prev_value: float, +) -> tuple[int, float, float, float, int, float]: if not np.isnan(val): + + if val == prev_value: + num_consecutive_same_value += 1 + else: + num_consecutive_same_value = 1 + prev_value = val + nobs += 1 prev_mean = mean_x - compensation y = val - compensation @@ -30,7 +43,7 @@ def add_var( else: mean_x = 0 ssqdm_x += (val - prev_mean) * (val - mean_x) - return nobs, mean_x, ssqdm_x, compensation + return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value @numba.jit(nopython=True, nogil=True, parallel=False) @@ -79,10 +92,27 @@ def sliding_var( s = start[i] e = end[i] if i == 0 or not is_monotonic_increasing_bounds: + + prev_value = values[s] + num_consecutive_same_value = 0 + for j in range(s, e): val = values[j] - nobs, mean_x, ssqdm_x, compensation_add = add_var( - val, nobs, mean_x, ssqdm_x, compensation_add + ( + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_var( + val, + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, ) else: for j in range(start[i - 1], s): @@ -93,12 +123,25 @@ def sliding_var( for j in range(end[i - 1], e): val = values[j] - nobs, mean_x, ssqdm_x, compensation_add = add_var( - val, nobs, mean_x, ssqdm_x, compensation_add + ( + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_var( + val, + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, ) if nobs >= min_periods and nobs > ddof: - if nobs == 1: + if nobs == 1 or num_consecutive_same_value >= nobs: result = 0.0 else: result = ssqdm_x / (nobs - ddof) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index d4569816f9f7a..023fd7630952b 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2150,10 +2150,7 @@ def median( The default ``ddof`` of 1 used in :meth:`Series.std` is different than the default ``ddof`` of 0 in :func:`numpy.std`. - A minimum of one period is required for the rolling calculation. - - The implementation is susceptible to floating point imprecision as - shown in the example below.\n + A minimum of one period is required for the rolling calculation.\n """ ).replace("\n", "", 1), create_section_header("Examples"), @@ -2161,13 +2158,13 @@ def median( """ >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) >>> s.rolling(3).std() - 0 NaN - 1 NaN - 2 5.773503e-01 - 3 1.000000e+00 - 4 1.000000e+00 - 5 1.154701e+00 - 6 2.580957e-08 + 0 NaN + 1 NaN + 2 0.577350 + 3 1.000000 + 4 1.000000 + 5 1.154701 + 6 0.000000 dtype: float64 """ ).replace("\n", "", 1), @@ -2212,10 +2209,7 @@ def std( The default ``ddof`` of 1 used in :meth:`Series.var` is different than the default ``ddof`` of 0 in :func:`numpy.var`. - A minimum of one period is required for the rolling calculation. - - The implementation is susceptible to floating point imprecision as - shown in the example below.\n + A minimum of one period is required for the rolling calculation.\n """ ).replace("\n", "", 1), create_section_header("Examples"), @@ -2223,13 +2217,13 @@ def std( """ >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) >>> s.rolling(3).var() - 0 NaN - 1 NaN - 2 3.333333e-01 - 3 1.000000e+00 - 4 1.000000e+00 - 5 1.333333e+00 - 6 6.661338e-16 + 0 NaN + 1 NaN + 2 0.333333 + 3 1.000000 + 4 1.000000 + 5 1.333333 + 6 0.000000 dtype: float64 """ ).replace("\n", "", 1), diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 7e9bc121f06ff..df1c5e42de41e 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1781,3 +1781,103 @@ def test_step_not_integer_raises(): def test_step_not_positive_raises(): with pytest.raises(ValueError, match="step must be >= 0"): DataFrame(range(2)).rolling(1, step=-1) + + +@pytest.mark.parametrize( + ["values", "window", "min_periods", "expected"], + [ + [ + np.array([20, 10, 10, np.inf, 1, 1, 2, 3]), + 3, + 1, + np.array( + [ + np.nan, + 50.0, + 33.33333333333333, + 0.0, + 40.5, + 0.0, + 0.3333333333333333, + 1.0, + ] + ), + ], + [ + np.array([20, 10, 10, np.nan, 10, 1, 2, 3]), + 3, + 1, + np.array( + [ + np.nan, + 50.0, + 33.33333333333333, + 0.0, + 0.0, + 40.5, + 24.333333333333332, + 1.0, + ] + ), + ], + [ + np.array([np.nan, 5, 6, 7, 5, 5, 5]), + 3, + 3, + np.array([np.nan, np.nan, np.nan, 1.0, 1.0, 1.3333333333333335, 0.0]), + ], + [ + np.array([5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3]), + 3, + 3, + np.array( + [ + np.nan, + np.nan, + 1.3333333333333335, + 0.0, + np.nan, + np.nan, + np.nan, + np.nan, + 0.33333333333333337, + 0.0, + ] + ), + ], + [ + np.array([5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3]), + 3, + 3, + np.array( + [ + np.nan, + np.nan, + 1.3333333333333335, + 0.0, + np.nan, + np.nan, + np.nan, + np.nan, + 5.333333333333333, + 0.0, + ] + ), + ], + ], +) +def test_rolling_var_same_value_count_logic(values, window, min_periods, expected): + # GH 42064 + + sr = Series(values) + result_var = sr.rolling(window, min_periods=min_periods).var() + # 1. result should be close to correct value + # non-zero values can still differ slightly as the result of online algorithm + assert np.isclose(result_var, expected, equal_nan=True).all() + # 2. zeros should be exactly the same since the new algo takes effect here + assert (result_var[expected == 0] == 0).all() + + # std should also pass as it's just a sqrt of var + result_std = sr.rolling(window, min_periods=min_periods).std() + assert np.isclose(result_std, np.sqrt(expected), equal_nan=True).all() + assert (result_std[expected == 0] == 0).all() From b381516fce4447957cc1cf5408f520ddcebdd382 Mon Sep 17 00:00:00 2001 From: auderson Date: Sat, 26 Mar 2022 11:36:14 +0800 Subject: [PATCH 02/16] bug fix: nobs == 0 == min_periods --- pandas/core/_numba/kernels/sum_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index c2e81b4990ba9..b67452e4071d9 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -81,7 +81,7 @@ def sliding_sum( val, nobs, sum_x, compensation_add ) - if nobs == 0 == nobs: + if nobs == 0 == min_periods: result = 0.0 elif nobs >= min_periods: result = sum_x From 85f3fcd96ff539f983733f6728a6593b8828d304 Mon Sep 17 00:00:00 2001 From: auderson Date: Sat, 26 Mar 2022 11:37:05 +0800 Subject: [PATCH 03/16] use Py_ssize_t in place of int64_t --- pandas/_libs/window/aggregations.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 69285e06fa7ba..04d41e6d65146 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -278,7 +278,7 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start, cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, - float64_t ssqdm_x, int64_t num_consecutive_same_value) nogil: + float64_t ssqdm_x, Py_ssize_t num_consecutive_same_value) nogil: cdef: float64_t result @@ -298,7 +298,7 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x, float64_t *ssqdm_x, float64_t *compensation, - int64_t *num_consecutive_same_value, float64_t *prev_value) nogil: + Py_ssize_t *num_consecutive_same_value, float64_t *prev_value) nogil: """ add a value from the var calc """ cdef: float64_t delta, prev_mean, y, t @@ -363,8 +363,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, cdef: float64_t mean_x, ssqdm_x, nobs, compensation_add, float64_t compensation_remove, prev_value - int64_t s, e, num_consecutive_same_value - Py_ssize_t i, j, N = len(start) + int64_t s, e + Py_ssize_t i, j, N = len(start), num_consecutive_same_value ndarray[float64_t] output bint is_monotonic_increasing_bounds From 318adee426a2109236aa47ef903c71bc1e526d10 Mon Sep 17 00:00:00 2001 From: auderson Date: Sat, 26 Mar 2022 11:37:25 +0800 Subject: [PATCH 04/16] use tm.assert_series_equal --- pandas/tests/window/test_rolling.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index df1c5e42de41e..aa595e9c28bf5 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1794,12 +1794,12 @@ def test_step_not_positive_raises(): [ np.nan, 50.0, - 33.33333333333333, + 33.333333333333336, 0.0, - 40.5, + 40.500000000000014, 0.0, - 0.3333333333333333, - 1.0, + 0.3333333333333405, + 1.000000000000007, ] ), ], @@ -1811,12 +1811,12 @@ def test_step_not_positive_raises(): [ np.nan, 50.0, - 33.33333333333333, + 33.333333333333336, 0.0, 0.0, - 40.5, - 24.333333333333332, - 1.0, + 40.500000000000014, + 24.33333333333334, + 1.000000000000004, ] ), ], @@ -1868,16 +1868,18 @@ def test_step_not_positive_raises(): ) def test_rolling_var_same_value_count_logic(values, window, min_periods, expected): # GH 42064 + expected = Series(expected) sr = Series(values) result_var = sr.rolling(window, min_periods=min_periods).var() # 1. result should be close to correct value - # non-zero values can still differ slightly as the result of online algorithm - assert np.isclose(result_var, expected, equal_nan=True).all() + # non-zero values can still differ slightly from "truth" + # as the result of online algorithm + tm.assert_series_equal(result_var, expected, check_exact=True) # 2. zeros should be exactly the same since the new algo takes effect here - assert (result_var[expected == 0] == 0).all() + tm.assert_series_equal(expected == 0, result_var == 0) # std should also pass as it's just a sqrt of var result_std = sr.rolling(window, min_periods=min_periods).std() - assert np.isclose(result_std, np.sqrt(expected), equal_nan=True).all() - assert (result_std[expected == 0] == 0).all() + tm.assert_series_equal(result_std, np.sqrt(expected), check_exact=True) + tm.assert_series_equal(expected == 0, result_std == 0) From 7324cc144a89244acec0198b6eb875d5d6701f5a Mon Sep 17 00:00:00 2001 From: auderson Date: Sat, 26 Mar 2022 11:43:01 +0800 Subject: [PATCH 05/16] add more checks in roll numba methods --- pandas/tests/window/test_numba.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 0e7fe24420171..ca08931bdd69e 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -79,7 +79,20 @@ def f(x, *args): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "data", [DataFrame(np.eye(5)), Series(range(5), name="foo")] + "data", + [ + DataFrame(np.eye(5)), + DataFrame( + [ + [5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3], + [5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3], + [np.nan, np.nan, 5, 6, 7, 5, 5, 5], + ] + ).T, + Series(range(5), name="foo"), + Series([20, 10, 10, np.inf, 1, 1, 2, 3]), + Series([20, 10, 10, np.nan, 10, 1, 2, 3]), + ], ) def test_numba_vs_cython_rolling_methods( self, @@ -95,7 +108,7 @@ def test_numba_vs_cython_rolling_methods( engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - roll = data.rolling(2, step=step) + roll = data.rolling(3, step=step) result = getattr(roll, method)( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) From ad3a87c0c9b4b37c0e1780589f254831b664553a Mon Sep 17 00:00:00 2001 From: auderson Date: Sat, 26 Mar 2022 11:47:59 +0800 Subject: [PATCH 06/16] add more checks in roll numba methods --- pandas/tests/window/test_numba.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index ca08931bdd69e..4cf6306dc39e5 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -86,7 +86,7 @@ def f(x, *args): [ [5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3], [5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3], - [np.nan, np.nan, 5, 6, 7, 5, 5, 5], + [np.nan, np.nan, 5, 6, 7, 5, 5, 5, 5, 5], ] ).T, Series(range(5), name="foo"), From 66318d175c8e0e71d2cb7b4143399612f81acbd3 Mon Sep 17 00:00:00 2001 From: auderson Date: Sat, 26 Mar 2022 12:45:50 +0800 Subject: [PATCH 07/16] cancel check exact --- pandas/tests/window/test_rolling.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index aa595e9c28bf5..cd654cdcfb7a5 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1794,12 +1794,12 @@ def test_step_not_positive_raises(): [ np.nan, 50.0, - 33.333333333333336, + 33.33333333333333, 0.0, - 40.500000000000014, + 40.5, 0.0, - 0.3333333333333405, - 1.000000000000007, + 0.3333333333333333, + 1.0, ] ), ], @@ -1811,12 +1811,12 @@ def test_step_not_positive_raises(): [ np.nan, 50.0, - 33.333333333333336, + 33.33333333333333, 0.0, 0.0, - 40.500000000000014, - 24.33333333333334, - 1.000000000000004, + 40.5, + 24.333333333333332, + 1.0, ] ), ], @@ -1875,11 +1875,11 @@ def test_rolling_var_same_value_count_logic(values, window, min_periods, expecte # 1. result should be close to correct value # non-zero values can still differ slightly from "truth" # as the result of online algorithm - tm.assert_series_equal(result_var, expected, check_exact=True) + tm.assert_series_equal(result_var, expected) # 2. zeros should be exactly the same since the new algo takes effect here tm.assert_series_equal(expected == 0, result_var == 0) # std should also pass as it's just a sqrt of var result_std = sr.rolling(window, min_periods=min_periods).std() - tm.assert_series_equal(result_std, np.sqrt(expected), check_exact=True) + tm.assert_series_equal(result_std, np.sqrt(expected)) tm.assert_series_equal(expected == 0, result_std == 0) From 7909ffe13192c8bfa05b98b90a81a8fe1a9eebdd Mon Sep 17 00:00:00 2001 From: auderson Date: Sat, 26 Mar 2022 13:36:13 +0800 Subject: [PATCH 08/16] test 32bit: undo Py_ssize_t --- pandas/_libs/window/aggregations.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 04d41e6d65146..69285e06fa7ba 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -278,7 +278,7 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start, cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, - float64_t ssqdm_x, Py_ssize_t num_consecutive_same_value) nogil: + float64_t ssqdm_x, int64_t num_consecutive_same_value) nogil: cdef: float64_t result @@ -298,7 +298,7 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x, float64_t *ssqdm_x, float64_t *compensation, - Py_ssize_t *num_consecutive_same_value, float64_t *prev_value) nogil: + int64_t *num_consecutive_same_value, float64_t *prev_value) nogil: """ add a value from the var calc """ cdef: float64_t delta, prev_mean, y, t @@ -363,8 +363,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, cdef: float64_t mean_x, ssqdm_x, nobs, compensation_add, float64_t compensation_remove, prev_value - int64_t s, e - Py_ssize_t i, j, N = len(start), num_consecutive_same_value + int64_t s, e, num_consecutive_same_value + Py_ssize_t i, j, N = len(start) ndarray[float64_t] output bint is_monotonic_increasing_bounds From 5bd7eac74125ae3cb89528fc63a85d42e5d1fdb7 Mon Sep 17 00:00:00 2001 From: auderson Date: Tue, 29 Mar 2022 20:05:41 +0800 Subject: [PATCH 09/16] add explanation for test_rolling_var_same_value_count_logic --- pandas/tests/window/test_rolling.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index cd654cdcfb7a5..0a4661feee504 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1867,7 +1867,18 @@ def test_step_not_positive_raises(): ], ) def test_rolling_var_same_value_count_logic(values, window, min_periods, expected): - # GH 42064 + """ + For GH 42064. + + With new algo implemented, result will be set to .0 in rolling var + if sufficient amount of consecutively same values are found. + + Below, use `assert_series_equal` twice to check for equality, + because `check_exact=True` will fail in 32-bit tests due to + precision loss. + + """ + expected = Series(expected) sr = Series(values) From 8b9bf760395b67b3e23d30f7a9b4e39d62421110 Mon Sep 17 00:00:00 2001 From: auderson Date: Wed, 30 Mar 2022 09:58:47 +0800 Subject: [PATCH 10/16] comments updated --- pandas/tests/window/test_rolling.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 0a4661feee504..832c118a1dabe 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1867,22 +1867,19 @@ def test_step_not_positive_raises(): ], ) def test_rolling_var_same_value_count_logic(values, window, min_periods, expected): - """ - For GH 42064. - - With new algo implemented, result will be set to .0 in rolling var - if sufficient amount of consecutively same values are found. - - Below, use `assert_series_equal` twice to check for equality, - because `check_exact=True` will fail in 32-bit tests due to - precision loss. - - """ + # GH 42064. expected = Series(expected) - sr = Series(values) + + # With new algo implemented, result will be set to .0 in rolling var + # if sufficient amount of consecutively same values are found. result_var = sr.rolling(window, min_periods=min_periods).var() + + # use `assert_series_equal` twice to check for equality, + # because `check_exact=True` will fail in 32-bit tests due to + # precision loss. + # 1. result should be close to correct value # non-zero values can still differ slightly from "truth" # as the result of online algorithm From fd68eb45edf1a1ce7c3ddcf0cba985aae3782fa2 Mon Sep 17 00:00:00 2001 From: auderson Date: Fri, 1 Apr 2022 10:10:32 +0800 Subject: [PATCH 11/16] add equal-zero test for test_rolling_var_numerical_issues & test_rolling_var_floating_artifact_precision --- pandas/tests/window/test_rolling.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 832c118a1dabe..ae5493b61e042 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1203,6 +1203,9 @@ def test_rolling_var_numerical_issues(func, third_value, values): result = getattr(ds.rolling(2), func)() expected = Series([np.nan] + values) tm.assert_series_equal(result, expected) + # GH 42064 + # new `roll_var` will output 0.0 correctly + tm.assert_series_equal(result == 0, expected == 0) def test_timeoffset_as_window_parameter_for_corr(): @@ -1521,6 +1524,9 @@ def test_rolling_var_floating_artifact_precision(): result = s.rolling(3).var() expected = Series([np.nan, np.nan, 4 / 3, 0]) tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15) + # GH 42064 + # new `roll_var` will output 0.0 correctly + tm.assert_series_equal(result == 0, expected == 0) def test_rolling_std_small_values(): From c49d7854365151f92b2512113603eceb79daf98f Mon Sep 17 00:00:00 2001 From: auderson Date: Sun, 10 Apr 2022 11:13:49 +0800 Subject: [PATCH 12/16] update docs --- doc/source/whatsnew/v1.5.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 7e9bcbf66777a..895c127a48698 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -96,7 +96,8 @@ Other enhancements - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`) - :meth:`DataFrame.rolling.var`, :meth:`DataFrame.rolling.std`, :meth:`Series.rolling.var`, :meth:`Series.rolling.std` now generate correct result 0 for window containing same values (:issue:`42064`) -- +- :meth:`DataFrame.rolling.skew`, :meth:`Series.rolling.skew` now generate correct result 0 for window containing same values (:issue:`42064`) +- :meth:`DataFrame.rolling.kurt`, :meth:`Series.rolling.kurt` now generate correct result -3 for window containing same values (:issue:`42064`, :issue:`30993`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: From 4294c1692fbae132bb9fd4bc65d96b50a71c59bd Mon Sep 17 00:00:00 2001 From: auderson Date: Mon, 11 Apr 2022 11:01:19 +0800 Subject: [PATCH 13/16] add more tests --- pandas/tests/window/test_rolling.py | 80 ++++++++--------------------- 1 file changed, 22 insertions(+), 58 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 640d721de24a2..5f7709c62a1e2 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1781,82 +1781,46 @@ def test_step_not_positive_raises(): ["values", "window", "min_periods", "expected"], [ [ - np.array([20, 10, 10, np.inf, 1, 1, 2, 3]), + [20, 10, 10, np.inf, 1, 1, 2, 3], 3, 1, - np.array( - [ - np.nan, - 50.0, - 33.33333333333333, - 0.0, - 40.5, - 0.0, - 0.3333333333333333, - 1.0, - ] - ), + [np.nan, 50, 100 / 3, 0, 40.5, 0, 1 / 3, 1], ], [ - np.array([20, 10, 10, np.nan, 10, 1, 2, 3]), + [20, 10, 10, np.nan, 10, 1, 2, 3], 3, 1, - np.array( - [ - np.nan, - 50.0, - 33.33333333333333, - 0.0, - 0.0, - 40.5, - 24.333333333333332, - 1.0, - ] - ), + [np.nan, 50, 100 / 3, 0, 0, 40.5, 73 / 3, 1], ], [ - np.array([np.nan, 5, 6, 7, 5, 5, 5]), + [np.nan, 5, 6, 7, 5, 5, 5], 3, 3, - np.array([np.nan, np.nan, np.nan, 1.0, 1.0, 1.3333333333333335, 0.0]), + [np.nan] * 3 + [1, 1, 4 / 3, 0], ], [ - np.array([5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3]), + [5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3], 3, 3, - np.array( - [ - np.nan, - np.nan, - 1.3333333333333335, - 0.0, - np.nan, - np.nan, - np.nan, - np.nan, - 0.33333333333333337, - 0.0, - ] - ), + [np.nan] * 2 + [4 / 3, 0] + [np.nan] * 4 + [1 / 3, 0], ], [ - np.array([5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3]), + [5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3], 3, 3, - np.array( - [ - np.nan, - np.nan, - 1.3333333333333335, - 0.0, - np.nan, - np.nan, - np.nan, - np.nan, - 5.333333333333333, - 0.0, - ] - ), + [np.nan] * 2 + [4 / 3, 0] + [np.nan] * 4 + [16 / 3, 0], + ], + [ + [5, 7] * 4, + 3, + 3, + [np.nan] * 2 + [4 / 3] * 6, + ], + [ + [5, 7, 5, np.nan, 7, 5, 7], + 3, + 2, + [np.nan, 2, 4 / 3] + [2] * 3 + [4 / 3], ], ], ) From 738c45ba7aa281c0edfd3ac3304e02d0d2a81a51 Mon Sep 17 00:00:00 2001 From: auderson Date: Mon, 11 Apr 2022 12:00:33 +0800 Subject: [PATCH 14/16] update whats new --- doc/source/whatsnew/v1.5.0.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 895c127a48698..cdd3f01361447 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -95,9 +95,6 @@ Other enhancements - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`) - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`) -- :meth:`DataFrame.rolling.var`, :meth:`DataFrame.rolling.std`, :meth:`Series.rolling.var`, :meth:`Series.rolling.std` now generate correct result 0 for window containing same values (:issue:`42064`) -- :meth:`DataFrame.rolling.skew`, :meth:`Series.rolling.skew` now generate correct result 0 for window containing same values (:issue:`42064`) -- :meth:`DataFrame.rolling.kurt`, :meth:`Series.rolling.kurt` now generate correct result -3 for window containing same values (:issue:`42064`, :issue:`30993`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: @@ -183,8 +180,9 @@ Styler .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2: -notable_bug_fix2 -^^^^^^^^^^^^^^^^ +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ +- rolling mean / sum / std / var / skew / kurt now output correct result for window with same values (:issue:`42064`, :issue:`30993`) .. --------------------------------------------------------------------------- .. _whatsnew_150.api_breaking: From ca1ee1a0f0629b95d14f7d3e4f33f89dedff26b9 Mon Sep 17 00:00:00 2001 From: auderson Date: Mon, 11 Apr 2022 12:01:10 +0800 Subject: [PATCH 15/16] update whats new --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index cdd3f01361447..041ef79e327a4 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -182,6 +182,7 @@ Styler Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ + - rolling mean / sum / std / var / skew / kurt now output correct result for window with same values (:issue:`42064`, :issue:`30993`) .. --------------------------------------------------------------------------- From 8f0eba6281e6c47d618a265ca8d2730e32459ada Mon Sep 17 00:00:00 2001 From: auderson Date: Tue, 12 Apr 2022 09:38:58 +0800 Subject: [PATCH 16/16] update doc --- doc/source/whatsnew/v1.5.0.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 01f6dd71ea397..02eb4b7e7c9c5 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -180,10 +180,8 @@ Styler .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2: -Groupby/resample/rolling -^^^^^^^^^^^^^^^^^^^^^^^^ - -- rolling mean / sum / std / var / skew / kurt now output correct result for window with same values (:issue:`42064`, :issue:`30993`) +notable_bug_fix2 +^^^^^^^^^^^^^^^^ .. --------------------------------------------------------------------------- .. _whatsnew_150.api_breaking: @@ -601,7 +599,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.cummax` with ``int64`` dtype with leading value being the smallest possible int64 (:issue:`46382`) - Bug in :meth:`GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`) - Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`) -- +- Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`) Reshaping ^^^^^^^^^