Skip to content

Commit db062da

Browse files
jbrockmendeljreback
authored andcommitted
REF: implement cumulative ops block-wise (#29872)
1 parent a6a8440 commit db062da

File tree

4 files changed

+79
-37
lines changed

4 files changed

+79
-37
lines changed

doc/source/whatsnew/v1.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,7 @@ Numeric
765765
- Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`)
766766
- Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`)
767767
- Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
768+
- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)
768769

769770
Conversion
770771
^^^^^^^^^^

pandas/core/generic.py

Lines changed: 57 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -11086,44 +11086,66 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs):
1108611086
else:
1108711087
axis = self._get_axis_number(axis)
1108811088

11089-
y = com.values_from_object(self).copy()
11090-
d = self._construct_axes_dict()
11091-
d["copy"] = False
11089+
if axis == 1:
11090+
return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T
11091+
11092+
def na_accum_func(blk_values):
11093+
# We will be applying this function to block values
11094+
if blk_values.dtype.kind in ["m", "M"]:
11095+
# numpy 1.18 started sorting NaTs at the end instead of beginning,
11096+
# so we need to work around to maintain backwards-consistency.
11097+
orig_dtype = blk_values.dtype
11098+
11099+
# We need to define mask before masking NaTs
11100+
mask = isna(blk_values)
11101+
11102+
if accum_func == np.minimum.accumulate:
11103+
# Note: the accum_func comparison fails as an "is" comparison
11104+
y = blk_values.view("i8")
11105+
y[mask] = np.iinfo(np.int64).max
11106+
changed = True
11107+
else:
11108+
y = blk_values
11109+
changed = False
11110+
11111+
result = accum_func(y.view("i8"), axis)
11112+
if skipna:
11113+
np.putmask(result, mask, iNaT)
11114+
elif accum_func == np.minimum.accumulate:
11115+
# Restore NaTs that we masked previously
11116+
nz = (~np.asarray(mask)).nonzero()[0]
11117+
if len(nz):
11118+
# everything up to the first non-na entry stays NaT
11119+
result[: nz[0]] = iNaT
11120+
11121+
if changed:
11122+
# restore NaT elements
11123+
y[mask] = iNaT # TODO: could try/finally for this?
11124+
11125+
if isinstance(blk_values, np.ndarray):
11126+
result = result.view(orig_dtype)
11127+
else:
11128+
# DatetimeArray
11129+
result = type(blk_values)._from_sequence(result, dtype=orig_dtype)
11130+
11131+
elif skipna and not issubclass(
11132+
blk_values.dtype.type, (np.integer, np.bool_)
11133+
):
11134+
vals = blk_values.copy().T
11135+
mask = isna(vals)
11136+
np.putmask(vals, mask, mask_a)
11137+
result = accum_func(vals, axis)
11138+
np.putmask(result, mask, mask_b)
11139+
else:
11140+
result = accum_func(blk_values.T, axis)
1109211141

11093-
if issubclass(y.dtype.type, (np.datetime64, np.timedelta64)):
11094-
# numpy 1.18 started sorting NaTs at the end instead of beginning,
11095-
# so we need to work around to maintain backwards-consistency.
11096-
orig_dtype = y.dtype
11097-
if accum_func == np.minimum.accumulate:
11098-
# Note: the accum_func comparison fails as an "is" comparison
11099-
# Note that "y" is always a copy, so we can safely modify it
11100-
mask = isna(self)
11101-
y = y.view("i8")
11102-
y[mask] = np.iinfo(np.int64).max
11103-
11104-
result = accum_func(y.view("i8"), axis).view(orig_dtype)
11105-
if skipna:
11106-
mask = isna(self)
11107-
np.putmask(result, mask, iNaT)
11108-
elif accum_func == np.minimum.accumulate:
11109-
# Restore NaTs that we masked previously
11110-
nz = (~np.asarray(mask)).nonzero()[0]
11111-
if len(nz):
11112-
# everything up to the first non-na entry stays NaT
11113-
result[: nz[0]] = iNaT
11142+
# transpose back for ndarray, not for EA
11143+
return result.T if hasattr(result, "T") else result
1111411144

11115-
if self.ndim == 1:
11116-
# restore dt64tz dtype
11117-
d["dtype"] = self.dtype
11118-
11119-
elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)):
11120-
mask = isna(self)
11121-
np.putmask(y, mask, mask_a)
11122-
result = accum_func(y, axis)
11123-
np.putmask(result, mask, mask_b)
11124-
else:
11125-
result = accum_func(y, axis)
11145+
result = self._data.apply(na_accum_func)
1112611146

11147+
d = self._construct_axes_dict()
11148+
d["copy"] = False
1112711149
return self._constructor(result, **d).__finalize__(self)
1112811150

1112911151
return set_function_name(cum_func, name, cls)

pandas/tests/frame/test_apply.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1331,8 +1331,8 @@ def test_agg_cython_table(self, df, func, expected, axis):
13311331
_get_cython_table_params(
13321332
DataFrame([[np.nan, 1], [1, 2]]),
13331333
[
1334-
("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])),
1335-
("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])),
1334+
("cumprod", DataFrame([[np.nan, 1], [1, 2]])),
1335+
("cumsum", DataFrame([[np.nan, 1], [1, 3]])),
13361336
],
13371337
),
13381338
),
@@ -1341,6 +1341,10 @@ def test_agg_cython_table_transform(self, df, func, expected, axis):
13411341
# GH 21224
13421342
# test transforming functions in
13431343
# pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
1344+
if axis == "columns" or axis == 1:
1345+
# operating blockwise doesn't let us preserve dtypes
1346+
expected = expected.astype("float64")
1347+
13441348
result = df.agg(func, axis=axis)
13451349
tm.assert_frame_equal(result, expected)
13461350

pandas/tests/frame/test_cumulative.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,18 @@ def test_cummax(self, datetime_frame):
118118
# fix issue
119119
cummax_xs = datetime_frame.cummax(axis=1)
120120
assert np.shape(cummax_xs) == np.shape(datetime_frame)
121+
122+
def test_cumulative_ops_preserve_dtypes(self):
123+
# GH#19296 dont incorrectly upcast to object
124+
df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3.0], "C": [True, False, False]})
125+
126+
result = df.cumsum()
127+
128+
expected = DataFrame(
129+
{
130+
"A": Series([1, 3, 6], dtype=np.int64),
131+
"B": Series([1, 3, 6], dtype=np.float64),
132+
"C": df["C"].cumsum(),
133+
}
134+
)
135+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)