REF: implement cumulative ops block-wise (#29872)

jbrockmendel · jreback · commit db062dabde47 · 2019-12-30T08:30:18.000-05:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -765,6 +765,7 @@ Numeric
 - Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`)
 - Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`)
 - Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
+- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -11086,44 +11086,66 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs):
         else:
             axis = self._get_axis_number(axis)
 
-        y = com.values_from_object(self).copy()
-        d = self._construct_axes_dict()
-        d["copy"] = False
+        if axis == 1:
+            return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T
+
+        def na_accum_func(blk_values):
+            # We will be applying this function to block values
+            if blk_values.dtype.kind in ["m", "M"]:
+                # numpy 1.18 started sorting NaTs at the end instead of beginning,
+                #  so we need to work around to maintain backwards-consistency.
+                orig_dtype = blk_values.dtype
+
+                # We need to define mask before masking NaTs
+                mask = isna(blk_values)
+
+                if accum_func == np.minimum.accumulate:
+                    # Note: the accum_func comparison fails as an "is" comparison
+                    y = blk_values.view("i8")
+                    y[mask] = np.iinfo(np.int64).max
+                    changed = True
+                else:
+                    y = blk_values
+                    changed = False
+
+                result = accum_func(y.view("i8"), axis)
+                if skipna:
+                    np.putmask(result, mask, iNaT)
+                elif accum_func == np.minimum.accumulate:
+                    # Restore NaTs that we masked previously
+                    nz = (~np.asarray(mask)).nonzero()[0]
+                    if len(nz):
+                        # everything up to the first non-na entry stays NaT
+                        result[: nz[0]] = iNaT
+
+                if changed:
+                    # restore NaT elements
+                    y[mask] = iNaT  # TODO: could try/finally for this?
+
+                if isinstance(blk_values, np.ndarray):
+                    result = result.view(orig_dtype)
+                else:
+                    # DatetimeArray
+                    result = type(blk_values)._from_sequence(result, dtype=orig_dtype)
+
+            elif skipna and not issubclass(
+                blk_values.dtype.type, (np.integer, np.bool_)
+            ):
+                vals = blk_values.copy().T
+                mask = isna(vals)
+                np.putmask(vals, mask, mask_a)
+                result = accum_func(vals, axis)
+                np.putmask(result, mask, mask_b)
+            else:
+                result = accum_func(blk_values.T, axis)
 
-        if issubclass(y.dtype.type, (np.datetime64, np.timedelta64)):
-            # numpy 1.18 started sorting NaTs at the end instead of beginning,
-            #  so we need to work around to maintain backwards-consistency.
-            orig_dtype = y.dtype
-            if accum_func == np.minimum.accumulate:
-                # Note: the accum_func comparison fails as an "is" comparison
-                # Note that "y" is always a copy, so we can safely modify it
-                mask = isna(self)
-                y = y.view("i8")
-                y[mask] = np.iinfo(np.int64).max
-
-            result = accum_func(y.view("i8"), axis).view(orig_dtype)
-            if skipna:
-                mask = isna(self)
-                np.putmask(result, mask, iNaT)
-            elif accum_func == np.minimum.accumulate:
-                # Restore NaTs that we masked previously
-                nz = (~np.asarray(mask)).nonzero()[0]
-                if len(nz):
-                    # everything up to the first non-na entry stays NaT
-                    result[: nz[0]] = iNaT
+            # transpose back for ndarray, not for EA
+            return result.T if hasattr(result, "T") else result
 
-            if self.ndim == 1:
-                # restore dt64tz dtype
-                d["dtype"] = self.dtype
-
-        elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)):
-            mask = isna(self)
-            np.putmask(y, mask, mask_a)
-            result = accum_func(y, axis)
-            np.putmask(result, mask, mask_b)
-        else:
-            result = accum_func(y, axis)
+        result = self._data.apply(na_accum_func)
 
+        d = self._construct_axes_dict()
+        d["copy"] = False
         return self._constructor(result, **d).__finalize__(self)
 
     return set_function_name(cum_func, name, cls)
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
@@ -1331,8 +1331,8 @@ def test_agg_cython_table(self, df, func, expected, axis):
             _get_cython_table_params(
                 DataFrame([[np.nan, 1], [1, 2]]),
                 [
-                    ("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])),
-                    ("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])),
+                    ("cumprod", DataFrame([[np.nan, 1], [1, 2]])),
+                    ("cumsum", DataFrame([[np.nan, 1], [1, 3]])),
                 ],
             ),
         ),
@@ -1341,6 +1341,10 @@ def test_agg_cython_table_transform(self, df, func, expected, axis):
         # GH 21224
         # test transforming functions in
         # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
+        if axis == "columns" or axis == 1:
+            # operating blockwise doesn't let us preserve dtypes
+            expected = expected.astype("float64")
+
         result = df.agg(func, axis=axis)
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py
@@ -118,3 +118,18 @@ def test_cummax(self, datetime_frame):
         # fix issue
         cummax_xs = datetime_frame.cummax(axis=1)
         assert np.shape(cummax_xs) == np.shape(datetime_frame)
+
+    def test_cumulative_ops_preserve_dtypes(self):
+        # GH#19296 dont incorrectly upcast to object
+        df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3.0], "C": [True, False, False]})
+
+        result = df.cumsum()
+
+        expected = DataFrame(
+            {
+                "A": Series([1, 3, 6], dtype=np.int64),
+                "B": Series([1, 3, 6], dtype=np.float64),
+                "C": df["C"].cumsum(),
+            }
+        )
+        tm.assert_frame_equal(result, expected)