Add tests to asv; doc changes; fastpath if no dups

h-vetinari · h-vetinari · commit 094139bffa07 · 2018-06-27T20:05:25.000+02:00
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -420,14 +420,33 @@ def setup(self):
         self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
                              'b': np.random.choice(t, n),
                              'c': np.random.choice(xs, n)})
-        self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
+        # df2 will not have any duplicates
+        self.df2 = DataFrame(np.random.randn(100, 1000).astype(str))
+
+        df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)),
+                        columns=list('ABCDE'))
+        df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str))
+        self.df3 = df3
 
     def time_frame_duplicated(self):
         self.df.duplicated()
 
     def time_frame_duplicated_wide(self):
         self.df2.duplicated()
 
+    def time_frame_duplicated_wide_inverse(self):
+        # will take fastpath for no duplicates
+        self.df2.duplicated(return_inverse=True)
+
+    def time_frame_duplicated_mixed(self):
+        self.df3.duplicated()
+
+    def time_frame_duplicated_mixed_inverse(self):
+        self.df3.duplicated(return_inverse=True)
+
+    def time_frame_duplicated_mixed_inverse_last(self):
+        self.df3.duplicated(return_inverse=True, keep='last')
+
 
 class XS(object):
 
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -1,7 +1,7 @@
 .. _whatsnew_0231:
 
-v0.23.1
--------
+v0.23.1 (June 12, 2018)
+-----------------------
 
 This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes
 and bug fixes. We recommend that all users upgrade to this version.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -17,14 +17,15 @@ Previously, there was no way to determine how duplicate rows in a ``DataFrame``
 information that was calculated on the deduplicated values (e.g. aggregation) back to the original dataset. And while the ``numpy.unique``-method provides such a
 ``return_inverse``-kwarg, it fails to work with ``object`` data.
 
-The method has now gained a ``return_inverse`` keyword -- specifying ``return_inverse=True`` will change the output from a single Series to a tuple of two Series:
+Therefore, the ``duplicated``-method has now gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
+to a tuple of two Series (in the following example, the index is not just a simple order, to illustrate that the inverse correctly takes it into account):
 
 .. ipython:: python
 
     df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
                       index=[1, 4, 9, 16, 25])
     df
-    isdup, inv = df.duplicated(return_inverse=True)
+    isdup, inv = df.duplicated(return_inverse=True)  # default: keep='first'
     isdup
     inv
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4407,6 +4407,10 @@ def f(vals):
         isdup = Series(duplicated_int64(ids, keep), index=self.index)
         if not return_inverse:
             return isdup
+        elif not isdup.any():
+            # no need to calculate inverse if no duplicates
+            inv = Series(self.index, index=self.index)
+            return isdup, inv
 
         if keep == 'first':
             # o2u: original indices to indices of ARRAY of unique values
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -1654,6 +1654,18 @@ def test_duplicated_inverse_large(self, subset, keep):
         reconstr = unique.reindex(inv.values).set_index(inv.index)
         tm.assert_frame_equal(reconstr, df[subset])
 
+    @pytest.mark.parametrize('keep', ['first', 'last'])
+    def test_duplicated_inverse_fastpath(self, keep):
+        df = DataFrame({'A': range(10)})  # no duplicates
+
+        expected_isdup = df.duplicated(keep=keep)
+        result_isdup, result_inv = df.duplicated(keep=keep,
+                                                 return_inverse=True)
+        tm.assert_series_equal(result_isdup, expected_isdup)
+
+        expected_inv = Series(range(10))
+        tm.assert_series_equal(result_inv, expected_inv)
+
     def test_drop_duplicates(self):
         df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
                                 'foo', 'bar', 'bar', 'foo'],