Skip to content

Commit 094139b

Browse files
committed
Add tests to asv; doc changes; fastpath if no dups
1 parent 740df13 commit 094139b

File tree

5 files changed

+41
-5
lines changed

5 files changed

+41
-5
lines changed

asv_bench/benchmarks/frame_methods.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -420,14 +420,33 @@ def setup(self):
420420
self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
421421
'b': np.random.choice(t, n),
422422
'c': np.random.choice(xs, n)})
423-
self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
423+
# df2 will not have any duplicates
424+
self.df2 = DataFrame(np.random.randn(100, 1000).astype(str))
425+
426+
df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)),
427+
columns=list('ABCDE'))
428+
df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str))
429+
self.df3 = df3
424430

425431
def time_frame_duplicated(self):
426432
self.df.duplicated()
427433

428434
def time_frame_duplicated_wide(self):
429435
self.df2.duplicated()
430436

437+
def time_frame_duplicated_wide_inverse(self):
438+
# will take fastpath for no duplicates
439+
self.df2.duplicated(return_inverse=True)
440+
441+
def time_frame_duplicated_mixed(self):
442+
self.df3.duplicated()
443+
444+
def time_frame_duplicated_mixed_inverse(self):
445+
self.df3.duplicated(return_inverse=True)
446+
447+
def time_frame_duplicated_mixed_inverse_last(self):
448+
self.df3.duplicated(return_inverse=True, keep='last')
449+
431450

432451
class XS(object):
433452

doc/source/whatsnew/v0.23.1.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.. _whatsnew_0231:
22

3-
v0.23.1
4-
-------
3+
v0.23.1 (June 12, 2018)
4+
-----------------------
55

66
This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes
77
and bug fixes. We recommend that all users upgrade to this version.

doc/source/whatsnew/v0.24.0.txt

+3-2
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@ Previously, there was no way to determine how duplicate rows in a ``DataFrame``
1717
information that was calculated on the deduplicated values (e.g. aggregation) back to the original dataset. And while the ``numpy.unique``-method provides such a
1818
``return_inverse``-kwarg, it fails to work with ``object`` data.
1919

20-
The method has now gained a ``return_inverse`` keyword -- specifying ``return_inverse=True`` will change the output from a single Series to a tuple of two Series:
20+
Therefore, the ``duplicated``-method has now gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
21+
to a tuple of two Series (in the following example, the index is not just a simple order, to illustrate that the inverse correctly takes it into account):
2122

2223
.. ipython:: python
2324

2425
df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
2526
index=[1, 4, 9, 16, 25])
2627
df
27-
isdup, inv = df.duplicated(return_inverse=True)
28+
isdup, inv = df.duplicated(return_inverse=True) # default: keep='first'
2829
isdup
2930
inv
3031

pandas/core/frame.py

+4
Original file line numberDiff line numberDiff line change
@@ -4407,6 +4407,10 @@ def f(vals):
44074407
isdup = Series(duplicated_int64(ids, keep), index=self.index)
44084408
if not return_inverse:
44094409
return isdup
4410+
elif not isdup.any():
4411+
# no need to calculate inverse if no duplicates
4412+
inv = Series(self.index, index=self.index)
4413+
return isdup, inv
44104414

44114415
if keep == 'first':
44124416
# o2u: original indices to indices of ARRAY of unique values

pandas/tests/frame/test_analytics.py

+12
Original file line numberDiff line numberDiff line change
@@ -1654,6 +1654,18 @@ def test_duplicated_inverse_large(self, subset, keep):
16541654
reconstr = unique.reindex(inv.values).set_index(inv.index)
16551655
tm.assert_frame_equal(reconstr, df[subset])
16561656

1657+
@pytest.mark.parametrize('keep', ['first', 'last'])
1658+
def test_duplicated_inverse_fastpath(self, keep):
1659+
df = DataFrame({'A': range(10)}) # no duplicates
1660+
1661+
expected_isdup = df.duplicated(keep=keep)
1662+
result_isdup, result_inv = df.duplicated(keep=keep,
1663+
return_inverse=True)
1664+
tm.assert_series_equal(result_isdup, expected_isdup)
1665+
1666+
expected_inv = Series(range(10))
1667+
tm.assert_series_equal(result_inv, expected_inv)
1668+
16571669
def test_drop_duplicates(self):
16581670
df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
16591671
'foo', 'bar', 'bar', 'foo'],

0 commit comments

Comments
 (0)