From e8f539a231db9ba240bc91c41a4afcb1810f6d32 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 20 Jan 2020 13:52:11 +0100 Subject: [PATCH 01/16] API: generalized check_array_indexer for validating array-like indexers --- pandas/api/indexers/__init__.py | 4 +- pandas/core/arrays/categorical.py | 11 ++--- pandas/core/arrays/datetimelike.py | 6 ++- pandas/core/arrays/interval.py | 3 ++ pandas/core/arrays/masked.py | 14 ++++-- pandas/core/arrays/numpy_.py | 8 +-- pandas/core/arrays/sparse/array.py | 5 ++ pandas/core/indexers.py | 66 ++++++++++++++++++++++++- pandas/tests/extension/base/getitem.py | 32 ++++++++++++ pandas/tests/extension/decimal/array.py | 8 +-- pandas/tests/extension/json/array.py | 7 +-- 11 files changed, 131 insertions(+), 33 deletions(-) diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index 10654eb0888ee..e0050326e791a 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -2,7 +2,7 @@ Public API for Rolling Window Indexers. """ -from pandas.core.indexers import check_bool_array_indexer +from pandas.core.indexers import check_array_indexer, check_bool_array_indexer from pandas.core.window.indexers import BaseIndexer -__all__ = ["check_bool_array_indexer", "BaseIndexer"] +__all__ = ["check_array_indexer", "check_bool_array_indexer", "BaseIndexer"] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9d7359dd9c614..3c717458797b5 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -39,7 +39,7 @@ ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_array_like, is_hashable +from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna from pandas.core import ops @@ -54,7 +54,7 @@ from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array -from pandas.core.indexers import check_bool_array_indexer +from pandas.core.indexers import check_array_indexer from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort @@ -2001,11 +2001,8 @@ def __getitem__(self, key): else: return self.categories[i] - if is_list_like(key) and not is_array_like(key): - key = np.asarray(key) - - if com.is_bool_indexer(key): - key = check_bool_array_indexer(self, key) + if is_list_like(key): + key = check_array_indexer(self, key) result = self._codes[key] if result.ndim > 1: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 70637026c278d..7fb3d31f34e8e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -42,7 +42,7 @@ from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com -from pandas.core.indexers import check_bool_array_indexer +from pandas.core.indexers import check_array_indexer from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison, make_invalid_op @@ -517,8 +517,10 @@ def __getitem__(self, key): return self._box_func(val) return type(self)(val, dtype=self.dtype) + if is_list_like(key): + key = check_array_indexer(self, key) + if com.is_bool_indexer(key): - key = check_bool_array_indexer(self, key) if key.all(): key = slice(0, None, None) else: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 37d2baed2c09e..23f2151bb593a 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -40,6 +40,7 @@ from pandas.core.arrays.categorical import Categorical import pandas.core.common as com from pandas.core.construction import array +from pandas.core.indexers import check_array_indexer from pandas.core.indexes.base import ensure_index _VALID_CLOSED = {"left", "right", "both", "neither"} @@ -495,6 +496,8 @@ def __len__(self) -> int: return len(self.left) def __getitem__(self, value): + if is_list_like(value): + value = check_array_indexer(self, value) left = self.left[value] right = self.right[value] diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 47605413ff1a6..50beb58d79b0e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -4,13 +4,17 @@ from pandas._libs import lib, missing as libmissing -from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype +from pandas.core.dtypes.common import ( + is_integer, + is_list_like, + is_object_dtype, + is_string_dtype, +) from pandas.core.dtypes.missing import isna, notna from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin -import pandas.core.common as com -from pandas.core.indexers import check_bool_array_indexer +from pandas.core.indexers import check_array_indexer if TYPE_CHECKING: from pandas._typing import Scalar @@ -35,8 +39,8 @@ def __getitem__(self, item): return self.dtype.na_value return self._data[item] - elif com.is_bool_indexer(item): - item = check_bool_array_indexer(self, item) + elif is_list_like(item): + item = check_array_indexer(self, item) return type(self)(self._data[item], self._mask[item]) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 4db3d3010adaf..903b4ad851907 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -9,6 +9,7 @@ from pandas.util._decorators import Appender from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like @@ -18,9 +19,8 @@ from pandas.core import nanops from pandas.core.algorithms import searchsorted, take, unique from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin -import pandas.core.common as com from pandas.core.construction import extract_array -from pandas.core.indexers import check_bool_array_indexer +from pandas.core.indexers import check_array_indexer from pandas.core.missing import backfill_1d, pad_1d @@ -235,8 +235,8 @@ def __getitem__(self, item): if isinstance(item, type(self)): item = item._ndarray - elif com.is_bool_indexer(item): - item = check_bool_array_indexer(self, item) + elif is_list_like(item): + item = check_array_indexer(self, item) result = self._ndarray[item] if not lib.is_scalar(item): diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index e2562a375515d..0f21e4e03b8d2 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -29,6 +29,7 @@ is_datetime64_any_dtype, is_dtype_equal, is_integer, + is_list_like, is_object_dtype, is_scalar, is_string_dtype, @@ -43,6 +44,7 @@ from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import sanitize_array +from pandas.core.indexers import check_array_indexer from pandas.core.missing import interpolate_2d import pandas.core.ops as ops from pandas.core.ops.common import unpack_zerodim_and_defer @@ -768,6 +770,9 @@ def __getitem__(self, key): else: key = np.asarray(key) + if is_list_like(key): + key = check_array_indexer(self, key) + if com.is_bool_indexer(key): key = check_bool_indexer(self, key) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 4d45769d2fea9..c59f6536683b3 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -5,7 +5,12 @@ from pandas._typing import AnyArrayLike -from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_integer_dtype, + is_list_like, +) from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries # ----------------------------------------------------------- @@ -307,3 +312,62 @@ def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndar if len(result) != len(array): raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.") return result + + +def check_array_indexer(array, indexer) -> np.ndarray: + """ + Check if `indexer` is a valid array indexer for `array`. + + `array` and `indexer` are checked to have the same length, and the + dtype is validated. If it is an integer or boolean ExtensionArray, it is + checked if there are missing values present, and it is converted to + the appropriate numpy array. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + array : array + The array that's being indexed (only used for the length). + indexer : array-like + The array-like that's used to index. + + Returns + ------- + numpy.ndarray + The validated indexer. + + Raises + ------ + IndexError + When the lengths don't match. + ValueError + When `indexer` cannot be converted to a numpy ndarray. + + """ + import pandas as pd + + if not is_array_like(indexer): + indexer = pd.array(indexer) + dtype = indexer.dtype + if is_bool_dtype(dtype): + try: + indexer = np.asarray(indexer, dtype=bool) + except ValueError: + raise ValueError("Cannot mask with a boolean indexer containing NA values") + + # GH26658 + if len(indexer) != len(array): + raise IndexError( + f"Item wrong length {len(indexer)} instead of {len(array)}." + ) + + elif is_integer_dtype(dtype): + try: + indexer = np.asarray(indexer, dtype=int) + except ValueError: + raise ValueError( + "Cannot index with an integer indexer containing NA values" + ) + + return indexer diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index dc1f62c4c97c5..5c31d6f04335e 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -160,6 +160,38 @@ def test_getitem_boolean_array_mask_raises(self, data): with pytest.raises(ValueError): s[mask] + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_getitem_integer_array(self, data, idx): + result = data[idx] + assert len(result) == 3 + assert isinstance(result, type(data)) + expected = data.take([0, 1, 2]) + self.assert_extension_array_equal(result, expected) + + expected = pd.Series(expected) + result = pd.Series(data)[idx] + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")], + ids=["list", "integer-array"], + ) + def test_getitem_integer_with_missing_raises(self, data, idx): + msg = "Cannot index with an integer indexer containing NA values" + with pytest.raises(ValueError, match=msg): + data[idx] + + # TODO this raises KeyError about labels not found (it tries label-based) + # import pandas._testing as tm + # s = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))]) + # with pytest.raises(ValueError, match=msg): + # s[idx] + def test_getitem_slice(self, data): # getitem[slice] should return an array result = data[slice(0)] # empty diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 85bd5f7a33fe1..12a906b5afbe1 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -117,13 +117,7 @@ def __getitem__(self, item): else: # array, slice. if pd.api.types.is_list_like(item): - if not pd.api.types.is_array_like(item): - item = pd.array(item) - dtype = item.dtype - if pd.api.types.is_bool_dtype(dtype): - item = pd.api.indexers.check_bool_array_indexer(self, item) - elif pd.api.types.is_integer_dtype(dtype): - item = np.asarray(item, dtype="int") + item = pd.api.indexers.check_array_indexer(self, item) return type(self)(self._data[item]) def take(self, indexer, allow_fill=False, fill_value=None): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 17bc2773aad19..fe83200152794 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -82,11 +82,8 @@ def __getitem__(self, item): # slice return type(self)(self.data[item]) else: - if not pd.api.types.is_array_like(item): - item = pd.array(item) - dtype = item.dtype - if pd.api.types.is_bool_dtype(dtype): - item = pd.api.indexers.check_bool_array_indexer(self, item) + item = pd.api.indexers.check_array_indexer(self, item) + if pd.api.types.is_bool_dtype(item.dtype): return self._from_sequence([x for x, m in zip(self, item) if m]) # integer return type(self)([self.data[i] for i in item]) From 4fa9f5a79d93ca0a2dabdb6e06d7d340d4003790 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 20 Jan 2020 14:04:00 +0100 Subject: [PATCH 02/16] test boolean message as well --- pandas/tests/extension/base/getitem.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 5c31d6f04335e..337c5bf93ab72 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -152,7 +152,9 @@ def test_getitem_boolean_array_mask(self, data): def test_getitem_boolean_array_mask_raises(self, data): mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") mask[:2] = pd.NA - with pytest.raises(ValueError): + + msg = "Cannot mask with a boolean indexer containing NA values" + with pytest.raises(ValueError, match=msg): data[mask] s = pd.Series(data) From b55dfd2d78358475132dca2214e1a4641f9a305d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 20 Jan 2020 15:52:46 +0100 Subject: [PATCH 03/16] fixes for failing tests --- pandas/core/arrays/categorical.py | 4 ++-- pandas/core/arrays/datetimelike.py | 2 ++ pandas/tests/indexes/categorical/test_category.py | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3c717458797b5..2a7293f23278f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2001,12 +2001,12 @@ def __getitem__(self, key): else: return self.categories[i] - if is_list_like(key): + if is_list_like(key) and not isinstance(key, tuple): key = check_array_indexer(self, key) result = self._codes[key] if result.ndim > 1: - return result + raise IndexError("Cannot user indexer with multiple dimensions") return self._constructor(result, dtype=self.dtype, fastpath=True) def __setitem__(self, key, value): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 7fb3d31f34e8e..e11053330771b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -521,6 +521,8 @@ def __getitem__(self, key): key = check_array_indexer(self, key) if com.is_bool_indexer(key): + # can still have object dtype + key = np.asarray(key, dtype=bool) if key.all(): key = slice(0, None, None) else: diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index e027641288bb9..20e114f2b1fc2 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -976,8 +976,9 @@ def test_engine_type(self, dtype, engine_type): assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type) - def test_getitem_2d_deprecated(self): + def test_getitem_raise_2d(self): # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable idx = self.create_index() - with pytest.raises(ValueError, match="cannot mask with array containing NA"): + msg = "Cannot user indexer with multiple dimensions" + with pytest.raises(IndexError, match=msg): idx[:, None] From 58bfe780130ebbfb3c9e0ec06f0ef357632f7333 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2020 10:59:59 +0100 Subject: [PATCH 04/16] remove previous check_bool_array_indexer --- doc/source/reference/extensions.rst | 2 +- pandas/api/indexers/__init__.py | 4 +- pandas/core/common.py | 4 +- pandas/core/indexers.py | 84 +++++++++++------------------ pandas/core/indexing.py | 4 +- 5 files changed, 39 insertions(+), 59 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index c072237850d82..78fdfbfd28144 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -66,7 +66,7 @@ behaves correctly. .. autosummary:: :toctree: api/ - api.indexers.check_bool_array_indexer + api.indexers.check_array_indexer The sentinel ``pandas.api.extensions.no_default`` is used as the default diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index e0050326e791a..826297e6b498f 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -2,7 +2,7 @@ Public API for Rolling Window Indexers. """ -from pandas.core.indexers import check_array_indexer, check_bool_array_indexer +from pandas.core.indexers import check_array_indexer from pandas.core.window.indexers import BaseIndexer -__all__ = ["check_array_indexer", "check_bool_array_indexer", "BaseIndexer"] +__all__ = ["check_array_indexer", "BaseIndexer"] diff --git a/pandas/core/common.py b/pandas/core/common.py index f0fcb736586d6..d8b082e7c0f79 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -121,8 +121,8 @@ def is_bool_indexer(key: Any) -> bool: See Also -------- - check_bool_array_indexer : Check that `key` - is a valid mask for an array, and convert to an ndarray. + check_array_indexer : Check that `key` is a valid array to index, + and convert to an ndarray. """ na_msg = "cannot mask with array containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index c59f6536683b3..85a00fca0eccd 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -249,33 +249,36 @@ def length_of_indexer(indexer, target=None) -> int: raise AssertionError("cannot find the length of the indexer") -def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray: +def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: """ - Check if `mask` is a valid boolean indexer for `array`. + Check if `indexer` is a valid array indexer for `array`. - `array` and `mask` are checked to have the same length, and the - dtype is validated. + `array` and `indexer` are checked to have the same length, and the + dtype is validated. If it is an integer or boolean ExtensionArray, it is + checked if there are missing values present, and it is converted to + the appropriate numpy array. .. versionadded:: 1.0.0 Parameters ---------- array : array - The array that's being masked. - mask : array - The boolean array that's masking. + The array that's being indexed (only used for the length). + indexer : array-like or list-like + The array-like that's used to index. Returns ------- numpy.ndarray - The validated boolean mask. + The validated indexer. Raises ------ IndexError When the lengths don't match. ValueError - When `mask` cannot be converted to a bool-dtype ndarray. + When `indexer` cannot be converted to a numpy ndarray to index + (e.g. presence of missing values). See Also -------- @@ -283,17 +286,18 @@ def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndar Examples -------- - A boolean ndarray is returned when the arguments are all valid. + When checking a boolean mask, a boolean ndarray is returned when the + arguments are all valid. >>> mask = pd.array([True, False]) >>> arr = pd.array([1, 2]) - >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) An IndexError is raised when the lengths don't match. >>> mask = pd.array([True, False, True]) - >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + >>> pd.api.indexers.check_array_indexer(arr, mask) Traceback (most recent call last): ... IndexError: Item wrong length 3 instead of 2. @@ -302,53 +306,29 @@ def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndar a bool-dtype ndarray. >>> mask = pd.array([True, pd.NA]) - >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + >>> pd.api.indexers.check_array_indexer(arr, mask) Traceback (most recent call last): ... - ValueError: cannot convert to bool numpy array in presence of missing values - """ - result = np.asarray(mask, dtype=bool) - # GH26658 - if len(result) != len(array): - raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.") - return result + ValueError: Cannot mask with a boolean indexer containing NA values + Similarly for integer indexers, an integer ndarray is returned when it is + a valid indexer, otherwise an error is (for integer indexers, a matching + length is not required): -def check_array_indexer(array, indexer) -> np.ndarray: - """ - Check if `indexer` is a valid array indexer for `array`. - - `array` and `indexer` are checked to have the same length, and the - dtype is validated. If it is an integer or boolean ExtensionArray, it is - checked if there are missing values present, and it is converted to - the appropriate numpy array. - - .. versionadded:: 1.0.0 - - Parameters - ---------- - array : array - The array that's being indexed (only used for the length). - indexer : array-like - The array-like that's used to index. - - Returns - ------- - numpy.ndarray - The validated indexer. - - Raises - ------ - IndexError - When the lengths don't match. - ValueError - When `indexer` cannot be converted to a numpy ndarray. + >>> indexer = pd.array([0, 2], dtype="Int64") + >>> arr = pd.array([1, 2, 3]) + >>> pd.api.indexers.check_array_indexer(arr, indexer) + array([0, 2]) + >>> indexer = pd.array([0, pd.NA], dtype="Int64") + Traceback (most recent call last): + ... + ValueError: Cannot index with an integer indexer containing NA values """ - import pandas as pd + from pandas.core.construction import array as pd_array if not is_array_like(indexer): - indexer = pd.array(indexer) + indexer = pd_array(indexer) dtype = indexer.dtype if is_bool_dtype(dtype): try: @@ -364,7 +344,7 @@ def check_array_indexer(array, indexer) -> np.ndarray: elif is_integer_dtype(dtype): try: - indexer = np.asarray(indexer, dtype=int) + indexer = np.asarray(indexer, dtype=np.intp) except ValueError: raise ValueError( "Cannot index with an integer indexer containing NA values" diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 63a86792082da..a33724c8d2123 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -23,7 +23,7 @@ import pandas.core.common as com from pandas.core.indexers import ( - check_bool_array_indexer, + check_array_indexer, is_list_like_indexer, length_of_indexer, ) @@ -2232,7 +2232,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: else: if is_sparse(result): result = result.to_dense() - result = check_bool_array_indexer(index, result) + result = check_array_indexer(index, result) return result From 5ce8d857dd909934774042898cbd783137308a79 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2020 11:08:03 +0100 Subject: [PATCH 05/16] don't convert tuples to avoid warning from numpy --- pandas/core/arrays/datetimelike.py | 2 +- pandas/tests/indexes/categorical/test_category.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e11053330771b..12e06d75548c2 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -517,7 +517,7 @@ def __getitem__(self, key): return self._box_func(val) return type(self)(val, dtype=self.dtype) - if is_list_like(key): + if is_list_like(key) and not isinstance(key, tuple): key = check_array_indexer(self, key) if com.is_bool_indexer(key): diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 20e114f2b1fc2..92169a49af838 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -976,7 +976,7 @@ def test_engine_type(self, dtype, engine_type): assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type) - def test_getitem_raise_2d(self): + def test_getitem_2d_deprecated(self): # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable idx = self.create_index() msg = "Cannot user indexer with multiple dimensions" From ebc21507bda3bc2d0228ec42553a3efa12d65cd4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2020 11:54:41 +0100 Subject: [PATCH 06/16] ensure check_bool_indexer returns numpy array --- pandas/core/indexers.py | 4 ++-- pandas/core/indexing.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 85a00fca0eccd..997d229932cf3 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -262,8 +262,8 @@ def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: Parameters ---------- - array : array - The array that's being indexed (only used for the length). + array : array-like + The array that is being indexed (only used for the length). indexer : array-like or list-like The array-like that's used to index. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index a33724c8d2123..9d18080493d56 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2232,7 +2232,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: else: if is_sparse(result): result = result.to_dense() - result = check_array_indexer(index, result) + result = np.asarray(check_array_indexer(index, result), dtype=bool) return result From 4a51d9700bbc850d8a6d484d1326e83146520435 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2020 12:46:29 +0100 Subject: [PATCH 07/16] raise warning for categorical --- pandas/core/arrays/categorical.py | 5 ++++- pandas/tests/indexes/categorical/test_category.py | 7 ------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 2a7293f23278f..4f50c9efc83b6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2006,7 +2006,10 @@ def __getitem__(self, key): result = self._codes[key] if result.ndim > 1: - raise IndexError("Cannot user indexer with multiple dimensions") + from pandas.core.indexes.base import deprecate_ndim_indexing + + deprecate_ndim_indexing(result) + return result return self._constructor(result, dtype=self.dtype, fastpath=True) def __setitem__(self, key, value): diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 92169a49af838..d09dc586fe056 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -975,10 +975,3 @@ def test_engine_type(self, dtype, engine_type): ci.values._codes = ci.values._codes.astype("int64") assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type) - - def test_getitem_2d_deprecated(self): - # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable - idx = self.create_index() - msg = "Cannot user indexer with multiple dimensions" - with pytest.raises(IndexError, match=msg): - idx[:, None] From c979df806de96e2d7fab1a29df4211157e82d67a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 24 Jan 2020 17:17:07 +0100 Subject: [PATCH 08/16] move deprecate_ndim_indexing --- pandas/core/arrays/categorical.py | 4 +--- pandas/core/indexers.py | 21 +++++++++++++++++++++ pandas/core/indexes/base.py | 16 +--------------- pandas/core/indexes/extension.py | 3 ++- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4f50c9efc83b6..03e9b5cacf4b3 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -54,7 +54,7 @@ from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array -from pandas.core.indexers import check_array_indexer +from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort @@ -2006,8 +2006,6 @@ def __getitem__(self, key): result = self._codes[key] if result.ndim > 1: - from pandas.core.indexes.base import deprecate_ndim_indexing - deprecate_ndim_indexing(result) return result return self._constructor(result, dtype=self.dtype, fastpath=True) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 997d229932cf3..450c046da404f 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -1,6 +1,8 @@ """ Low-dependency indexing utilities. """ +import warnings + import numpy as np from pandas._typing import AnyArrayLike @@ -249,6 +251,25 @@ def length_of_indexer(indexer, target=None) -> int: raise AssertionError("cannot find the length of the indexer") +def deprecate_ndim_indexing(result): + """ + Helper function to raise the deprecation warning for multi-dimensional + indexing on 1D Series/Index. + + GH#27125 indexer like idx[:, None] expands dim, but we cannot do that + and keep an index, so we currently return ndarray, which is deprecated + (Deprecation GH#30588). + """ + if np.ndim(result) > 1: + warnings.warn( + "Support for multi-dimensional indexing (e.g. `index[:, None]`) " + "on an Index is deprecated and will be removed in a future " + "version. Convert to a numpy array before indexing instead.", + DeprecationWarning, + stacklevel=3, + ) + + def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: """ Check if `indexer` is a valid array indexer for `array`. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bab3d2d1b5431..95fa03c60d48e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -67,7 +67,7 @@ from pandas.core.arrays import ExtensionArray from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com -from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexers import deprecate_ndim_indexing, maybe_convert_indices from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing from pandas.core.ops import get_op_result_name @@ -5819,17 +5819,3 @@ def _try_convert_to_int_array( pass raise ValueError - - -def deprecate_ndim_indexing(result): - if np.ndim(result) > 1: - # GH#27125 indexer like idx[:, None] expands dim, but we - # cannot do that and keep an index, so return ndarray - # Deprecation GH#30588 - warnings.warn( - "Support for multi-dimensional indexing (e.g. `index[:, None]`) " - "on an Index is deprecated and will be removed in a future " - "version. Convert to a numpy array before indexing instead.", - DeprecationWarning, - stacklevel=3, - ) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 9ddc5c01030b1..c99f2a1e47b54 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -12,7 +12,8 @@ from pandas.core.dtypes.generic import ABCSeries from pandas.core.arrays import ExtensionArray -from pandas.core.indexes.base import Index, deprecate_ndim_indexing +from pandas.core.indexers import deprecate_ndim_indexing +from pandas.core.indexes.base import Index from pandas.core.ops import get_op_result_name From ce2e04297182be3041aa272695dbc7236924bda7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 24 Jan 2020 17:38:24 +0100 Subject: [PATCH 09/16] cleanup; ensure output of check_array_indexer is always an ndarray --- pandas/core/indexers.py | 41 +++++++++++++++++++++++------- pandas/tests/indexing/test_iloc.py | 6 ++++- pandas/tests/indexing/test_loc.py | 6 ++++- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 450c046da404f..164d4be849d35 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -270,14 +270,18 @@ def deprecate_ndim_indexing(result): ) +# ----------------------------------------------------------- +# Public indexer validation + + def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: """ Check if `indexer` is a valid array indexer for `array`. - `array` and `indexer` are checked to have the same length, and the - dtype is validated. If it is an integer or boolean ExtensionArray, it is - checked if there are missing values present, and it is converted to - the appropriate numpy array. + For a boolean mask, `array` and `indexer` are checked to have the same + length. The dtype is validated, and if it is an integer or boolean + ExtensionArray, it is checked if there are missing values present, and + it is converted to the appropriate numpy array. .. versionadded:: 1.0.0 @@ -286,12 +290,14 @@ def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: array : array-like The array that is being indexed (only used for the length). indexer : array-like or list-like - The array-like that's used to index. + The array-like that's used to index. The function assumes this is an + array-like, and input that is not yet an numpy array or an ExtensionArray + is converted to one. Returns ------- numpy.ndarray - The validated indexer. + The validated indexer as a numpy array that can be used to index. Raises ------ @@ -321,7 +327,7 @@ def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: >>> pd.api.indexers.check_array_indexer(arr, mask) Traceback (most recent call last): ... - IndexError: Item wrong length 3 instead of 2. + IndexError: Boolean index has wrong length: 3 instead of 2. A ValueError is raised when the mask cannot be converted to a bool-dtype ndarray. @@ -332,6 +338,12 @@ def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: ... ValueError: Cannot mask with a boolean indexer containing NA values + A numpy boolean mask will get passed through (if the length is correct): + + >>> mask = np.array([True, False]) + >>> pd.api.indexers.check_array_indexer(arr, mask) + array([ True, False]) + Similarly for integer indexers, an integer ndarray is returned when it is a valid indexer, otherwise an error is (for integer indexers, a matching length is not required): @@ -342,9 +354,18 @@ def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: array([0, 2]) >>> indexer = pd.array([0, pd.NA], dtype="Int64") + >>> pd.api.indexers.check_array_indexer(arr, indexer) Traceback (most recent call last): ... ValueError: Cannot index with an integer indexer containing NA values + + For non-integer/boolean dtypes, an appropriate error is raised: + + >>> indexer = np.array([0., 2.], dtype="float64") + >>> pd.api.indexers.check_array_indexer(arr, indexer) + Traceback (most recent call last): + ... + IndexError: arrays used as indices must be of integer or boolean type """ from pandas.core.construction import array as pd_array @@ -360,9 +381,9 @@ def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: # GH26658 if len(indexer) != len(array): raise IndexError( - f"Item wrong length {len(indexer)} instead of {len(array)}." + f"Boolean index has wrong length: " + f"{len(indexer)} instead of {len(array)}" ) - elif is_integer_dtype(dtype): try: indexer = np.asarray(indexer, dtype=np.intp) @@ -370,5 +391,7 @@ def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: raise ValueError( "Cannot index with an integer indexer containing NA values" ) + else: + raise IndexError("arrays used as indices must be of integer or boolean type") return indexer diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 48c25ec034653..11c9d5213fe95 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -251,7 +251,11 @@ def test_iloc_getitem_bool_diff_len(self, index): s = Series([1, 2, 3]) with pytest.raises( IndexError, - match=("Item wrong length {} instead of {}.".format(len(index), len(s))), + match=( + "Boolean index has wrong length: {} instead of {}".format( + len(index), len(s) + ) + ), ): _ = s.iloc[index] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 4c1436b800fc3..d5a890a8694a6 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -202,7 +202,11 @@ def test_loc_getitem_bool_diff_len(self, index): s = Series([1, 2, 3]) with pytest.raises( IndexError, - match=("Item wrong length {} instead of {}.".format(len(index), len(s))), + match=( + "Boolean index has wrong length: {} instead of {}".format( + len(index), len(s) + ) + ), ): _ = s.loc[index] From 4d447bf030b8eb5886fd7622cc930d9a076f3f7d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 24 Jan 2020 17:41:05 +0100 Subject: [PATCH 10/16] clean-up black reformatting --- pandas/tests/indexing/test_iloc.py | 12 ++++-------- pandas/tests/indexing/test_loc.py | 12 ++++-------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 11c9d5213fe95..d67259e8b7d40 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -249,14 +249,10 @@ def test_iloc_getitem_bool(self): def test_iloc_getitem_bool_diff_len(self, index): # GH26658 s = Series([1, 2, 3]) - with pytest.raises( - IndexError, - match=( - "Boolean index has wrong length: {} instead of {}".format( - len(index), len(s) - ) - ), - ): + msg = "Boolean index has wrong length: {} instead of {}".format( + len(index), len(s) + ) + with pytest.raises(IndexError, match=msg): _ = s.iloc[index] def test_iloc_getitem_slice(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d5a890a8694a6..b9dc96adfa738 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -200,14 +200,10 @@ def test_loc_getitem_bool(self): def test_loc_getitem_bool_diff_len(self, index): # GH26658 s = Series([1, 2, 3]) - with pytest.raises( - IndexError, - match=( - "Boolean index has wrong length: {} instead of {}".format( - len(index), len(s) - ) - ), - ): + msg = "Boolean index has wrong length: {} instead of {}".format( + len(index), len(s) + ) + with pytest.raises(IndexError, match=msg): _ = s.loc[index] def test_loc_getitem_int_slice(self): From 9ed8fe9c34c0898be95faf3e79a69caacf84c2e4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Jan 2020 07:39:45 +0100 Subject: [PATCH 11/16] fix check_bool_indexer --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 11c5725b5ddce..fcdaeb68dbde2 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2232,7 +2232,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: else: if is_sparse(result): result = np.asarray(result) - result = np.asarray(check_array_indexer(index, result), dtype=bool) + result = check_array_indexer(index, np.asarray(result, dtype=bool)) return result From 2f8cd276e6e7bc2be3c8dfa4a8d94855187c5654 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Jan 2020 08:03:52 +0100 Subject: [PATCH 12/16] add comment to check_bool_indexer --- pandas/core/indexing.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index fcdaeb68dbde2..fb49da0352b60 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -15,7 +15,6 @@ is_numeric_dtype, is_scalar, is_sequence, - is_sparse, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries @@ -2230,9 +2229,9 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: ) result = result.astype(bool)._values else: - if is_sparse(result): - result = np.asarray(result) - result = check_array_indexer(index, np.asarray(result, dtype=bool)) + # key might be sparse / object-dtype bool, check_array_indexer needs bool array + result = np.asarray(result, dtype=bool) + result = check_array_indexer(index, result) return result From 4d9a201664adbf46833f3765f0f18e286c033d91 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Jan 2020 10:12:14 +0100 Subject: [PATCH 13/16] fix empty list case --- pandas/core/arrays/datetimelike.py | 9 +++++---- pandas/core/indexers.py | 3 +++ pandas/tests/extension/base/getitem.py | 14 +++++++++++++- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 12e06d75548c2..2c1aad436b6c9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -517,16 +517,17 @@ def __getitem__(self, key): return self._box_func(val) return type(self)(val, dtype=self.dtype) - if is_list_like(key) and not isinstance(key, tuple): - key = check_array_indexer(self, key) - if com.is_bool_indexer(key): - # can still have object dtype + # first check for boolean, because check_array_indexer doesn't + # allow object dtype key = np.asarray(key, dtype=bool) + key = check_array_indexer(self, key) if key.all(): key = slice(0, None, None) else: key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + elif is_list_like(key) and not isinstance(key, tuple): + key = check_array_indexer(self, key) is_period = is_period_dtype(self) if is_period: diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 164d4be849d35..5adb58d6defcb 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -371,6 +371,9 @@ def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: if not is_array_like(indexer): indexer = pd_array(indexer) + if len(indexer) == 0: + # empty list is converted to float array by pd.array + indexer = np.array([], dtype=np.intp) dtype = indexer.dtype if is_bool_dtype(dtype): try: diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 8a05ab4ddaa1b..8615a8df22dcc 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -97,6 +97,15 @@ def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): result = data_missing[0] assert na_cmp(result, na_value) + def test_getitem_empty(self, data): + # Indexing with empty list + result = data[[]] + assert len(result) == 0 + assert isinstance(result, type(data)) + + expected = data[np.array([], dtype="int64")] + self.assert_extension_array_equal(result, expected) + def test_getitem_mask(self, data): # Empty mask, raw array mask = np.zeros(len(data), dtype=bool) @@ -153,7 +162,10 @@ def test_getitem_boolean_array_mask_raises(self, data): mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") mask[:2] = pd.NA - msg = "Cannot mask with a boolean indexer containing NA values" + msg = ( + "Cannot mask with a boolean indexer containing NA values|" + "cannot mask with array containing NA / NaN values" + ) with pytest.raises(ValueError, match=msg): data[mask] From 097d221f3e49d3346ea78687f46cea1698d14e3c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Jan 2020 10:35:13 +0100 Subject: [PATCH 14/16] add specific tests for check_array_indexer --- pandas/tests/indexing/test_check_indexer.py | 87 +++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 pandas/tests/indexing/test_check_indexer.py diff --git a/pandas/tests/indexing/test_check_indexer.py b/pandas/tests/indexing/test_check_indexer.py new file mode 100644 index 0000000000000..33c6bc3372083 --- /dev/null +++ b/pandas/tests/indexing/test_check_indexer.py @@ -0,0 +1,87 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.api.indexers import check_array_indexer + + +@pytest.mark.parametrize( + "indexer, expected", + [ + # integer + ([1, 2], np.array([1, 2], dtype=np.intp)), + (np.array([1, 2], dtype="int64"), np.array([1, 2], dtype=np.intp)), + (pd.array([1, 2], dtype="Int32"), np.array([1, 2], dtype=np.intp)), + (pd.Index([1, 2]), np.array([1, 2], dtype=np.intp)), + # boolean + ([True, False, True], np.array([True, False, True], dtype=np.bool_)), + (np.array([True, False, True]), np.array([True, False, True], dtype=np.bool_)), + ( + pd.array([True, False, True], dtype="boolean"), + np.array([True, False, True], dtype=np.bool_), + ), + # other + ([], np.array([], dtype=np.intp)), + ], +) +def test_valid_input(indexer, expected): + array = np.array([1, 2, 3]) + result = check_array_indexer(array, indexer) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")], +) +def test_bool_raise_missing_values(indexer): + array = np.array([1, 2, 3]) + + msg = "Cannot mask with a boolean indexer containing NA values" + with pytest.raises(ValueError, match=msg): + check_array_indexer(array, indexer) + + +@pytest.mark.parametrize( + "indexer", + [ + [True, False], + pd.array([True, False], dtype="boolean"), + np.array([True, False], dtype=np.bool_), + ], +) +def test_bool_raise_length(indexer): + array = np.array([1, 2, 3]) + + msg = "Boolean index has wrong length" + with pytest.raises(IndexError, match=msg): + check_array_indexer(array, indexer) + + +@pytest.mark.parametrize( + "indexer", [[0, 1, None], pd.array([0, 1, pd.NA], dtype="Int64")], +) +def test_int_raise_missing_values(indexer): + array = np.array([1, 2, 3]) + + msg = "Cannot index with an integer indexer containing NA values" + with pytest.raises(ValueError, match=msg): + check_array_indexer(array, indexer) + + +@pytest.mark.parametrize( + "indexer", + [ + [0.0, 1.0], + np.array([1.0, 2.0], dtype="float64"), + np.array([True, False], dtype=object), + pd.Index([True, False], dtype=object), + pd.array(["a", "b"], dtype="string"), + ], +) +def test_raise_invalid_array_dtypes(indexer): + array = np.array([1, 2, 3]) + + msg = "arrays used as indices must be of integer or boolean type" + with pytest.raises(IndexError, match=msg): + check_array_indexer(array, indexer) From 3c5e4c68873e641f0f2dffed35d74e0a64908aa1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Jan 2020 12:03:09 +0100 Subject: [PATCH 15/16] allow list-length-1-with-slice corner case --- pandas/core/arrays/datetimelike.py | 6 +++++- pandas/core/arrays/interval.py | 2 +- pandas/core/arrays/masked.py | 2 +- pandas/core/arrays/numpy_.py | 2 +- pandas/core/arrays/sparse/array.py | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2c1aad436b6c9..319caa53dc3c1 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -518,7 +518,7 @@ def __getitem__(self, key): return type(self)(val, dtype=self.dtype) if com.is_bool_indexer(key): - # first check for boolean, because check_array_indexer doesn't + # first convert to boolean, because check_array_indexer doesn't # allow object dtype key = np.asarray(key, dtype=bool) key = check_array_indexer(self, key) @@ -526,6 +526,10 @@ def __getitem__(self, key): key = slice(0, None, None) else: key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice): + # see https://github.com/pandas-dev/pandas/issues/31299, need to allow + # this for now (would otherwise raise in check_array_indexer) + pass elif is_list_like(key) and not isinstance(key, tuple): key = check_array_indexer(self, key) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 23f2151bb593a..055c772d66485 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -496,7 +496,7 @@ def __len__(self) -> int: return len(self.left) def __getitem__(self, value): - if is_list_like(value): + if is_list_like(value) and not isinstance(value, tuple): value = check_array_indexer(self, value) left = self.left[value] right = self.right[value] diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 6bd21df434528..2c3e51960622d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -39,7 +39,7 @@ def __getitem__(self, item): return self.dtype.na_value return self._data[item] - elif is_list_like(item): + elif is_list_like(item) and not isinstance(item, tuple): item = check_array_indexer(self, item) return type(self)(self._data[item], self._mask[item]) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 56b362e3db7a3..73bc22d199f37 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -234,7 +234,7 @@ def __getitem__(self, item): if isinstance(item, type(self)): item = item._ndarray - elif is_list_like(item): + elif is_list_like(item) and not isinstance(item, tuple): item = check_array_indexer(self, item) result = self._ndarray[item] diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c499218c2e947..078e6c2f119d2 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -770,7 +770,7 @@ def __getitem__(self, key): else: key = np.asarray(key) - if is_list_like(key): + if is_list_like(key) and not isinstance(key, tuple): key = check_array_indexer(self, key) if com.is_bool_indexer(key): From 1ca35d15baffa2e8869b238f57f29e2b01c91332 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Jan 2020 16:12:57 +0100 Subject: [PATCH 16/16] move list-like check inside --- pandas/core/arrays/categorical.py | 3 +-- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/interval.py | 3 +-- pandas/core/arrays/masked.py | 10 ++------ pandas/core/arrays/numpy_.py | 4 +-- pandas/core/arrays/sparse/array.py | 4 +-- pandas/core/indexers.py | 28 ++++++++++++++++----- pandas/tests/extension/decimal/array.py | 3 +-- pandas/tests/indexing/test_check_indexer.py | 10 ++++++++ 9 files changed, 40 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 03e9b5cacf4b3..412422397af06 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2001,8 +2001,7 @@ def __getitem__(self, key): else: return self.categories[i] - if is_list_like(key) and not isinstance(key, tuple): - key = check_array_indexer(self, key) + key = check_array_indexer(self, key) result = self._codes[key] if result.ndim > 1: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 319caa53dc3c1..0ea707e1ae69d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -530,7 +530,7 @@ def __getitem__(self, key): # see https://github.com/pandas-dev/pandas/issues/31299, need to allow # this for now (would otherwise raise in check_array_indexer) pass - elif is_list_like(key) and not isinstance(key, tuple): + else: key = check_array_indexer(self, key) is_period = is_period_dtype(self) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 055c772d66485..d890c0c16aecc 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -496,8 +496,7 @@ def __len__(self) -> int: return len(self.left) def __getitem__(self, value): - if is_list_like(value) and not isinstance(value, tuple): - value = check_array_indexer(self, value) + value = check_array_indexer(self, value) left = self.left[value] right = self.right[value] diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2c3e51960622d..80e317123126a 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -4,12 +4,7 @@ from pandas._libs import lib, missing as libmissing -from pandas.core.dtypes.common import ( - is_integer, - is_list_like, - is_object_dtype, - is_string_dtype, -) +from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype from pandas.core.dtypes.missing import isna, notna from pandas.core.algorithms import take @@ -39,8 +34,7 @@ def __getitem__(self, item): return self.dtype.na_value return self._data[item] - elif is_list_like(item) and not isinstance(item, tuple): - item = check_array_indexer(self, item) + item = check_array_indexer(self, item) return type(self)(self._data[item], self._mask[item]) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 73bc22d199f37..8b1d1e58dc36c 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -9,7 +9,6 @@ from pandas.util._decorators import Appender from pandas.util._validators import validate_fillna_kwargs -from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like @@ -234,8 +233,7 @@ def __getitem__(self, item): if isinstance(item, type(self)): item = item._ndarray - elif is_list_like(item) and not isinstance(item, tuple): - item = check_array_indexer(self, item) + item = check_array_indexer(self, item) result = self._ndarray[item] if not lib.is_scalar(item): diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 078e6c2f119d2..b476a019c66cc 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -29,7 +29,6 @@ is_datetime64_any_dtype, is_dtype_equal, is_integer, - is_list_like, is_object_dtype, is_scalar, is_string_dtype, @@ -770,8 +769,7 @@ def __getitem__(self, key): else: key = np.asarray(key) - if is_list_like(key) and not isinstance(key, tuple): - key = check_array_indexer(self, key) + key = check_array_indexer(self, key) if com.is_bool_indexer(key): key = check_bool_indexer(self, key) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 5adb58d6defcb..fe475527f4596 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -5,7 +5,7 @@ import numpy as np -from pandas._typing import AnyArrayLike +from pandas._typing import Any, AnyArrayLike from pandas.core.dtypes.common import ( is_array_like, @@ -274,14 +274,18 @@ def deprecate_ndim_indexing(result): # Public indexer validation -def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: +def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: """ Check if `indexer` is a valid array indexer for `array`. For a boolean mask, `array` and `indexer` are checked to have the same length. The dtype is validated, and if it is an integer or boolean ExtensionArray, it is checked if there are missing values present, and - it is converted to the appropriate numpy array. + it is converted to the appropriate numpy array. Other dtypes will raise + an error. + + Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed + through as is. .. versionadded:: 1.0.0 @@ -290,9 +294,9 @@ def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: array : array-like The array that is being indexed (only used for the length). indexer : array-like or list-like - The array-like that's used to index. The function assumes this is an - array-like, and input that is not yet an numpy array or an ExtensionArray - is converted to one. + The array-like that's used to index. List-like input that is not yet + a numpy array or an ExtensionArray is converted to one. Other input + types are passed through as is Returns ------- @@ -369,11 +373,23 @@ def check_array_indexer(array: AnyArrayLike, indexer) -> np.ndarray: """ from pandas.core.construction import array as pd_array + # whathever is not an array-like is returned as-is (possible valid array + # indexers that are not array-like: integer, slice, Ellipsis, None) + # In this context, tuples are not considered as array-like, as they have + # a specific meaning in indexing (multi-dimensional indexing) + if is_list_like(indexer): + if isinstance(indexer, tuple): + return indexer + else: + return indexer + + # convert list-likes to array if not is_array_like(indexer): indexer = pd_array(indexer) if len(indexer) == 0: # empty list is converted to float array by pd.array indexer = np.array([], dtype=np.intp) + dtype = indexer.dtype if is_bool_dtype(dtype): try: diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 12a906b5afbe1..743852c35dbd8 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -116,8 +116,7 @@ def __getitem__(self, item): return self._data[item] else: # array, slice. - if pd.api.types.is_list_like(item): - item = pd.api.indexers.check_array_indexer(self, item) + item = pd.api.indexers.check_array_indexer(self, item) return type(self)(self._data[item]) def take(self, indexer, allow_fill=False, fill_value=None): diff --git a/pandas/tests/indexing/test_check_indexer.py b/pandas/tests/indexing/test_check_indexer.py index 33c6bc3372083..82f8c12229824 100644 --- a/pandas/tests/indexing/test_check_indexer.py +++ b/pandas/tests/indexing/test_check_indexer.py @@ -85,3 +85,13 @@ def test_raise_invalid_array_dtypes(indexer): msg = "arrays used as indices must be of integer or boolean type" with pytest.raises(IndexError, match=msg): check_array_indexer(array, indexer) + + +@pytest.mark.parametrize( + "indexer", [None, Ellipsis, slice(0, 3), (None,)], +) +def test_pass_through_non_array_likes(indexer): + array = np.array([1, 2, 3]) + + result = check_array_indexer(array, indexer) + assert result == indexer