diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py new file mode 100644 index 0000000000000..3d926357b81f4 --- /dev/null +++ b/pandas/core/arrays/boolean.py @@ -0,0 +1,52 @@ +import numpy as np + +from pandas.core.arrays.integer import IntegerArray, _IntegerDtype +from pandas.core.dtypes.dtypes import registry +from pandas.util._decorators import cache_readonly + + +class BooleanDtype(_IntegerDtype): + name = "EABool" + type = np.bool + na_value = np.nan + + @cache_readonly + def is_signed_integer(self): + return False + + @cache_readonly + def is_unsigned_integer(self): + return False + + @classmethod + def construct_array_type(cls): + return BooleanArray + + +class BooleanArray(IntegerArray): + + @cache_readonly + def dtype(self): + return BooleanDtype() + + +def to_boolean_array(values): + """ + Infer and return an integer array of the values. + + Parameters + ---------- + values : 1D list-like + + Returns + ------- + IntegerArray + + Raises + ------ + TypeError if incompatible types + """ + return BooleanArray(values, dtype='uint8', copy=False) + + +registry.register(BooleanDtype) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index c126117060c3d..e5c0fd22e4674 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.core.dtypes.common import ( is_integer, is_scalar, is_float, + is_bool_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, @@ -158,7 +159,8 @@ def coerce_to_array(values, dtype, mask=None, copy=False): raise TypeError("{} cannot be converted to an IntegerDtype".format( values.dtype)) - elif not (is_integer_dtype(values) or is_float_dtype(values)): + elif not (is_integer_dtype(values) or is_float_dtype(values) or + is_bool_dtype(values)): raise TypeError("{} cannot be converted to an IntegerDtype".format( values.dtype)) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 2125458e8a0ba..1b5c6722f51f2 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -13,6 +13,7 @@ def test_name(self, dtype): def test_kind(self, dtype): valid = set('biufcmMOSUV') + if dtype.kind is not None: assert dtype.kind in valid diff --git a/pandas/tests/extension/boolean/test_boolean.py b/pandas/tests/extension/boolean/test_boolean.py new file mode 100644 index 0000000000000..8301a6f75ddcc --- /dev/null +++ b/pandas/tests/extension/boolean/test_boolean.py @@ -0,0 +1,635 @@ +import numpy as np +import pandas as pd +import pandas.util.testing as tm +import pytest + +from pandas.tests.extension import base +from pandas.tests.extension.integer.test_integer import BaseInteger +from pandas.api.types import ( + is_integer, is_scalar, is_float, is_float_dtype) +from pandas.core.dtypes.generic import ABCIndexClass + +from pandas.core.arrays.boolean import ( + BooleanDtype, BooleanArray, to_boolean_array) + + +@pytest.fixture +def dtype(request): + return BooleanDtype + + +@pytest.fixture +def data(dtype): + return BooleanArray([True, False]) + + +@pytest.fixture +def data_missing(dtype): + return BooleanArray([np.nan, True], dtype=dtype) + + +@pytest.fixture +def data_repeated(data): + def gen(count): + for _ in range(count): + yield data + yield gen + + +@pytest.fixture +def data_for_sorting(dtype): + return BooleanArray([True, False]) + + +@pytest.fixture +def data_missing_for_sorting(dtype): + return BooleanArray([True, np.nan, False]) + + +@pytest.fixture +def na_cmp(): + # we are np.nan + return lambda x, y: np.isnan(x) and np.isnan(y) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def data_for_grouping(dtype): + b = False + a = True + na = np.nan + return BooleanArray([b, b, na, na, a, a, b, a]) + + +class TestDtype(BaseInteger, base.BaseDtypeTests): + + def test_is_dtype_unboxes_dtype(self): + raise NotImplementedError + + def test_array_type_with_arg(self, data, dtype): + assert dtype.construct_array_type() is BooleanArray + + +class TestArithmeticOps(BaseInteger, base.BaseArithmeticOpsTests): + + def _check_divmod_op(self, s, op, other, exc=None): + super(TestArithmeticOps, self)._check_divmod_op(s, op, other, None) + + def _check_op(self, s, op_name, other, exc=None): + op = self.get_op_from_name(op_name) + result = op(s, other) + + # compute expected + mask = s.isna() + + # other array is an BooleanArray + if isinstance(other, BooleanArray): + omask = getattr(other, 'mask', None) + mask = getattr(other, 'data', other) + if omask is not None: + mask |= omask + + # float result type or float op + if ((is_float_dtype(other) or is_float(other) or + op_name in ['__rtruediv__', '__truediv__', + '__rdiv__', '__div__'])): + rs = s.astype('float') + expected = op(rs, other) + self._check_op_float(result, expected, mask, s, op_name, other) + + # integer result type + else: + rs = pd.Series(s.values._data) + expected = op(rs, other) + self._check_op_integer(result, expected, mask, s, op_name, other) + + def _check_op_float(self, result, expected, mask, s, op_name, other): + # check comparisions that are resulting in float dtypes + + expected[mask] = np.nan + self.assert_series_equal(result, expected) + + def _check_op_integer(self, result, expected, mask, s, op_name, other): + # check comparisions that are resulting in integer dtypes + + # to compare properly, we convert the expected + # to float, mask to nans and convert infs + # if we have uints then we process as uints + # then conert to float + # and we ultimately want to create a IntArray + # for comparisons + + fill_value = 0 + + # mod/rmod turn floating 0 into NaN while + # integer works as expected (no nan) + if op_name in ['__mod__', '__rmod__']: + if is_scalar(other): + if other == 0: + expected[s.values == 0] = 0 + else: + expected = expected.fillna(0) + else: + expected[(s.values == 0) & + ((expected == 0) | expected.isna())] = 0 + + try: + expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + original = expected + expected = expected.astype(s.dtype) + + except ValueError: + + expected = expected.astype(float) + expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + original = expected + expected = expected.astype(s.dtype) + + expected[mask] = np.nan + + # assert that the expected astype is ok + # (skip for unsigned as they have wrap around) + if not s.dtype.is_unsigned_integer: + original = pd.Series(original) + + # we need to fill with 0's to emulate what an astype('int') does + # (truncation) for certain ops + if op_name in ['__rtruediv__', '__rdiv__']: + mask |= original.isna() + original = original.fillna(0).astype('int') + + original = original.astype('float') + original[mask] = np.nan + self.assert_series_equal(original, expected.astype('float')) + + # assert our expected result + self.assert_series_equal(result, expected) + + def test_arith_integer_array(self, data, all_arithmetic_operators): + # we operate with a rhs of an integer array + + op = all_arithmetic_operators + + s = pd.Series(data) + rhs = pd.Series([1] * len(data), dtype=data.dtype) + rhs.iloc[-1] = np.nan + + self._check_op(s, op, rhs) + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + # scalar + op = all_arithmetic_operators + + s = pd.Series(data) + self._check_op(s, op, 1, exc=TypeError) + + @pytest.mark.xfail(run=False, reason="_reduce needs implementation") + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): + # frame & scalar + op = all_arithmetic_operators + + df = pd.DataFrame({'A': data}) + self._check_op(df, op, 1, exc=TypeError) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + # ndarray & other series + op = all_arithmetic_operators + + s = pd.Series(data) + other = np.ones(len(s), dtype=s.dtype.type) + self._check_op(s, op, other, exc=TypeError) + + def test_arith_coerce_scalar(self, data, all_arithmetic_operators): + + op = all_arithmetic_operators + s = pd.Series(data) + + other = 0.01 + self._check_op(s, op, other) + + @pytest.mark.parametrize("other", [1., 1.0, np.array(1.), np.array([1.])]) + def test_arithmetic_conversion(self, all_arithmetic_operators, other): + # if we have a float operand we should have a float result + # if if that is equal to an integer + op = self.get_op_from_name(all_arithmetic_operators) + + s = pd.Series([1, 2, 3], dtype='Int64') + result = op(s, other) + assert result.dtype is np.dtype('float') + + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + with pytest.raises(TypeError): + ops('foo') + with pytest.raises(TypeError): + ops(pd.Timestamp('20180101')) + + # invalid array-likes + with pytest.raises(TypeError): + ops(pd.Series('foo', index=s.index)) + + if op != '__rpow__': + # TODO(extension) + # rpow with a datetimelike coerces the integer array incorrectly + with pytest.raises(TypeError): + ops(pd.Series(pd.date_range('20180101', periods=len(s)))) + + # 2d + with pytest.raises(NotImplementedError): + opa(pd.DataFrame({'A': s})) + with pytest.raises(NotImplementedError): + opa(np.arange(len(s)).reshape(-1, len(s))) + + +class TestComparisonOps(BaseInteger, base.BaseComparisonOpsTests): + + def _compare_other(self, s, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = op(s, other) + expected = pd.Series(op(data._data, other)) + + # fill the nan locations + expected[data._mask] = True if op_name == '__ne__' else False + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data._data) + expected = op(expected, other) + + # fill the nan locations + expected[data._mask] = True if op_name == '__ne__' else False + + tm.assert_series_equal(result, expected) + + +class TestInterface(BaseInteger, base.BaseInterfaceTests): + + def test_repr_array(self, data): + result = repr(data) + + # not long + assert '...' not in result + + assert 'dtype=' in result + assert 'BooleanArray' in result + + def test_repr_array_long(self, data): + # some arrays may be able to assert a ... in the repr + with pd.option_context('display.max_seq_items', 1): + result = repr(data) + + assert '...' in result + assert 'length' in result + + +class TestConstructors(BaseInteger, base.BaseConstructorsTests): + + def test_from_dtype_from_float(self, data): + # construct from our dtype & string dtype + dtype = data.dtype + + # from float + expected = pd.Series(data) + result = pd.Series(np.array(data).astype('float'), dtype=str(dtype)) + self.assert_series_equal(result, expected) + + # from int / list + expected = pd.Series(data) + result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) + self.assert_series_equal(result, expected) + + # from int / array + expected = pd.Series(data).dropna().reset_index(drop=True) + dropped = np.array(data.dropna()).astype(np.dtype((dtype.type))) + result = pd.Series(dropped, dtype=str(dtype)) + self.assert_series_equal(result, expected) + + +class TestReshaping(BaseInteger, base.BaseReshapingTests): + + def test_concat_mixed_dtypes(self, data): + # https://github.com/pandas-dev/pandas/issues/20762 + df1 = pd.DataFrame({'A': data[:3]}) + df2 = pd.DataFrame({"A": [1, 2, 3]}) + df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category') + df4 = pd.DataFrame({"A": pd.SparseArray([1, 2, 3])}) + dfs = [df1, df2, df3, df4] + + # dataframes + result = pd.concat(dfs) + expected = pd.concat([x.astype(object) for x in dfs]) + self.assert_frame_equal(result, expected) + + # series + result = pd.concat([x['A'] for x in dfs]) + expected = pd.concat([x['A'].astype(object) for x in dfs]) + self.assert_series_equal(result, expected) + + result = pd.concat([df1, df2]) + expected = pd.concat([df1.astype('object'), df2.astype('object')]) + self.assert_frame_equal(result, expected) + + # concat of an Integer and Int coerces to object dtype + # TODO(jreback) once integrated this would + # be a result of Integer + result = pd.concat([df1['A'], df2['A']]) + expected = pd.concat([df1['A'].astype('object'), + df2['A'].astype('object')]) + self.assert_series_equal(result, expected) + + +class TestGetitem(BaseInteger, base.BaseGetitemTests): + pass + + +class TestMissing(BaseInteger, base.BaseMissingTests): + pass + + +class TestMethods(BaseInteger, base.BaseMethodsTests): + + @pytest.mark.parametrize('dropna', [True, False]) + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts( + dropna=dropna).sort_index() + expected.index = expected.index.astype(all_data.dtype) + + self.assert_series_equal(result, expected) + + def test_combine_add(self, data_repeated): + # GH 20825 + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + + # fundamentally this is not a great operation + # as overflow / underflow can easily happen here + # e.g. int8 + int8 + def scalar_add(a, b): + + # TODO; should really be a type specific NA + if pd.isna(a) or pd.isna(b): + return np.nan + if is_integer(a): + a = int(a) + elif is_integer(b): + b = int(b) + return a + b + + result = s1.combine(s2, scalar_add) + expected = pd.Series( + orig_data1._from_sequence([scalar_add(a, b) for (a, b) in + zip(orig_data1, + orig_data2)])) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 + x2) + expected = pd.Series( + orig_data1._from_sequence([a + val for a in list(orig_data1)])) + self.assert_series_equal(result, expected) + + +class TestCasting(BaseInteger, base.BaseCastingTests): + + @pytest.mark.parametrize('dropna', [True, False]) + def test_construct_index(self, all_data, dropna): + # ensure that we do not coerce to Float64Index, rather + # keep as Index + + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Index(BooleanArray(other, + dtype=all_data.dtype)) + expected = pd.Index(other, dtype=object) + + self.assert_index_equal(result, expected) + + @pytest.mark.parametrize('dropna', [True, False]) + def test_astype_index(self, all_data, dropna): + # as an int/uint index to Index + + all_data = all_data[:10] + if dropna: + other = all_data[~all_data.isna()] + else: + other = all_data + + dtype = all_data.dtype + idx = pd.Index(np.array(other)) + assert isinstance(idx, ABCIndexClass) + + result = idx.astype(dtype) + expected = idx.astype(object).astype(dtype) + self.assert_index_equal(result, expected) + + def test_astype(self, all_data): + all_data = all_data[:10] + + ints = all_data[~all_data.isna()] + mixed = all_data + dtype = BooleanDtype() + + # coerce to same type - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype) + expected = pd.Series(ints) + self.assert_series_equal(result, expected) + + # coerce to same other - ints + s = pd.Series(ints) + result = s.astype(dtype) + expected = pd.Series(ints, dtype=dtype) + self.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype.numpy_dtype) + expected = pd.Series(ints._data.astype( + all_data.dtype.numpy_dtype)) + tm.assert_series_equal(result, expected) + + # coerce to same type - mixed + s = pd.Series(mixed) + result = s.astype(all_data.dtype) + expected = pd.Series(mixed) + self.assert_series_equal(result, expected) + + # coerce to same other - mixed + s = pd.Series(mixed) + result = s.astype(dtype) + expected = pd.Series(mixed, dtype=dtype) + self.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - mixed + s = pd.Series(mixed) + with pytest.raises(ValueError): + s.astype(all_data.dtype.numpy_dtype) + + # coerce to object + s = pd.Series(mixed) + result = s.astype('object') + expected = pd.Series(np.asarray(mixed)) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dtype', [BooleanDtype(), 'EABool']) + def test_astype_specific_casting(self, dtype): + s = pd.Series([1, 2, 3], dtype='Int64') + result = s.astype(dtype) + expected = pd.Series([True, True, True], dtype='EABool') + self.assert_series_equal(result, expected) + + s = pd.Series([1, 2, 3, None], dtype='Int64') + result = s.astype(dtype) + expected = pd.Series([True, True, True, None], dtype='EABool') + self.assert_series_equal(result, expected) + + def test_construct_cast_invalid(self, dtype): + + msg = "cannot safely" + arr = [1.2, 2.3, 3.7] + with tm.assert_raises_regex(TypeError, msg): + BooleanArray(arr, dtype=dtype) + + with tm.assert_raises_regex(TypeError, msg): + pd.Series(arr).astype(dtype) + + arr = [1.2, 2.3, 3.7, np.nan] + with tm.assert_raises_regex(TypeError, msg): + BooleanArray(arr, dtype=dtype) + + with tm.assert_raises_regex(TypeError, msg): + pd.Series(arr).astype(dtype) + + +class TestGroupby(BaseInteger, base.BaseGroupbyTests): + + @pytest.mark.xfail(reason="groupby not working", strict=True) + def test_groupby_extension_no_sort(self, data_for_grouping): + super(TestGroupby, self).test_groupby_extension_no_sort( + data_for_grouping) + + @pytest.mark.parametrize('as_index', [ + pytest.param(True, + marks=pytest.mark.xfail(reason="groupby not working", + strict=True)), + False + ]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + super(TestGroupby, self).test_groupby_extension_agg( + as_index, data_for_grouping) + + +def test_frame_repr(data_missing): + + df = pd.DataFrame({'A': data_missing}) + result = repr(df) + expected = ' A\n0 NaN\n1 1' + assert result == expected + + +def test_conversions(data_missing): + + # astype to object series + df = pd.DataFrame({'A': data_missing}) + result = df['A'].astype('object') + expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A') + tm.assert_series_equal(result, expected) + + # convert to object ndarray + # we assert that we are exactly equal + # including type conversions of scalars + result = df['A'].astype('object').values + expected = np.array([np.nan, 1], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + for r, e in zip(result, expected): + if pd.isnull(r): + assert pd.isnull(e) + elif is_integer(r): + # PY2 can be int or long + assert r == e + assert is_integer(e) + else: + assert r == e + assert type(r) == type(e) + + +@pytest.mark.parametrize( + 'values', + [ + ['foo', 'bar'], + 'foo', + 1, + 1.0, + pd.date_range('20130101', periods=2), + np.array(['foo'])]) +def test_to_boolean_array_error(values): + # error in converting existing arrays to BooleanArrays + with pytest.raises(TypeError): + to_boolean_array(values) + + +@pytest.mark.parametrize( + 'values, to_dtype, result_dtype', + [ + (np.array([1, np.nan]), 'EABool', BooleanDtype)]) +def test_to_boolean_array(values, to_dtype, result_dtype): + # convert existing arrays to BooleanArrays + result = to_boolean_array(values) + expected = BooleanArray(values, dtype=result_dtype()) + tm.assert_extension_array_equal(result, expected) + + +def test_cross_type_arithmetic(): + + df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), + 'B': pd.Series([1, np.nan, 3], dtype='UInt8'), + 'C': [1, 2, 3]}) + + result = df.A + df.C + expected = pd.Series([2, 4, np.nan], dtype='Int64') + tm.assert_series_equal(result, expected) + + result = (df.A + df.C) * 3 == 12 + expected = pd.Series([False, True, False]) + tm.assert_series_equal(result, expected) + + result = df.A + df.B + expected = pd.Series([2, np.nan, np.nan], dtype='Int64') + tm.assert_series_equal(result, expected) + + +# TODO(jreback) - these need testing / are broken + +# shift + +# set_index (destroys type)