From f65a70dd6474e1510100a6f3f239d89a4095af01 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 5 Dec 2019 09:03:52 +0100 Subject: [PATCH] BUG/TST: fix arrow roundtrip / parquet tests for recent pyarrow --- pandas/core/arrays/boolean.py | 18 ++++++++++++++++++ pandas/core/arrays/string_.py | 7 +++++-- pandas/tests/arrays/string_/test_string.py | 4 ++-- pandas/tests/arrays/test_boolean.py | 21 +++++++++++++++++++-- pandas/tests/io/test_parquet.py | 1 - 5 files changed, 44 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index aec3397bddd16..05f68356163dc 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -101,6 +101,24 @@ def __repr__(self) -> str: def _is_boolean(self) -> bool: return True + def __from_arrow__(self, array): + """Construct BooleanArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # TODO should optimize this without going through object array + bool_arr = BooleanArray._from_sequence(np.array(arr)) + results.append(bool_arr) + + return BooleanArray._concat_same_type(results) + def coerce_to_array(values, mask=None, copy: bool = False): """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f6af05ab4d9e7..c2f797cb4b6c2 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -86,7 +86,7 @@ def __from_arrow__(self, array): results = [] for arr in chunks: - # using _from_sequence to ensure None is convered to np.nan + # using _from_sequence to ensure None is convered to NA str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) @@ -197,7 +197,10 @@ def __arrow_array__(self, type=None): if type is None: type = pa.string() - return pa.array(self._ndarray, type=type, from_pandas=True) + + values = self._ndarray.copy() + values[self.isna()] = None + return pa.array(values, type=type, from_pandas=True) def _values_for_factorize(self): arr = self._ndarray.copy() diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 0dfd75a2042b0..aaa6d1cacd0a7 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -204,5 +204,5 @@ def test_arrow_roundtrip(): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) tm.assert_frame_equal(result, df) - # ensure the missing value is represented by NaN and not None - assert np.isnan(result.loc[2, "a"]) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is pd.NA diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index a13bb8edc8e48..1e331429975a5 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -546,12 +546,29 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): # result = arr[mask] -@pytest.mark.skip(reason="broken test") @td.skip_if_no("pyarrow", min_version="0.15.0") def test_arrow_array(data): # protocol added in 0.15.0 import pyarrow as pa arr = pa.array(data) - expected = pa.array(np.array(data, dtype=object), type=pa.bool_(), from_pandas=True) + + # TODO use to_numpy(na_value=None) here + data_object = np.array(data, dtype=object) + data_object[data.isna()] = None + expected = pa.array(data_object, type=pa.bool_(), from_pandas=True) assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array([True, False, None], dtype="boolean") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "bool" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.BooleanDtype) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a98c93c250070..3e687d185df84 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -504,7 +504,6 @@ def test_empty_dataframe(self, pa): df = pd.DataFrame() check_round_trip(df, pa) - @pytest.mark.skip(reason="broken test") @td.skip_if_no("pyarrow", min_version="0.15.0") def test_additional_extension_arrays(self, pa): # test additional ExtensionArrays that are supported through the