From 411664197a27206c4a73b9b3cf211cd2078b5d95 Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Tue, 2 Aug 2022 10:27:50 +0100 Subject: [PATCH 01/13] Specify supported dtypes as opposed to excluded ones --- tests/conftest.py | 2 ++ tests/strategies.py | 5 ++--- tests/test_column_object.py | 20 ++++++++++++-------- tests/test_from_dataframe.py | 4 ++-- tests/wrappers.py | 28 ++++++++++++++-------------- 5 files changed, 32 insertions(+), 27 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 9e89fe7..844e889 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -58,6 +58,8 @@ def pytest_configure(config): "test_column_object.py::test_size[vaex]", # https://github.com/rapidsai/cudf/issues/11389 "test_column_object.py::test_dtype[cudf]", + # recent spec change https://github.com/data-apis/dataframe-api/pull/74 + "test_column_object.py::test_describe_categorical", # Raises RuntimeError, which is technically correct, but the spec will # require TypeError soon. # See https://github.com/data-apis/dataframe-api/pull/74 diff --git a/tests/strategies.py b/tests/strategies.py index 1125574..c02bd2a 100644 --- a/tests/strategies.py +++ b/tests/strategies.py @@ -77,7 +77,7 @@ def __repr__(self) -> str: def mock_dataframes( draw, *, - exclude_dtypes: Collection[NominalDtype] = [], + dtypes: Collection[NominalDtype] = set(NominalDtype), allow_zero_cols: bool = True, allow_zero_rows: bool = True, ncols: Optional[int] = None, @@ -96,9 +96,8 @@ def mock_dataframes( min_nrows = 0 if allow_zero_rows else 1 nrows = draw(st.integers(min_nrows, 5)) name_to_column = {} - valid_dtypes = [e for e in NominalDtype if e not in exclude_dtypes] for colname in colnames: - nominal_dtype = draw(st.sampled_from(valid_dtypes)) + nominal_dtype = draw(st.sampled_from(list(dtypes))) dtype = nominal_dtype.value elements = None if nominal_dtype == NominalDtype.CATEGORY: diff --git a/tests/test_column_object.py b/tests/test_column_object.py index 94439df..110506a 100644 --- a/tests/test_column_object.py +++ b/tests/test_column_object.py @@ -15,10 +15,10 @@ def draw_column_and_mock( - libinfo: LibraryInfo, data: st.DataObject + libinfo: LibraryInfo, data: st.DataObject, **kwargs ) -> Tuple[Column, MockColumn]: mock_df = data.draw( - mock_dataframes(**{**libinfo.mock_dataframes_kwargs, "ncols": 1}), + mock_dataframes(**{**libinfo.mock_dataframes_kwargs, **kwargs, "ncols": 1}), label="mock_df", ) df = libinfo.mock_to_interchange(mock_df) @@ -106,18 +106,22 @@ def test_dtype(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) def test_describe_categorical(libinfo: LibraryInfo, data: st.DataObject): - # TODO: bias generation for categorical columns - col, mock_col = draw_column_and_mock(libinfo, data) + if NominalDtype.CATEGORY not in libinfo.supported_dtypes: + pytest.skip(f"categorical columns not generated for {libinfo.name}") + if data.draw(st.booleans()): + dtypes = {NominalDtype.CATEGORY} + else: + dtypes = libinfo.supported_dtypes # TODO: removing categorical here is flaky? + col, mock_col = draw_column_and_mock(libinfo, data, dtypes=dtypes) if mock_col.nominal_dtype == NominalDtype.CATEGORY: catinfo = col.describe_categorical assert isinstance(catinfo, dict) - for key in ["is_ordered", "is_dictionary", "mapping"]: + for key in ["is_ordered", "is_dictionary", "categories"]: assert key in catinfo.keys() assert isinstance(catinfo["is_ordered"], bool) assert isinstance(catinfo["is_dictionary"], bool) - mapping = catinfo["mapping"] - if mapping is not None: - assert isinstance(mapping, dict) + if not catinfo["is_dictionary"]: + assert catinfo["categories"] is None else: with pytest.raises(TypeError): col.describe_categorical diff --git a/tests/test_from_dataframe.py b/tests/test_from_dataframe.py index bd9766a..3dcbc9e 100644 --- a/tests/test_from_dataframe.py +++ b/tests/test_from_dataframe.py @@ -18,12 +18,12 @@ def test_from_dataframe_roundtrip( Round trip of dataframe interchange results in a dataframe identical to the original dataframe. """ - exclude_dtypes = set(orig_libinfo.exclude_dtypes) | set(dest_libinfo.exclude_dtypes) + dtypes = set(orig_libinfo.supported_dtypes) & set(dest_libinfo.supported_dtypes) allow_zero_cols = orig_libinfo.allow_zero_cols and dest_libinfo.allow_zero_cols allow_zero_rows = orig_libinfo.allow_zero_rows and dest_libinfo.allow_zero_rows mock_df = data.draw( mock_dataframes( - exclude_dtypes=exclude_dtypes, + dtypes=dtypes, allow_zero_cols=allow_zero_cols, allow_zero_rows=allow_zero_rows, ), diff --git a/tests/wrappers.py b/tests/wrappers.py index 7f0bb44..e0ceb53 100644 --- a/tests/wrappers.py +++ b/tests/wrappers.py @@ -1,6 +1,6 @@ import re from copy import copy -from typing import Any, Callable, Dict, List, NamedTuple, Tuple +from typing import Any, Callable, Dict, List, NamedTuple, Set, Tuple import numpy as np import pytest @@ -19,7 +19,7 @@ class LibraryInfo(NamedTuple): mock_to_toplevel: Callable[[MockDataFrame], TopLevelDataFrame] from_dataframe: Callable[[TopLevelDataFrame], DataFrame] frame_equal: Callable[[TopLevelDataFrame, DataFrame], bool] - exclude_dtypes: List[NominalDtype] = [] + supported_dtypes: Set[NominalDtype] = set(NominalDtype) allow_zero_cols: bool = True allow_zero_rows: bool = True @@ -30,7 +30,7 @@ def mock_to_interchange(self, mock_dataframe: MockDataFrame) -> DataFrame: @property def mock_dataframes_kwargs(self) -> Dict[str, Any]: return { - "exclude_dtypes": self.exclude_dtypes, + "dtypes": self.supported_dtypes, "allow_zero_cols": self.allow_zero_cols, "allow_zero_rows": self.allow_zero_rows, } @@ -80,7 +80,7 @@ def pandas_mock_to_toplevel(mock_df: MockDataFrame) -> pd.DataFrame: mock_to_toplevel=pandas_mock_to_toplevel, from_dataframe=pandas_from_dataframe, frame_equal=lambda df1, df2: df1.equals(df2), - exclude_dtypes=[NominalDtype.DATETIME64NS], + supported_dtypes=set(NominalDtype) ^ {NominalDtype.DATETIME64NS}, ) libinfo_params.append(pytest.param(pandas_libinfo, id=pandas_libinfo.name)) @@ -137,9 +137,7 @@ def vaex_frame_equal(df1, df2) -> bool: mock_to_toplevel=vaex_mock_to_toplevel, from_dataframe=vaex_from_dataframe, frame_equal=vaex_frame_equal, - exclude_dtypes=[ - NominalDtype.DATETIME64NS, - ], + supported_dtypes=set(NominalDtype) ^ {NominalDtype.DATETIME64NS}, # https://github.com/vaexio/vaex/issues/2094 allow_zero_cols=False, allow_zero_rows=False, @@ -218,13 +216,14 @@ def modin_frame_equal(df1: mpd.DataFrame, df2: mpd.DataFrame) -> bool: mock_to_toplevel=modin_mock_to_toplevel, from_dataframe=modin_from_dataframe, frame_equal=modin_frame_equal, - # https://github.com/modin-project/modin/issues/4654 - # https://github.com/modin-project/modin/issues/4652 - exclude_dtypes=[ - NominalDtype.UTF8, + supported_dtypes=set(NominalDtype) + ^ { NominalDtype.DATETIME64NS, + # https://github.com/modin-project/modin/issues/4654 + NominalDtype.UTF8, + # https://github.com/modin-project/modin/issues/4652 NominalDtype.CATEGORY, - ], + }, # https://github.com/modin-project/modin/issues/4643 allow_zero_rows=False, ) @@ -294,11 +293,12 @@ def cudf_mock_to_toplevel(mock_df: MockDataFrame) -> cudf.DataFrame: mock_to_toplevel=cudf_mock_to_toplevel, from_dataframe=cudf_from_dataframe, frame_equal=lambda df1, df2: df1.equals(df2), # NaNs considered equal - exclude_dtypes=[ + supported_dtypes=set(NominalDtype) + ^ { NominalDtype.DATETIME64NS, # https://github.com/rapidsai/cudf/issues/11308 NominalDtype.UTF8, - ], + }, ) libinfo_params.append(pytest.param(cudf_libinfo, id=cudf_libinfo.name)) From df06bcc23d7210d33b178feb2b54dc818a5393b6 Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Mon, 8 Aug 2022 11:33:32 +0100 Subject: [PATCH 02/13] Update vendored spec --- tests/api.py | 214 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 149 insertions(+), 65 deletions(-) diff --git a/tests/api.py b/tests/api.py index a407e9f..2860f35 100644 --- a/tests/api.py +++ b/tests/api.py @@ -1,15 +1,107 @@ from __future__ import annotations import enum -from typing import Any, Iterable, Sequence +from abc import ABC, abstractmethod +from typing import Any, Iterable, Sequence, TypedDict -__all__ = ["Buffer", "Column", "DataFrame"] +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" -# TODO: load classes at runtime from submodule of df protocol repo + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 -class Buffer: +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: tuple[Buffer, Any] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: tuple[Buffer, Any] | None + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: tuple[Buffer, Any] | None + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + categories: Column | None + + +class Buffer(ABC): """ Data in the buffer is guaranteed to be contiguous in memory. @@ -25,17 +117,20 @@ class Buffer: """ @property + @abstractmethod def bufsize(self) -> int: """ Buffer size in bytes. """ @property + @abstractmethod def ptr(self) -> int: """ Pointer to start of the buffer as an integer. """ + @abstractmethod def __dlpack__(self): """ Produce DLPack capsule (see array API standard). @@ -50,26 +145,16 @@ def __dlpack__(self): """ raise NotImplementedError("__dlpack__") - def __dlpack_device__(self) -> tuple[enum.IntEnum, int]: + @abstractmethod + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: """ Device type and device ID for where the data in the buffer resides. - - Uses device type codes matching DLPack. Enum members are:: - - - CPU = 1 - - CUDA = 2 - - CPU_PINNED = 3 - - OPENCL = 4 - - VULKAN = 7 - - METAL = 8 - - VPI = 9 - - ROCM = 10 - + Uses device type codes matching DLPack. Note: must be implemented even if ``__dlpack__`` is not. """ -class Column: +class Column(ABC): """ A column object, with only the methods and properties required by the interchange protocol defined. @@ -111,19 +196,22 @@ class Column: Note: this Column object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. - """ - @property - def size(self) -> int | None: + @abstractmethod + def size(self) -> int: """ Size of the column, in elements. Corresponds to DataFrame.num_rows() if column is a single chunk; equal to size of this current chunk otherwise. + + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. """ @property + @abstractmethod def offset(self) -> int: """ Offset of first element. @@ -134,27 +222,17 @@ def offset(self) -> int: """ @property - def dtype(self) -> tuple[enum.IntEnum, int, str, str]: + @abstractmethod + def dtype(self) -> tuple[DtypeKind, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. - Kind : - - - INT = 0 - - UINT = 1 - - FLOAT = 2 - - BOOL = 20 - - STRING = 21 # UTF-8 - - DATETIME = 22 - - CATEGORICAL = 23 - Bit-width : the number of bits as an integer Format string : data type description format string in Apache Arrow C Data Interface format. Endianness : current only native endianness (``=``) is supported Notes: - - Kind specifiers are aligned with DLPack where possible (hence the jump to 20, leave enough room for future extension) - Masks must be specified as boolean with either bit width 1 (for bit @@ -174,49 +252,42 @@ def dtype(self) -> tuple[enum.IntEnum, int, str, str]: and nested (list, struct, map, union) dtypes. """ - # TODO: What should the dict key be? @property - def describe_categorical(self) -> dict[str, tuple[bool, bool, dict | None]]: + @abstractmethod + def describe_categorical(self) -> CategoricalDescription: """ If the dtype is categorical, there are two options: - - There are only values in the data buffer. - - There is a separate dictionary-style encoding for categorical values. + - There is a separate non-categorical Column encoding categorical values. - Raises RuntimeError if the dtype is not categorical - - Content of returned dict: + Raises TypeError if the dtype is not categorical + Returns the dictionary with description on how to interpret the data buffer: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - - "is_dictionary" : bool, whether a dictionary-style mapping of + - "is_dictionary" : bool, whether a mapping of categorical values to other objects exists - - "mapping" : dict, Python-level only (e.g. ``{int: str}``). - None if not a dictionary-style categorical. + - "categories" : Column representing the (implicit) mapping of indices to + category values (e.g. an array of cat1, cat2, ...). + None if not a dictionary-style categorical. TBD: are there any other in-memory representations that are needed? """ @property - def describe_null(self) -> tuple[int, Any]: + @abstractmethod + def describe_null(self) -> tuple[ColumnNullType, Any]: """ Return the missing value (or "null") representation the column dtype uses, as a tuple ``(kind, value)``. - Kind: - - - 0 : non-nullable - - 1 : NaN/NaT - - 2 : sentinel value - - 3 : bit mask - - 4 : byte mask - Value : if kind is "sentinel value", the actual value. If kind is a bit mask or a byte mask, the value (0 or 1) indicating a missing value. None otherwise. """ @property + @abstractmethod def null_count(self) -> int | None: """ Number of null elements, if known. @@ -225,16 +296,19 @@ def null_count(self) -> int | None: """ @property + @abstractmethod def metadata(self) -> dict[str, Any]: """ The metadata for the column. See `DataFrame.metadata` for more details. """ + @abstractmethod def num_chunks(self) -> int: """ Return the number of chunks the column consists of. """ + @abstractmethod def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: """ Return an iterator yielding the chunks. @@ -242,7 +316,8 @@ def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: See `DataFrame.get_chunks` for details on ``n_chunks``. """ - def get_buffers(self) -> dict[str, Any]: + @abstractmethod + def get_buffers(self) -> ColumnBuffers: """ Return a dictionary containing the underlying buffers. @@ -270,10 +345,10 @@ def get_buffers(self) -> dict[str, Any]: # Children columns underneath the column, each object in this iterator # must adhere to the column specification. # """ -# +# pass -class DataFrame: +class DataFrame(ABC): """ A data frame class, with only the methods required by the interchange protocol defined. @@ -288,28 +363,27 @@ class DataFrame: to the dataframe interchange protocol specification. """ - def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> dict: + version = 0 # version of the protocol + + @abstractmethod + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> DataFrame: """ - Produces a dictionary object following the dataframe protocol specification. + Construct a new exchange object, potentially changing the parameters. ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + producer to overwrite null values in the data with ``NaN``. It is intended for cases where the consumer does not support the bit mask or byte mask that is the producer's native representation. - ``allow_copy`` is a keyword that defines whether or not the library is allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. """ - self._nan_as_null = nan_as_null - self._allow_zero_zopy = allow_copy - return { - "dataframe": self, # DataFrame object adhering to the protocol - "version": 0, # Version number of the protocol - } @property + @abstractmethod def metadata(self) -> dict[str, Any]: """ The metadata for the data frame, as a dictionary with string keys. The @@ -321,11 +395,13 @@ def metadata(self) -> dict[str, Any]: followed by a period and the desired name, e.g, ``pandas.indexcol``. """ + @abstractmethod def num_columns(self) -> int: """ Return the number of columns in the DataFrame. """ + @abstractmethod def num_rows(self) -> int | None: # TODO: not happy with Optional, but need to flag it may be expensive # why include it if it may be None - what do we expect consumers @@ -334,41 +410,49 @@ def num_rows(self) -> int | None: Return the number of rows in the DataFrame, if available. """ + @abstractmethod def num_chunks(self) -> int: """ Return the number of chunks the DataFrame consists of. """ + @abstractmethod def column_names(self) -> Iterable[str]: """ Return an iterator yielding the column names. """ + @abstractmethod def get_column(self, i: int) -> Column: """ Return the column at the indicated position. """ + @abstractmethod def get_column_by_name(self, name: str) -> Column: """ Return the column whose name is the indicated name. """ + @abstractmethod def get_columns(self) -> Iterable[Column]: """ Return an iterator yielding the columns. """ + @abstractmethod def select_columns(self, indices: Sequence[int]) -> DataFrame: """ Create a new DataFrame by selecting a subset of columns by index. """ + @abstractmethod def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: """ Create a new DataFrame by selecting a subset of columns by name. """ + @abstractmethod def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]: """ Return an iterator yielding the chunks. From 7593804ce6512575bb29d0fb6778006a4af95229 Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Mon, 8 Aug 2022 11:33:57 +0100 Subject: [PATCH 03/13] Update `test_size` --- tests/test_column_object.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_column_object.py b/tests/test_column_object.py index 110506a..9411d1b 100644 --- a/tests/test_column_object.py +++ b/tests/test_column_object.py @@ -30,10 +30,9 @@ def draw_column_and_mock( @given(data=st.data()) def test_size(libinfo: LibraryInfo, data: st.DataObject): col, mock_col = draw_column_and_mock(libinfo, data) - size = col.size - if size is not None: - assert isinstance(size, int) - assert size == mock_col.array.size + size = col.size() + assert isinstance(size, int) + assert size == mock_col.array.size @given(data=st.data()) From 557e2d0b04e218788bc0457ad44460f9b18744ee Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Mon, 8 Aug 2022 11:52:14 +0100 Subject: [PATCH 04/13] `MockDataFrame` doesn't mirror nrow/ncol semantics from spec --- tests/strategies.py | 16 +++++----------- tests/test_dataframe_object.py | 4 ++-- tests/wrappers.py | 10 +++++----- 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/tests/strategies.py b/tests/strategies.py index c02bd2a..74d4da6 100644 --- a/tests/strategies.py +++ b/tests/strategies.py @@ -34,16 +34,16 @@ class MockColumn(NamedTuple): class MockDataFrame(Mapping): def __init__(self, name_to_column: Dict[str, MockColumn]): if len(name_to_column) == 0: - self._ncols = 0 - self._nrows = 0 + self.ncols = 0 + self.nrows = 0 else: arrays = [x for x, _ in name_to_column.values()] - self._ncols = len(arrays) - self._nrows = arrays[0].size + self.ncols = len(arrays) + self.nrows = arrays[0].size for x in arrays: # sanity checks assert x.ndim == 1 - assert x.size == self._nrows + assert x.size == self.nrows self._name_to_column = name_to_column def __getitem__(self, key: str): @@ -55,12 +55,6 @@ def __iter__(self): def __len__(self): return len(self._name_to_column) - def num_columns(self) -> int: - return self._ncols - - def num_rows(self) -> int: - return self._nrows - def __repr__(self) -> str: col_reprs = [] for name, col in self.items(): diff --git a/tests/test_dataframe_object.py b/tests/test_dataframe_object.py index 4c3ecb0..a18ccb1 100644 --- a/tests/test_dataframe_object.py +++ b/tests/test_dataframe_object.py @@ -27,7 +27,7 @@ def test_num_columns(libinfo: LibraryInfo, data: st.DataObject): df = libinfo.mock_to_interchange(mock_df) out = df.num_columns() assert isinstance(out, int) - assert out == mock_df.num_columns() + assert out == mock_df.ncols @given(data=st.data()) @@ -40,7 +40,7 @@ def test_num_rows(libinfo: LibraryInfo, data: st.DataObject): out = df.num_rows() assume(out is not None) assert isinstance(out, int) - assert out == mock_df.num_rows() + assert out == mock_df.nrows @given(data=st.data()) diff --git a/tests/wrappers.py b/tests/wrappers.py index e0ceb53..f6e5c29 100644 --- a/tests/wrappers.py +++ b/tests/wrappers.py @@ -62,7 +62,7 @@ def __repr__(self) -> str: else: def pandas_mock_to_toplevel(mock_df: MockDataFrame) -> pd.DataFrame: - if mock_df.num_columns() == 0: + if mock_df.ncols == 0: return pd.DataFrame() serieses = [] for name, (array, nominal_dtype) in mock_df.items(): @@ -96,7 +96,7 @@ def pandas_mock_to_toplevel(mock_df: MockDataFrame) -> pd.DataFrame: else: def vaex_mock_to_toplevel(mock_df: MockDataFrame) -> TopLevelDataFrame: - if mock_df.num_columns() == 0 or mock_df.num_rows() == 0: + if mock_df.ncols == 0 or mock_df.nrows == 0: raise ValueError(f"{mock_df=} not supported by vaex") items: List[Tuple[str, np.ndarray]] = [] for name, (array, _) in mock_df.items(): @@ -178,9 +178,9 @@ def vaex_frame_equal(df1, df2) -> bool: else: def modin_mock_to_toplevel(mock_df: MockDataFrame) -> mpd.DataFrame: - if mock_df.num_columns() == 0: + if mock_df.ncols == 0: return mpd.DataFrame() - if mock_df.num_rows() == 0: + if mock_df.nrows == 0: raise ValueError(f"{mock_df=} not supported by modin") serieses: List[mpd.Series] = [] for name, (array, nominal_dtype) in mock_df.items(): @@ -270,7 +270,7 @@ def register_extension_type(*a, **kw): else: def cudf_mock_to_toplevel(mock_df: MockDataFrame) -> cudf.DataFrame: - if mock_df.num_columns() == 0: + if mock_df.ncols == 0: return cudf.DataFrame() serieses = [] for name, (array, nominal_dtype) in mock_df.items(): From 87c5cc08f38a388e92573664afb9473913affda2 Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Mon, 8 Aug 2022 13:21:58 +0100 Subject: [PATCH 05/13] Improve ergonomics of testing columns and buffers --- tests/strategies.py | 73 +++++++++++++++++++++++-------------- tests/test_buffer_object.py | 22 ++--------- tests/test_column_object.py | 48 +++++++----------------- tests/test_meta.py | 11 +++++- tests/test_signatures.py | 15 ++------ tests/wrappers.py | 39 +++++++++++++++++--- 6 files changed, 110 insertions(+), 98 deletions(-) diff --git a/tests/strategies.py b/tests/strategies.py index 74d4da6..61b1b86 100644 --- a/tests/strategies.py +++ b/tests/strategies.py @@ -1,12 +1,18 @@ from collections.abc import Mapping from enum import Enum -from typing import Collection, Dict, NamedTuple, Optional +from typing import Collection, Dict, NamedTuple import numpy as np from hypothesis import strategies as st from hypothesis.extra import numpy as nps -__all__ = ["mock_dataframes", "MockDataFrame", "MockColumn", "NominalDtype"] +__all__ = [ + "mock_dataframes", + "mock_single_col_dataframes", + "MockDataFrame", + "MockColumn", + "NominalDtype", +] class NominalDtype(Enum): @@ -62,45 +68,58 @@ def __repr__(self) -> str: return "MockDataFrame({" + ", ".join(col_reprs) + "})" -utf8_strat = st.from_regex(r"[a-zA-Z\_]{1,8}", fullmatch=True).filter( - lambda b: b[-1:] != "\0" -) +def utf8_strings() -> st.SearchStrategy[str]: + return st.from_regex(r"[a-zA-Z\_]{1,8}", fullmatch=True).filter( + lambda b: b[-1:] != "\0" + ) + + +def mock_columns( + nominal_dtype: NominalDtype, size: int +) -> st.SearchStrategy[MockColumn]: + dtype = nominal_dtype.value + elements = None + if nominal_dtype == NominalDtype.CATEGORY: + dtype = np.int8 + elements = st.integers(0, 15) + elif nominal_dtype == NominalDtype.UTF8: + # nps.arrays(dtype="U8") doesn't skip surrogates by default + elements = utf8_strings() + x_strat = nps.arrays(dtype=dtype, shape=size, elements=elements) + return x_strat.map(lambda x: MockColumn(x, nominal_dtype)) @st.composite def mock_dataframes( - draw, + draw: st.DrawFn, *, dtypes: Collection[NominalDtype] = set(NominalDtype), allow_zero_cols: bool = True, allow_zero_rows: bool = True, - ncols: Optional[int] = None, ) -> MockDataFrame: - if ncols is None: - min_ncols = 0 if allow_zero_cols else 1 - max_ncols = 5 - else: - if ncols == 0 and not allow_zero_cols: - raise ValueError(f"ncols cannot be 0 when {allow_zero_cols=}") - min_ncols = ncols - max_ncols = ncols + min_ncols = 0 if allow_zero_cols else 1 colnames = draw( - st.lists(utf8_strat, min_size=min_ncols, max_size=max_ncols, unique=True) + st.lists(utf8_strings(), min_size=min_ncols, max_size=5, unique=True) ) min_nrows = 0 if allow_zero_rows else 1 nrows = draw(st.integers(min_nrows, 5)) name_to_column = {} for colname in colnames: nominal_dtype = draw(st.sampled_from(list(dtypes))) - dtype = nominal_dtype.value - elements = None - if nominal_dtype == NominalDtype.CATEGORY: - dtype = np.int8 - elements = st.integers(0, 15) - elif nominal_dtype == NominalDtype.UTF8: - # nps.arrays(dtype="U8") doesn't skip surrogates by default - elements = utf8_strat - x = draw(nps.arrays(dtype=dtype, shape=nrows, elements=elements)) - assert not isinstance(nominal_dtype, str) - name_to_column[colname] = MockColumn(x, nominal_dtype) + name_to_column[colname] = draw(mock_columns(nominal_dtype, nrows)) return MockDataFrame(name_to_column) + + +@st.composite +def mock_single_col_dataframes( + draw: st.DrawFn, + *, + dtypes: Collection[NominalDtype] = set(NominalDtype), + allow_zero_rows: bool = True, +) -> MockDataFrame: + colname = draw(utf8_strings()) + nominal_dtype = draw(st.sampled_from(list(dtypes))) + min_size = 0 if allow_zero_rows else 1 + size = draw(st.integers(min_size, 5)) + mock_col = draw(mock_columns(nominal_dtype, size)) + return MockDataFrame({colname: mock_col}) diff --git a/tests/test_buffer_object.py b/tests/test_buffer_object.py index 4f2d9f2..59273a6 100644 --- a/tests/test_buffer_object.py +++ b/tests/test_buffer_object.py @@ -3,42 +3,26 @@ from hypothesis import given from hypothesis import strategies as st -from tests.api import Buffer - -from .strategies import mock_dataframes from .wrappers import LibraryInfo -def draw_buffer(libinfo: LibraryInfo, data: st.DataObject) -> Buffer: - mock_df = data.draw( - mock_dataframes(**{**libinfo.mock_dataframes_kwargs, "ncols": 1}), - label="mock_df", - ) - df = libinfo.mock_to_interchange(mock_df) - name = next(iter(mock_df.keys())) - col = df.get_column_by_name(name) - bufinfo = col.get_buffers() - buf, _ = bufinfo["data"] - return buf - - @given(data=st.data()) def test_bufsize(libinfo: LibraryInfo, data: st.DataObject): - buf = draw_buffer(libinfo, data) + buf = data.draw(libinfo.buffers(), label="buf") bufsize = buf.bufsize assert isinstance(bufsize, int) @given(data=st.data()) def test_ptr(libinfo: LibraryInfo, data: st.DataObject): - buf = draw_buffer(libinfo, data) + buf = data.draw(libinfo.buffers(), label="buf") ptr = buf.ptr assert isinstance(ptr, int) @given(data=st.data()) def test_dlpack_device(libinfo: LibraryInfo, data: st.DataObject): - buf = draw_buffer(libinfo, data) + buf = data.draw(libinfo.buffers(), label="buf") dlpack_device = buf.__dlpack_device__() assert isinstance(dlpack_device, tuple) assert len(dlpack_device) == 2 diff --git a/tests/test_column_object.py b/tests/test_column_object.py index 9411d1b..9ff742b 100644 --- a/tests/test_column_object.py +++ b/tests/test_column_object.py @@ -1,35 +1,18 @@ from enum import IntEnum -from typing import Dict, Tuple +from typing import Dict import numpy as np import pytest -from hypothesis import given, note +from hypothesis import given from hypothesis import strategies as st -from tests.api import Column - -from .strategies import MockColumn, NominalDtype, mock_dataframes +from .strategies import NominalDtype from .wrappers import LibraryInfo -# TODO: helpful assertion messages - - -def draw_column_and_mock( - libinfo: LibraryInfo, data: st.DataObject, **kwargs -) -> Tuple[Column, MockColumn]: - mock_df = data.draw( - mock_dataframes(**{**libinfo.mock_dataframes_kwargs, **kwargs, "ncols": 1}), - label="mock_df", - ) - df = libinfo.mock_to_interchange(mock_df) - name = next(iter(mock_df.keys())) - note(f"{libinfo.mock_to_toplevel(mock_df)[name]=}") - return df.get_column_by_name(name), mock_df[name] - @given(data=st.data()) def test_size(libinfo: LibraryInfo, data: st.DataObject): - col, mock_col = draw_column_and_mock(libinfo, data) + col, mock_col = data.draw(libinfo.columns_and_mock_columns(), label="col, mock_col") size = col.size() assert isinstance(size, int) assert size == mock_col.array.size @@ -37,7 +20,7 @@ def test_size(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) def test_offset(libinfo: LibraryInfo, data: st.DataObject): - col, _ = draw_column_and_mock(libinfo, data) + col = data.draw(libinfo.columns(), label="col") offset = col.offset assert isinstance(offset, int) @@ -85,7 +68,7 @@ class DtypeKind(IntEnum): @given(data=st.data()) def test_dtype(libinfo: LibraryInfo, data: st.DataObject): - col, mock_col = draw_column_and_mock(libinfo, data) + col, mock_col = data.draw(libinfo.columns_and_mock_columns(), label="col, mock_col") dtype = col.dtype assert isinstance(dtype, tuple) assert len(dtype) == 4 @@ -107,11 +90,8 @@ def test_dtype(libinfo: LibraryInfo, data: st.DataObject): def test_describe_categorical(libinfo: LibraryInfo, data: st.DataObject): if NominalDtype.CATEGORY not in libinfo.supported_dtypes: pytest.skip(f"categorical columns not generated for {libinfo.name}") - if data.draw(st.booleans()): - dtypes = {NominalDtype.CATEGORY} - else: - dtypes = libinfo.supported_dtypes # TODO: removing categorical here is flaky? - col, mock_col = draw_column_and_mock(libinfo, data, dtypes=dtypes) + # TODO: bias categorical generation + col, mock_col = data.draw(libinfo.columns_and_mock_columns(), label="col, mock_col") if mock_col.nominal_dtype == NominalDtype.CATEGORY: catinfo = col.describe_categorical assert isinstance(catinfo, dict) @@ -128,14 +108,14 @@ def test_describe_categorical(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) def test_describe_null(libinfo: LibraryInfo, data: st.DataObject): - col, _ = draw_column_and_mock(libinfo, data) + col = data.draw(libinfo.columns(), label="col") nullinfo = col.describe_null assert isinstance(nullinfo, tuple) assert len(nullinfo) == 2 kind, value = nullinfo assert isinstance(kind, int) assert kind in [0, 1, 2, 3, 4] - if kind in [0, 1]: # noll-nullable or NaN/NaT + if kind in [0, 1]: # noll-nullable or NaN assert value is None elif kind in [3, 4]: # bit or byte mask assert isinstance(value, int) @@ -144,7 +124,7 @@ def test_describe_null(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) def test_null_count(libinfo: LibraryInfo, data: st.DataObject): - col, mock_col = draw_column_and_mock(libinfo, data) + col, mock_col = data.draw(libinfo.columns_and_mock_columns(), label="col, mock_col") null_count = col.null_count if null_count is not None: assert isinstance(null_count, int) @@ -154,14 +134,14 @@ def test_null_count(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) def test_num_chunks(libinfo: LibraryInfo, data: st.DataObject): - col, _ = draw_column_and_mock(libinfo, data) + col = data.draw(libinfo.columns(), label="col") num_chunks = col.num_chunks() assert isinstance(num_chunks, int) @given(data=st.data()) def test_get_chunks(libinfo: LibraryInfo, data: st.DataObject): - col, _ = draw_column_and_mock(libinfo, data) + col = data.draw(libinfo.columns(), label="col") num_chunks = col.num_chunks() n_chunks = data.draw( st.none() | st.integers(1, 2).map(lambda n: n * num_chunks), @@ -176,7 +156,7 @@ def test_get_chunks(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) def test_get_buffers(libinfo: LibraryInfo, data: st.DataObject): - col, _ = draw_column_and_mock(libinfo, data) + col = data.draw(libinfo.columns(), label="col") bufinfo = col.get_buffers() assert isinstance(bufinfo, dict) for key in ["data", "validity", "offsets"]: diff --git a/tests/test_meta.py b/tests/test_meta.py index 895c833..e6993c4 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -16,7 +16,16 @@ def test_mock_dataframes(mock_df): @pytest.mark.parametrize( - "func_name", ["mock_dataframes", "toplevel_dataframes", "interchange_dataframes"] + "func_name", + [ + "mock_dataframes", + "toplevel_dataframes", + "interchange_dataframes", + "mock_single_col_dataframes", + "columns", + "columns_and_mock_columns", + "buffers", + ], ) @given(data=st.data()) def test_strategy(libinfo: LibraryInfo, func_name: str, data: st.DataObject): diff --git a/tests/test_signatures.py b/tests/test_signatures.py index 6a220e0..c165961 100644 --- a/tests/test_signatures.py +++ b/tests/test_signatures.py @@ -3,7 +3,7 @@ from typing import Callable # See https://github.com/python/mypy/issues/6864 import pytest -from hypothesis import assume, given, note, settings +from hypothesis import given, settings from hypothesis import strategies as st from .api import * @@ -108,10 +108,7 @@ def test_dataframe_method( @given(data=st.data()) @settings(max_examples=1) def test_column_method(libinfo: LibraryInfo, stub: FunctionType, data: st.DataObject): - df = data.draw(libinfo.interchange_dataframes(), label="df") - assume(df.num_columns() > 0) - col = df.get_column(0) - note(f"{col=}") + col = data.draw(libinfo.columns(), label="col") assert hasattr(col, stub.__name__) method = getattr(col, stub.__name__) assert isinstance(method, Callable) # type: ignore @@ -128,13 +125,7 @@ def test_column_method(libinfo: LibraryInfo, stub: FunctionType, data: st.DataOb @given(data=st.data()) @settings(max_examples=1) def test_buffer_method(libinfo: LibraryInfo, stub: FunctionType, data: st.DataObject): - df = data.draw(libinfo.interchange_dataframes(), label="df") - assume(df.num_columns() > 0) - col = df.get_column(0) - note(f"{col=}") - bufinfo = col.get_buffers() - buf, _ = bufinfo["data"] - note(f"{buf=}") + buf = data.draw(libinfo.buffers(), label="buf") assert hasattr(buf, stub.__name__) method = getattr(buf, stub.__name__) assert isinstance(method, Callable) # type: ignore diff --git a/tests/wrappers.py b/tests/wrappers.py index f6e5c29..4746ff4 100644 --- a/tests/wrappers.py +++ b/tests/wrappers.py @@ -6,8 +6,14 @@ import pytest from hypothesis import strategies as st -from .api import DataFrame -from .strategies import MockDataFrame, NominalDtype, mock_dataframes +from .api import Buffer, Column, DataFrame +from .strategies import ( + MockColumn, + MockDataFrame, + NominalDtype, + mock_dataframes, + mock_single_col_dataframes, +) __all__ = ["libname_to_libinfo", "libinfo_params", "LibraryInfo"] @@ -44,6 +50,31 @@ def toplevel_dataframes(self) -> st.SearchStrategy[TopLevelDataFrame]: def interchange_dataframes(self) -> st.SearchStrategy[TopLevelDataFrame]: return self.toplevel_dataframes().map(lambda df: df.__dataframe__()) + def mock_single_col_dataframes(self) -> st.SearchStrategy[MockDataFrame]: + return mock_single_col_dataframes( + dtypes=self.supported_dtypes, allow_zero_rows=self.allow_zero_rows + ) + + def columns(self) -> st.SearchStrategy[Column]: + return ( + self.mock_single_col_dataframes() + .map(self.mock_to_interchange) + .map(lambda df: df.get_column(0)) + ) + + def columns_and_mock_columns(self) -> st.SearchStrategy[Tuple[Column, MockColumn]]: + mock_df_strat = st.shared(self.mock_single_col_dataframes()) + col_strat = mock_df_strat.map(self.mock_to_interchange).map( + lambda df: df.get_column(0) + ) + mock_col_strat = mock_df_strat.map( + lambda mock_df: next(col for col in mock_df.values()) + ) + return st.tuples(col_strat, mock_col_strat) + + def buffers(self) -> st.SearchStrategy[Buffer]: + return self.columns().map(lambda col: col.get_buffers()["data"][0]) + def __repr__(self) -> str: return f"LibraryInfo(<{self.name}>)" @@ -181,7 +212,7 @@ def modin_mock_to_toplevel(mock_df: MockDataFrame) -> mpd.DataFrame: if mock_df.ncols == 0: return mpd.DataFrame() if mock_df.nrows == 0: - raise ValueError(f"{mock_df=} not supported by modin") + raise ValueError(f"{mock_df.nrows=} not supported by modin") serieses: List[mpd.Series] = [] for name, (array, nominal_dtype) in mock_df.items(): if nominal_dtype == NominalDtype.UTF8: @@ -221,8 +252,6 @@ def modin_frame_equal(df1: mpd.DataFrame, df2: mpd.DataFrame) -> bool: NominalDtype.DATETIME64NS, # https://github.com/modin-project/modin/issues/4654 NominalDtype.UTF8, - # https://github.com/modin-project/modin/issues/4652 - NominalDtype.CATEGORY, }, # https://github.com/modin-project/modin/issues/4643 allow_zero_rows=False, From ecbc1d4565bdcb203e9055906b485a091eb81476 Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Mon, 8 Aug 2022 15:19:46 +0100 Subject: [PATCH 06/13] Update wrappers --- tests/wrappers.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/wrappers.py b/tests/wrappers.py index 4746ff4..b5f81ee 100644 --- a/tests/wrappers.py +++ b/tests/wrappers.py @@ -87,7 +87,7 @@ def __repr__(self) -> str: try: import pandas as pd - from pandas.api.exchange import from_dataframe as pandas_from_dataframe + from pandas.api.interchange import from_dataframe as pandas_from_dataframe except ImportError as e: libinfo_params.append(pytest.param("pandas", marks=pytest.mark.skip(reason=e.msg))) else: @@ -111,7 +111,6 @@ def pandas_mock_to_toplevel(mock_df: MockDataFrame) -> pd.DataFrame: mock_to_toplevel=pandas_mock_to_toplevel, from_dataframe=pandas_from_dataframe, frame_equal=lambda df1, df2: df1.equals(df2), - supported_dtypes=set(NominalDtype) ^ {NominalDtype.DATETIME64NS}, ) libinfo_params.append(pytest.param(pandas_libinfo, id=pandas_libinfo.name)) @@ -183,15 +182,19 @@ def vaex_frame_equal(df1, df2) -> bool: try: import modin # noqa: F401 - # One issue modin has with pandas upstream is an outdated import of an - # exception class, so we try monkey-patching the class to the old path. try: + import pandas from pandas.core import base from pandas.errors import DataError except ImportError: pass else: + # One issue modin has with pandas upstream is an outdated import of an + # exception class, so we try monkey-patching the class to the old path. setattr(base, "DataError", DataError) + # modin also hard checks for supported pandas versions, so we + # monkey-patch a supported version. + setattr(pandas, "__version__", "1.4.3") import ray From 3e176c6a05b9da097f7f72fb9f096803b135a80b Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Tue, 9 Aug 2022 11:27:08 +0100 Subject: [PATCH 07/13] Improved `describe_categorical` testing --- tests/strategies.py | 2 +- tests/test_column_object.py | 58 ++++++++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/tests/strategies.py b/tests/strategies.py index 61b1b86..2ddff8c 100644 --- a/tests/strategies.py +++ b/tests/strategies.py @@ -69,7 +69,7 @@ def __repr__(self) -> str: def utf8_strings() -> st.SearchStrategy[str]: - return st.from_regex(r"[a-zA-Z\_]{1,8}", fullmatch=True).filter( + return st.from_regex(r"[a-zA-Z][a-zA-Z\_]", fullmatch=True).filter( lambda b: b[-1:] != "\0" ) diff --git a/tests/test_column_object.py b/tests/test_column_object.py index 9ff742b..f69e359 100644 --- a/tests/test_column_object.py +++ b/tests/test_column_object.py @@ -3,10 +3,10 @@ import numpy as np import pytest -from hypothesis import given +from hypothesis import given, note from hypothesis import strategies as st -from .strategies import NominalDtype +from .strategies import NominalDtype, mock_single_col_dataframes from .wrappers import LibraryInfo @@ -87,23 +87,47 @@ def test_dtype(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) -def test_describe_categorical(libinfo: LibraryInfo, data: st.DataObject): +def test_describe_categorical_on_categorical(libinfo: LibraryInfo, data: st.DataObject): if NominalDtype.CATEGORY not in libinfo.supported_dtypes: pytest.skip(f"categorical columns not generated for {libinfo.name}") - # TODO: bias categorical generation - col, mock_col = data.draw(libinfo.columns_and_mock_columns(), label="col, mock_col") - if mock_col.nominal_dtype == NominalDtype.CATEGORY: - catinfo = col.describe_categorical - assert isinstance(catinfo, dict) - for key in ["is_ordered", "is_dictionary", "categories"]: - assert key in catinfo.keys() - assert isinstance(catinfo["is_ordered"], bool) - assert isinstance(catinfo["is_dictionary"], bool) - if not catinfo["is_dictionary"]: - assert catinfo["categories"] is None - else: - with pytest.raises(TypeError): - col.describe_categorical + mock_df = data.draw( + mock_single_col_dataframes( + dtypes={NominalDtype.CATEGORY}, + allow_zero_rows=libinfo.allow_zero_rows, + ), + label="mock_df", + ) + df = libinfo.mock_to_interchange(mock_df) + col = df.get_column(0) + note(f"{col=}") + catinfo = col.describe_categorical + assert isinstance(catinfo, dict) + for key in ["is_ordered", "is_dictionary", "categories"]: + assert key in catinfo.keys() + assert isinstance(catinfo["is_ordered"], bool) + assert isinstance(catinfo["is_dictionary"], bool) + if not catinfo["is_dictionary"]: + assert catinfo["categories"] is None + + +@given(data=st.data()) +def test_describe_categorical_on_non_categorical( + libinfo: LibraryInfo, data: st.DataObject +): + dtypes = libinfo.supported_dtypes + if NominalDtype.CATEGORY in libinfo.supported_dtypes: + dtypes.remove(NominalDtype.CATEGORY) + mock_df = data.draw( + mock_single_col_dataframes( + dtypes=dtypes, allow_zero_rows=libinfo.allow_zero_rows + ), + label="mock_df", + ) + df = libinfo.mock_to_interchange(mock_df) + col = df.get_column(0) + note(f"{col=}") + with pytest.raises(TypeError): + col.describe_categorical @given(data=st.data()) From afc7c77717be9a6fae2249b8d8be893a21cb07f0 Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Wed, 10 Aug 2022 18:22:44 +0100 Subject: [PATCH 08/13] Fix regex used to generate utf8 strings --- tests/strategies.py | 4 +--- tests/test_meta.py | 8 +++++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/strategies.py b/tests/strategies.py index 2ddff8c..6a2424f 100644 --- a/tests/strategies.py +++ b/tests/strategies.py @@ -69,9 +69,7 @@ def __repr__(self) -> str: def utf8_strings() -> st.SearchStrategy[str]: - return st.from_regex(r"[a-zA-Z][a-zA-Z\_]", fullmatch=True).filter( - lambda b: b[-1:] != "\0" - ) + return st.from_regex(r"[a-zA-Z][a-zA-Z\_]{0,7}", fullmatch=True) def mock_columns( diff --git a/tests/test_meta.py b/tests/test_meta.py index e6993c4..7856240 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -6,10 +6,16 @@ from hypothesis import given from hypothesis import strategies as st -from .strategies import MockDataFrame, mock_dataframes +from .strategies import MockDataFrame, mock_dataframes, utf8_strings from .wrappers import LibraryInfo +@given(utf8_strings()) +def test_utf8_strings(string): + assert isinstance(string, str) + assert string[-1:] != "\0" + + @given(mock_dataframes()) def test_mock_dataframes(mock_df): assert isinstance(mock_df, MockDataFrame) From 540c91a38d4bc6268c4e8442da88b79c9eecdeab Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Wed, 14 Sep 2022 10:20:56 +0100 Subject: [PATCH 09/13] Assert datetime columns don't describe null as `kind=1` --- tests/test_column_object.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_column_object.py b/tests/test_column_object.py index f69e359..1205730 100644 --- a/tests/test_column_object.py +++ b/tests/test_column_object.py @@ -132,13 +132,19 @@ def test_describe_categorical_on_non_categorical( @given(data=st.data()) def test_describe_null(libinfo: LibraryInfo, data: st.DataObject): - col = data.draw(libinfo.columns(), label="col") + col, mock_col = data.draw(libinfo.columns_and_mock_columns(), label="col, mock_col") nullinfo = col.describe_null assert isinstance(nullinfo, tuple) assert len(nullinfo) == 2 kind, value = nullinfo assert isinstance(kind, int) assert kind in [0, 1, 2, 3, 4] + if mock_col.nominal_dtype == NominalDtype.DATETIME64NS: + # The spec previously treated kind=1 as NaNs AND NaTs, but has since + # been updated to exclude NaTs. This means datetime columns should + # never have nulls represented as kind=1, as NaNs are a floating-point + # concept. See https://github.com/data-apis/dataframe-api/issues/64 + assert kind != 1 if kind in [0, 1]: # noll-nullable or NaN assert value is None elif kind in [3, 4]: # bit or byte mask From f3d8c35e0fce0a85e55edad6f0fae694ca40788a Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Wed, 14 Sep 2022 10:38:45 +0100 Subject: [PATCH 10/13] Update xfails --- tests/conftest.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 844e889..12faf24 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -45,29 +45,31 @@ def pytest_configure(config): "test_signatures.py::test_dataframe_method[vaex-__dataframe__]", "test_dataframe_object.py::test_dunder_dataframe[cudf]", "test_signatures.py::test_dataframe_method[cudf-__dataframe__]", + # https://github.com/vaexio/vaex/pull/2150 + "tests/test_signatures.py::test_column_method[vaex-size]", # https://github.com/rapidsai/cudf/issues/11320 "test_signatures.py::test_buffer_method[cudf-__dlpack__]", "test_signatures.py::test_buffer_method[cudf-__dlpack_device__]", # https://github.com/vaexio/vaex/issues/2083 # https://github.com/vaexio/vaex/issues/2093 # https://github.com/vaexio/vaex/issues/2113 + # https://github.com/vaexio/vaex/pull/2150 "test_from_dataframe.py::test_from_dataframe_roundtrip[pandas-vaex]", + "test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-modin]", "test_from_dataframe.py::test_from_dataframe_roundtrip[modin-vaex]", "test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pandas]", - # https://github.com/vaexio/vaex/issues/2093 + # https://github.com/vaexio/vaex/pull/2150 "test_column_object.py::test_size[vaex]", # https://github.com/rapidsai/cudf/issues/11389 "test_column_object.py::test_dtype[cudf]", - # recent spec change https://github.com/data-apis/dataframe-api/pull/74 - "test_column_object.py::test_describe_categorical", + # https://github.com/vaexio/vaex/pull/2150 + "test_column_object.py::test_describe_categorical_on_categorical[vaex]", # Raises RuntimeError, which is technically correct, but the spec will # require TypeError soon. # See https://github.com/data-apis/dataframe-api/pull/74 "test_column_object.py::test_describe_categorical[modin]", # https://github.com/vaexio/vaex/issues/2113 "test_column_object.py::test_describe_categorical[vaex]", - # https://github.com/pandas-dev/pandas/issues/47789 - "test_column_object.py::test_null_count[pandas]", # https://github.com/modin-project/modin/issues/4687 "test_column_object.py::test_null_count[modin]", # https://github.com/vaexio/vaex/issues/2121 From 38d69c21fd0e17ddb24da338ffa386514d0c11a8 Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Wed, 14 Sep 2022 11:20:25 +0100 Subject: [PATCH 11/13] Meta test to check `--ci` is testing intended libraries --- tests/test_meta.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_meta.py b/tests/test_meta.py index 7856240..57bb8f0 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -7,7 +7,13 @@ from hypothesis import strategies as st from .strategies import MockDataFrame, mock_dataframes, utf8_strings -from .wrappers import LibraryInfo +from .wrappers import LibraryInfo, libname_to_libinfo + + +def test_ci_has_correct_library_params(pytestconfig): + if not pytestconfig.getoption("--ci"): + pytest.skip("only intended for --ci runs") + assert set(libname_to_libinfo.keys()) == {"pandas", "vaex", "modin"} @given(utf8_strings()) From c9ce3c7fd42bfc03bf5ffff55356e78ec44695e4 Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Wed, 14 Sep 2022 12:28:13 +0100 Subject: [PATCH 12/13] Build pandas on CI for now --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6ef023e..67194e0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,7 +17,7 @@ jobs: run: | pip install ray git+https://github.com/modin-project/modin pip install vaex # use stable as no nightly builds and long build time - pip install -i https://pypi.anaconda.org/scipy-wheels-nightly/simple pandas --ignore-installed --no-deps + pip install git+https://github.com/pandas-dev/pandas --no-deps --ignore-installed # TODO: use nightly builds again - name: Run tests run: | pytest tests/ -v --ci From a86a635b653e7d7bc2e069a508aa1fd0bbaecfce Mon Sep 17 00:00:00 2001 From: Matthew Barber Date: Wed, 14 Sep 2022 15:18:34 +0100 Subject: [PATCH 13/13] Run workflows on just push --- .github/workflows/lint.yml | 2 +- .github/workflows/test.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 4cf809f..03fcbfe 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,5 +1,5 @@ name: Linting -on: [push, pull_request] +on: [push] jobs: build: runs-on: ubuntu-latest diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 67194e0..706c2a6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,5 +1,5 @@ name: Run tests -on: [push, pull_request] +on: [push] jobs: test: runs-on: ubuntu-latest