diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 4cf809f..03fcbfe 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,5 +1,5 @@ name: Linting -on: [push, pull_request] +on: [push] jobs: build: runs-on: ubuntu-latest diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6ef023e..706c2a6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,5 +1,5 @@ name: Run tests -on: [push, pull_request] +on: [push] jobs: test: runs-on: ubuntu-latest @@ -17,7 +17,7 @@ jobs: run: | pip install ray git+https://github.com/modin-project/modin pip install vaex # use stable as no nightly builds and long build time - pip install -i https://pypi.anaconda.org/scipy-wheels-nightly/simple pandas --ignore-installed --no-deps + pip install git+https://github.com/pandas-dev/pandas --no-deps --ignore-installed # TODO: use nightly builds again - name: Run tests run: | pytest tests/ -v --ci diff --git a/tests/api.py b/tests/api.py index a407e9f..2860f35 100644 --- a/tests/api.py +++ b/tests/api.py @@ -1,15 +1,107 @@ from __future__ import annotations import enum -from typing import Any, Iterable, Sequence +from abc import ABC, abstractmethod +from typing import Any, Iterable, Sequence, TypedDict -__all__ = ["Buffer", "Column", "DataFrame"] +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" -# TODO: load classes at runtime from submodule of df protocol repo + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 -class Buffer: +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: tuple[Buffer, Any] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: tuple[Buffer, Any] | None + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: tuple[Buffer, Any] | None + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + categories: Column | None + + +class Buffer(ABC): """ Data in the buffer is guaranteed to be contiguous in memory. @@ -25,17 +117,20 @@ class Buffer: """ @property + @abstractmethod def bufsize(self) -> int: """ Buffer size in bytes. """ @property + @abstractmethod def ptr(self) -> int: """ Pointer to start of the buffer as an integer. """ + @abstractmethod def __dlpack__(self): """ Produce DLPack capsule (see array API standard). @@ -50,26 +145,16 @@ def __dlpack__(self): """ raise NotImplementedError("__dlpack__") - def __dlpack_device__(self) -> tuple[enum.IntEnum, int]: + @abstractmethod + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: """ Device type and device ID for where the data in the buffer resides. - - Uses device type codes matching DLPack. Enum members are:: - - - CPU = 1 - - CUDA = 2 - - CPU_PINNED = 3 - - OPENCL = 4 - - VULKAN = 7 - - METAL = 8 - - VPI = 9 - - ROCM = 10 - + Uses device type codes matching DLPack. Note: must be implemented even if ``__dlpack__`` is not. """ -class Column: +class Column(ABC): """ A column object, with only the methods and properties required by the interchange protocol defined. @@ -111,19 +196,22 @@ class Column: Note: this Column object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. - """ - @property - def size(self) -> int | None: + @abstractmethod + def size(self) -> int: """ Size of the column, in elements. Corresponds to DataFrame.num_rows() if column is a single chunk; equal to size of this current chunk otherwise. + + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. """ @property + @abstractmethod def offset(self) -> int: """ Offset of first element. @@ -134,27 +222,17 @@ def offset(self) -> int: """ @property - def dtype(self) -> tuple[enum.IntEnum, int, str, str]: + @abstractmethod + def dtype(self) -> tuple[DtypeKind, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. - Kind : - - - INT = 0 - - UINT = 1 - - FLOAT = 2 - - BOOL = 20 - - STRING = 21 # UTF-8 - - DATETIME = 22 - - CATEGORICAL = 23 - Bit-width : the number of bits as an integer Format string : data type description format string in Apache Arrow C Data Interface format. Endianness : current only native endianness (``=``) is supported Notes: - - Kind specifiers are aligned with DLPack where possible (hence the jump to 20, leave enough room for future extension) - Masks must be specified as boolean with either bit width 1 (for bit @@ -174,49 +252,42 @@ def dtype(self) -> tuple[enum.IntEnum, int, str, str]: and nested (list, struct, map, union) dtypes. """ - # TODO: What should the dict key be? @property - def describe_categorical(self) -> dict[str, tuple[bool, bool, dict | None]]: + @abstractmethod + def describe_categorical(self) -> CategoricalDescription: """ If the dtype is categorical, there are two options: - - There are only values in the data buffer. - - There is a separate dictionary-style encoding for categorical values. + - There is a separate non-categorical Column encoding categorical values. - Raises RuntimeError if the dtype is not categorical - - Content of returned dict: + Raises TypeError if the dtype is not categorical + Returns the dictionary with description on how to interpret the data buffer: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - - "is_dictionary" : bool, whether a dictionary-style mapping of + - "is_dictionary" : bool, whether a mapping of categorical values to other objects exists - - "mapping" : dict, Python-level only (e.g. ``{int: str}``). - None if not a dictionary-style categorical. + - "categories" : Column representing the (implicit) mapping of indices to + category values (e.g. an array of cat1, cat2, ...). + None if not a dictionary-style categorical. TBD: are there any other in-memory representations that are needed? """ @property - def describe_null(self) -> tuple[int, Any]: + @abstractmethod + def describe_null(self) -> tuple[ColumnNullType, Any]: """ Return the missing value (or "null") representation the column dtype uses, as a tuple ``(kind, value)``. - Kind: - - - 0 : non-nullable - - 1 : NaN/NaT - - 2 : sentinel value - - 3 : bit mask - - 4 : byte mask - Value : if kind is "sentinel value", the actual value. If kind is a bit mask or a byte mask, the value (0 or 1) indicating a missing value. None otherwise. """ @property + @abstractmethod def null_count(self) -> int | None: """ Number of null elements, if known. @@ -225,16 +296,19 @@ def null_count(self) -> int | None: """ @property + @abstractmethod def metadata(self) -> dict[str, Any]: """ The metadata for the column. See `DataFrame.metadata` for more details. """ + @abstractmethod def num_chunks(self) -> int: """ Return the number of chunks the column consists of. """ + @abstractmethod def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: """ Return an iterator yielding the chunks. @@ -242,7 +316,8 @@ def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: See `DataFrame.get_chunks` for details on ``n_chunks``. """ - def get_buffers(self) -> dict[str, Any]: + @abstractmethod + def get_buffers(self) -> ColumnBuffers: """ Return a dictionary containing the underlying buffers. @@ -270,10 +345,10 @@ def get_buffers(self) -> dict[str, Any]: # Children columns underneath the column, each object in this iterator # must adhere to the column specification. # """ -# +# pass -class DataFrame: +class DataFrame(ABC): """ A data frame class, with only the methods required by the interchange protocol defined. @@ -288,28 +363,27 @@ class DataFrame: to the dataframe interchange protocol specification. """ - def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> dict: + version = 0 # version of the protocol + + @abstractmethod + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> DataFrame: """ - Produces a dictionary object following the dataframe protocol specification. + Construct a new exchange object, potentially changing the parameters. ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + producer to overwrite null values in the data with ``NaN``. It is intended for cases where the consumer does not support the bit mask or byte mask that is the producer's native representation. - ``allow_copy`` is a keyword that defines whether or not the library is allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. """ - self._nan_as_null = nan_as_null - self._allow_zero_zopy = allow_copy - return { - "dataframe": self, # DataFrame object adhering to the protocol - "version": 0, # Version number of the protocol - } @property + @abstractmethod def metadata(self) -> dict[str, Any]: """ The metadata for the data frame, as a dictionary with string keys. The @@ -321,11 +395,13 @@ def metadata(self) -> dict[str, Any]: followed by a period and the desired name, e.g, ``pandas.indexcol``. """ + @abstractmethod def num_columns(self) -> int: """ Return the number of columns in the DataFrame. """ + @abstractmethod def num_rows(self) -> int | None: # TODO: not happy with Optional, but need to flag it may be expensive # why include it if it may be None - what do we expect consumers @@ -334,41 +410,49 @@ def num_rows(self) -> int | None: Return the number of rows in the DataFrame, if available. """ + @abstractmethod def num_chunks(self) -> int: """ Return the number of chunks the DataFrame consists of. """ + @abstractmethod def column_names(self) -> Iterable[str]: """ Return an iterator yielding the column names. """ + @abstractmethod def get_column(self, i: int) -> Column: """ Return the column at the indicated position. """ + @abstractmethod def get_column_by_name(self, name: str) -> Column: """ Return the column whose name is the indicated name. """ + @abstractmethod def get_columns(self) -> Iterable[Column]: """ Return an iterator yielding the columns. """ + @abstractmethod def select_columns(self, indices: Sequence[int]) -> DataFrame: """ Create a new DataFrame by selecting a subset of columns by index. """ + @abstractmethod def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: """ Create a new DataFrame by selecting a subset of columns by name. """ + @abstractmethod def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]: """ Return an iterator yielding the chunks. diff --git a/tests/conftest.py b/tests/conftest.py index 9e89fe7..12faf24 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -45,27 +45,31 @@ def pytest_configure(config): "test_signatures.py::test_dataframe_method[vaex-__dataframe__]", "test_dataframe_object.py::test_dunder_dataframe[cudf]", "test_signatures.py::test_dataframe_method[cudf-__dataframe__]", + # https://github.com/vaexio/vaex/pull/2150 + "tests/test_signatures.py::test_column_method[vaex-size]", # https://github.com/rapidsai/cudf/issues/11320 "test_signatures.py::test_buffer_method[cudf-__dlpack__]", "test_signatures.py::test_buffer_method[cudf-__dlpack_device__]", # https://github.com/vaexio/vaex/issues/2083 # https://github.com/vaexio/vaex/issues/2093 # https://github.com/vaexio/vaex/issues/2113 + # https://github.com/vaexio/vaex/pull/2150 "test_from_dataframe.py::test_from_dataframe_roundtrip[pandas-vaex]", + "test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-modin]", "test_from_dataframe.py::test_from_dataframe_roundtrip[modin-vaex]", "test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pandas]", - # https://github.com/vaexio/vaex/issues/2093 + # https://github.com/vaexio/vaex/pull/2150 "test_column_object.py::test_size[vaex]", # https://github.com/rapidsai/cudf/issues/11389 "test_column_object.py::test_dtype[cudf]", + # https://github.com/vaexio/vaex/pull/2150 + "test_column_object.py::test_describe_categorical_on_categorical[vaex]", # Raises RuntimeError, which is technically correct, but the spec will # require TypeError soon. # See https://github.com/data-apis/dataframe-api/pull/74 "test_column_object.py::test_describe_categorical[modin]", # https://github.com/vaexio/vaex/issues/2113 "test_column_object.py::test_describe_categorical[vaex]", - # https://github.com/pandas-dev/pandas/issues/47789 - "test_column_object.py::test_null_count[pandas]", # https://github.com/modin-project/modin/issues/4687 "test_column_object.py::test_null_count[modin]", # https://github.com/vaexio/vaex/issues/2121 diff --git a/tests/strategies.py b/tests/strategies.py index 1125574..6a2424f 100644 --- a/tests/strategies.py +++ b/tests/strategies.py @@ -1,12 +1,18 @@ from collections.abc import Mapping from enum import Enum -from typing import Collection, Dict, NamedTuple, Optional +from typing import Collection, Dict, NamedTuple import numpy as np from hypothesis import strategies as st from hypothesis.extra import numpy as nps -__all__ = ["mock_dataframes", "MockDataFrame", "MockColumn", "NominalDtype"] +__all__ = [ + "mock_dataframes", + "mock_single_col_dataframes", + "MockDataFrame", + "MockColumn", + "NominalDtype", +] class NominalDtype(Enum): @@ -34,16 +40,16 @@ class MockColumn(NamedTuple): class MockDataFrame(Mapping): def __init__(self, name_to_column: Dict[str, MockColumn]): if len(name_to_column) == 0: - self._ncols = 0 - self._nrows = 0 + self.ncols = 0 + self.nrows = 0 else: arrays = [x for x, _ in name_to_column.values()] - self._ncols = len(arrays) - self._nrows = arrays[0].size + self.ncols = len(arrays) + self.nrows = arrays[0].size for x in arrays: # sanity checks assert x.ndim == 1 - assert x.size == self._nrows + assert x.size == self.nrows self._name_to_column = name_to_column def __getitem__(self, key: str): @@ -55,12 +61,6 @@ def __iter__(self): def __len__(self): return len(self._name_to_column) - def num_columns(self) -> int: - return self._ncols - - def num_rows(self) -> int: - return self._nrows - def __repr__(self) -> str: col_reprs = [] for name, col in self.items(): @@ -68,46 +68,56 @@ def __repr__(self) -> str: return "MockDataFrame({" + ", ".join(col_reprs) + "})" -utf8_strat = st.from_regex(r"[a-zA-Z\_]{1,8}", fullmatch=True).filter( - lambda b: b[-1:] != "\0" -) +def utf8_strings() -> st.SearchStrategy[str]: + return st.from_regex(r"[a-zA-Z][a-zA-Z\_]{0,7}", fullmatch=True) + + +def mock_columns( + nominal_dtype: NominalDtype, size: int +) -> st.SearchStrategy[MockColumn]: + dtype = nominal_dtype.value + elements = None + if nominal_dtype == NominalDtype.CATEGORY: + dtype = np.int8 + elements = st.integers(0, 15) + elif nominal_dtype == NominalDtype.UTF8: + # nps.arrays(dtype="U8") doesn't skip surrogates by default + elements = utf8_strings() + x_strat = nps.arrays(dtype=dtype, shape=size, elements=elements) + return x_strat.map(lambda x: MockColumn(x, nominal_dtype)) @st.composite def mock_dataframes( - draw, + draw: st.DrawFn, *, - exclude_dtypes: Collection[NominalDtype] = [], + dtypes: Collection[NominalDtype] = set(NominalDtype), allow_zero_cols: bool = True, allow_zero_rows: bool = True, - ncols: Optional[int] = None, ) -> MockDataFrame: - if ncols is None: - min_ncols = 0 if allow_zero_cols else 1 - max_ncols = 5 - else: - if ncols == 0 and not allow_zero_cols: - raise ValueError(f"ncols cannot be 0 when {allow_zero_cols=}") - min_ncols = ncols - max_ncols = ncols + min_ncols = 0 if allow_zero_cols else 1 colnames = draw( - st.lists(utf8_strat, min_size=min_ncols, max_size=max_ncols, unique=True) + st.lists(utf8_strings(), min_size=min_ncols, max_size=5, unique=True) ) min_nrows = 0 if allow_zero_rows else 1 nrows = draw(st.integers(min_nrows, 5)) name_to_column = {} - valid_dtypes = [e for e in NominalDtype if e not in exclude_dtypes] for colname in colnames: - nominal_dtype = draw(st.sampled_from(valid_dtypes)) - dtype = nominal_dtype.value - elements = None - if nominal_dtype == NominalDtype.CATEGORY: - dtype = np.int8 - elements = st.integers(0, 15) - elif nominal_dtype == NominalDtype.UTF8: - # nps.arrays(dtype="U8") doesn't skip surrogates by default - elements = utf8_strat - x = draw(nps.arrays(dtype=dtype, shape=nrows, elements=elements)) - assert not isinstance(nominal_dtype, str) - name_to_column[colname] = MockColumn(x, nominal_dtype) + nominal_dtype = draw(st.sampled_from(list(dtypes))) + name_to_column[colname] = draw(mock_columns(nominal_dtype, nrows)) return MockDataFrame(name_to_column) + + +@st.composite +def mock_single_col_dataframes( + draw: st.DrawFn, + *, + dtypes: Collection[NominalDtype] = set(NominalDtype), + allow_zero_rows: bool = True, +) -> MockDataFrame: + colname = draw(utf8_strings()) + nominal_dtype = draw(st.sampled_from(list(dtypes))) + min_size = 0 if allow_zero_rows else 1 + size = draw(st.integers(min_size, 5)) + mock_col = draw(mock_columns(nominal_dtype, size)) + return MockDataFrame({colname: mock_col}) diff --git a/tests/test_buffer_object.py b/tests/test_buffer_object.py index 4f2d9f2..59273a6 100644 --- a/tests/test_buffer_object.py +++ b/tests/test_buffer_object.py @@ -3,42 +3,26 @@ from hypothesis import given from hypothesis import strategies as st -from tests.api import Buffer - -from .strategies import mock_dataframes from .wrappers import LibraryInfo -def draw_buffer(libinfo: LibraryInfo, data: st.DataObject) -> Buffer: - mock_df = data.draw( - mock_dataframes(**{**libinfo.mock_dataframes_kwargs, "ncols": 1}), - label="mock_df", - ) - df = libinfo.mock_to_interchange(mock_df) - name = next(iter(mock_df.keys())) - col = df.get_column_by_name(name) - bufinfo = col.get_buffers() - buf, _ = bufinfo["data"] - return buf - - @given(data=st.data()) def test_bufsize(libinfo: LibraryInfo, data: st.DataObject): - buf = draw_buffer(libinfo, data) + buf = data.draw(libinfo.buffers(), label="buf") bufsize = buf.bufsize assert isinstance(bufsize, int) @given(data=st.data()) def test_ptr(libinfo: LibraryInfo, data: st.DataObject): - buf = draw_buffer(libinfo, data) + buf = data.draw(libinfo.buffers(), label="buf") ptr = buf.ptr assert isinstance(ptr, int) @given(data=st.data()) def test_dlpack_device(libinfo: LibraryInfo, data: st.DataObject): - buf = draw_buffer(libinfo, data) + buf = data.draw(libinfo.buffers(), label="buf") dlpack_device = buf.__dlpack_device__() assert isinstance(dlpack_device, tuple) assert len(dlpack_device) == 2 diff --git a/tests/test_column_object.py b/tests/test_column_object.py index 94439df..1205730 100644 --- a/tests/test_column_object.py +++ b/tests/test_column_object.py @@ -1,44 +1,26 @@ from enum import IntEnum -from typing import Dict, Tuple +from typing import Dict import numpy as np import pytest from hypothesis import given, note from hypothesis import strategies as st -from tests.api import Column - -from .strategies import MockColumn, NominalDtype, mock_dataframes +from .strategies import NominalDtype, mock_single_col_dataframes from .wrappers import LibraryInfo -# TODO: helpful assertion messages - - -def draw_column_and_mock( - libinfo: LibraryInfo, data: st.DataObject -) -> Tuple[Column, MockColumn]: - mock_df = data.draw( - mock_dataframes(**{**libinfo.mock_dataframes_kwargs, "ncols": 1}), - label="mock_df", - ) - df = libinfo.mock_to_interchange(mock_df) - name = next(iter(mock_df.keys())) - note(f"{libinfo.mock_to_toplevel(mock_df)[name]=}") - return df.get_column_by_name(name), mock_df[name] - @given(data=st.data()) def test_size(libinfo: LibraryInfo, data: st.DataObject): - col, mock_col = draw_column_and_mock(libinfo, data) - size = col.size - if size is not None: - assert isinstance(size, int) - assert size == mock_col.array.size + col, mock_col = data.draw(libinfo.columns_and_mock_columns(), label="col, mock_col") + size = col.size() + assert isinstance(size, int) + assert size == mock_col.array.size @given(data=st.data()) def test_offset(libinfo: LibraryInfo, data: st.DataObject): - col, _ = draw_column_and_mock(libinfo, data) + col = data.draw(libinfo.columns(), label="col") offset = col.offset assert isinstance(offset, int) @@ -86,7 +68,7 @@ class DtypeKind(IntEnum): @given(data=st.data()) def test_dtype(libinfo: LibraryInfo, data: st.DataObject): - col, mock_col = draw_column_and_mock(libinfo, data) + col, mock_col = data.draw(libinfo.columns_and_mock_columns(), label="col, mock_col") dtype = col.dtype assert isinstance(dtype, tuple) assert len(dtype) == 4 @@ -105,34 +87,65 @@ def test_dtype(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) -def test_describe_categorical(libinfo: LibraryInfo, data: st.DataObject): - # TODO: bias generation for categorical columns - col, mock_col = draw_column_and_mock(libinfo, data) - if mock_col.nominal_dtype == NominalDtype.CATEGORY: - catinfo = col.describe_categorical - assert isinstance(catinfo, dict) - for key in ["is_ordered", "is_dictionary", "mapping"]: - assert key in catinfo.keys() - assert isinstance(catinfo["is_ordered"], bool) - assert isinstance(catinfo["is_dictionary"], bool) - mapping = catinfo["mapping"] - if mapping is not None: - assert isinstance(mapping, dict) - else: - with pytest.raises(TypeError): - col.describe_categorical +def test_describe_categorical_on_categorical(libinfo: LibraryInfo, data: st.DataObject): + if NominalDtype.CATEGORY not in libinfo.supported_dtypes: + pytest.skip(f"categorical columns not generated for {libinfo.name}") + mock_df = data.draw( + mock_single_col_dataframes( + dtypes={NominalDtype.CATEGORY}, + allow_zero_rows=libinfo.allow_zero_rows, + ), + label="mock_df", + ) + df = libinfo.mock_to_interchange(mock_df) + col = df.get_column(0) + note(f"{col=}") + catinfo = col.describe_categorical + assert isinstance(catinfo, dict) + for key in ["is_ordered", "is_dictionary", "categories"]: + assert key in catinfo.keys() + assert isinstance(catinfo["is_ordered"], bool) + assert isinstance(catinfo["is_dictionary"], bool) + if not catinfo["is_dictionary"]: + assert catinfo["categories"] is None + + +@given(data=st.data()) +def test_describe_categorical_on_non_categorical( + libinfo: LibraryInfo, data: st.DataObject +): + dtypes = libinfo.supported_dtypes + if NominalDtype.CATEGORY in libinfo.supported_dtypes: + dtypes.remove(NominalDtype.CATEGORY) + mock_df = data.draw( + mock_single_col_dataframes( + dtypes=dtypes, allow_zero_rows=libinfo.allow_zero_rows + ), + label="mock_df", + ) + df = libinfo.mock_to_interchange(mock_df) + col = df.get_column(0) + note(f"{col=}") + with pytest.raises(TypeError): + col.describe_categorical @given(data=st.data()) def test_describe_null(libinfo: LibraryInfo, data: st.DataObject): - col, _ = draw_column_and_mock(libinfo, data) + col, mock_col = data.draw(libinfo.columns_and_mock_columns(), label="col, mock_col") nullinfo = col.describe_null assert isinstance(nullinfo, tuple) assert len(nullinfo) == 2 kind, value = nullinfo assert isinstance(kind, int) assert kind in [0, 1, 2, 3, 4] - if kind in [0, 1]: # noll-nullable or NaN/NaT + if mock_col.nominal_dtype == NominalDtype.DATETIME64NS: + # The spec previously treated kind=1 as NaNs AND NaTs, but has since + # been updated to exclude NaTs. This means datetime columns should + # never have nulls represented as kind=1, as NaNs are a floating-point + # concept. See https://github.com/data-apis/dataframe-api/issues/64 + assert kind != 1 + if kind in [0, 1]: # noll-nullable or NaN assert value is None elif kind in [3, 4]: # bit or byte mask assert isinstance(value, int) @@ -141,7 +154,7 @@ def test_describe_null(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) def test_null_count(libinfo: LibraryInfo, data: st.DataObject): - col, mock_col = draw_column_and_mock(libinfo, data) + col, mock_col = data.draw(libinfo.columns_and_mock_columns(), label="col, mock_col") null_count = col.null_count if null_count is not None: assert isinstance(null_count, int) @@ -151,14 +164,14 @@ def test_null_count(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) def test_num_chunks(libinfo: LibraryInfo, data: st.DataObject): - col, _ = draw_column_and_mock(libinfo, data) + col = data.draw(libinfo.columns(), label="col") num_chunks = col.num_chunks() assert isinstance(num_chunks, int) @given(data=st.data()) def test_get_chunks(libinfo: LibraryInfo, data: st.DataObject): - col, _ = draw_column_and_mock(libinfo, data) + col = data.draw(libinfo.columns(), label="col") num_chunks = col.num_chunks() n_chunks = data.draw( st.none() | st.integers(1, 2).map(lambda n: n * num_chunks), @@ -173,7 +186,7 @@ def test_get_chunks(libinfo: LibraryInfo, data: st.DataObject): @given(data=st.data()) def test_get_buffers(libinfo: LibraryInfo, data: st.DataObject): - col, _ = draw_column_and_mock(libinfo, data) + col = data.draw(libinfo.columns(), label="col") bufinfo = col.get_buffers() assert isinstance(bufinfo, dict) for key in ["data", "validity", "offsets"]: diff --git a/tests/test_dataframe_object.py b/tests/test_dataframe_object.py index 4c3ecb0..a18ccb1 100644 --- a/tests/test_dataframe_object.py +++ b/tests/test_dataframe_object.py @@ -27,7 +27,7 @@ def test_num_columns(libinfo: LibraryInfo, data: st.DataObject): df = libinfo.mock_to_interchange(mock_df) out = df.num_columns() assert isinstance(out, int) - assert out == mock_df.num_columns() + assert out == mock_df.ncols @given(data=st.data()) @@ -40,7 +40,7 @@ def test_num_rows(libinfo: LibraryInfo, data: st.DataObject): out = df.num_rows() assume(out is not None) assert isinstance(out, int) - assert out == mock_df.num_rows() + assert out == mock_df.nrows @given(data=st.data()) diff --git a/tests/test_from_dataframe.py b/tests/test_from_dataframe.py index bd9766a..3dcbc9e 100644 --- a/tests/test_from_dataframe.py +++ b/tests/test_from_dataframe.py @@ -18,12 +18,12 @@ def test_from_dataframe_roundtrip( Round trip of dataframe interchange results in a dataframe identical to the original dataframe. """ - exclude_dtypes = set(orig_libinfo.exclude_dtypes) | set(dest_libinfo.exclude_dtypes) + dtypes = set(orig_libinfo.supported_dtypes) & set(dest_libinfo.supported_dtypes) allow_zero_cols = orig_libinfo.allow_zero_cols and dest_libinfo.allow_zero_cols allow_zero_rows = orig_libinfo.allow_zero_rows and dest_libinfo.allow_zero_rows mock_df = data.draw( mock_dataframes( - exclude_dtypes=exclude_dtypes, + dtypes=dtypes, allow_zero_cols=allow_zero_cols, allow_zero_rows=allow_zero_rows, ), diff --git a/tests/test_meta.py b/tests/test_meta.py index 895c833..57bb8f0 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -6,8 +6,20 @@ from hypothesis import given from hypothesis import strategies as st -from .strategies import MockDataFrame, mock_dataframes -from .wrappers import LibraryInfo +from .strategies import MockDataFrame, mock_dataframes, utf8_strings +from .wrappers import LibraryInfo, libname_to_libinfo + + +def test_ci_has_correct_library_params(pytestconfig): + if not pytestconfig.getoption("--ci"): + pytest.skip("only intended for --ci runs") + assert set(libname_to_libinfo.keys()) == {"pandas", "vaex", "modin"} + + +@given(utf8_strings()) +def test_utf8_strings(string): + assert isinstance(string, str) + assert string[-1:] != "\0" @given(mock_dataframes()) @@ -16,7 +28,16 @@ def test_mock_dataframes(mock_df): @pytest.mark.parametrize( - "func_name", ["mock_dataframes", "toplevel_dataframes", "interchange_dataframes"] + "func_name", + [ + "mock_dataframes", + "toplevel_dataframes", + "interchange_dataframes", + "mock_single_col_dataframes", + "columns", + "columns_and_mock_columns", + "buffers", + ], ) @given(data=st.data()) def test_strategy(libinfo: LibraryInfo, func_name: str, data: st.DataObject): diff --git a/tests/test_signatures.py b/tests/test_signatures.py index 6a220e0..c165961 100644 --- a/tests/test_signatures.py +++ b/tests/test_signatures.py @@ -3,7 +3,7 @@ from typing import Callable # See https://github.com/python/mypy/issues/6864 import pytest -from hypothesis import assume, given, note, settings +from hypothesis import given, settings from hypothesis import strategies as st from .api import * @@ -108,10 +108,7 @@ def test_dataframe_method( @given(data=st.data()) @settings(max_examples=1) def test_column_method(libinfo: LibraryInfo, stub: FunctionType, data: st.DataObject): - df = data.draw(libinfo.interchange_dataframes(), label="df") - assume(df.num_columns() > 0) - col = df.get_column(0) - note(f"{col=}") + col = data.draw(libinfo.columns(), label="col") assert hasattr(col, stub.__name__) method = getattr(col, stub.__name__) assert isinstance(method, Callable) # type: ignore @@ -128,13 +125,7 @@ def test_column_method(libinfo: LibraryInfo, stub: FunctionType, data: st.DataOb @given(data=st.data()) @settings(max_examples=1) def test_buffer_method(libinfo: LibraryInfo, stub: FunctionType, data: st.DataObject): - df = data.draw(libinfo.interchange_dataframes(), label="df") - assume(df.num_columns() > 0) - col = df.get_column(0) - note(f"{col=}") - bufinfo = col.get_buffers() - buf, _ = bufinfo["data"] - note(f"{buf=}") + buf = data.draw(libinfo.buffers(), label="buf") assert hasattr(buf, stub.__name__) method = getattr(buf, stub.__name__) assert isinstance(method, Callable) # type: ignore diff --git a/tests/wrappers.py b/tests/wrappers.py index 7f0bb44..b5f81ee 100644 --- a/tests/wrappers.py +++ b/tests/wrappers.py @@ -1,13 +1,19 @@ import re from copy import copy -from typing import Any, Callable, Dict, List, NamedTuple, Tuple +from typing import Any, Callable, Dict, List, NamedTuple, Set, Tuple import numpy as np import pytest from hypothesis import strategies as st -from .api import DataFrame -from .strategies import MockDataFrame, NominalDtype, mock_dataframes +from .api import Buffer, Column, DataFrame +from .strategies import ( + MockColumn, + MockDataFrame, + NominalDtype, + mock_dataframes, + mock_single_col_dataframes, +) __all__ = ["libname_to_libinfo", "libinfo_params", "LibraryInfo"] @@ -19,7 +25,7 @@ class LibraryInfo(NamedTuple): mock_to_toplevel: Callable[[MockDataFrame], TopLevelDataFrame] from_dataframe: Callable[[TopLevelDataFrame], DataFrame] frame_equal: Callable[[TopLevelDataFrame, DataFrame], bool] - exclude_dtypes: List[NominalDtype] = [] + supported_dtypes: Set[NominalDtype] = set(NominalDtype) allow_zero_cols: bool = True allow_zero_rows: bool = True @@ -30,7 +36,7 @@ def mock_to_interchange(self, mock_dataframe: MockDataFrame) -> DataFrame: @property def mock_dataframes_kwargs(self) -> Dict[str, Any]: return { - "exclude_dtypes": self.exclude_dtypes, + "dtypes": self.supported_dtypes, "allow_zero_cols": self.allow_zero_cols, "allow_zero_rows": self.allow_zero_rows, } @@ -44,6 +50,31 @@ def toplevel_dataframes(self) -> st.SearchStrategy[TopLevelDataFrame]: def interchange_dataframes(self) -> st.SearchStrategy[TopLevelDataFrame]: return self.toplevel_dataframes().map(lambda df: df.__dataframe__()) + def mock_single_col_dataframes(self) -> st.SearchStrategy[MockDataFrame]: + return mock_single_col_dataframes( + dtypes=self.supported_dtypes, allow_zero_rows=self.allow_zero_rows + ) + + def columns(self) -> st.SearchStrategy[Column]: + return ( + self.mock_single_col_dataframes() + .map(self.mock_to_interchange) + .map(lambda df: df.get_column(0)) + ) + + def columns_and_mock_columns(self) -> st.SearchStrategy[Tuple[Column, MockColumn]]: + mock_df_strat = st.shared(self.mock_single_col_dataframes()) + col_strat = mock_df_strat.map(self.mock_to_interchange).map( + lambda df: df.get_column(0) + ) + mock_col_strat = mock_df_strat.map( + lambda mock_df: next(col for col in mock_df.values()) + ) + return st.tuples(col_strat, mock_col_strat) + + def buffers(self) -> st.SearchStrategy[Buffer]: + return self.columns().map(lambda col: col.get_buffers()["data"][0]) + def __repr__(self) -> str: return f"LibraryInfo(<{self.name}>)" @@ -56,13 +87,13 @@ def __repr__(self) -> str: try: import pandas as pd - from pandas.api.exchange import from_dataframe as pandas_from_dataframe + from pandas.api.interchange import from_dataframe as pandas_from_dataframe except ImportError as e: libinfo_params.append(pytest.param("pandas", marks=pytest.mark.skip(reason=e.msg))) else: def pandas_mock_to_toplevel(mock_df: MockDataFrame) -> pd.DataFrame: - if mock_df.num_columns() == 0: + if mock_df.ncols == 0: return pd.DataFrame() serieses = [] for name, (array, nominal_dtype) in mock_df.items(): @@ -80,7 +111,6 @@ def pandas_mock_to_toplevel(mock_df: MockDataFrame) -> pd.DataFrame: mock_to_toplevel=pandas_mock_to_toplevel, from_dataframe=pandas_from_dataframe, frame_equal=lambda df1, df2: df1.equals(df2), - exclude_dtypes=[NominalDtype.DATETIME64NS], ) libinfo_params.append(pytest.param(pandas_libinfo, id=pandas_libinfo.name)) @@ -96,7 +126,7 @@ def pandas_mock_to_toplevel(mock_df: MockDataFrame) -> pd.DataFrame: else: def vaex_mock_to_toplevel(mock_df: MockDataFrame) -> TopLevelDataFrame: - if mock_df.num_columns() == 0 or mock_df.num_rows() == 0: + if mock_df.ncols == 0 or mock_df.nrows == 0: raise ValueError(f"{mock_df=} not supported by vaex") items: List[Tuple[str, np.ndarray]] = [] for name, (array, _) in mock_df.items(): @@ -137,9 +167,7 @@ def vaex_frame_equal(df1, df2) -> bool: mock_to_toplevel=vaex_mock_to_toplevel, from_dataframe=vaex_from_dataframe, frame_equal=vaex_frame_equal, - exclude_dtypes=[ - NominalDtype.DATETIME64NS, - ], + supported_dtypes=set(NominalDtype) ^ {NominalDtype.DATETIME64NS}, # https://github.com/vaexio/vaex/issues/2094 allow_zero_cols=False, allow_zero_rows=False, @@ -154,15 +182,19 @@ def vaex_frame_equal(df1, df2) -> bool: try: import modin # noqa: F401 - # One issue modin has with pandas upstream is an outdated import of an - # exception class, so we try monkey-patching the class to the old path. try: + import pandas from pandas.core import base from pandas.errors import DataError except ImportError: pass else: + # One issue modin has with pandas upstream is an outdated import of an + # exception class, so we try monkey-patching the class to the old path. setattr(base, "DataError", DataError) + # modin also hard checks for supported pandas versions, so we + # monkey-patch a supported version. + setattr(pandas, "__version__", "1.4.3") import ray @@ -180,10 +212,10 @@ def vaex_frame_equal(df1, df2) -> bool: else: def modin_mock_to_toplevel(mock_df: MockDataFrame) -> mpd.DataFrame: - if mock_df.num_columns() == 0: + if mock_df.ncols == 0: return mpd.DataFrame() - if mock_df.num_rows() == 0: - raise ValueError(f"{mock_df=} not supported by modin") + if mock_df.nrows == 0: + raise ValueError(f"{mock_df.nrows=} not supported by modin") serieses: List[mpd.Series] = [] for name, (array, nominal_dtype) in mock_df.items(): if nominal_dtype == NominalDtype.UTF8: @@ -218,13 +250,12 @@ def modin_frame_equal(df1: mpd.DataFrame, df2: mpd.DataFrame) -> bool: mock_to_toplevel=modin_mock_to_toplevel, from_dataframe=modin_from_dataframe, frame_equal=modin_frame_equal, - # https://github.com/modin-project/modin/issues/4654 - # https://github.com/modin-project/modin/issues/4652 - exclude_dtypes=[ - NominalDtype.UTF8, + supported_dtypes=set(NominalDtype) + ^ { NominalDtype.DATETIME64NS, - NominalDtype.CATEGORY, - ], + # https://github.com/modin-project/modin/issues/4654 + NominalDtype.UTF8, + }, # https://github.com/modin-project/modin/issues/4643 allow_zero_rows=False, ) @@ -271,7 +302,7 @@ def register_extension_type(*a, **kw): else: def cudf_mock_to_toplevel(mock_df: MockDataFrame) -> cudf.DataFrame: - if mock_df.num_columns() == 0: + if mock_df.ncols == 0: return cudf.DataFrame() serieses = [] for name, (array, nominal_dtype) in mock_df.items(): @@ -294,11 +325,12 @@ def cudf_mock_to_toplevel(mock_df: MockDataFrame) -> cudf.DataFrame: mock_to_toplevel=cudf_mock_to_toplevel, from_dataframe=cudf_from_dataframe, frame_equal=lambda df1, df2: df1.equals(df2), # NaNs considered equal - exclude_dtypes=[ + supported_dtypes=set(NominalDtype) + ^ { NominalDtype.DATETIME64NS, # https://github.com/rapidsai/cudf/issues/11308 NominalDtype.UTF8, - ], + }, ) libinfo_params.append(pytest.param(cudf_libinfo, id=cudf_libinfo.name))