Skip to content

SAS7BDAT parser: Fast byteswap #47403

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Oct 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
5b9cd4b
Fast byteswap
jonashaag Jun 17, 2022
17c965f
Add types
jonashaag Jun 17, 2022
51499fb
Merge branch 'main' into sas/byteswap
jonashaag Jul 9, 2022
435a003
Review feedback
jonashaag Jul 9, 2022
10ab87f
Slightly faster variant (1 less bytes obj construction)
jonashaag Jul 9, 2022
ad74f5c
Make MyPy happy?
jonashaag Jul 10, 2022
9c5b4b3
Update sas7bdat.py
jonashaag Jul 11, 2022
21c364c
Merge branch 'main' into sas/byteswap
jonashaag Jul 11, 2022
148fa75
Merge branch 'main' into sas/byteswap
jonashaag Jul 15, 2022
f3c63f0
Use intrinsics
jonashaag Jul 21, 2022
78de495
Merge branch 'main' into sas/byteswap
jonashaag Aug 8, 2022
4ef928e
Merge branch 'main' into sas/byteswap
jonashaag Sep 10, 2022
c310c0d
Lint
jonashaag Sep 10, 2022
3b7ba83
Add tests + move byteswap to module
jonashaag Sep 10, 2022
53fbce2
Add float tests + refactoring
jonashaag Sep 10, 2022
9cbc5be
Undo unrelated changes
jonashaag Sep 10, 2022
4802848
Undo unrelated changes
jonashaag Sep 10, 2022
41abe02
Lint
jonashaag Sep 11, 2022
2abd8e0
Merge branch 'main' into sas/byteswap
jonashaag Sep 15, 2022
bf0976a
Update v1.6.0.rst
jonashaag Sep 15, 2022
c725d49
Merge branch 'main' into sas/byteswap
jonashaag Sep 30, 2022
c7c1a2f
read_int -> read_uint
jonashaag Oct 4, 2022
6a4a556
Lint
jonashaag Oct 4, 2022
9f5ba3f
Merge branch 'main' into sas/byteswap
jonashaag Oct 4, 2022
a439434
Update sas7bdat.py
jonashaag Oct 4, 2022
55bd863
Merge branch 'main' into sas/byteswap
jonashaag Oct 4, 2022
bdf8203
Merge branch 'main' into sas/byteswap
jonashaag Oct 5, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
- Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`)
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)

.. ---------------------------------------------------------------------------
Expand Down
5 changes: 5 additions & 0 deletions pandas/io/sas/_byteswap.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ...
def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ...
def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
92 changes: 92 additions & 0 deletions pandas/io/sas/byteswap.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""
The following are faster versions of struct.unpack that avoid the overhead of Python function calls.

In the SAS7BDAT parser, they may be called up to (n_rows * n_cols) times.
"""
from cython cimport Py_ssize_t
from libc.stdint cimport (
uint16_t,
uint32_t,
uint64_t,
)


def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
assert offset + 4 < len(data)
cdef:
const char *data_ptr = data
float res = (<float*>(data_ptr + offset))[0]
if byteswap:
res = _byteswap_float(res)
return res


def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
assert offset + 8 < len(data)
cdef:
const char *data_ptr = data
double res = (<double*>(data_ptr + offset))[0]
if byteswap:
res = _byteswap_double(res)
return res


def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
assert offset + 2 < len(data)
cdef:
const char *data_ptr = data
uint16_t res = (<uint16_t *>(data_ptr + offset))[0]
if byteswap:
res = _byteswap2(res)
return res


def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
assert offset + 4 < len(data)
cdef:
const char *data_ptr = data
uint32_t res = (<uint32_t *>(data_ptr + offset))[0]
if byteswap:
res = _byteswap4(res)
return res


def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
assert offset + 8 < len(data)
cdef:
const char *data_ptr = data
uint64_t res = (<uint64_t *>(data_ptr + offset))[0]
if byteswap:
res = _byteswap8(res)
return res


# Byteswapping

cdef extern from *:
"""
#ifdef _MSC_VER
#define _byteswap2 _byteswap_ushort
#define _byteswap4 _byteswap_ulong
#define _byteswap8 _byteswap_uint64
#else
#define _byteswap2 __builtin_bswap16
#define _byteswap4 __builtin_bswap32
#define _byteswap8 __builtin_bswap64
#endif
"""
uint16_t _byteswap2(uint16_t)
uint32_t _byteswap4(uint32_t)
uint64_t _byteswap8(uint64_t)


cdef inline float _byteswap_float(float num):
cdef uint32_t *intptr = <uint32_t *>&num
intptr[0] = _byteswap4(intptr[0])
return num


cdef inline double _byteswap_double(double num):
cdef uint64_t *intptr = <uint64_t *>&num
intptr[0] = _byteswap8(intptr[0])
return num
130 changes: 75 additions & 55 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
datetime,
timedelta,
)
import struct
import sys
from typing import cast

import numpy as np
Expand All @@ -42,6 +42,13 @@
)

from pandas.io.common import get_handle
from pandas.io.sas._byteswap import (
read_double_with_byteswap,
read_float_with_byteswap,
read_uint16_with_byteswap,
read_uint32_with_byteswap,
read_uint64_with_byteswap,
)
from pandas.io.sas._sas import (
Parser,
get_subheader_index,
Expand Down Expand Up @@ -263,8 +270,10 @@ def _get_properties(self) -> None:
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
if buf == b"\x01":
self.byte_order = "<"
self.need_byteswap = sys.byteorder == "big"
else:
self.byte_order = ">"
self.need_byteswap = sys.byteorder == "little"

# Get encoding information
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
Expand All @@ -286,7 +295,7 @@ def _get_properties(self) -> None:
)
self.date_modified = epoch + pd.to_timedelta(x, unit="s")

self.header_length = self._read_int(
self.header_length = self._read_uint(
const.header_size_offset + align1, const.header_size_length
)

Expand All @@ -298,7 +307,7 @@ def _get_properties(self) -> None:
if len(self._cached_page) != self.header_length: # type: ignore[arg-type]
raise ValueError("The SAS7BDAT file appears to be truncated.")

self._page_length = self._read_int(
self._page_length = self._read_uint(
const.page_size_offset + align1, const.page_size_length
)

Expand All @@ -311,37 +320,46 @@ def __next__(self) -> DataFrame:

# Read a single float of the given width (4 or 8).
def _read_float(self, offset: int, width: int):
if width not in (4, 8):
assert self._cached_page is not None
if width == 4:
return read_float_with_byteswap(
self._cached_page, offset, self.need_byteswap
)
elif width == 8:
return read_double_with_byteswap(
self._cached_page, offset, self.need_byteswap
)
else:
self.close()
raise ValueError("invalid float width")
buf = self._read_bytes(offset, width)
fd = "f" if width == 4 else "d"
return struct.unpack(self.byte_order + fd, buf)[0]

# Read a single signed integer of the given width (1, 2, 4 or 8).
def _read_int(self, offset: int, width: int) -> int:
if width not in (1, 2, 4, 8):
# Read a single unsigned integer of the given width (1, 2, 4 or 8).
def _read_uint(self, offset: int, width: int) -> int:
assert self._cached_page is not None
if width == 1:
return self._read_bytes(offset, 1)[0]
elif width == 2:
return read_uint16_with_byteswap(
self._cached_page, offset, self.need_byteswap
)
elif width == 4:
return read_uint32_with_byteswap(
self._cached_page, offset, self.need_byteswap
)
elif width == 8:
return read_uint64_with_byteswap(
self._cached_page, offset, self.need_byteswap
)
else:
self.close()
raise ValueError("invalid int width")
buf = self._read_bytes(offset, width)
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
iv = struct.unpack(self.byte_order + it, buf)[0]
return iv

def _read_bytes(self, offset: int, length: int):
if self._cached_page is None:
self._path_or_buf.seek(offset)
buf = self._path_or_buf.read(length)
if len(buf) < length:
self.close()
msg = f"Unable to read {length:d} bytes from file position {offset:d}."
raise ValueError(msg)
return buf
else:
if offset + length > len(self._cached_page):
self.close()
raise ValueError("The cached page is too small.")
return self._cached_page[offset : offset + length]
assert self._cached_page is not None
if offset + length > len(self._cached_page):
self.close()
raise ValueError("The cached page is too small.")
return self._cached_page[offset : offset + length]

def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes:
return self._convert_header_text(
Expand Down Expand Up @@ -375,12 +393,12 @@ def _read_page_header(self) -> None:
bit_offset = self._page_bit_offset
tx = const.page_type_offset + bit_offset
self._current_page_type = (
self._read_int(tx, const.page_type_length) & const.page_type_mask2
self._read_uint(tx, const.page_type_length) & const.page_type_mask2
)
tx = const.block_count_offset + bit_offset
self._current_page_block_count = self._read_int(tx, const.block_count_length)
self._current_page_block_count = self._read_uint(tx, const.block_count_length)
tx = const.subheader_count_offset + bit_offset
self._current_page_subheaders_count = self._read_int(
self._current_page_subheaders_count = self._read_uint(
tx, const.subheader_count_length
)

Expand All @@ -391,16 +409,16 @@ def _process_page_metadata(self) -> None:
offset = const.subheader_pointers_offset + bit_offset
total_offset = offset + self._subheader_pointer_length * i

subheader_offset = self._read_int(total_offset, self._int_length)
subheader_offset = self._read_uint(total_offset, self._int_length)
total_offset += self._int_length

subheader_length = self._read_int(total_offset, self._int_length)
subheader_length = self._read_uint(total_offset, self._int_length)
total_offset += self._int_length

subheader_compression = self._read_int(total_offset, 1)
subheader_compression = self._read_uint(total_offset, 1)
total_offset += 1

subheader_type = self._read_int(total_offset, 1)
subheader_type = self._read_uint(total_offset, 1)

if (
subheader_length == 0
Expand Down Expand Up @@ -442,29 +460,29 @@ def _process_rowsize_subheader(self, offset: int, length: int) -> None:
lcs_offset += 354
lcp_offset += 378

self.row_length = self._read_int(
self.row_length = self._read_uint(
offset + const.row_length_offset_multiplier * int_len,
int_len,
)
self.row_count = self._read_int(
self.row_count = self._read_uint(
offset + const.row_count_offset_multiplier * int_len,
int_len,
)
self.col_count_p1 = self._read_int(
self.col_count_p1 = self._read_uint(
offset + const.col_count_p1_multiplier * int_len, int_len
)
self.col_count_p2 = self._read_int(
self.col_count_p2 = self._read_uint(
offset + const.col_count_p2_multiplier * int_len, int_len
)
mx = const.row_count_on_mix_page_offset_multiplier * int_len
self._mix_page_row_count = self._read_int(offset + mx, int_len)
self._lcs = self._read_int(lcs_offset, 2)
self._lcp = self._read_int(lcp_offset, 2)
self._mix_page_row_count = self._read_uint(offset + mx, int_len)
self._lcs = self._read_uint(lcs_offset, 2)
self._lcp = self._read_uint(lcp_offset, 2)

def _process_columnsize_subheader(self, offset: int, length: int) -> None:
int_len = self._int_length
offset += int_len
self.column_count = self._read_int(offset, int_len)
self.column_count = self._read_uint(offset, int_len)
if self.col_count_p1 + self.col_count_p2 != self.column_count:
print(
f"Warning: column count mismatch ({self.col_count_p1} + "
Expand All @@ -478,7 +496,7 @@ def _process_subheader_counts(self, offset: int, length: int) -> None:
def _process_columntext_subheader(self, offset: int, length: int) -> None:

offset += self._int_length
text_block_size = self._read_int(offset, const.text_block_size_length)
text_block_size = self._read_uint(offset, const.text_block_size_length)

buf = self._read_bytes(offset, text_block_size)
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
Expand Down Expand Up @@ -542,13 +560,13 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None:
+ const.column_name_length_offset
)

idx = self._read_int(
idx = self._read_uint(
text_subheader, const.column_name_text_subheader_length
)
col_offset = self._read_int(
col_offset = self._read_uint(
col_name_offset, const.column_name_offset_length
)
col_len = self._read_int(col_name_length, const.column_name_length_length)
col_len = self._read_uint(col_name_length, const.column_name_length_length)

name_raw = self.column_names_raw[idx]
cname = name_raw[col_offset : col_offset + col_len]
Expand All @@ -571,13 +589,13 @@ def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
)

x = self._read_int(col_data_offset, int_len)
x = self._read_uint(col_data_offset, int_len)
self._column_data_offsets.append(x)

x = self._read_int(col_data_len, const.column_data_length_length)
x = self._read_uint(col_data_len, const.column_data_length_length)
self._column_data_lengths.append(x)

x = self._read_int(col_types, const.column_type_length)
x = self._read_uint(col_types, const.column_type_length)
self._column_types.append(b"d" if x == 1 else b"s")

def _process_columnlist_subheader(self, offset: int, length: int) -> None:
Expand All @@ -597,23 +615,25 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
col_label_len = offset + const.column_label_length_offset + 3 * int_len

x = self._read_int(
x = self._read_uint(
text_subheader_format, const.column_format_text_subheader_index_length
)
format_idx = min(x, len(self.column_names_raw) - 1)

format_start = self._read_int(
format_start = self._read_uint(
col_format_offset, const.column_format_offset_length
)
format_len = self._read_int(col_format_len, const.column_format_length_length)
format_len = self._read_uint(col_format_len, const.column_format_length_length)

label_idx = self._read_int(
label_idx = self._read_uint(
text_subheader_label, const.column_label_text_subheader_index_length
)
label_idx = min(label_idx, len(self.column_names_raw) - 1)

label_start = self._read_int(col_label_offset, const.column_label_offset_length)
label_len = self._read_int(col_label_len, const.column_label_length_length)
label_start = self._read_uint(
col_label_offset, const.column_label_offset_length
)
label_len = self._read_uint(col_label_len, const.column_label_length_length)

label_names = self.column_names_raw[label_idx]
column_label = self._convert_header_text(
Expand Down
Loading