Skip to content

Commit 10ab87f

Browse files
committed
Slightly faster variant (1 less bytes obj construction)
1 parent 435a003 commit 10ab87f

File tree

3 files changed

+39
-33
lines changed

3 files changed

+39
-33
lines changed

pandas/io/sas/_sas.pyi

+5-5
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ class Parser:
44
def __init__(self, parser: SAS7BDATReader) -> None: ...
55
def read(self, nrows: int) -> None: ...
66

7-
def read_float_with_byteswap(data: bytes, byteswap: bool) -> float: ...
8-
def read_double_with_byteswap(data: bytes, byteswap: bool) -> float: ...
9-
def read_uint16_with_byteswap(data: bytes, byteswap: bool) -> int: ...
10-
def read_uint32_with_byteswap(data: bytes, byteswap: bool) -> int: ...
11-
def read_uint64_with_byteswap(data: bytes, byteswap: bool) -> int: ...
7+
def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ...
8+
def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ...
9+
def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
10+
def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
11+
def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...

pandas/io/sas/sas.pyx

+25-10
Original file line numberDiff line numberDiff line change
@@ -441,36 +441,51 @@ cdef class Parser:
441441

442442
# The following are faster versions of struct.unpack that avoid the overhead of
443443
# Python function calls. They may be called up to (n_rows * n_cols) times.
444-
def read_float_with_byteswap(const uint8_t *data, bint byteswap):
445-
cdef float res = (<float*>data)[0]
444+
def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
445+
assert offset + 4 < len(data)
446+
cdef:
447+
const char *data_ptr = data
448+
float res = (<float*>(data_ptr + offset))[0]
446449
if byteswap:
447450
res = _byteswap_float(res)
448451
return res
449452

450453

451-
def read_double_with_byteswap(const uint8_t *data, bint byteswap):
452-
cdef double res = (<double*>data)[0]
454+
def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
455+
assert offset + 8 < len(data)
456+
cdef:
457+
const char *data_ptr = data
458+
double res = (<double*>(data_ptr + offset))[0]
453459
if byteswap:
454460
res = _byteswap_double(res)
455461
return res
456462

457463

458-
def read_uint16_with_byteswap(const uint8_t *data, bint byteswap):
459-
cdef uint16_t res = (<uint16_t *>data)[0]
464+
def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
465+
assert offset + 2 < len(data)
466+
cdef:
467+
const char *data_ptr = data
468+
uint16_t res = (<uint16_t *>(data_ptr + offset))[0]
460469
if byteswap:
461470
res = _byteswap2(res)
462471
return res
463472

464473

465-
def read_uint32_with_byteswap(const uint8_t *data, bint byteswap):
466-
cdef uint32_t res = (<uint32_t *>data)[0]
474+
def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
475+
assert offset + 4 < len(data)
476+
cdef:
477+
const char *data_ptr = data
478+
uint32_t res = (<uint32_t *>(data_ptr + offset))[0]
467479
if byteswap:
468480
res = _byteswap4(res)
469481
return res
470482

471483

472-
def read_uint64_with_byteswap(const uint8_t *data, bint byteswap):
473-
cdef uint64_t res = (<uint64_t *>data)[0]
484+
def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
485+
assert offset + 8 < len(data)
486+
cdef:
487+
const char *data_ptr = data
488+
uint64_t res = (<uint64_t *>(data_ptr + offset))[0]
474489
if byteswap:
475490
res = _byteswap8(res)
476491
return res

pandas/io/sas/sas7bdat.py

+9-18
Original file line numberDiff line numberDiff line change
@@ -356,11 +356,11 @@ def __next__(self) -> DataFrame:
356356
def _read_float(self, offset: int, width: int):
357357
if width == 4:
358358
return read_float_with_byteswap(
359-
self._read_bytes(offset, 4), self.need_byteswap
359+
self._cached_page, offset, self.need_byteswap
360360
)
361361
elif width == 8:
362362
return read_double_with_byteswap(
363-
self._read_bytes(offset, 8), self.need_byteswap
363+
self._cached_page, offset, self.need_byteswap
364364
)
365365
else:
366366
self.close()
@@ -372,34 +372,25 @@ def _read_int(self, offset: int, width: int) -> int:
372372
return self._read_bytes(offset, 1)[0]
373373
elif width == 2:
374374
return read_uint16_with_byteswap(
375-
self._read_bytes(offset, 2), self.need_byteswap
375+
self._cached_page, offset, self.need_byteswap
376376
)
377377
elif width == 4:
378378
return read_uint32_with_byteswap(
379-
self._read_bytes(offset, 4), self.need_byteswap
379+
self._cached_page, offset, self.need_byteswap
380380
)
381381
elif width == 8:
382382
return read_uint64_with_byteswap(
383-
self._read_bytes(offset, 8), self.need_byteswap
383+
self._cached_page, offset, self.need_byteswap
384384
)
385385
else:
386386
self.close()
387387
raise ValueError("invalid int width")
388388

389389
def _read_bytes(self, offset: int, length: int):
390-
if self._cached_page is None:
391-
self._path_or_buf.seek(offset)
392-
buf = self._path_or_buf.read(length)
393-
if len(buf) < length:
394-
self.close()
395-
msg = f"Unable to read {length:d} bytes from file position {offset:d}."
396-
raise ValueError(msg)
397-
return buf
398-
else:
399-
if offset + length > len(self._cached_page):
400-
self.close()
401-
raise ValueError("The cached page is too small.")
402-
return self._cached_page[offset : offset + length]
390+
if offset + length > len(self._cached_page):
391+
self.close()
392+
raise ValueError("The cached page is too small.")
393+
return self._cached_page[offset : offset + length]
403394

404395
def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes:
405396
return self._convert_header_text(

0 commit comments

Comments
 (0)