Skip to content

Commit 5b9cd4b

Browse files
committed
Fast byteswap
1 parent bd9a6f0 commit 5b9cd4b

File tree

2 files changed

+114
-14
lines changed

2 files changed

+114
-14
lines changed

pandas/io/sas/sas.pyx

+79-3
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
# cython: profile=False
22
# cython: boundscheck=False, initializedcheck=False
33
from cython cimport Py_ssize_t
4+
from libc.stdint cimport (
5+
int64_t,
6+
uint8_t,
7+
uint16_t,
8+
uint32_t,
9+
uint64_t,
10+
)
11+
from libc.string cimport memcpy
12+
413
import numpy as np
514

615
import pandas.io.sas.sas_constants as const
716

8-
ctypedef signed long long int64_t
9-
ctypedef unsigned char uint8_t
10-
ctypedef unsigned short uint16_t
1117

1218
# rle_decompress decompresses data using a Run Length Encoding
1319
# algorithm. It is partially documented here:
@@ -433,3 +439,73 @@ cdef class Parser:
433439
self.current_row_on_page_index += 1
434440
self.current_row_in_chunk_index += 1
435441
self.current_row_in_file_index += 1
442+
443+
444+
def read_float_with_byteswap(const uint8_t *data, bint byteswap):
445+
cdef float res = (<float*>data)[0]
446+
if byteswap:
447+
res = _byteswap_float(res)
448+
return res
449+
450+
451+
def read_double_with_byteswap(const uint8_t *data, bint byteswap):
452+
cdef double res = (<double*>data)[0]
453+
if byteswap:
454+
res = _byteswap_double(res)
455+
return res
456+
457+
458+
def read_uint16_with_byteswap(const uint8_t *data, bint byteswap):
459+
cdef uint16_t res = (<uint16_t *>data)[0]
460+
if byteswap:
461+
res = _byteswap2(res)
462+
return res
463+
464+
465+
def read_uint32_with_byteswap(const uint8_t *data, bint byteswap):
466+
cdef uint32_t res = (<uint32_t *>data)[0]
467+
if byteswap:
468+
res = _byteswap4(res)
469+
return res
470+
471+
472+
def read_uint64_with_byteswap(const uint8_t *data, bint byteswap):
473+
cdef uint64_t res = (<uint64_t *>data)[0]
474+
if byteswap:
475+
res = _byteswap8(res)
476+
return res
477+
478+
479+
# Byteswapping
480+
# From https://github.com/WizardMac/ReadStat/blob/master/src/readstat_bits.
481+
# Copyright (c) 2013-2016 Evan Miller, Apache 2 License
482+
483+
cdef inline uint16_t _byteswap2(uint16_t num):
484+
return ((num & 0xFF00) >> 8) | ((num & 0x00FF) << 8)
485+
486+
487+
cdef inline uint32_t _byteswap4(uint32_t num):
488+
num = ((num & <uint32_t>0xFFFF0000) >> 16) | ((num & <uint32_t>0x0000FFFF) << 16)
489+
return ((num & <uint32_t>0xFF00FF00) >> 8) | ((num & <uint32_t>0x00FF00FF) << 8)
490+
491+
492+
cdef inline uint64_t _byteswap8(uint64_t num):
493+
num = ((num & <uint64_t>0xFFFFFFFF00000000) >> 32) | ((num & <uint64_t>0x00000000FFFFFFFF) << 32)
494+
num = ((num & <uint64_t>0xFFFF0000FFFF0000) >> 16) | ((num & <uint64_t>0x0000FFFF0000FFFF) << 16)
495+
return ((num & <uint64_t>0xFF00FF00FF00FF00) >> 8) | ((num & <uint64_t>0x00FF00FF00FF00FF) << 8)
496+
497+
498+
cdef inline float _byteswap_float(float num):
499+
cdef uint32_t answer = 0
500+
memcpy(&answer, &num, 4)
501+
answer = _byteswap4(answer)
502+
memcpy(&num, &answer, 4)
503+
return num
504+
505+
506+
cdef inline double _byteswap_double(double num):
507+
cdef uint64_t answer = 0
508+
memcpy(&answer, &num, 8)
509+
answer = _byteswap8(answer)
510+
memcpy(&num, &answer, 8)
511+
return num

pandas/io/sas/sas7bdat.py

+35-11
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
datetime,
2121
timedelta,
2222
)
23-
import struct
23+
import sys
2424
from typing import cast
2525

2626
import numpy as np
@@ -42,7 +42,14 @@
4242
)
4343

4444
from pandas.io.common import get_handle
45-
from pandas.io.sas._sas import Parser
45+
from pandas.io.sas._sas import (
46+
Parser,
47+
read_double_with_byteswap,
48+
read_float_with_byteswap,
49+
read_uint16_with_byteswap,
50+
read_uint32_with_byteswap,
51+
read_uint64_with_byteswap,
52+
)
4653
import pandas.io.sas.sas_constants as const
4754
from pandas.io.sas.sasreader import ReaderBase
4855

@@ -259,8 +266,10 @@ def _get_properties(self) -> None:
259266
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
260267
if buf == b"\x01":
261268
self.byte_order = "<"
269+
self.need_byteswap = sys.byteorder == "big"
262270
else:
263271
self.byte_order = ">"
272+
self.need_byteswap = sys.byteorder == "little"
264273

265274
# Get encoding information
266275
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
@@ -345,22 +354,37 @@ def __next__(self) -> DataFrame:
345354

346355
# Read a single float of the given width (4 or 8).
347356
def _read_float(self, offset: int, width: int):
348-
if width not in (4, 8):
357+
if width == 4:
358+
return read_float_with_byteswap(
359+
self._read_bytes(offset, 4), self.need_byteswap
360+
)
361+
elif width == 8:
362+
return read_double_with_byteswap(
363+
self._read_bytes(offset, 8), self.need_byteswap
364+
)
365+
else:
349366
self.close()
350367
raise ValueError("invalid float width")
351-
buf = self._read_bytes(offset, width)
352-
fd = "f" if width == 4 else "d"
353-
return struct.unpack(self.byte_order + fd, buf)[0]
354368

355369
# Read a single signed integer of the given width (1, 2, 4 or 8).
356370
def _read_int(self, offset: int, width: int) -> int:
357-
if width not in (1, 2, 4, 8):
371+
if width == 1:
372+
return self._read_bytes(offset, 1)[0]
373+
elif width == 2:
374+
return read_uint16_with_byteswap(
375+
self._read_bytes(offset, 2), self.need_byteswap
376+
)
377+
elif width == 4:
378+
return read_uint32_with_byteswap(
379+
self._read_bytes(offset, 4), self.need_byteswap
380+
)
381+
elif width == 8:
382+
return read_uint64_with_byteswap(
383+
self._read_bytes(offset, 8), self.need_byteswap
384+
)
385+
else:
358386
self.close()
359387
raise ValueError("invalid int width")
360-
buf = self._read_bytes(offset, width)
361-
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
362-
iv = struct.unpack(self.byte_order + it, buf)[0]
363-
return iv
364388

365389
def _read_bytes(self, offset: int, length: int):
366390
if self._cached_page is None:

0 commit comments

Comments
 (0)