Skip to content

Commit 0b78ae8

Browse files
Owen Andersonfacebook-github-bot
Owen Anderson
authored andcommitted
Cleanup byte swapping utilities to generate optimal code on the platforms we care about. (pytorch#11394)
Summary: While the use of memcpy as part of the byte swapping sequence looks funky, all major compilers recognize and optimize this pattern reliably, resulting in essentially optimal code generation. For example, decodeUInt32LE goes from this on iOS arm64: > ldrb w8, [x0, #3] > ldrb w9, [x0, #2] > bfi w8, w9, #8, #8 > ldrb w9, [x0, #1] > bfi w8, w9, #16, #8 > ldrb w9, [x0] > bfi w8, w9, #24, #8 > mov x0, x8 > ret To this: > ldr w8, [x0] > rev w0, w8 > ret Pull Request resolved: pytorch#11394 Reviewed By: SsnL Differential Revision: D9728659 Pulled By: resistor fbshipit-source-id: 9afbd4adfad1d1fb7b01f1179e6707ee21fa726f
1 parent a0d4106 commit 0b78ae8

File tree

1 file changed

+77
-29
lines changed

1 file changed

+77
-29
lines changed

torch/csrc/byte_order.cpp

Lines changed: 77 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,34 +2,94 @@
22

33
#include <string.h>
44

5+
#if defined(_MSC_VER)
6+
#include <stdlib.h>
7+
#endif
8+
9+
static inline void swapBytes16(void *ptr)
10+
{
11+
uint16_t output;
12+
memcpy(&output, ptr, sizeof(uint16_t));
13+
#if defined(_MSC_VER) && !defined(_DEBUG)
14+
output = _byteswap_ushort(output);
15+
#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
16+
output = __builtin_bswap16(output);
17+
#else
18+
uint16_t Hi = output >> 8;
19+
uint16_t Lo = output << 8;
20+
output = Hi | Lo;
21+
#endif
22+
memcpy(ptr, &output, sizeof(uint16_t));
23+
}
24+
25+
static inline void swapBytes32(void *ptr)
26+
{
27+
uint32_t output;
28+
memcpy(&output, ptr, sizeof(uint32_t));
29+
#if defined(_MSC_VER) && !defined(_DEBUG)
30+
output = _byteswap_ulong(output);
31+
#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
32+
output = __builtin_bswap32(output);
33+
#else
34+
uint32_t Byte0 = output & 0x000000FF;
35+
uint32_t Byte1 = output & 0x0000FF00;
36+
uint32_t Byte2 = output & 0x00FF0000;
37+
uint32_t Byte3 = output & 0xFF000000;
38+
output = (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24);
39+
#endif
40+
memcpy(ptr, &output, sizeof(uint32_t));
41+
}
42+
43+
static inline void swapBytes64(void *ptr)
44+
{
45+
uint64_t output;
46+
memcpy(&output, ptr, sizeof(uint64_t));
47+
#if defined(_MSC_VER) && !defined(_DEBUG)
48+
output = _byteswap_uint64(output);
49+
#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
50+
output = __builtin_bswap64(output);
51+
#else
52+
uint64_t Hi = SwapByteOrder_32(uint32_t(value));
53+
uint32_t Lo = SwapByteOrder_32(uint32_t(value >> 32));
54+
return (Hi << 32) | Lo;
55+
#endif
56+
memcpy(ptr, &output, sizeof(uint64_t));
57+
}
58+
559
static inline uint16_t decodeUInt16LE(const uint8_t *data) {
6-
return (data[0]<<0) | (data[1]<<8);
60+
uint16_t output;
61+
memcpy(&output, data, sizeof(uint16_t));
62+
return output;
763
}
864

965
static inline uint16_t decodeUInt16BE(const uint8_t *data) {
10-
return (data[1]<<0) | (data[0]<<8);
66+
uint16_t output = decodeUInt16LE(data);
67+
swapBytes16(&output);
68+
return output;
1169
}
1270

1371
static inline uint32_t decodeUInt32LE(const uint8_t *data) {
14-
return (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
72+
uint32_t output;
73+
memcpy(&output, data, sizeof(uint32_t));
74+
return output;
1575
}
1676

1777
static inline uint32_t decodeUInt32BE(const uint8_t *data) {
18-
return (data[3]<<0) | (data[2]<<8) | (data[1]<<16) | (data[0]<<24);
78+
uint32_t output = decodeUInt32LE(data);
79+
swapBytes32(&output);
80+
return output;
1981
}
2082

2183
static inline uint64_t decodeUInt64LE(const uint8_t *data) {
22-
return (((uint64_t)data[0])<< 0) | (((uint64_t)data[1])<< 8) |
23-
(((uint64_t)data[2])<<16) | (((uint64_t)data[3])<<24) |
24-
(((uint64_t)data[4])<<32) | (((uint64_t)data[5])<<40) |
25-
(((uint64_t)data[6])<<48) | (((uint64_t)data[7])<<56);
84+
uint64_t output;
85+
memcpy(&output, data, sizeof(uint64_t));
86+
return output;
2687
}
2788

2889
static inline uint64_t decodeUInt64BE(const uint8_t *data) {
29-
return (((uint64_t)data[7])<< 0) | (((uint64_t)data[6])<< 8) |
30-
(((uint64_t)data[5])<<16) | (((uint64_t)data[4])<<24) |
31-
(((uint64_t)data[3])<<32) | (((uint64_t)data[2])<<40) |
32-
(((uint64_t)data[1])<<48) | (((uint64_t)data[0])<<56);
90+
uint64_t output = decodeUInt64LE(data);
91+
swapBytes64(&output);
92+
return output;
3393
}
3494

3595
THPByteOrder THP_nativeByteOrder()
@@ -92,24 +152,12 @@ void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order,
92152
}
93153
}
94154

95-
template<size_t size>
96-
static void swapBytes(uint8_t *ptr)
97-
{
98-
uint8_t tmp;
99-
for (size_t i = 0; i < size / 2; i++) {
100-
tmp = ptr[i];
101-
ptr[i] = ptr[size-i];
102-
ptr[size-i] = tmp;
103-
}
104-
}
105-
106-
107155
void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len)
108156
{
109157
memcpy(dst, src, sizeof(int16_t) * len);
110158
if (order != THP_nativeByteOrder()) {
111159
for (size_t i = 0; i < len; i++) {
112-
swapBytes<sizeof(int16_t)>(dst);
160+
swapBytes16(dst);
113161
dst += sizeof(int16_t);
114162
}
115163
}
@@ -120,7 +168,7 @@ void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order,
120168
memcpy(dst, src, sizeof(int32_t) * len);
121169
if (order != THP_nativeByteOrder()) {
122170
for (size_t i = 0; i < len; i++) {
123-
swapBytes<sizeof(int32_t)>(dst);
171+
swapBytes32(dst);
124172
dst += sizeof(int32_t);
125173
}
126174
}
@@ -131,7 +179,7 @@ void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order,
131179
memcpy(dst, src, sizeof(int64_t) * len);
132180
if (order != THP_nativeByteOrder()) {
133181
for (size_t i = 0; i < len; i++) {
134-
swapBytes<sizeof(int64_t)>(dst);
182+
swapBytes64(dst);
135183
dst += sizeof(int64_t);
136184
}
137185
}
@@ -142,7 +190,7 @@ void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, s
142190
memcpy(dst, src, sizeof(float) * len);
143191
if (order != THP_nativeByteOrder()) {
144192
for (size_t i = 0; i < len; i++) {
145-
swapBytes<sizeof(float)>(dst);
193+
swapBytes32(dst);
146194
dst += sizeof(float);
147195
}
148196
}
@@ -153,7 +201,7 @@ void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order,
153201
memcpy(dst, src, sizeof(double) * len);
154202
if (order != THP_nativeByteOrder()) {
155203
for (size_t i = 0; i < len; i++) {
156-
swapBytes<sizeof(double)>(dst);
204+
swapBytes64(dst);
157205
dst += sizeof(double);
158206
}
159207
}

0 commit comments

Comments
 (0)