-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Optimize BSON decoding #1667
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize BSON decoding #1667
Changes from 16 commits
9e109b4
c91b341
04965cd
955b8c5
dda00e3
579e2b1
8914813
9422cf9
95e890f
84e7728
1306fe8
a3d3f44
0266a90
bdfe6d2
95ab04b
7199945
d43b83d
7fc074f
5792d35
0084f4f
cf51555
9c20c99
a094261
ce754d6
4718ea9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
/* | ||
* Copyright 2008-present MongoDB, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.bson.internal; | ||
|
||
/** | ||
* Utility class for platform-specific operations. | ||
* This class is not part of the public API and may be removed or changed at any time. | ||
*/ | ||
public final class PlatformUtil { | ||
|
||
private PlatformUtil() {} | ||
|
||
// These architectures support unaligned memory access. | ||
// While others might as well, it's safer to assume they don't. | ||
private static final String[] ARCHITECTURES_ALLOWING_UNALIGNED_ACCESS = { | ||
"x86", | ||
"amd64", | ||
"i386", | ||
"x86_64", | ||
"arm64", // evergreen dbx-perf-distro uses this architecture | ||
"aarch64"}; | ||
|
||
public static boolean isUnalignedAccessAllowed() { | ||
try { | ||
String processArch = System.getProperty("os.arch"); | ||
for (String supportedArch : ARCHITECTURES_ALLOWING_UNALIGNED_ACCESS) { | ||
if (supportedArch.equals(processArch)) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} catch (Exception e) { | ||
// Ignore security exception and proceed with default value | ||
return false; | ||
} | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
import java.nio.charset.StandardCharsets; | ||
|
||
import static java.lang.String.format; | ||
import static org.bson.internal.PlatformUtil.isUnalignedAccessAllowed; | ||
|
||
/** | ||
* An implementation of {@code BsonInput} that is backed by a {@code ByteBuf}. | ||
|
@@ -33,6 +34,14 @@ | |
public class ByteBufferBsonInput implements BsonInput { | ||
|
||
private static final String[] ONE_BYTE_ASCII_STRINGS = new String[Byte.MAX_VALUE + 1]; | ||
private static final boolean UNALIGNED_ACCESS_SUPPORTED = isUnalignedAccessAllowed(); | ||
/* A dynamically sized scratch buffer, that is reused across BSON String reads: | ||
* 1. Reduces garbage collection by avoiding new byte array creation. | ||
* 2. Improves cache utilization through temporal locality. | ||
* 3. Avoids JVM allocation and zeroing cost for new memory allocations. | ||
*/ | ||
private byte[] scratchBuffer; | ||
NathanQingyangXu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
static { | ||
for (int b = 0; b < ONE_BYTE_ASCII_STRINGS.length; b++) { | ||
|
@@ -127,15 +136,12 @@ public String readString() { | |
|
||
@Override | ||
public String readCString() { | ||
int mark = buffer.position(); | ||
skipCString(); | ||
int size = buffer.position() - mark; | ||
buffer.position(mark); | ||
int size = computeCStringLength(buffer.position()); | ||
return readString(size); | ||
} | ||
|
||
private String readString(final int size) { | ||
if (size == 2) { | ||
private String readString(final int bsonStringSize) { | ||
if (bsonStringSize == 2) { | ||
byte asciiByte = buffer.get(); // if only one byte in the string, it must be ascii. | ||
byte nullByte = buffer.get(); // read null terminator | ||
if (nullByte != 0) { | ||
|
@@ -146,26 +152,74 @@ private String readString(final int size) { | |
} | ||
return ONE_BYTE_ASCII_STRINGS[asciiByte]; // this will throw if asciiByte is negative | ||
} else { | ||
byte[] bytes = new byte[size - 1]; | ||
buffer.get(bytes); | ||
byte nullByte = buffer.get(); | ||
if (nullByte != 0) { | ||
throw new BsonSerializationException("Found a BSON string that is not null-terminated"); | ||
if (buffer.isBackedByArray()) { | ||
int position = buffer.position(); | ||
int arrayOffset = buffer.arrayOffset(); | ||
int newPosition = position + bsonStringSize; | ||
buffer.position(newPosition); | ||
|
||
byte[] array = buffer.array(); | ||
if (array[arrayOffset + newPosition - 1] != 0) { | ||
throw new BsonSerializationException("Found a BSON string that is not null-terminated"); | ||
} | ||
return new String(array, arrayOffset + position, bsonStringSize - 1, StandardCharsets.UTF_8); | ||
} else if (scratchBuffer == null || bsonStringSize > scratchBuffer.length) { | ||
int scratchBufferSize = bsonStringSize + (bsonStringSize >>> 1); //1.5 times the size | ||
scratchBuffer = new byte[scratchBufferSize]; | ||
} | ||
return new String(bytes, StandardCharsets.UTF_8); | ||
|
||
buffer.get(scratchBuffer, 0, bsonStringSize); | ||
if (scratchBuffer[bsonStringSize - 1] != 0) { | ||
throw new BsonSerializationException("BSON string not null-terminated"); | ||
} | ||
return new String(scratchBuffer, 0, bsonStringSize - 1, StandardCharsets.UTF_8); | ||
} | ||
} | ||
|
||
@Override | ||
public void skipCString() { | ||
ensureOpen(); | ||
boolean checkNext = true; | ||
while (checkNext) { | ||
if (!buffer.hasRemaining()) { | ||
throw new BsonSerializationException("Found a BSON string that is not null-terminated"); | ||
int pos = buffer.position(); | ||
int length = computeCStringLength(pos); | ||
buffer.position(pos + length); | ||
} | ||
|
||
/* | ||
This method uses the SWAR (SIMD Within A Register) technique when aligned access is supported. | ||
SWAR finds a null terminator by processing 8 bytes at once. | ||
*/ | ||
public int computeCStringLength(final int prevPos) { | ||
ensureOpen(); | ||
int pos = buffer.position(); | ||
int limit = buffer.limit(); | ||
|
||
if (UNALIGNED_ACCESS_SUPPORTED) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So if we did this for all platforms, it would be slower on the ones that don't allow unaligned access? Slower than just going byte by byte? Just wondering if it's worth it to have two code paths to maintain. I also don't see a test for when this value is false, since we don't run on any platforms that would make it so. It's a bit concerning that we don't, even though by inspection it seems obvious, at least with the code as it is, that it's correct. If we did want to add a test, we would have to add a testing backdoor to PlatformUtil to override the default behavior of examining There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There might be some performance penalty, as Nearly all modern cloud providers provide architectures that support unaligned access. The ones that don’t are typically limited to embedded systems or legacy hardware. Given how rare such platforms are, I’m actually in favor of removing the platform check altogether - I think the likelihood of hitting such an architecture is extremely low. @jyemin @NathanQingyangXu What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am ok with this. Keeping expanding the CPU list in the long future doesn't make much sense to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, let's remove the platform check. |
||
int chunks = (limit - pos) >>> 3; | ||
// Process 8 bytes at a time. | ||
for (int i = 0; i < chunks; i++) { | ||
long word = buffer.getLong(pos); | ||
long mask = word - 0x0101010101010101L; | ||
mask &= ~word; | ||
mask &= 0x8080808080808080L; | ||
if (mask != 0) { | ||
// first null terminator found in the Little Endian long | ||
int offset = Long.numberOfTrailingZeros(mask) >>> 3; | ||
// Found the null at pos + offset; reset buffer's position. | ||
return (pos - prevPos) + offset + 1; | ||
} | ||
pos += 8; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the above bitwise magic is cool; but it would be also great to leave some reference to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure whether it is an idea to present some proof that the above bitwise magic does help perf significantly so the tradeoff between perf and readability is an easy decision (Or you have provided it in the description of the PR?).
Then it would be more convincing that the bitwise magic is really justified. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think we could delay this cryptic @jyemin , how do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Java IO gains are indeed modest, but I can't come up with a hypothesis for why Netty would be so much better in this case. Since the gains from the other optimizations still seem worthwhile, perhaps we should just revert the SWAR piece and consider it in follow-on ticket and get another set of eyes on it (perhaps our Netty expert collaborator from other PRs). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unresolving this, as there is still a discussion.
From my perspective, the maintenance burden is quite minimal - the code spans just ~8 lines with bitwise complexity that might not be immediately obvious, however, i believe would not require deep study. For a ~30% improvement in the Netty case, the trade-off seems worthwhile.
I ran a local benchmark with JIT compilation (C2 on ARM), which gave some visibility into the generated assembly for both Java’s ByteBuffer and Netty’s ByteBuf. TLDR; The original Before SWAR (Netty):In its original pre-SWAR
Netty pre-SWAR (byte-by-byte) assembly
Before SWAR (JDK):In contrast, Java’s ByteBuffer pre-SWAR After SWAR (Netty):
This addressed Netty’s significant overhead After SWAR (JDK):The SWAR version used a native Non-inlined getLongUnaligned ~10 cycles of overhead`0x000000011424bc34: bl 0x000000011424bf00 ; Call getLongUnaligned (stub)`
Java’s efficient pre-SWAR loop leaves little room for improvement, and SWAR’s unlined native call limits gains. Assembly for JDK's already inlined `getByte() in pre-SWAR`
That being said, I’m in favor of keeping this optimization given the substantial performance improvement for Netty users and the low risk associated with unaligned access. However, I’m also fine with moving it to a separate ticket to unblock the rest of the optimizations. I am reverting the SWAR changes for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Is getLong better because the method is not virtual, or because it's invoked 1/8 of the time? (It seems like if getByte is virtual then getLong would have to be as well.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't see it reverted yet, but as for me this is convincing rationale to keep it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Ah, actually Java interpretation of compiled code
Most of gains comes from 1/8 fewer calls reducing virtual call overhead, checks and component lookups. I also noticed, that JDK's ByteBuffer had 4 unrolled There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
} | ||
checkNext = buffer.get() != 0; | ||
} | ||
|
||
// Process remaining bytes one-by-one. | ||
NathanQingyangXu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
while (pos < limit) { | ||
if (buffer.get(pos++) == 0) { | ||
return (pos - prevPos); | ||
} | ||
} | ||
|
||
buffer.position(pos); | ||
throw new BsonSerializationException("Found a BSON string that is not null-terminated"); | ||
} | ||
|
||
@Override | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* | ||
* Copyright 2008-present MongoDB, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.bson.internal; | ||
|
||
import org.junit.jupiter.api.DisplayName; | ||
import org.junit.jupiter.api.Test; | ||
import org.junit.jupiter.params.ParameterizedTest; | ||
import org.junit.jupiter.params.provider.ValueSource; | ||
|
||
import static org.junit.Assert.assertFalse; | ||
import static org.junit.Assert.assertTrue; | ||
|
||
class PlatformUtilTest { | ||
|
||
@ParameterizedTest | ||
@ValueSource(strings = {"arm", "ppc", "ppc64", "sparc", "mips"}) | ||
@DisplayName("Should not allow unaligned access for unsupported architectures") | ||
void shouldNotAllowUnalignedAccessForUnsupportedArchitecture(final String architecture) { | ||
withSystemProperty("os.arch", architecture, () -> { | ||
boolean result = PlatformUtil.isUnalignedAccessAllowed(); | ||
assertFalse(result); | ||
}); | ||
} | ||
|
||
@Test | ||
@DisplayName("Should not allow unaligned access when system property is undefined") | ||
void shouldNotAllowUnalignedAccessWhenSystemPropertyIsUndefined() { | ||
withSystemProperty("os.arch", null, () -> { | ||
boolean result = PlatformUtil.isUnalignedAccessAllowed(); | ||
assertFalse(result); | ||
}); | ||
} | ||
|
||
@ParameterizedTest | ||
@ValueSource(strings = {"x86", "amd64", "i386", "x86_64", "arm64", "aarch64"}) | ||
@DisplayName("Should allow unaligned access for supported architectures") | ||
void shouldAllowUnalignedAccess(final String architecture) { | ||
withSystemProperty("os.arch", architecture, () -> { | ||
boolean result = PlatformUtil.isUnalignedAccessAllowed(); | ||
assertTrue(result); | ||
}); | ||
} | ||
|
||
public static void withSystemProperty(final String name, final String value, final Runnable testCode) { | ||
String original = System.getProperty(name); | ||
if (value == null) { | ||
System.clearProperty(name); | ||
} else { | ||
System.setProperty(name, value); | ||
} | ||
try { | ||
testCode.run(); | ||
} finally { | ||
System.setProperty(name, original); | ||
} | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.