`bswap` and `movebe` equivalents? #1426

creationix · 2021-07-30T17:10:22Z

Related to #422 and #725, I'm wondering what I should to to have fast reads of big-endian numbers from a buffer.

My use case is I have a large data structure that uses network byte ordering (aka big-endian) and wasm is always little-endian.

I'm trying to write C code for clang that emits effecient bytecode, but it gets really nasty whenever I need to load a BE uint64_t to a native number in memory.

Suppose the following test program:

#include <stdint.h>

#define memcpy(dst, src, len) __builtin_memcpy(dst, src, len)

int popcnt_u32(uint32_t val) {
  int count = 0;
  while (val) {
    count++;
    val &= (val - (uint32_t)1);
  }
  return count;
}

int popcnt_u64(uint64_t val) {
  int count = 0;
  while (val) {
    count++;
    val &= (val - (uint64_t)1);
  }
  return count;
}

int is_big_endian(void) {
  union {
    uint32_t i;
    char c[4];
  } bint = {0x01020304};

  return bint.c[0] == 1;
}

uint32_t swap_u32(uint32_t val) {
  if (is_big_endian())
    return val;
  return ((val & 0xFF000000u) >> 24u) | ((val & 0x00FF0000u) >> 8u) | ((val & 0x0000FF00u) << 8u) |
         ((val & 0x000000FFu) << 24u);
}

uint64_t swap_u64(uint64_t val) {
  if (is_big_endian())
    return val;
  return ((val & 0xFF00000000000000u) >> 56u) | ((val & 0x00FF000000000000u) >> 40u) |
         ((val & 0x0000FF0000000000u) >> 24u) | ((val & 0x000000FF00000000u) >> 8u) |
         ((val & 0x00000000FF000000u) << 8u) | ((val & 0x0000000000FF0000u) << 24u) |
         ((val & 0x000000000000FF00u) << 40u) | ((val & 0x00000000000000FFu) << 56u);
}

uint32_t read_u32_swap(uint8_t* storage) {
  uint32_t val;
  memcpy(&val, storage, sizeof(uint32_t));
  return swap_u32(val);
}

uint32_t read_u32(uint8_t* storage) {
  uint8_t* data = storage;
  if (is_big_endian())
    return (uint64_t)data[0] << 0U | (uint64_t)data[1] << 8U | (uint64_t)data[2] << 16U | (uint64_t)data[3] << 24U;
  else
    return (uint64_t)data[0] << 24U | (uint64_t)data[1] << 16U | (uint64_t)data[2] << 8U | (uint64_t)data[3] << 0U;
}

uint64_t read_u64_swap(uint8_t* storage) {
  uint64_t val;
  memcpy(&val, storage, sizeof(uint64_t));
  return swap_u64(val);
}

uint64_t read_u64(uint8_t* storage) {
  uint8_t* data = storage;
  if (is_big_endian())
    return (uint64_t)data[0] << 0U | (uint64_t)data[1] << 8U | (uint64_t)data[2] << 16U | (uint64_t)data[3] << 24U |
           (uint64_t)data[4] << 32U | (uint64_t)data[5] << 40U | (uint64_t)data[6] << 48U | (uint64_t)data[7] << 56U;
  else
    return (uint64_t)data[0] << 56U | (uint64_t)data[1] << 48U | (uint64_t)data[2] << 40U | (uint64_t)data[3] << 32U |
           (uint64_t)data[4] << 24U | (uint64_t)data[5] << 16U | (uint64_t)data[6] << 8U | (uint64_t)data[7] << 0U;
}

If I compile this to native assembly, clang/llvm detects the patterns for the byteswaping and generates optimal instructions:

popcnt_u32:                             # @popcnt_u32
        popcnt  eax, edi
        cmove   eax, edi
        ret
popcnt_u64:                             # @popcnt_u64
        popcnt  rax, rdi
        ret
is_big_endian:                          # @is_big_endian
        xor     eax, eax
        ret
swap_u32:                               # @swap_u32
        mov     eax, edi
        bswap   eax
        ret
swap_u64:                               # @swap_u64
        mov     rax, rdi
        bswap   rax
        ret
read_u32_swap:                          # @read_u32_swap
        movbe   eax, dword ptr [rdi]
        ret
read_u32:                               # @read_u32
        movbe   eax, dword ptr [rdi]
        ret
read_u64_swap:                          # @read_u64_swap
        movbe   rax, qword ptr [rdi]
        ret
read_u64:                               # @read_u64
        movbe   rax, qword ptr [rdi]
        ret

Notice how the functions reduce down to things like movbe, bswap and popcnt!

But if I try the same thing with webassembly, it doesn't optimize the swaps at all:

popcnt_u32:                             # @popcnt_u32
        local.get       0
        i32.popcnt
        i32.const       0
        local.get       0
        i32.select
        end_function
popcnt_u64:                             # @popcnt_u64
        local.get       0
        i64.popcnt
        i32.wrap_i64
        end_function
is_big_endian:                          # @is_big_endian
        i32.const       0
        end_function
swap_u32:                               # @swap_u32
        local.get       0
        i32.const       24
        i32.shl 
        local.get       0
        i32.const       8
        i32.shl 
        i32.const       16711680
        i32.and 
        i32.or  
        local.get       0
        i32.const       8
        i32.shr_u
        i32.const       65280
        i32.and 
        local.get       0
        i32.const       24
        i32.shr_u
        i32.or  
        i32.or  
        end_function
swap_u64:                               # @swap_u64
        local.get       0
        i64.const       56
        i64.shl 
        local.get       0
        i64.const       40
        i64.shl 
        i64.const       71776119061217280
        i64.and 
        i64.or  
        local.get       0
        i64.const       24
        i64.shl 
        i64.const       280375465082880
        i64.and 
        local.get       0
        i64.const       8
        i64.shl 
        i64.const       1095216660480
        i64.and 
        i64.or  
        i64.or  
        local.get       0
        i64.const       8
        i64.shr_u
        i64.const       4278190080
        i64.and 
        local.get       0
        i64.const       24
        i64.shr_u
        i64.const       16711680
        i64.and 
        i64.or  
        local.get       0
        i64.const       40
        i64.shr_u
        i64.const       65280
        i64.and 
        local.get       0
        i64.const       56
        i64.shr_u
        i64.or  
        i64.or  
        i64.or  
        end_function
read_u32_swap:                          # @read_u32_swap
        local.get       0
        i32.load        0:p2align=0
        local.tee       0
        i32.const       24
        i32.shl 
        local.get       0
        i32.const       8
        i32.shl 
        i32.const       16711680
        i32.and 
        i32.or  
        local.get       0
        i32.const       8
        i32.shr_u
        i32.const       65280
        i32.and 
        local.get       0
        i32.const       24
        i32.shr_u
        i32.or  
        i32.or  
        end_function
read_u32:                               # @read_u32
        local.get       0
        i32.load        0:p2align=0
        local.tee       0
        i32.const       24
        i32.shl 
        local.get       0
        i32.const       8
        i32.shl 
        i32.const       16711680
        i32.and 
        i32.or  
        local.get       0
        i32.const       8
        i32.shr_u
        i32.const       65280
        i32.and 
        local.get       0
        i32.const       24
        i32.shr_u
        i32.or  
        i32.or  
        end_function
read_u64_swap:                          # @read_u64_swap
        local.get       0
        i64.load        0:p2align=0
        local.tee       1
        i64.const       56
        i64.shl 
        local.get       1
        i64.const       40
        i64.shl 
        i64.const       71776119061217280
        i64.and 
        i64.or  
        local.get       1
        i64.const       24
        i64.shl 
        i64.const       280375465082880
        i64.and 
        local.get       1
        i64.const       8
        i64.shl 
        i64.const       1095216660480
        i64.and 
        i64.or  
        i64.or  
        local.get       1
        i64.const       8
        i64.shr_u
        i64.const       4278190080
        i64.and 
        local.get       1
        i64.const       24
        i64.shr_u
        i64.const       16711680
        i64.and 
        i64.or  
        local.get       1
        i64.const       40
        i64.shr_u
        i64.const       65280
        i64.and 
        local.get       1
        i64.const       56
        i64.shr_u
        i64.or  
        i64.or  
        i64.or  
        end_function
read_u64:                               # @read_u64
        local.get       0
        i64.load        0:p2align=0
        local.tee       1
        i64.const       56
        i64.shl 
        local.get       1
        i64.const       40
        i64.shl 
        i64.const       71776119061217280
        i64.and 
        i64.or  
        local.get       1
        i64.const       24
        i64.shl 
        i64.const       280375465082880
        i64.and 
        local.get       1
        i64.const       8
        i64.shl 
        i64.const       1095216660480
        i64.and 
        i64.or  
        i64.or  
        local.get       1
        i64.const       8
        i64.shr_u
        i64.const       4278190080
        i64.and 
        local.get       1
        i64.const       24
        i64.shr_u
        i64.const       16711680
        i64.and 
        i64.or  
        local.get       1
        i64.const       40
        i64.shr_u
        i64.const       65280
        i64.and 
        local.get       1
        i64.const       56
        i64.shr_u
        i64.or  
        i64.or  
        i64.or  
        end_function

Is there anything I can do to help this not bloat so terribly? Are there proposals for adding bswap and/or loadbe to the instruction set? The linked issues above seem to be closed with no progress in years.

I understand that not all real machines have these instructions, but many do. I would say the vast majority of machines that support wasm have these fast instructions natively. And even if the underlying machine can't emit a single native instruction when compiling the wasm to native, it can emit whatever is optimal for the platform. But the real effect will be much smaller wasm code downloaded over the network!

Please reconsider adding these to the spec or if there is an existing primitive that gets me close, please point me in the right direction.

The text was updated successfully, but these errors were encountered:

creationix · 2021-07-30T17:30:16Z

I guess another option is to change the data format to store as little-endian since most machines in my use case have that already (and wasm always is). I just find it odd that "network" byte ordering is different from "web assembly" byte ordering.

If the native format of the data structure is LE, then the generated code obviously gets a lot better:

popcnt_u32:                             # @popcnt_u32
        local.get       0
        i32.popcnt
        i32.const       0
        local.get       0
        i32.select
        end_function
popcnt_u64:                             # @popcnt_u64
        local.get       0
        i64.popcnt
        i32.wrap_i64
        end_function
is_little_endian:                       # @is_little_endian
        i32.const       1
        end_function
swap_u32:                               # @swap_u32
        local.get       0
        end_function
swap_u64:                               # @swap_u64
        local.get       0
        end_function
read_u32_swap:                          # @read_u32_swap
        local.get       0
        i32.load        0:p2align=0
        end_function
read_u32:                               # @read_u32
        local.get       0
        i32.load        0:p2align=0
        end_function
read_u64_swap:                          # @read_u64_swap
        local.get       0
        i64.load        0:p2align=0
        end_function
read_u64:                               # @read_u64
        local.get       0
        i64.load        0:p2align=0
        end_function

But changing the data format is often not an option, especially for network protocols that use "network" byte ordering.

penzn · 2021-07-30T17:37:13Z

There is no byteswap equivalent yet, but there is a note about it in Future Features.

MaxGraey · 2021-07-30T17:58:33Z

Btw Wasm MVP supports popcnt ops (i32.popcnt / i64.popcnt). Just use __builtin_popcount / __builtin_popcountll.

Special casing for zero input is really redundant:

        local.get       0
        i32.popcnt
        i32.const       0
        local.get       0
        i32.select

perhaps LLVM should enhance this

creationix · 2021-07-30T18:34:27Z

Btw Wasm MVP supports popcnt ops

Good to know which builtins are available, but as you see in my code, it did correctly detect the pattern and generated the i32.popcnt for me in wasm already.

dcodeIO · 2021-07-30T18:38:40Z

Fwiw, we have been anticipating the addition of bswap over at AssemblyScript and are currently polyfilling them. We'd also be interested if these were made available as instructions.

creationix · 2021-07-30T18:39:41Z

There is no byteswap equivalent yet, but there is a note about it in Future Features.

That's great! Though I wonder if there is any hope it will land any time soon? Also from that document, I'd love to have map_file(addr, length, Blob, file-offset)

dschuff · 2021-08-03T01:12:12Z

In general, proposals advance when sufficiently motivated people push them forward. That person could be you! bswap seems like it could be a relatively straightforward proposal.

it's a simple arithmetic instruction (or a small set of them)
IIUC modern processors all offer similar instructions with well-defined semantics, so implementation in engines is likely to be straightforward

This means that the cost of the proposal is relatively low.
You'd want to make the case that the benefits outweigh the cost.

You've shown that the alternative user-space expansion of this is fairly large, and as the future features, says, it reads non-constant input multiple times, which makes optimization harder.
it's a pattern recognized by common compilers, meaning many compiled programs are likely to be able to take advantage of it (even better would be to present more details and/or numbers on this, eg. what common or critical use case does this serve; how often might it end up being used in common programs?)

It's of course some work to advance a proposal, and there is some fixed amount of overhead per-proposal across the board. This could maybe amortized if there are other related instructions being proposed, but it's certainly not required. You'd also probably be able to get some help once you advance to a certain point (e.g. getting implementations, semantics, etc). Especially for uncontroversial proposals, often this mostly involves nagging the right people at the right time.

titzer · 2021-08-03T18:50:27Z

+1 to what @dschuff says here. It looks like it'd be easy to make a case for byte-swap instructions.

creationix · 2021-08-09T19:15:03Z

What code bases would need a PR to prototype this? I assume at a minimum, a compiler like maybe the wasm backend to llvm and then a wasm runtime like many of the interpreters or JIT/AOT compilers. Any tips on which ones would be good to help prove the feature technically?

lars-t-hansen · 2021-08-10T02:49:06Z

LLVM, maybe binaryen, and then at least one good VM that targets native code (to make the performance case) - v8, spidermonkey, and wasmtime are obvious choices. I don't know about the others, but adding single instructions to spidermonkey for this kind of prototyping is not particularly hard, though sometimes laborsome.

(At the last SIMD meeting we discussed very briefly whether we could have a lightweight process for moving single instruction proposals such as this along with minimal friction, and @dtig said she would bring it to the whole group. Maybe part of that process would be some guidelines on how to go about making the case for a new instruction.)

dtig · 2021-08-10T23:29:15Z

I've opened WebAssembly/meetings#857 to gather ideas on how we can streamline the process, based on responses there, I can add to the process documentation if we decide that there is enough overhead that we can get rid of for smaller proposals.

Prototyping a single opcode in V8 should also be fairly straightforward, there are some layers top plumb through, @ngzhian put together some helpful documentation on how to do this in V8. To prove this feature technically, +1 to what @lars-t-hansen reply above, and adding to that some spec tests or a side-by-side comparison of code reduction on adding bswap, and movbe would be helpful too.

tlively · 2021-08-16T23:47:26Z

We have a checklist of things to do to add new instructions to Binaryen as well: https://github.com/WebAssembly/binaryen/blob/main/Contributing.md#adding-support-for-new-instructions. For prototyping on the tools side, it would probably be sufficient to model the new instructions as function calls in C/C++/LLVM and only implement the instruction in Binaryen. Then you could add a trivial Binaryen pass to lower the import calls emitted by LLVM to the new instructions.

Edit: Although it's not that hard to add a new instruction to Clang/LLVM either.

creationix · 2021-08-19T18:27:23Z

Thanks everyone. I'm not sure when I'll have availability to prototype this, but it sounds like a lot of fun!

at least one good VM that targets native code (to make the performance case)

I would think it would help the performance for both native and interpreted. Surely a single interpreted instruction is much faster than dozens of them. But yeah, proving both should be helpful.

My hope is to prototype the following:

design a proposed encoding in the wire format
Add the instructions to binaryen for both code generation and the interpreter.
Patch llvm to emit the new instructions for wasm
Add the instructions to V8

But I'm doing this in my sporadic free time, so if anyone else wants to speed things up, feel free to jump in.

Then to test it, write a codec for some existing wire format that requires network byte ordering and compare:

compare size of generated wasm binary
compare size of generated native assembly
compare runtime performance in both interpreter and native code

Assuming I get all that done, I'll probably need to focus on other projects for a bit and will likely need help pushing through all the processes required. But my hope is this will give the idea a jump start.

MaxGraey · 2021-08-19T18:32:39Z

I could help with binaryen part

This was referenced Oct 29, 2022

Looking for i64.bswap in future impl #1334

Closed

Currently, webasm are only have le(LittleEndian) mode. #1212

Closed

sunfishcode added the integer label Dec 21, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

`bswap` and `movebe` equivalents? #1426

`bswap` and `movebe` equivalents? #1426

creationix commented Jul 30, 2021 •

edited

Loading

creationix commented Jul 30, 2021 •

edited

Loading

Uh oh!

penzn commented Jul 30, 2021

Uh oh!

MaxGraey commented Jul 30, 2021 •

edited

Loading

Uh oh!

creationix commented Jul 30, 2021

Uh oh!

dcodeIO commented Jul 30, 2021

Uh oh!

creationix commented Jul 30, 2021

Uh oh!

dschuff commented Aug 3, 2021

Uh oh!

titzer commented Aug 3, 2021 •

edited

Loading

Uh oh!

creationix commented Aug 9, 2021

Uh oh!

lars-t-hansen commented Aug 10, 2021

Uh oh!

dtig commented Aug 10, 2021

Uh oh!

tlively commented Aug 16, 2021 •

edited

Loading

Uh oh!

creationix commented Aug 19, 2021 •

edited

Loading

Uh oh!

MaxGraey commented Aug 19, 2021

Uh oh!

bswap and movebe equivalents? #1426

bswap and movebe equivalents? #1426

Comments

creationix commented Jul 30, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

creationix commented Jul 30, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

penzn commented Jul 30, 2021

Uh oh!

MaxGraey commented Jul 30, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

creationix commented Jul 30, 2021

Uh oh!

dcodeIO commented Jul 30, 2021

Uh oh!

creationix commented Jul 30, 2021

Uh oh!

dschuff commented Aug 3, 2021

Uh oh!

titzer commented Aug 3, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

creationix commented Aug 9, 2021

Uh oh!

lars-t-hansen commented Aug 10, 2021

Uh oh!

dtig commented Aug 10, 2021

Uh oh!

tlively commented Aug 16, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

creationix commented Aug 19, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

MaxGraey commented Aug 19, 2021

Uh oh!

`bswap` and `movebe` equivalents? #1426

`bswap` and `movebe` equivalents? #1426

creationix commented Jul 30, 2021 •

edited

Loading

creationix commented Jul 30, 2021 •

edited

Loading

MaxGraey commented Jul 30, 2021 •

edited

Loading

titzer commented Aug 3, 2021 •

edited

Loading

tlively commented Aug 16, 2021 •

edited

Loading

creationix commented Aug 19, 2021 •

edited

Loading