-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Description
C++20 has added built-in wait/notify operations on atomics, both the generic std::atomic<T>
, as well as a boolean-like std::atomic_flag
.
On the WebAssembly target I'd expect those methods to compile to thin wrappers around the native atomic.wait
/ atomic.notify
instructions, but it looks like it results in a much more complex codegen. I'm guessing it implements whatever those instructions do natively under the hood, but embedded into the WebAssembly binary.
Example:
#include <atomic>
__attribute__((export_name("wait")))
extern "C" void wait(std::atomic<int>* x) {
x->wait(123);
}
int main() {}
with emcc temp.cpp -Os -g2 -o temp.js -pthread
results in the following codegen + tons of the helper functions it references:
(func $wait (type $t0) (param $p0 i32)
(local $l1 i32) (local $l2 i32) (local $l3 i32) (local $l4 i32) (local $l5 i32) (local $l6 i32) (local $l7 i32) (local $l8 i32) (local $l9 i32) (local $l10 i32) (local $l11 i32) (local $l12 i32) (local $l13 i32) (local $l14 i32) (local $l15 i32) (local $l16 i32) (local $l17 i32) (local $l18 i32) (local $l19 i64) (local $l20 i64) (local $l21 i64)
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l4
global.set $__stack_pointer
local.get $l4
i64.const 21474836603
i64.store offset=8 align=4
local.get $l4
local.get $p0
i32.store offset=4
local.get $l4
i32.const 5
i32.store offset=28
local.get $l4
local.get $p0
i32.store offset=16
local.get $l4
local.get $l4
i64.load offset=4 align=4
i64.store offset=20 align=4
local.get $l4
i32.const 4
i32.add
local.set $l11
local.get $l4
i32.const 16
i32.add
local.set $l7
i32.const 0
local.set $p0
call $std::__2::chrono::__libcpp_steady_clock_now__
local.set $l20
loop $L0
local.get $p0
i32.const 63
i32.gt_u
local.set $l12
loop $L1
block $B2
local.get $l11
call $std::__2::__cxx_atomic_wait_test_fn_impl<std::__2::__cxx_atomic_impl<int__std::__2::__cxx_atomic_base_impl<int>>__int>::operator___abi:v160006____const
br_if $B2
local.get $l12
i32.eqz
if $I3
local.get $p0
i32.const 1
i32.add
local.set $p0
br $L0
end
call $std::__2::chrono::__libcpp_steady_clock_now__
local.get $l20
i64.sub
local.tee $l19
i64.const 0
i64.gt_s
i32.const 0
i32.and
br_if $B2
block $B4 (result i32)
i32.const 0
local.set $l1
block $B5
local.get $l19
i64.const 64001
i64.ge_s
if $I6
i32.const 1
local.set $l1
local.get $l7
i32.load
call $std::__2::__libcpp_contention_state_void_const_volatile*_
i64.atomic.load offset=8
local.set $l19
local.get $l7
i32.const 4
i32.add
call $std::__2::__cxx_atomic_wait_test_fn_impl<std::__2::__cxx_atomic_impl<int__std::__2::__cxx_atomic_base_impl<int>>__int>::operator___abi:v160006____const
br_if $B5
i32.const 0
local.set $l8
local.get $l7
i32.load
call $std::__2::__libcpp_contention_state_void_const_volatile*_
local.tee $l1
local.tee $l13
i64.const 1
i64.atomic.rmw.add
drop
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l5
global.set $__stack_pointer
local.get $l5
local.get $l19
i64.store offset=24
local.get $l5
local.get $l1
i32.const 8
i32.add
i32.store offset=16
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::zero_abi:v160006___
local.set $l19
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l3
global.set $__stack_pointer
local.get $l3
local.get $l19
i64.store offset=24
local.get $l3
call $std::__2::chrono::__libcpp_steady_clock_now__
i64.store offset=16
loop $L7
local.get $l8
i32.const 63
i32.gt_u
local.set $l14
loop $L8
block $B9
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l1
global.set $__stack_pointer
local.get $l1
local.get $l5
i32.load offset=16
i64.atomic.load
i64.store offset=8
local.get $l1
i64.load offset=8
local.get $l5
i64.load offset=24
i64.eq
local.set $l16
local.get $l1
i32.const 16
i32.add
global.set $__stack_pointer
local.get $l16
i32.eqz
br_if $B9
local.get $l14
i32.eqz
if $I10
local.get $l8
i32.const 1
i32.add
local.set $l8
br $L7
end
local.get $l3
call $std::__2::chrono::__libcpp_steady_clock_now__
i64.store
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l2
global.set $__stack_pointer
local.get $l2
local.get $l3
i64.load
i64.store offset=8
local.get $l2
local.get $l3
i64.load offset=16
i64.store
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l1
global.set $__stack_pointer
local.get $l1
local.get $l2
i64.load offset=8
i64.store offset=8
local.get $l1
i64.load offset=8
local.set $l19
local.get $l1
local.get $l2
i64.load
i64.store
local.get $l1
local.get $l19
local.get $l1
i64.load
i64.sub
i64.store offset=16
local.get $l1
i32.const 24
i32.add
local.get $l1
i32.const 16
i32.add
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long>_long_long_const&__std::__2::enable_if<is_convertible<long_long_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<long_long>::value___void>::type*_
i64.load
local.set $l19
local.get $l1
i32.const 32
i32.add
global.set $__stack_pointer
local.get $l2
i32.const 16
i32.add
global.set $__stack_pointer
local.get $l3
local.get $l19
i64.store offset=8
local.get $l3
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::zero_abi:v160006___
i64.store
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l1
global.set $__stack_pointer
local.get $l3
i32.const 24
i32.add
local.tee $l2
i64.load
local.get $l3
i64.load
i64.eq
local.set $l17
local.get $l1
i32.const 16
i32.add
global.set $__stack_pointer
local.get $l17
i32.const 1
i32.xor
if $I11
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l1
global.set $__stack_pointer
local.get $l2
i64.load
local.get $l3
i64.load offset=8
i64.lt_s
local.set $l18
local.get $l1
i32.const 16
i32.add
global.set $__stack_pointer
local.get $l18
br_if $B9
end
local.get $l3
i64.load offset=8
local.set $l19
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l1
global.set $__stack_pointer
local.get $l1
local.get $l19
i64.store offset=24
local.get $l1
i32.const 128
i32.store offset=8
local.get $l1
i32.const 16
i32.add
local.tee $l6
local.get $l1
i32.const 8
i32.add
local.tee $l9
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
local.set $l10
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l15
global.set $__stack_pointer
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l2
global.set $__stack_pointer
local.get $l2
i32.const 8
i32.add
local.get $l10
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long__std::__2::ratio<1ll__1000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>_const&__std::__2::enable_if<__no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::value_&&__std::__2::integral_constant<bool__false>::value_||___no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::type::den_==_1_&&_!treat_as_floating_point<long_long>::value___void>::type*_
i64.load
local.set $l19
local.get $l2
local.get $l1
i64.load offset=24
i64.store
local.get $l2
i64.load
local.set $l21
local.get $l2
i32.const 16
i32.add
global.set $__stack_pointer
local.get $l15
i32.const 16
i32.add
global.set $__stack_pointer
block $B12
local.get $l19
local.get $l21
i64.lt_s
if $I13
local.get $l1
i32.const 8
i32.store offset=4
local.get $l6
local.get $l9
local.get $l1
i32.const 4
i32.add
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long__std::__2::ratio<1ll__1000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>_const&__std::__2::enable_if<__no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::value_&&__std::__2::integral_constant<bool__false>::value_||___no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::type::den_==_1_&&_!treat_as_floating_point<long_long>::value___void>::type*_
call $std::__2::__libcpp_thread_sleep_for_abi:v160006__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&_
br $B12
end
local.get $l1
i32.const 64
i32.store offset=8
local.get $l1
i32.const 24
i32.add
local.tee $l6
local.get $l1
i32.const 16
i32.add
local.tee $l9
local.get $l1
i32.const 8
i32.add
local.tee $l10
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
call $bool_std::__2::chrono::operator>_abi:v160006_<long_long__std::__2::ratio<1ll__1000000000ll>__long_long__std::__2::ratio<1ll__1000000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000ll>>_const&_
if $I14
local.get $l1
i32.const 2
i32.store offset=8
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l2
global.set $__stack_pointer
local.get $l2
local.get $l6
i64.load
i64.store offset=8
local.get $l2
local.get $l2
i64.load offset=8
local.get $l10
i64.load32_s
i64.div_s
i64.store offset=16
local.get $l2
i32.const 24
i32.add
local.get $l2
i32.const 16
i32.add
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long>_long_long_const&__std::__2::enable_if<is_convertible<long_long_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<long_long>::value___void>::type*_
i64.load
local.set $l19
local.get $l2
i32.const 32
i32.add
global.set $__stack_pointer
local.get $l1
local.get $l19
i64.store offset=16
local.get $l9
call $std::__2::__libcpp_thread_sleep_for_abi:v160006__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&_
br $B12
end
local.get $l1
i32.const 4
i32.store offset=8
local.get $l1
i32.const 24
i32.add
local.get $l1
i32.const 16
i32.add
local.get $l1
i32.const 8
i32.add
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
call $bool_std::__2::chrono::operator>_abi:v160006_<long_long__std::__2::ratio<1ll__1000000000ll>__long_long__std::__2::ratio<1ll__1000000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000ll>>_const&_
i32.eqz
br_if $B12
call $emscripten_get_now
call $_emscripten_yield
end
local.get $l1
i32.const 32
i32.add
global.set $__stack_pointer
br $L8
end
end
end
local.get $l3
i32.const 32
i32.add
global.set $__stack_pointer
local.get $l5
i32.const 32
i32.add
global.set $__stack_pointer
local.get $l13
i64.const 1
i64.atomic.rmw.sub
drop
i32.const 0
br $B4
end
local.get $l19
i64.const 4001
i64.lt_s
br_if $B5
call $emscripten_get_now
call $_emscripten_yield
end
local.get $l1
end
i32.eqz
br_if $L1
end
end
end
local.get $l4
i32.const 32
i32.add
global.set $__stack_pointer)
I know there is at least emscripten/atomic.h
which uses Clang builtins to get native atomic wait/notify, but it would be nice to be able to use those optimised instructions via the standard library too.
Activity
sbc100 commentedon Nov 29, 2023
That would be pretty cool to, assuming the semantics match.
We would probably want to implement it here and then upstream it to llvm's libc++.
RReverser commentedon Nov 30, 2023
The semantics are slightly different - e.g. C++ wait actually verifies that the value has changed - but it should be still a pretty thin wrapper/loop around the
atomic.wait
instruction rather than whatever it generates today.tlively commentedon Nov 30, 2023
Since the Wasm wait instructions do not allow spurious wakeups, the loop shouldn't even be necessary. (assuming I am interpreting the C++ docs correctly... I believe a notify should still wake a waiter even if the value hasn't changed. I think the note about not waking up if the value hasn't changed only applies to spurious wakeups.)
RReverser commentedon Nov 30, 2023
Hm I'm not sure about that bit. I thought it's more about general consistency.
Like, if something in Wasm performs (for whatever reason) manual
emscripten_atomic_notify
/ its__builtin_*
variant / itsAtomics.notify
JS variant on the address ofstd::atomic<int>
, it reads like it would be a spec violation foratomic->wait()
to return if the value hasn't actually changed.RReverser commentedon Nov 30, 2023
Those can probably count as "spurious wakeups" in a sense.
RReverser commentedon Nov 30, 2023
Thinking about it a bit more, you don't even need to use external wakeup to run into issues.
Even via C++ APIs, if one thread does
atomic->notify_one();
then another waiting onatomic->wait(...);
should still continue waiting unless the value has changed:In this regard it's different from Wasm instruction.