3
3
module XoshiroSimd
4
4
# Getting the xoroshiro RNG to reliably vectorize is somewhat of a hassle without Simd.jl.
5
5
import .. Random: rand!
6
- using .. Random: TaskLocalRNG, rand, Xoshiro, CloseOpen01, UnsafeView, SamplerType, SamplerTrivial, getstate, setstate!
6
+ using .. Random: TaskLocalRNG, rand, Xoshiro, CloseOpen01, UnsafeView, SamplerType, SamplerTrivial, getstate, setstate!, _uint2float
7
7
using Base: BitInteger_types
8
8
using Base. Libc: memcpy
9
9
using Core. Intrinsics: llvmcall
@@ -30,7 +30,12 @@ simdThreshold(::Type{Bool}) = 640
30
30
Tuple{UInt64, Int64},
31
31
x, y)
32
32
33
- @inline _bits2float (x:: UInt64 , :: Type{Float64} ) = reinterpret (UInt64, Float64 (x >>> 11 ) * 0x1 .0 p- 53 )
33
+ # `_bits2float(x::UInt64, T)` takes `x::UInt64` as input, it splits it in `N` parts where
34
+ # `N = sizeof(UInt64) / sizeof(T)` (`N = 1` for `Float64`, `N = 2` for `Float32, etc...), it
35
+ # truncates each part to the unsigned type of the same size as `T`, scales all of these
36
+ # numbers to a value of type `T` in the range [0,1) with `_uint2float`, and then
37
+ # recomposes another `UInt64` using all these parts.
38
+ @inline _bits2float (x:: UInt64 , :: Type{Float64} ) = reinterpret (UInt64, _uint2float (x, Float64))
34
39
@inline function _bits2float (x:: UInt64 , :: Type{Float32} )
35
40
#=
36
41
# this implementation uses more high bits, but is harder to vectorize
@@ -40,19 +45,19 @@ simdThreshold(::Type{Bool}) = 640
40
45
=#
41
46
ui = (x>>> 32 ) % UInt32
42
47
li = x % UInt32
43
- u = Float32 (ui >>> 8 ) * Float32 ( 0x1 . 0 p - 24 )
44
- l = Float32 (li >>> 8 ) * Float32 ( 0x1 . 0 p - 24 )
48
+ u = _uint2float (ui, Float32)
49
+ l = _uint2float (ui, Float32)
45
50
(UInt64 (reinterpret (UInt32, u)) << 32 ) | UInt64 (reinterpret (UInt32, l))
46
51
end
47
52
@inline function _bits2float (x:: UInt64 , :: Type{Float16} )
48
53
i1 = (x>>> 48 ) % UInt16
49
54
i2 = (x>>> 32 ) % UInt16
50
55
i3 = (x>>> 16 ) % UInt16
51
56
i4 = x % UInt16
52
- f1 = Float16 (i1 >>> 5 ) * Float16 ( 0x1 . 0 p - 11 )
53
- f2 = Float16 (i2 >>> 5 ) * Float16 ( 0x1 . 0 p - 11 )
54
- f3 = Float16 (i3 >>> 5 ) * Float16 ( 0x1 . 0 p - 11 )
55
- f4 = Float16 (i4 >>> 5 ) * Float16 ( 0x1 . 0 p - 11 )
57
+ f1 = _uint2float (i1, Float16)
58
+ f2 = _uint2float (i2, Float16)
59
+ f3 = _uint2float (i3, Float16)
60
+ f4 = _uint2float (i4, Float16)
56
61
return (UInt64 (reinterpret (UInt16, f1)) << 48 ) | (UInt64 (reinterpret (UInt16, f2)) << 32 ) | (UInt64 (reinterpret (UInt16, f3)) << 16 ) | UInt64 (reinterpret (UInt16, f4))
57
62
end
58
63
0 commit comments