@@ -974,42 +974,53 @@ preprocess(dest, x) = extrude(broadcast_unalias(dest, x))
974
974
return dest
975
975
end
976
976
977
- # Performance optimization: for BitArray outputs, we cache the result
977
+ # Performance optimization: for BitVector outputs, we cache the result
978
978
# in a 64-bit register before writing into memory (to bypass LSQ)
979
+ @inline function copyto! (dest:: BitVector , bc:: Broadcasted{Nothing} )
980
+ axes (dest) == axes (bc) || throwdm (axes (dest), axes (bc))
981
+ ischunkedbroadcast (dest, bc) && return chunkedcopyto! (dest, bc)
982
+ destc = dest. chunks
983
+ bcp = preprocess (dest, bc)
984
+ length (bcp) <= 0 && return dest
985
+ len = Base. num_bit_chunks (Int (length (bcp)))
986
+ @inbounds for i = 0 : (len - 2 )
987
+ z = UInt64 (0 )
988
+ for j = 0 : 63
989
+ z |= UInt64 (bcp[i* 64 + j + 1 ]:: Bool ) << (j & 63 )
990
+ end
991
+ destc[i + 1 ] = z
992
+ end
993
+ @inbounds let i = len - 1
994
+ z = UInt64 (0 )
995
+ for j = 0 : ((length (bcp) - 1 ) & 63 )
996
+ z |= UInt64 (bcp[i* 64 + j + 1 ]:: Bool ) << (j & 63 )
997
+ end
998
+ destc[i + 1 ] = z
999
+ end
1000
+ return dest
1001
+ end
1002
+
1003
+ # Performance optimization: for BitArray outputs, we cache the result
1004
+ # in a "small" Vector{Bool}, and then copy in chunks into the output
979
1005
@inline function copyto! (dest:: BitArray , bc:: Broadcasted{Nothing} )
980
1006
axes (dest) == axes (bc) || throwdm (axes (dest), axes (bc))
981
1007
ischunkedbroadcast (dest, bc) && return chunkedcopyto! (dest, bc)
982
- ndims (dest) == 0 && (dest[] = bc[]; return dest)
1008
+ length (dest) < 256 && return invoke (copyto!, Tuple{AbstractArray, Broadcasted{Nothing}}, dest, bc)
1009
+ tmp = Vector {Bool} (undef, bitcache_size)
1010
+ destc = dest. chunks
1011
+ cind = 1
983
1012
bc′ = preprocess (dest, bc)
984
- ax = axes (bc′)
985
- ax1, out = ax[1 ], CartesianIndices (tail (ax))
986
- destc, indc = dest. chunks, 0
987
- bitst, remain = 0 , UInt64 (0 )
988
- for I in out
989
- i = first (ax1) - 1
990
- if ndims (bc) == 1 || bitst >= 64 - length (ax1)
991
- if ndims (bc) > 1 && bitst != 0
992
- @inbounds @simd for j = bitst: 63
993
- remain |= UInt64 (convert (Bool, bc′[i+= 1 , I])) << (j & 63 )
994
- end
995
- @inbounds destc[indc+= 1 ] = remain
996
- bitst, remain = 0 , UInt64 (0 )
997
- end
998
- while i <= last (ax1) - 64
999
- z = UInt64 (0 )
1000
- @inbounds @simd for j = 0 : 63
1001
- z |= UInt64 (convert (Bool, bc′[i+= 1 , I])) << (j & 63 )
1002
- end
1003
- @inbounds destc[indc+= 1 ] = z
1004
- end
1013
+ @inbounds for P in Iterators. partition (eachindex (bc′), bitcache_size)
1014
+ ind = 1
1015
+ @simd for I in P
1016
+ tmp[ind] = bc′[I]
1017
+ ind += 1
1005
1018
end
1006
- @inbounds @simd for j = i+ 1 : last (ax1)
1007
- remain |= UInt64 (convert (Bool, bc′[j, I])) << (bitst & 63 )
1008
- bitst += 1
1019
+ @simd for i in ind: bitcache_size
1020
+ tmp[i] = false
1009
1021
end
1010
- end
1011
- @inbounds if bitst != 0
1012
- destc[indc+= 1 ] = remain
1022
+ dumpbitcache (destc, cind, tmp)
1023
+ cind += bitcache_chunks
1013
1024
end
1014
1025
return dest
1015
1026
end
0 commit comments