diff --git a/src/int/specialized_div_rem/delegate.rs b/src/int/specialized_div_rem/delegate.rs index 8310c1429..135d3402a 100644 --- a/src/int/specialized_div_rem/delegate.rs +++ b/src/int/specialized_div_rem/delegate.rs @@ -185,3 +185,133 @@ macro_rules! impl_delegate { } }; } + +/// Returns `n / d` and sets `*rem = n % d`. +/// +/// This specialization exists because: +/// - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`, +/// so we have to use an old fashioned `&mut u128` argument to return the remainder. +/// - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the +/// delegate algorithm strategy the only reasonably fast way to perform `u128` division. +#[doc(hidden)] +pub fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 { + use super::*; + let duo_lo = duo as u64; + let duo_hi = (duo >> 64) as u64; + let div_lo = div as u64; + let div_hi = (div >> 64) as u64; + + match (div_lo == 0, div_hi == 0, duo_hi == 0) { + (true, true, _) => zero_div_fn(), + (_, false, true) => { + *rem = duo; + return 0; + } + (false, true, true) => { + let tmp = u64_by_u64_div_rem(duo_lo, div_lo); + *rem = tmp.1 as u128; + return tmp.0 as u128; + } + (false, true, false) => { + if duo_hi < div_lo { + let norm_shift = u64_normalization_shift(div_lo, duo_hi, false); + let shl = if norm_shift == 0 { + 64 - 1 + } else { + 64 - norm_shift + }; + + let mut div: u128 = div << shl; + let mut pow_lo: u64 = 1 << shl; + let mut quo_lo: u64 = 0; + let mut duo = duo; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as i128) { + duo = sub; + quo_lo |= pow_lo; + let duo_hi = (duo >> 64) as u64; + if duo_hi == 0 { + let tmp = u64_by_u64_div_rem(duo as u64, div_lo); + *rem = tmp.1 as u128; + return (quo_lo | tmp.0) as u128; + } + } + div >>= 1; + pow_lo >>= 1; + } + } else if duo_hi == div_lo { + let tmp = u64_by_u64_div_rem(duo as u64, div as u64); + *rem = tmp.1 as u128; + return (1 << 64) | (tmp.0 as u128); + } else { + if (div_lo >> 32) == 0 { + let div_0 = div_lo as u32 as u64; + let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0); + + let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32); + let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0); + + let duo_lo = (duo as u32 as u64) | (rem_2 << 32); + let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0); + + *rem = rem_1 as u128; + return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64); + } + + let duo_lo = duo as u64; + let tmp = u64_by_u64_div_rem(duo_hi, div_lo); + let quo_hi = tmp.0; + let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64); + if duo < div { + *rem = duo; + return (quo_hi as u128) << 64; + } + + let mut div: u128 = div << (64 - 1); + let mut pow_lo: u64 = 1 << (64 - 1); + let mut quo_lo: u64 = 0; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as i128) { + duo = sub; + quo_lo |= pow_lo; + let duo_hi = (duo >> 64) as u64; + if duo_hi == 0 { + let tmp = u64_by_u64_div_rem(duo as u64, div_lo); + *rem = tmp.1 as u128; + return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64); + } + } + div >>= 1; + pow_lo >>= 1; + } + } + } + (_, false, false) => { + if duo < div { + *rem = duo; + return 0; + } + let div_original = div; + let shl = u64_normalization_shift(duo_hi, div_hi, false); + let mut duo = duo; + let mut div: u128 = div << shl; + let mut pow_lo: u64 = 1 << shl; + let mut quo_lo: u64 = 0; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as i128) { + duo = sub; + quo_lo |= pow_lo; + if duo < div_original { + *rem = duo; + return quo_lo as u128; + } + } + div >>= 1; + pow_lo >>= 1; + } + } + } +} diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs index 7f37f6220..eaeb030e3 100644 --- a/src/int/specialized_div_rem/mod.rs +++ b/src/int/specialized_div_rem/mod.rs @@ -46,6 +46,7 @@ mod binary_long; #[macro_use] mod delegate; +pub use self::delegate::u128_divide_sparc; #[macro_use] mod trifecta; @@ -60,27 +61,31 @@ fn zero_div_fn() -> ! { unsafe { core::hint::unreachable_unchecked() } } -// The `B` extension on RISC-V determines if a CLZ assembly instruction exists -#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] -const USE_LZ: bool = cfg!(target_feature = "b"); - -#[cfg(target_arch = "arm")] -const USE_LZ: bool = if cfg!(target_feature = "thumb-mode") { - // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is supported. This - // is needed to successfully differentiate between targets like `thumbv8.base` and - // `thumbv8.main`. - cfg!(target_feature = "v6t2") -} else { - // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is supported. - // Technically, ARMv5T was the first to have CLZ, but the "v5t" target feature does not seem to - // work. - cfg!(target_feature = "v5te") +const USE_LZ: bool = { + if cfg!(target_arch = "arm") { + if cfg!(target_feature = "thumb-mode") { + // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is + // supported. This is needed to successfully differentiate between targets like + // `thumbv8.base` and `thumbv8.main`. + cfg!(target_feature = "v6t2") + } else { + // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is + // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target + // feature does not seem to work. + cfg!(target_feature = "v5te") + } + } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) { + // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later. + cfg!(target_feature = "vis3") + } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) { + // The `B` extension on RISC-V determines if a CLZ assembly instruction exists + cfg!(target_feature = "b") + } else { + // All other common targets Rust supports should have CLZ instructions + true + } }; -// All other targets Rust supports have CLZ instructions -#[cfg(not(any(target_arch = "arm", target_arch = "riscv32", target_arch = "riscv64")))] -const USE_LZ: bool = true; - impl_normalization_shift!( u32_normalization_shift, USE_LZ, @@ -115,8 +120,9 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) { // microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is // faster if the target pointer width is at least 64. #[cfg(all( + not(any(target_pointer_width = "16", target_pointer_width = "32")), not(all(not(feature = "no-asm"), target_arch = "x86_64")), - not(any(target_pointer_width = "16", target_pointer_width = "32")) + not(any(target_arch = "sparc", target_arch = "sparc64")) ))] impl_trifecta!( u128_div_rem, @@ -131,8 +137,9 @@ impl_trifecta!( // If the pointer width less than 64, then the target architecture almost certainly does not have // the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster. #[cfg(all( + any(target_pointer_width = "16", target_pointer_width = "32"), not(all(not(feature = "no-asm"), target_arch = "x86_64")), - any(target_pointer_width = "16", target_pointer_width = "32") + not(any(target_arch = "sparc", target_arch = "sparc64")) ))] impl_delegate!( u128_div_rem, diff --git a/src/int/udiv.rs b/src/int/udiv.rs index 3cd9be93c..d97178078 100644 --- a/src/int/udiv.rs +++ b/src/int/udiv.rs @@ -1,3 +1,4 @@ +pub use int::specialized_div_rem::u128_divide_sparc; use int::specialized_div_rem::*; intrinsics! { @@ -46,25 +47,50 @@ intrinsics! { quo_rem.0 } + // Note: we use block configuration and not `if cfg!(...)`, because we need to entirely disable + // the existence of `u128_div_rem` to get 32-bit SPARC to compile, see `u128_divide_sparc` docs. + #[win64_128bit_abi_hack] /// Returns `n / d` pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 { - u128_div_rem(n, d).0 + #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] { + u128_div_rem(n, d).0 + } + #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] { + u128_divide_sparc(n, d, &mut 0) + } } #[win64_128bit_abi_hack] /// Returns `n % d` pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 { - u128_div_rem(n, d).1 + #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] { + u128_div_rem(n, d).1 + } + #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] { + let mut rem = 0; + u128_divide_sparc(n, d, &mut rem); + rem + } } #[win64_128bit_abi_hack] /// Returns `n / d` and sets `*rem = n % d` pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 { - let quo_rem = u128_div_rem(n, d); - if let Some(rem) = rem { - *rem = quo_rem.1; + #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] { + let quo_rem = u128_div_rem(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; + } + quo_rem.0 + } + #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] { + let mut tmp = 0; + let quo = u128_divide_sparc(n, d, &mut tmp); + if let Some(rem) = rem { + *rem = tmp; + } + quo } - quo_rem.0 } }