diff --git a/.travis.yml b/.travis.yml
index c91e4f34de..f97609447c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,7 +8,7 @@ matrix:
     - env: TARGET=i586-unknown-linux-gnu
     - env: TARGET=i686-unknown-linux-gnu
     - env: TARGET=x86_64-unknown-linux-gnu NO_ADD=1
-    - env: TARGET=x86_64-unknown-linux-gnu-emulated NO_ADD=1 STDSIMD_TEST_EVERYTHING=1
+    - env: TARGET=x86_64-unknown-linux-gnu-emulated NO_ADD=1 STDSIMD_TEST_EVERYTHING=1 FEATURES="intel_sde"
     - env: TARGET=arm-unknown-linux-gnueabihf
     - env: TARGET=armv7-unknown-linux-gnueabihf
     - env: TARGET=aarch64-unknown-linux-gnu
@@ -33,7 +33,7 @@ install:
 
 script:
   - cargo generate-lockfile
-  - ci/run-docker.sh $TARGET
+  - ci/run-docker.sh $TARGET $FEATURES
 
 notifications:
   email:
diff --git a/Cargo.toml b/Cargo.toml
index 874f9d4f08..16c995e7ad 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,8 +28,9 @@ opt-level = 3
 
 [dev-dependencies]
 stdsimd-test = { version = "0.*", path = "stdsimd-test" }
-cupid = "0.3"
+cupid = "0.4.0"
 
 [features]
 strict = []
-std = []
\ No newline at end of file
+std = []
+intel_sde = []
\ No newline at end of file
diff --git a/ci/docker/aarch64-unknown-linux-gnu/Dockerfile b/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
index 4e3bff0ac0..68261a2f03 100644
--- a/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
@@ -8,6 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
   qemu-user \
   make \
   file
+
 ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \
     CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER="qemu-aarch64 -L /usr/aarch64-linux-gnu" \
     OBJDUMP=aarch64-linux-gnu-objdump
diff --git a/ci/run-docker.sh b/ci/run-docker.sh
index d5ea59e4cd..d08a164be3 100755
--- a/ci/run-docker.sh
+++ b/ci/run-docker.sh
@@ -4,10 +4,11 @@
 set -ex
 
 run() {
-    echo $1
+    echo "Building docker container for TARGET=${1}"
     docker build -t stdsimd ci/docker/$1
     mkdir -p target
     target=$(echo $1 | sed 's/-emulated//')
+    echo "Running docker"
     docker run \
       --user `id -u`:`id -g` \
       --rm \
@@ -16,6 +17,7 @@ run() {
       --env CARGO_HOME=/cargo \
       --volume `rustc --print sysroot`:/rust:ro \
       --env TARGET=$target \
+      --env FEATURES=$2 \
       --env STDSIMD_TEST_EVERYTHING \
       --volume `pwd`:/checkout:ro \
       --volume `pwd`/target:/checkout/target \
@@ -31,5 +33,5 @@ if [ -z "$1" ]; then
     run $d
   done
 else
-  run $1
+  run $1 $2
 fi
diff --git a/ci/run.sh b/ci/run.sh
index f4d3382cb0..8c8f18c37d 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -15,10 +15,15 @@ case ${TARGET} in
         ;;
 esac
 
+FEATURES="strict,$FEATURES"
+FEATURES_STD="${FEATURES},std"
+
 echo "RUSTFLAGS=${RUSTFLAGS}"
+echo "FEATURES=${FEATURES}"
+echo "OBJDUMP=${OBJDUMP}"
 
-cargo test --target $TARGET --features "strict"
-cargo test --release --target $TARGET --features "strict"
+cargo test --target $TARGET --features $FEATURES --verbose -- --nocapture
+cargo test --release --target $TARGET --features $FEATURES --verbose -- --nocapture
 
-cargo test --target $TARGET --features "strict,std"
-cargo test --release --target $TARGET --features "strict,std"
+cargo test --target $TARGET --features $FEATURES_STD --verbose -- --nocapture
+cargo test --release --target $TARGET --features $FEATURES_STD --verbose -- --nocapture
diff --git a/src/lib.rs b/src/lib.rs
index ed18b00d3f..add34c2742 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -121,7 +121,7 @@
 #![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd,
            simd_ffi, target_feature, cfg_target_feature, i128_type, asm,
            const_atomic_usize_new, stmt_expr_attributes)]
-#![cfg_attr(test, feature(proc_macro, test))]
+#![cfg_attr(test, feature(proc_macro, test, repr_align, attr_literals))]
 #![cfg_attr(feature = "cargo-clippy",
             allow(inline_always, too_many_arguments, cast_sign_loss,
                   cast_lossless, cast_possible_wrap,
@@ -159,8 +159,29 @@ pub mod vendor {
     pub use aarch64::*;
 
     pub use nvptx::*;
+
+    #[cfg(any(
+        // x86/x86_64:
+        any(target_arch = "x86", target_arch = "x86_64"),
+        // linux + std + (arm|aarch64):
+        all(target_os = "linux",
+            feature = "std",
+            any(target_arch = "arm", target_arch = "aarch64"))
+    ))]
+    pub use runtime::{__unstable_detect_feature, __Feature};
 }
 
+#[cfg(any(
+    // x86/x86_64:
+    any(target_arch = "x86", target_arch = "x86_64"),
+    // linux + std + (arm|aarch64):
+    all(target_os = "linux",
+        feature = "std",
+        any(target_arch = "arm", target_arch = "aarch64"))
+))]
+#[macro_use]
+mod runtime;
+
 #[macro_use]
 mod macros;
 mod simd_llvm;
@@ -204,7 +225,6 @@ mod v16 {
 }
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[macro_use]
 mod x86;
 
 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
diff --git a/src/macros.rs b/src/macros.rs
index c2009fa939..563e196b64 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -373,56 +373,6 @@ macro_rules! define_casts {
     }
 }
 
-/// Is a feature supported by the host CPU?
-///
-/// This macro performs run-time feature detection. It returns true if the host
-/// CPU in which the binary is running on supports a particular feature.
-#[macro_export]
-macro_rules! cfg_feature_enabled {
-    ($name:tt) => (
-        {
-            #[cfg(target_feature = $name)]
-            {
-                true
-            }
-            #[cfg(not(target_feature = $name))]
-            {
-                __unstable_detect_feature!($name)
-            }
-        }
-    )
-}
-
-/// On ARM features are only detected at compile-time using
-/// cfg(target_feature), so if this macro is executed the
-/// feature is not supported.
-#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
-#[macro_export]
-#[doc(hidden)]
-macro_rules! __unstable_detect_feature {
-    ("neon") => { false };
-    ($t:tt) => { compile_error!(concat!("unknown target feature: ", $t)) };
-}
-
-/// In all unsupported architectures using the macro is an error
-#[cfg(not(any(target_arch = "x86", target_arch = "x86_64",
-              target_arch = "arm", target_arch = "aarch64")))]
-#[macro_export]
-#[doc(hidden)]
-macro_rules! __unstable_detect_feature {
-    ($t:tt) => { compile_error!(concat!("unknown target feature: ", $t)) };
-}
-
-#[cfg(test)]
-mod tests {
-    #[cfg(target_arch = "x86_64")]
-    #[test]
-    fn test_macros() {
-        assert!(cfg_feature_enabled!("sse"));
-    }
-}
-
-
 #[cfg(test)]
 #[macro_export]
 macro_rules! test_arithmetic_ {
diff --git a/src/runtime/aarch64.rs b/src/runtime/aarch64.rs
new file mode 100644
index 0000000000..5d10fc06e3
--- /dev/null
+++ b/src/runtime/aarch64.rs
@@ -0,0 +1,56 @@
+//! Run-time feature detection on ARM Aarch64.
+use super::{bit, linux};
+
+#[macro_export]
+#[doc(hidden)]
+macro_rules! __unstable_detect_feature {
+    ("neon") => {
+        // FIXME: this should be removed once we rename Aarch64 neon to asimd
+        $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::asimd{})
+    };
+    ("asimd") => {
+        $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::asimd{})
+    };
+    ("pmull") => {
+        $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::pmull{})
+    };
+    ($t:tt) => { compile_error!(concat!("unknown arm target feature: ", $t)) };
+}
+
+/// ARM Aarch64 CPU Feature enum. Each variant denotes a position in a bitset
+/// for a particular feature.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+#[doc(hidden)]
+#[allow(non_camel_case_types)]
+#[repr(u8)]
+pub enum __Feature {
+    /// ARM Advanced SIMD (ASIMD) - Aarch64
+    asimd,
+    /// Polynomial Multiply
+    pmull,
+}
+
+pub fn detect_features<T: linux::FeatureQuery>(mut x: T) -> usize {
+    let value: usize = 0;
+    {
+        let mut enable_feature = |f| {
+            if x.has_feature(&f) {
+                bit::set(value, f as u32);
+            }
+        };
+        enable_feature(__Feature::asimd);
+        enable_feature(__Feature::pmull);
+    }
+    value
+}
+
+impl linux::FeatureQuery for linux::CpuInfo {
+    fn has_feature(&mut self, x: &__Feature) -> bool {
+        use self::__Feature::*;
+        match *x {
+            asimd => self.field("Features").has("asimd"),
+            pmull => self.field("Features").has("pmull"),
+        }
+    }
+}
diff --git a/src/runtime/arm.rs b/src/runtime/arm.rs
new file mode 100644
index 0000000000..60ef909fca
--- /dev/null
+++ b/src/runtime/arm.rs
@@ -0,0 +1,66 @@
+//! Run-time feature detection on ARM Aarch32.
+
+use super::{bit, linux};
+
+#[macro_export]
+#[doc(hidden)]
+macro_rules! __unstable_detect_feature {
+    ("neon") => {
+        $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::neon{})
+    };
+    ("pmull") => {
+        $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::pmull{})
+    };
+    ($t:tt) => { compile_error!(concat!("unknown arm target feature: ", $t)) };
+}
+
+/// ARM CPU Feature enum. Each variant denotes a position in a bitset for a
+/// particular feature.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+#[doc(hidden)]
+#[allow(non_camel_case_types)]
+#[repr(u8)]
+pub enum __Feature {
+    /// ARM Advanced SIMD (NEON) - Aarch32
+    neon,
+    /// Polynomial Multiply
+    pmull,
+}
+
+pub fn detect_features<T: linux::FeatureQuery>(mut x: T) -> usize {
+    let value: usize = 0;
+    {
+        let mut enable_feature = |f| {
+            if x.has_feature(&f) {
+                bit::set(value, f as u32);
+            }
+        };
+        enable_feature(__Feature::neon);
+        enable_feature(__Feature::pmull);
+    }
+    value
+}
+
+/// Is the CPU known to have a broken NEON unit?
+///
+/// See https://crbug.com/341598.
+fn has_broken_neon(cpuinfo: &linux::CpuInfo) -> bool {
+    cpuinfo.field("CPU implementer") == "0x51"
+        && cpuinfo.field("CPU architecture") == "7"
+        && cpuinfo.field("CPU variant") == "0x1"
+        && cpuinfo.field("CPU part") == "0x04d"
+        && cpuinfo.field("CPU revision") == "0"
+}
+
+impl linux::FeatureQuery for linux::CpuInfo {
+    fn has_feature(&mut self, x: &__Feature) -> bool {
+        use self::__Feature::*;
+        match *x {
+            neon => {
+                self.field("Features").has("neon") && !has_broken_neon(self)
+            }
+            pmull => self.field("Features").has("pmull"),
+        }
+    }
+}
diff --git a/src/runtime/bit.rs b/src/runtime/bit.rs
new file mode 100644
index 0000000000..42483e5225
--- /dev/null
+++ b/src/runtime/bit.rs
@@ -0,0 +1,11 @@
+//! Bit manipulation utilities
+
+/// Sets the `bit` of `x`.
+pub const fn set(x: usize, bit: u32) -> usize {
+    x | 1 << bit
+}
+
+/// Tests the `bit` of `x`.
+pub const fn test(x: usize, bit: u32) -> bool {
+    x & (1 << bit) != 0
+}
diff --git a/src/runtime/cache.rs b/src/runtime/cache.rs
new file mode 100644
index 0000000000..6ec39e98e8
--- /dev/null
+++ b/src/runtime/cache.rs
@@ -0,0 +1,29 @@
+//! Cache of run-time feature detection
+
+use super::bit;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+/// This global variable is a bitset used to cache the features supported by
+/// the
+/// CPU.
+static CACHE: AtomicUsize = AtomicUsize::new(::std::usize::MAX);
+
+/// Test the `bit` of the storage. If the storage has not been initialized,
+/// initializes it with the result of `f()`.
+///
+/// On its first invocation, it detects the CPU features and caches them in the
+/// `FEATURES` global variable as an `AtomicUsize`.
+///
+/// It uses the `__Feature` variant to index into this variable as a bitset. If
+/// the bit is set, the feature is enabled, and otherwise it is disabled.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+pub fn test<F>(bit: u32, f: F) -> bool
+where
+    F: FnOnce() -> usize,
+{
+    if CACHE.load(Ordering::Relaxed) == ::std::usize::MAX {
+        CACHE.store(f(), Ordering::Relaxed);
+    }
+    bit::test(CACHE.load(Ordering::Relaxed), bit)
+}
diff --git a/src/runtime/linux/cpuinfo.rs b/src/runtime/linux/cpuinfo.rs
new file mode 100644
index 0000000000..0b18c41cef
--- /dev/null
+++ b/src/runtime/linux/cpuinfo.rs
@@ -0,0 +1,211 @@
+//! Reads /proc/cpuinfo on Linux systems
+
+/// cpuinfo
+pub struct CpuInfo {
+    raw: String,
+}
+
+/// Field of cpuinfo
+#[derive(Debug)]
+pub struct CpuInfoField<'a>(Option<&'a str>);
+
+impl<'a> PartialEq<&'a str> for CpuInfoField<'a> {
+    fn eq(&self, other: &&'a str) -> bool {
+        match self.0 {
+            None => other.len() == 0,
+            Some(f) => f == other.trim(),
+        }
+    }
+}
+
+impl<'a> CpuInfoField<'a> {
+    pub fn new<'b>(v: Option<&'b str>) -> CpuInfoField<'b> {
+        match v {
+            None => CpuInfoField::<'b>(None),
+            Some(f) => CpuInfoField::<'b>(Some(f.trim())),
+        }
+    }
+    /// Does the field exist?
+    pub fn exists(&self) -> bool {
+        self.0.is_some()
+    }
+    /// Does the field contain `other`?
+    pub fn has(&self, other: &str) -> bool {
+        match self.0 {
+            None => other.len() == 0,
+            Some(f) => {
+                let other = other.trim();
+                for v in f.split(" ") {
+                    if v == other {
+                        return true;
+                    }
+                }
+                false
+            }
+        }
+    }
+}
+
+impl CpuInfo {
+    /// Reads /proc/cpuinfo into CpuInfo.
+    pub fn new() -> Result<CpuInfo, ::std::io::Error> {
+        use std::io::Read;
+        let mut file = ::std::fs::File::open("/proc/cpuinfo")?;
+        let mut cpui = CpuInfo { raw: String::new() };
+        file.read_to_string(&mut cpui.raw)?;
+        Ok(cpui)
+    }
+    /// Returns the value of the cpuinfo `field`.
+    pub fn field(&self, field: &str) -> CpuInfoField {
+        for l in self.raw.lines() {
+            if l.trim().starts_with(field) {
+                return CpuInfoField(l.split(": ").skip(1).next());
+            }
+        }
+        CpuInfoField(None)
+    }
+
+    /// Returns the `raw` contents of `/proc/cpuinfo`
+    fn raw(&self) -> &String {
+        &self.raw
+    }
+
+    fn from_str(other: &str) -> Result<CpuInfo, ::std::io::Error> {
+        Ok(CpuInfo {
+            raw: String::from(other),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[cfg(target_os = "linux")]
+    #[test]
+    fn test_cpuinfo_linux() {
+        let cpuinfo = CpuInfo::new().unwrap();
+        if cpuinfo.field("vendor_id") == "GenuineIntel" {
+            assert!(cpuinfo.field("flags").exists());
+            assert!(!cpuinfo.field("vendor33_id").exists());
+            assert!(cpuinfo.field("flags").has("sse"));
+            assert!(!cpuinfo.field("flags").has("avx314"));
+        }
+        println!("{}", cpuinfo.raw());
+    }
+
+
+    const CORE_DUO_T6500: &str = r"processor       : 0
+vendor_id       : GenuineIntel
+cpu family      : 6
+model           : 23
+model name      : Intel(R) Core(TM)2 Duo CPU     T6500  @ 2.10GHz
+stepping        : 10
+microcode       : 0xa0b
+cpu MHz         : 1600.000
+cache size      : 2048 KB
+physical id     : 0
+siblings        : 2
+core id         : 0
+cpu cores       : 2
+apicid          : 0
+initial apicid  : 0
+fdiv_bug        : no
+hlt_bug         : no
+f00f_bug        : no
+coma_bug        : no
+fpu             : yes
+fpu_exception   : yes
+cpuid level     : 13
+wp              : yes
+flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 xsave lahf_lm dtherm
+bogomips        : 4190.43
+clflush size    : 64
+cache_alignment : 64
+address sizes   : 36 bits physical, 48 bits virtual
+power management:
+";
+
+    #[test]
+    fn test_cpuinfo_linux_core_duo_t6500() {
+        let cpuinfo = CpuInfo::from_str(CORE_DUO_T6500).unwrap();
+        assert_eq!(cpuinfo.field("vendor_id"), "GenuineIntel");
+        assert_eq!(cpuinfo.field("cpu family"), "6");
+        assert_eq!(cpuinfo.field("model"), "23");
+        assert_eq!(
+            cpuinfo.field("model name"),
+            "Intel(R) Core(TM)2 Duo CPU     T6500  @ 2.10GHz"
+        );
+        assert_eq!(
+            cpuinfo.field("flags"),
+            "fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 xsave lahf_lm dtherm"
+        );
+        assert!(cpuinfo.field("flags").has("fpu"));
+        assert!(cpuinfo.field("flags").has("dtherm"));
+        assert!(cpuinfo.field("flags").has("sse2"));
+        assert!(!cpuinfo.field("flags").has("avx"));
+    }
+
+    const ARM_CORTEX_A53: &str = r"Processor   : AArch64 Processor rev 3 (aarch64)
+        processor   : 0
+        processor   : 1
+        processor   : 2
+        processor   : 3
+        processor   : 4
+        processor   : 5
+        processor   : 6
+        processor   : 7
+        Features    : fp asimd evtstrm aes pmull sha1 sha2 crc32
+        CPU implementer : 0x41
+        CPU architecture: AArch64
+        CPU variant : 0x0
+        CPU part    : 0xd03
+        CPU revision    : 3
+
+        Hardware    : HiKey Development Board
+        ";
+
+    #[test]
+    fn test_cpuinfo_linux_arm_cortex_a53() {
+        let cpuinfo = CpuInfo::from_str(ARM_CORTEX_A53).unwrap();
+        assert_eq!(
+            cpuinfo.field("Processor"),
+            "AArch64 Processor rev 3 (aarch64)"
+        );
+        assert_eq!(
+            cpuinfo.field("Features"),
+            "fp asimd evtstrm aes pmull sha1 sha2 crc32"
+        );
+        assert!(cpuinfo.field("Features").has("pmull"));
+        assert!(!cpuinfo.field("Features").has("neon"));
+        assert!(cpuinfo.field("Features").has("asimd"));
+    }
+
+    const ARM_CORTEX_A57: &str = r"Processor	: Cortex A57 Processor rev 1 (aarch64)
+processor	: 0
+processor	: 1
+processor	: 2
+processor	: 3
+Features	: fp asimd aes pmull sha1 sha2 crc32 wp half thumb fastmult vfp edsp neon vfpv3 tlsi vfpv4 idiva idivt
+CPU implementer	: 0x41
+CPU architecture: 8
+CPU variant	: 0x1
+CPU part	: 0xd07
+CPU revision	: 1";
+
+    #[test]
+    fn test_cpuinfo_linux_arm_cortex_a57() {
+        let cpuinfo = CpuInfo::from_str(ARM_CORTEX_A57).unwrap();
+        assert_eq!(
+            cpuinfo.field("Processor"),
+            "Cortex A57 Processor rev 1 (aarch64)"
+        );
+        assert_eq!(
+            cpuinfo.field("Features"),
+            "fp asimd aes pmull sha1 sha2 crc32 wp half thumb fastmult vfp edsp neon vfpv3 tlsi vfpv4 idiva idivt"
+        );
+        assert!(cpuinfo.field("Features").has("pmull"));
+        assert!(cpuinfo.field("Features").has("neon"));
+        assert!(cpuinfo.field("Features").has("asimd"));
+    }
+}
diff --git a/src/runtime/linux/mod.rs b/src/runtime/linux/mod.rs
new file mode 100644
index 0000000000..6625152baf
--- /dev/null
+++ b/src/runtime/linux/mod.rs
@@ -0,0 +1,31 @@
+//! Run-time feature detection for ARM on linux
+mod cpuinfo;
+pub use self::cpuinfo::CpuInfo;
+
+use super::__Feature;
+
+pub trait FeatureQuery {
+    fn has_feature(&mut self, x: &__Feature) -> bool;
+}
+
+fn detect_features_impl<T: FeatureQuery>(x: T) -> usize {
+    #[cfg(target_arch = "arm")]
+    {
+        super::arm::detect_features(x)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        super::aarch64::detect_features(x)
+    }
+}
+
+/// Detects ARM features:
+pub fn detect_features() -> usize {
+    // FIXME: use libc::getauxval, and if that fails /proc/auxv
+    // Try to read /proc/cpuinfo
+    if let Ok(v) = cpuinfo::CpuInfo::new() {
+        return detect_features_impl(v);
+    }
+    // Otherwise all features are disabled
+    0
+}
diff --git a/src/runtime/macros.rs b/src/runtime/macros.rs
new file mode 100644
index 0000000000..e8278bb295
--- /dev/null
+++ b/src/runtime/macros.rs
@@ -0,0 +1,39 @@
+//! Run-time feature detection macros.
+
+/// Is a feature supported by the host CPU?
+///
+/// This macro performs run-time feature detection. It returns true if the host
+/// CPU in which the binary is running on supports a particular feature.
+#[macro_export]
+macro_rules! cfg_feature_enabled {
+    ($name:tt) => (
+        {
+            #[cfg(target_feature = $name)]
+            {
+                true
+            }
+            #[cfg(not(target_feature = $name))]
+            {
+                __unstable_detect_feature!($name)
+            }
+        }
+    )
+}
+
+/// In all unsupported architectures using the macro is an error
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64",
+              target_arch = "arm", target_arch = "aarch64")))]
+#[macro_export]
+#[doc(hidden)]
+macro_rules! __unstable_detect_feature {
+    ($t:tt) => { compile_error!(concat!("unknown target feature: ", $t)) };
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(target_arch = "x86_64")]
+    #[test]
+    fn test_macros() {
+        assert!(cfg_feature_enabled!("sse"));
+    }
+}
diff --git a/src/runtime/mod.rs b/src/runtime/mod.rs
new file mode 100644
index 0000000000..a48b7b20ce
--- /dev/null
+++ b/src/runtime/mod.rs
@@ -0,0 +1,40 @@
+//! Run-time feature detection
+mod cache;
+mod bit;
+
+#[macro_use]
+mod macros;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[macro_use]
+mod x86;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub use self::x86::__Feature;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use self::x86::detect_features;
+
+#[cfg(all(target_arch = "arm", target_os = "linux", feature = "std"))]
+#[macro_use]
+mod arm;
+#[cfg(all(target_arch = "arm", target_os = "linux", feature = "std"))]
+pub use self::arm::__Feature;
+
+#[cfg(all(target_arch = "aarch64", target_os = "linux", feature = "std"))]
+#[macro_use]
+mod aarch64;
+#[cfg(all(target_arch = "aarch64", target_os = "linux", feature = "std"))]
+pub use self::aarch64::__Feature;
+
+#[cfg(all(feature = "std", target_os = "linux",
+          any(target_arch = "arm", target_arch = "aarch64")))]
+mod linux;
+
+#[cfg(all(feature = "std", target_os = "linux",
+          any(target_arch = "arm", target_arch = "aarch64")))]
+pub use self::linux::detect_features;
+
+/// Performs run-time feature detection.
+#[doc(hidden)]
+pub fn __unstable_detect_feature(x: __Feature) -> bool {
+    cache::test(x as u32, detect_features)
+}
diff --git a/src/runtime/x86.rs b/src/runtime/x86.rs
new file mode 100644
index 0000000000..6d16a5398f
--- /dev/null
+++ b/src/runtime/x86.rs
@@ -0,0 +1,461 @@
+//! This module implements minimal run-time feature detection for x86.
+//!
+//! The features are detected using the `detect_features` function below.
+//! This function uses the CPUID instruction to read the feature flags from the
+//! CPU and encodes them in an `usize` where each bit position represents
+//! whether a feature is available (bit is set) or unavaiable (bit is cleared).
+//!
+//! The enum `__Feature` is used to map bit positions to feature names, and the
+//! the `__unstable_detect_feature!` macro is used to map string literals (e.g.
+//! "avx") to these bit positions (e.g. `__Feature::avx`).
+//!
+//!
+//! The run-time feature detection is performed by the
+//! `__unstable_detect_feature(__Feature) -> bool` function. On its first call,
+//! this functions queries the CPU for the available features and stores them
+//! in a global `AtomicUsize` variable. The query is performed by just checking
+//! whether the feature bit in this global variable is set or cleared.
+
+use super::bit;
+
+/// This macro maps the string-literal feature names to values of the
+/// `__Feature` enum at compile-time. The feature names used are the same as
+/// those of rustc `target_feature` and `cfg_target_feature` features.
+///
+/// PLESE: do not use this, it is an implementation detail subjected to change.
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[macro_export]
+#[doc(hidden)]
+macro_rules! __unstable_detect_feature {
+    ("sse") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::sse{})  };
+    ("sse2") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::sse2{})
+    };
+    ("sse3") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::sse3{})
+    };
+    ("ssse3") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::ssse3{})
+    };
+    ("sse4.1") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::sse4_1{})
+    };
+    ("sse4.2") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::sse4_2{})
+    };
+    ("sse4a") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::sse4a{})
+    };
+    ("avx") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx{})
+    };
+    ("avx2") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx2{})
+    };
+    ("avx512f") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx512f{})
+    };
+    ("avx512cd") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx512cd{})
+    };
+    ("avx512er") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx512er{})
+    };
+    ("avx512pf") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx512pf{})
+    };
+    ("avx512bw") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx512bw{})
+    };
+    ("avx512dq") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx512dq{})
+    };
+    ("avx512vl") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx512vl{})
+    };
+    ("avx512ifma") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx512_ifma{})
+    };
+    ("avx512vbmi") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx512_vbmi{})
+    };
+    ("avx512vpopcntdq") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::avx512_vpopcntdq{})
+    };
+    ("fma") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::fma{})
+    };
+    ("bmi") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::bmi{})
+    };
+    ("bmi2") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::bmi2{})
+    };
+    ("abm") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::abm{})
+    };
+    ("lzcnt") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::abm{})
+    };
+    ("tbm") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::tbm{})
+    };
+    ("popcnt") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::popcnt{})
+    };
+    ("xsave") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::xsave{})
+    };
+    ("xsaveopt") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::xsaveopt{})
+    };
+    ("xsave") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::xsave{})
+    };
+    ("xsaveopt") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::xsaveopt{})
+    };
+    ("xsaves") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::xsaves{})
+    };
+    ("xsavec") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::xsavec{})
+    };
+    ($t:tt) => {
+        compile_error!(concat!("unknown target feature: ", $t))
+    };
+}
+
+/// X86 CPU Feature enum. Each variant denotes a position in a bitset for a
+/// particular feature.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+#[doc(hidden)]
+#[allow(non_camel_case_types)]
+#[repr(u8)]
+pub enum __Feature {
+    /// SSE (Streaming SIMD Extensions)
+    sse,
+    /// SSE2 (Streaming SIMD Extensions 2)
+    sse2,
+    /// SSE3 (Streaming SIMD Extensions 3)
+    sse3,
+    /// SSSE3 (Supplemental Streaming SIMD Extensions 3)
+    ssse3,
+    /// SSE4.1 (Streaming SIMD Extensions 4.1)
+    sse4_1,
+    /// SSE4.2 (Streaming SIMD Extensions 4.2)
+    sse4_2,
+    /// SSE4a (Streaming SIMD Extensions 4a)
+    sse4a,
+    /// AVX (Advanced Vector Extensions)
+    avx,
+    /// AVX2 (Advanced Vector Extensions 2)
+    avx2,
+    /// AVX-512 F (Foundation)
+    avx512f,
+    /// AVX-512 CD (Conflict Detection Instructions)
+    avx512cd,
+    /// AVX-512 ER (Exponential and Reciprocal Instructions)
+    avx512er,
+    /// AVX-512 PF (Prefetch Instructions)
+    avx512pf,
+    /// AVX-512 BW (Byte and Word Instructions)
+    avx512bw,
+    /// AVX-512 DQ (Doubleword and Quadword)
+    avx512dq,
+    /// AVX-512 VL (Vector Length Extensions)
+    avx512vl,
+    /// AVX-512 IFMA (Integer Fused Multiply Add)
+    avx512_ifma,
+    /// AVX-512 VBMI (Vector Byte Manipulation Instructions)
+    avx512_vbmi,
+    /// AVX-512 VPOPCNTDQ (Vector Population Count Doubleword and Quadword)
+    avx512_vpopcntdq,
+    /// FMA (Fused Multiply Add)
+    fma,
+    /// BMI1 (Bit Manipulation Instructions 1)
+    bmi,
+    /// BMI1 (Bit Manipulation Instructions 2)
+    bmi2,
+    /// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero
+    /// Count) on Intel
+    abm,
+    /// TBM (Trailing Bit Manipulation)
+    tbm,
+    /// POPCNT (Population Count)
+    popcnt,
+    /// XSAVE (Save Processor Extended States)
+    xsave,
+    /// XSAVEOPT (Save Processor Extended States Optimized)
+    xsaveopt,
+    /// XSAVES (Save Processor Extended States Supervisor)
+    xsaves,
+    /// XSAVEC (Save Processor Extended States Compacted)
+    xsavec,
+    #[doc(hidden)] __NonExhaustive,
+}
+
+/// Run-time feature detection on x86 works by using the CPUID instruction.
+///
+/// The [CPUID Wikipedia page][wiki_cpuid] contains
+/// all the information about which flags to set to query which values, and in
+/// which registers these are reported.
+///
+/// The definitive references are:
+/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+///   Instruction Set Reference, A-Z][intel64_ref].
+/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+///   System Instructions][amd64_ref].
+///
+/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
+/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+pub fn detect_features() -> usize {
+    use vendor::{__cpuid, __cpuid_count, has_cpuid, CpuidResult};
+    use vendor::_xgetbv;
+    let mut value: usize = 0;
+
+    // If the x86 CPU does not support the CPUID instruction then it is too
+    // old to support any of the currently-detectable features.
+    if !has_cpuid() {
+        return value;
+    }
+
+    // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
+    // has `cpuid` support.
+
+    // 0. EAX = 0: Basic Information:
+    // - EAX returns the "Highest Function Parameter", that is, the maximum
+    // leaf value for subsequent calls of `cpuinfo` in range [0,
+    // 0x8000_0000]. - The vendor ID is stored in 12 u8 ascii chars,
+    // returned in EBX, EDX, and   ECX (in that order):
+    let (max_leaf, vendor_id) = unsafe {
+        let CpuidResult {
+            eax: max_leaf,
+            ebx,
+            ecx,
+            edx,
+        } = __cpuid(0);
+        let vendor_id: [[u8; 4]; 3] = [
+            ::std::mem::transmute(ebx),
+            ::std::mem::transmute(edx),
+            ::std::mem::transmute(ecx),
+        ];
+        let vendor_id: [u8; 12] = ::std::mem::transmute(vendor_id);
+        (max_leaf, vendor_id)
+    };
+
+    if max_leaf < 1 {
+        // Earlier Intel 486, CPUID not implemented
+        return value;
+    }
+
+    // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
+    // Contains information about most x86 features.
+    let CpuidResult {
+        ecx: proc_info_ecx,
+        edx: proc_info_edx,
+        ..
+    } = unsafe { __cpuid(0x0000_0001_u32) };
+
+    // EAX = 7, ECX = 0: Queries "Extended Features";
+    // Contains information about bmi,bmi2, and avx2 support.
+    let (extended_features_ebx, extended_features_ecx) = if max_leaf >= 7 {
+        let CpuidResult { ebx, ecx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
+        (ebx, ecx)
+    } else {
+        (0, 0) // CPUID does not support "Extended Features"
+    };
+
+    // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
+    // - EAX returns the max leaf value for extended information, that is,
+    // `cpuid` calls in range [0x8000_0000; u32::MAX]:
+    let CpuidResult {
+        eax: extended_max_leaf,
+        ..
+    } = unsafe { __cpuid(0x8000_0000_u32) };
+
+    // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature
+    // Bits"
+    let extended_proc_info_ecx = if extended_max_leaf >= 1 {
+        let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
+        ecx
+    } else {
+        0
+    };
+
+    {
+        // borrows value till the end of this scope:
+        let mut enable = |r, rb, f| if bit::test(r as usize, rb) {
+            value = bit::set(value, f as u32);
+        };
+
+        enable(proc_info_ecx, 0, __Feature::sse3);
+        enable(proc_info_ecx, 9, __Feature::ssse3);
+        enable(proc_info_ecx, 12, __Feature::fma);
+        enable(proc_info_ecx, 19, __Feature::sse4_1);
+        enable(proc_info_ecx, 20, __Feature::sse4_2);
+        enable(proc_info_ecx, 23, __Feature::popcnt);
+        enable(proc_info_edx, 25, __Feature::sse);
+        enable(proc_info_edx, 26, __Feature::sse2);
+
+        enable(extended_features_ebx, 3, __Feature::bmi);
+        enable(extended_features_ebx, 8, __Feature::bmi2);
+
+        // `XSAVE` and `AVX` support:
+        if bit::test(proc_info_ecx as usize, 26) {
+            // 0. Here the CPU supports `XSAVE`.
+
+            // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
+            // supports saving the state of the AVX/AVX2 vector registers on
+            // context-switches, see:
+            //
+            // - https://software.intel.
+            // com/en-us/blogs/2011/04/14/is-avx-enabled
+            // - https://hg.mozilla.
+            // org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
+            let cpu_osxsave = bit::test(proc_info_ecx as usize, 27);
+
+            // 2. The OS must have signaled the CPU that it supports saving and
+            // restoring the SSE and AVX registers by setting `XCR0.SSE[1]` and
+            // `XCR0.AVX[2]` to `1`.
+            //
+            // This is safe because the CPU supports `xsave`
+            let xcr0 = unsafe { _xgetbv(0) };
+            let os_avx_support = xcr0 & 6 == 6;
+            let os_avx512_support = xcr0 & 224 == 224;
+
+            if cpu_osxsave && os_avx_support {
+                // Only if the OS and the CPU support saving/restoring the AVX
+                // registers we enable `xsave` support:
+                enable(proc_info_ecx, 26, __Feature::xsave);
+
+                // And AVX/AVX2:
+                enable(proc_info_ecx, 28, __Feature::avx);
+                enable(extended_features_ebx, 5, __Feature::avx2);
+
+                // For AVX-512 the OS also needs to support saving/restoring
+                // the
+                // extended state, only then we enable AVX-512 support:
+                if os_avx512_support {
+                    enable(extended_features_ebx, 16, __Feature::avx512f);
+                    enable(extended_features_ebx, 17, __Feature::avx512dq);
+                    enable(extended_features_ebx, 21, __Feature::avx512_ifma);
+                    enable(extended_features_ebx, 26, __Feature::avx512pf);
+                    enable(extended_features_ebx, 27, __Feature::avx512er);
+                    enable(extended_features_ebx, 28, __Feature::avx512cd);
+                    enable(extended_features_ebx, 30, __Feature::avx512bw);
+                    enable(extended_features_ebx, 31, __Feature::avx512vl);
+                    enable(extended_features_ecx, 1, __Feature::avx512_vbmi);
+                    enable(
+                        extended_features_ecx,
+                        14,
+                        __Feature::avx512_vpopcntdq,
+                    );
+                }
+            }
+
+            // Processor Extended State Enumeration Sub-leaf (EAX = 0DH, ECX =
+            // 1)
+            if max_leaf >= 0xd {
+                let CpuidResult {
+                    eax: proc_extended_state1_eax,
+                    ..
+                } = unsafe { __cpuid_count(0xd_u32, 1) };
+                enable(proc_extended_state1_eax, 0, __Feature::xsaveopt);
+                enable(proc_extended_state1_eax, 1, __Feature::xsavec);
+                enable(proc_extended_state1_eax, 3, __Feature::xsaves);
+            }
+        }
+
+        // This detects ABM on AMD CPUs and LZCNT on Intel CPUs.
+        // On intel CPUs with popcnt, lzcnt implements the
+        // "missing part" of ABM, so we map both to the same
+        // internal feature.
+        //
+        // The `cfg_feature_enabled!("lzcnt")` macro then
+        // internally maps to __Feature::abm.
+        enable(extended_proc_info_ecx, 5, __Feature::abm);
+        if vendor_id == *b"AuthenticAMD" {
+            // These features are only available on AMD CPUs:
+            enable(extended_proc_info_ecx, 6, __Feature::sse4a);
+            enable(extended_proc_info_ecx, 21, __Feature::tbm);
+        }
+    }
+
+    value
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(feature = "std")]
+    #[test]
+    fn runtime_detection_x86_nocapture() {
+        println!("sse: {:?}", cfg_feature_enabled!("sse"));
+        println!("sse2: {:?}", cfg_feature_enabled!("sse2"));
+        println!("sse3: {:?}", cfg_feature_enabled!("sse3"));
+        println!("ssse3: {:?}", cfg_feature_enabled!("ssse3"));
+        println!("sse4.1: {:?}", cfg_feature_enabled!("sse4.1"));
+        println!("sse4.2: {:?}", cfg_feature_enabled!("sse4.2"));
+        println!("avx: {:?}", cfg_feature_enabled!("avx"));
+        println!("avx2: {:?}", cfg_feature_enabled!("avx2"));
+        println!("avx512f {:?}", cfg_feature_enabled!("avx512f"));
+        println!("avx512cd {:?}", cfg_feature_enabled!("avx512cd"));
+        println!("avx512er {:?}", cfg_feature_enabled!("avx512er"));
+        println!("avx512pf {:?}", cfg_feature_enabled!("avx512pf"));
+        println!("avx512bw {:?}", cfg_feature_enabled!("avx512bw"));
+        println!("avx512dq {:?}", cfg_feature_enabled!("avx512dq"));
+        println!("avx512vl {:?}", cfg_feature_enabled!("avx512vl"));
+        println!("avx512ifma {:?}", cfg_feature_enabled!("avx512ifma"));
+        println!("avx512vbmi {:?}", cfg_feature_enabled!("avx512vbmi"));
+        println!(
+            "avx512vpopcntdq {:?}",
+            cfg_feature_enabled!("avx512vpopcntdq")
+        );
+        println!("fma: {:?}", cfg_feature_enabled!("fma"));
+        println!("abm: {:?}", cfg_feature_enabled!("abm"));
+        println!("bmi: {:?}", cfg_feature_enabled!("bmi"));
+        println!("bmi2: {:?}", cfg_feature_enabled!("bmi2"));
+        println!("tbm: {:?}", cfg_feature_enabled!("tbm"));
+        println!("popcnt: {:?}", cfg_feature_enabled!("popcnt"));
+        println!("lzcnt: {:?}", cfg_feature_enabled!("lzcnt"));
+        println!("xsave {:?}", cfg_feature_enabled!("xsave"));
+        println!("xsaveopt {:?}", cfg_feature_enabled!("xsaveopt"));
+        println!("xsaves {:?}", cfg_feature_enabled!("xsaves"));
+        println!("xsavec {:?}", cfg_feature_enabled!("xsavec"));
+    }
+}
diff --git a/src/x86/cpuid.rs b/src/x86/cpuid.rs
new file mode 100644
index 0000000000..9820f23f6e
--- /dev/null
+++ b/src/x86/cpuid.rs
@@ -0,0 +1,145 @@
+//! `cpuid` intrinsics
+
+#![cfg_attr(feature = "cargo-clippy", allow(stutter))]
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Result of the `cpuid` instruction.
+#[derive(Copy, Clone, Eq, Ord, PartialEq, PartialOrd)]
+#[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+pub struct CpuidResult {
+    /// EAX register.
+    pub eax: u32,
+    /// EBX register.
+    pub ebx: u32,
+    /// ECX register.
+    pub ecx: u32,
+    /// EDX register.
+    pub edx: u32,
+}
+
+/// Returns the result of the `cpuid` instruction for a given `leaf` (`EAX`)
+/// and
+/// `sub_leaf` (`ECX`).
+///
+/// The highest-supported leaf value is returned by the first tuple argument of
+/// [`__get_cpuid_max(0)`](fn.__get_cpuid_max.html). For leaves containung
+/// sub-leaves, the second tuple argument returns the highest-supported
+/// sub-leaf
+/// value.
+///
+/// The [CPUID Wikipedia page][wiki_cpuid] contains how to query which
+/// information using the `EAX` and `ECX` registers, and the interpretation of
+/// the results returned in `EAX`, `EBX`, `ECX`, and `EDX`.
+///
+/// The references are:
+/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+///   Instruction Set Reference, A-Z][intel64_ref].
+/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+///   System Instructions][amd64_ref].
+///
+/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
+/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+#[inline(always)]
+#[cfg_attr(test, assert_instr(cpuid))]
+pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
+    let mut r = ::std::mem::uninitialized::<CpuidResult>();
+    if cfg!(target_arch = "x86") {
+        asm!("cpuid"
+             : "={eax}"(r.eax), "={ebx}"(r.ebx), "={ecx}"(r.ecx), "={edx}"(r.edx)
+             : "{eax}"(leaf), "{ecx}"(sub_leaf)
+             : :);
+    } else {
+        // x86-64 uses %rbx as the base register, so preserve it.
+        asm!("cpuid\n"
+             : "={eax}"(r.eax), "={ebx}"(r.ebx), "={ecx}"(r.ecx), "={edx}"(r.edx)
+             : "{eax}"(leaf), "{ecx}"(sub_leaf)
+             : "rbx" :);
+    }
+    r
+}
+
+/// See [`__cpuid_count`](fn.__cpuid_count.html).
+#[inline(always)]
+#[cfg_attr(test, assert_instr(cpuid))]
+pub unsafe fn __cpuid(leaf: u32) -> CpuidResult {
+    __cpuid_count(leaf, 0)
+}
+
+/// Does the host support the `cpuid` instruction?
+#[inline(always)]
+pub fn has_cpuid() -> bool {
+    #[cfg(target_arch = "x86_64")]
+    {
+        true
+    }
+    #[cfg(target_arch = "x86")]
+    {
+        use super::ia32::{__readeflags, __writeeflags};
+
+        // On `x86` the `cpuid` instruction is not always available.
+        // This follows the approach indicated in:
+        // http://wiki.osdev.org/CPUID#Checking_CPUID_availability
+        unsafe {
+            // Read EFLAGS:
+            let eflags: u32 = __readeflags();
+
+            // Invert the ID bit in EFLAGS:
+            let eflags_mod: u32 = eflags | 0x0020_0000;
+
+            // Store the modified EFLAGS (ID bit may or may not be inverted)
+            __writeeflags(eflags_mod);
+
+            // Read EFLAGS again:
+            let eflags_after: u32 = __readeflags();
+
+            // Check if the ID bit changed:
+            eflags_after != eflags
+        }
+    }
+}
+
+/// Returns the highest-supported `leaf` (`EAX`) and sub-leaf (`ECX`) `cpuid`
+/// values.
+///
+/// If `cpuid` is supported, and `leaf` is zero, then the first tuple argument
+/// contains the highest `leaf` value that `cpuid` supports. For `leaf`s
+/// containing sub-leafs, the second tuple argument contains the
+/// highest-supported sub-leaf value.
+///
+/// See also [`__cpuid`](fn.__cpuid.html) and
+/// [`__cpuid_count`](fn.__cpuid_count.html).
+#[inline(always)]
+pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {
+    let CpuidResult { eax, ebx, .. } = __cpuid(leaf);
+    (eax, ebx)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_always_has_cpuid() {
+        // all currently-tested targets have the instruction
+        // FIXME: add targets without `cpuid` to CI
+        assert!(has_cpuid());
+    }
+
+    #[cfg(target_arch = "x86")]
+    #[test]
+    fn test_has_cpuid() {
+        use vendor::__readeflags;
+        unsafe {
+            let before = __readeflags();
+
+            if has_cpuid() {
+                assert!(before != __readeflags());
+            } else {
+                assert!(before == __readeflags());
+            }
+        }
+    }
+}
diff --git a/src/x86/ia32.rs b/src/x86/ia32.rs
new file mode 100644
index 0000000000..ac7ab8b080
--- /dev/null
+++ b/src/x86/ia32.rs
@@ -0,0 +1,50 @@
+//! `i386/ia32` intrinsics
+
+/// Reads EFLAGS.
+#[cfg(target_arch = "x86")]
+#[inline(always)]
+pub unsafe fn __readeflags() -> u32 {
+    let eflags: u32;
+    asm!("pushfd; popl $0" : "=r"(eflags) : : : "volatile");
+    eflags
+}
+
+/// Reads EFLAGS.
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+pub unsafe fn __readeflags() -> u64 {
+    let eflags: u64;
+    asm!("pushfq; popq $0" : "=r"(eflags) : : : "volatile");
+    eflags
+}
+
+/// Write EFLAGS.
+#[cfg(target_arch = "x86")]
+#[inline(always)]
+pub unsafe fn __writeeflags(eflags: u32) {
+    asm!("pushl $0; popfd" : : "r"(eflags) : "cc", "flags" : "volatile");
+}
+
+/// Write EFLAGS.
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+pub unsafe fn __writeeflags(eflags: u64) {
+    asm!("pushq $0; popfq" : : "r"(eflags) : "cc", "flags" : "volatile");
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_eflags() {
+        unsafe {
+            // reads eflags, writes them back, reads them again,
+            // and compare for equality:
+            let v = __readeflags();
+            __writeeflags(v);
+            let u = __readeflags();
+            assert_eq!(v, u);
+        }
+    }
+}
diff --git a/src/x86/mod.rs b/src/x86/mod.rs
index 96011521ab..3e740a8642 100644
--- a/src/x86/mod.rs
+++ b/src/x86/mod.rs
@@ -1,5 +1,9 @@
 //! `x86` and `x86_64` intrinsics.
 
+pub use self::ia32::*;
+pub use self::cpuid::*;
+pub use self::xsave::*;
+
 pub use self::sse::*;
 pub use self::sse2::*;
 pub use self::sse3::*;
@@ -12,9 +16,9 @@ pub use self::avx2::*;
 pub use self::abm::*;
 pub use self::bmi::*;
 pub use self::bmi2::*;
-pub use self::tbm::*;
 
-pub use self::runtime::{__unstable_detect_feature, __Feature};
+#[cfg(not(feature = "intel_sde"))]
+pub use self::tbm::*;
 
 /// 128-bit wide signed integer vector type
 #[allow(non_camel_case_types)]
@@ -25,8 +29,10 @@ pub type __m256i = ::v256::i8x32;
 
 #[macro_use]
 mod macros;
-#[macro_use]
-mod runtime;
+
+mod ia32;
+mod cpuid;
+mod xsave;
 
 mod sse;
 mod sse2;
@@ -40,10 +46,13 @@ mod avx2;
 mod abm;
 mod bmi;
 mod bmi2;
+
+#[cfg(not(feature = "intel_sde"))]
 mod tbm;
 
-#[allow(non_camel_case_types)]
+/// `C`'s `void` type.
 #[cfg(not(feature = "std"))]
+#[allow(non_camel_case_types)]
 #[repr(u8)]
 pub enum c_void {
     #[doc(hidden)] __variant1,
diff --git a/src/x86/runtime.rs b/src/x86/runtime.rs
deleted file mode 100644
index 1549c4f7a3..0000000000
--- a/src/x86/runtime.rs
+++ /dev/null
@@ -1,305 +0,0 @@
-//! This module implements minimal run-time feature detection for x86.
-//!
-//! The features are detected using the `detect_features` function below.
-//! This function uses the CPUID instruction to read the feature flags from the
-//! CPU and encodes them in an `usize` where each bit position represents
-//! whether a feature is available (bit is set) or unavaiable (bit is cleared).
-//!
-//! The enum `__Feature` is used to map bit positions to feature names, and the
-//! the `__unstable_detect_feature!` macro is used to map string literals (e.g.
-//! "avx") to these bit positions (e.g. `__Feature::avx`).
-//!
-//!
-//! The run-time feature detection is performed by the
-//! `__unstable_detect_feature(__Feature) -> bool` function. On its first call,
-//! this functions queries the CPU for the available features and stores them
-//! in a global `AtomicUsize` variable. The query is performed by just checking
-//! whether the feature bit in this global variable is set or cleared.
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-/// This macro maps the string-literal feature names to values of the
-/// `__Feature` enum at compile-time. The feature names used are the same as
-/// those of rustc `target_feature` and `cfg_target_feature` features.
-///
-/// PLESE: do not use this, it is an implementation detail subjected to change.
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[macro_export]
-#[doc(hidden)]
-macro_rules! __unstable_detect_feature {
-    ("sse") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::sse{})  };
-    ("sse2") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::sse2{})
-    };
-    ("sse3") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::sse3{})
-    };
-    ("ssse3") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::ssse3{})
-    };
-    ("sse4.1") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::sse4_1{})
-    };
-    ("sse4.2") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::sse4_2{})
-    };
-    ("avx") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::avx{})
-    };
-    ("avx2") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::avx2{})
-    };
-    ("fma") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::fma{})
-    };
-    ("bmi") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::bmi{})
-    };
-    ("bmi2") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::bmi2{})
-    };
-    ("abm") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::abm{})
-    };
-    ("lzcnt") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::abm{})
-    };
-    ("tbm") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::tbm{})
-    };
-    ("popcnt") => {
-        $crate::vendor::__unstable_detect_feature(
-            $crate::vendor::__Feature::popcnt{})
-    };
-    ($t:tt) => {
-        compile_error!(concat!("unknown target feature: ", $t))
-    };
-}
-
-/// X86 CPU Feature enum. Each variant denotes a position in a bitset for a
-/// particular feature.
-///
-/// PLEASE: do not use this, it is an implementation detail subject to change.
-#[doc(hidden)]
-#[allow(non_camel_case_types)]
-#[repr(u8)]
-pub enum __Feature {
-    /// SSE (Streaming SIMD Extensions)
-    sse,
-    /// SSE2 (Streaming SIMD Extensions 2)
-    sse2,
-    /// SSE3 (Streaming SIMD Extensions 3)
-    sse3,
-    /// SSSE3 (Supplemental Streaming SIMD Extensions 3)
-    ssse3,
-    /// SSE4.1 (Streaming SIMD Extensions 4.1)
-    sse4_1,
-    /// SSE4.2 (Streaming SIMD Extensions 4.2)
-    sse4_2,
-    /// AVX (Advanced Vector Extensions)
-    avx,
-    /// AVX2 (Advanced Vector Extensions 2)
-    avx2,
-    /// FMA (Fused Multiply Add)
-    fma,
-    /// BMI1 (Bit Manipulation Instructions 1)
-    bmi,
-    /// BMI1 (Bit Manipulation Instructions 2)
-    bmi2,
-    /// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero
-    /// Count) on Intel
-    abm,
-    /// TBM (Trailing Bit Manipulation)
-    tbm,
-    /// POPCNT (Population Count)
-    popcnt,
-
-    #[doc(hidden)] __NonExhaustive,
-}
-
-/// Sets the `bit`-th bit of `x`.
-fn set_bit(x: usize, bit: u32) -> usize {
-    debug_assert!(32 > bit);
-    x | 1 << bit
-}
-
-/// Tests the `bit`-th bit of `x`.
-fn test_bit(x: usize, bit: u32) -> bool {
-    debug_assert!(32 > bit);
-    x & (1 << bit) != 0
-}
-
-/// Run-time feature detection on x86 works by using the CPUID instruction.
-///
-/// The [CPUID Wikipedia page][wiki_cpuid] contains
-/// all the information about which flags to set to query which values, and in
-/// which registers these are reported.
-///
-/// The definitive references are:
-/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
-///   Instruction Set Reference, A-Z][intel64_ref].
-/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
-///   System Instructions][amd64_ref].
-///
-/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
-/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
-fn detect_features() -> usize {
-    let extended_features_ebx;
-    let proc_info_ecx;
-    let proc_info_edx;
-
-    unsafe {
-        /// To obtain all feature flags we need two CPUID queries:
-
-        /// 1. EAX=1, ECX=0: Queries "Processor Info and Feature Bits"
-        /// This gives us most of the CPU features in ECX and EDX (see
-        /// below).
-        asm!("cpuid"
-             : "={ecx}"(proc_info_ecx), "={edx}"(proc_info_edx)
-             : "{eax}"(0x0000_0001_u32), "{ecx}"(0 as u32)
-             : :);
-
-        /// 2. EAX=7, ECX=0: Queries "Extended Features"
-        /// This gives us information about bmi,bmi2, and avx2 support
-        /// (see below); the result in ECX is not currently needed.
-        asm!("cpuid"
-             : "={ebx}"(extended_features_ebx)
-             : "{eax}"(0x0000_0007_u32), "{ecx}"(0 as u32)
-             : :);
-    }
-
-    let mut value: usize = 0;
-
-    if test_bit(extended_features_ebx, 3) {
-        value = set_bit(value, __Feature::bmi as u32);
-    }
-    if test_bit(extended_features_ebx, 8) {
-        value = set_bit(value, __Feature::bmi2 as u32);
-    }
-
-    if test_bit(proc_info_ecx, 0) {
-        value = set_bit(value, __Feature::sse3 as u32);
-    }
-    if test_bit(proc_info_ecx, 5) {
-        value = set_bit(value, __Feature::abm as u32);
-    }
-    if test_bit(proc_info_ecx, 9) {
-        value = set_bit(value, __Feature::ssse3 as u32);
-    }
-    if test_bit(proc_info_ecx, 12) {
-        value = set_bit(value, __Feature::fma as u32);
-    }
-    if test_bit(proc_info_ecx, 19) {
-        value = set_bit(value, __Feature::sse4_1 as u32);
-    }
-    if test_bit(proc_info_ecx, 20) {
-        value = set_bit(value, __Feature::sse4_2 as u32);
-    }
-    if test_bit(proc_info_ecx, 21) {
-        value = set_bit(value, __Feature::tbm as u32);
-    }
-    if test_bit(proc_info_ecx, 23) {
-        value = set_bit(value, __Feature::popcnt as u32);
-    }
-
-    if test_bit(proc_info_edx, 25) {
-        value = set_bit(value, __Feature::sse as u32);
-    }
-    if test_bit(proc_info_edx, 26) {
-        value = set_bit(value, __Feature::sse2 as u32);
-    }
-
-    // ECX[26] detects XSAVE and ECX[27] detects OSXSAVE, that is, whether the
-    // OS is AVX enabled and supports saving the state of the AVX/AVX2 vector
-    // registers on context-switches, see:
-    //
-    // - https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
-    // - https://hg.mozilla.
-    // org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
-    //
-    if test_bit(proc_info_ecx, 26) && test_bit(proc_info_ecx, 27) {
-        /// XGETBV: reads the contents of the extended control
-        /// register (XCR).
-        unsafe fn xgetbv(xcr_no: u32) -> u64 {
-            let eax: u32;
-            let edx: u32;
-            // xgetbv
-            asm!("xgetbv"
-                 : "={eax}"(eax),  "={edx}"(edx)
-                 : "{ecx}"(xcr_no)
-                 : :);
-            ((edx as u64) << 32) | (eax as u64)
-        }
-
-        // This is safe because on x86 `xgetbv` is always available.
-        if unsafe { xgetbv(0) } & 6 == 6 {
-            if test_bit(proc_info_ecx, 28) {
-                value = set_bit(value, __Feature::avx as u32);
-            }
-            if test_bit(extended_features_ebx, 5) {
-                value = set_bit(value, __Feature::avx2 as u32);
-            }
-        }
-    }
-
-    value
-}
-
-/// This global variable is a bitset used to cache the features supported by
-/// the CPU.
-static FEATURES: AtomicUsize = AtomicUsize::new(::std::usize::MAX);
-
-/// Performs run-time feature detection.
-///
-/// On its first invocation, it detects the CPU features and caches them
-/// in the `FEATURES` global variable as an `AtomicUsize`.
-///
-/// It uses the `__Feature` variant to index into this variable as a bitset. If
-/// the bit is set, the feature is enabled, and otherwise it is disabled.
-///
-/// PLEASE: do not use this, it is an implementation detail subject to change.
-#[doc(hidden)]
-pub fn __unstable_detect_feature(x: __Feature) -> bool {
-    if FEATURES.load(Ordering::Relaxed) == ::std::usize::MAX {
-        FEATURES.store(detect_features(), Ordering::Relaxed);
-    }
-    test_bit(FEATURES.load(Ordering::Relaxed), x as u32)
-}
-
-#[cfg(test)]
-mod tests {
-    #[cfg(feature = "std")]
-    #[test]
-    fn runtime_detection_x86_nocapture() {
-        println!("sse: {:?}", cfg_feature_enabled!("sse"));
-        println!("sse2: {:?}", cfg_feature_enabled!("sse2"));
-        println!("sse3: {:?}", cfg_feature_enabled!("sse3"));
-        println!("ssse3: {:?}", cfg_feature_enabled!("ssse3"));
-        println!("sse4.1: {:?}", cfg_feature_enabled!("sse4.1"));
-        println!("sse4.2: {:?}", cfg_feature_enabled!("sse4.2"));
-        println!("avx: {:?}", cfg_feature_enabled!("avx"));
-        println!("avx2: {:?}", cfg_feature_enabled!("avx2"));
-        println!("abm: {:?}", cfg_feature_enabled!("abm"));
-        println!("bmi: {:?}", cfg_feature_enabled!("bmi"));
-        println!("bmi2: {:?}", cfg_feature_enabled!("bmi2"));
-        println!("tbm: {:?}", cfg_feature_enabled!("tbm"));
-        println!("popcnt: {:?}", cfg_feature_enabled!("popcnt"));
-        println!("lzcnt: {:?}", cfg_feature_enabled!("lzcnt"));
-        println!("fma: {:?}", cfg_feature_enabled!("fma"));
-    }
-}
diff --git a/src/x86/xsave.rs b/src/x86/xsave.rs
new file mode 100644
index 0000000000..4b9f5b8f2b
--- /dev/null
+++ b/src/x86/xsave.rs
@@ -0,0 +1,405 @@
+//! `xsave` and `xsaveopt` target feature intrinsics
+
+#![cfg_attr(feature = "cargo-clippy", allow(stutter))]
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+use x86::c_void;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.xsave"]
+    fn xsave(p: *mut i8, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xrstor"]
+    fn xrstor(p: *const c_void, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xsetbv"]
+    fn xsetbv(v: i32, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xgetbv"]
+    fn xgetbv(x: i32) -> i64;
+    #[link_name = "llvm.x86.xsave64"]
+    fn xsave64(p: *mut i8, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xrstor64"]
+    fn xrstor64(p: *const c_void, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xsaveopt"]
+    fn xsaveopt(p: *mut i8, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xsaveopt64"]
+    fn xsaveopt64(p: *mut i8, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xsavec"]
+    fn xsavec(p: *mut i8, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xsavec64"]
+    fn xsavec64(p: *mut i8, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xsaves"]
+    fn xsaves(p: *mut i8, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xsaves64"]
+    fn xsaves64(p: *mut i8, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xrstors"]
+    fn xrstors(p: *const c_void, hi: i32, lo: i32) -> ();
+    #[link_name = "llvm.x86.xrstors64"]
+    fn xrstors64(p: *const c_void, hi: i32, lo: i32) -> ();
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits [62:0] in `save_mask` and XCR0.
+/// `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
+/// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+#[inline(always)]
+#[target_feature = "+xsave"]
+#[cfg_attr(test, assert_instr(xsave))]
+pub unsafe fn _xsave(mem_addr: *mut c_void, save_mask: u64) -> () {
+    xsave(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
+}
+
+/// Perform a full or partial restore of the enabled processor states using
+/// the state information stored in memory at `mem_addr`.
+///
+/// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+#[inline(always)]
+#[target_feature = "+xsave"]
+#[cfg_attr(test, assert_instr(xrstor))]
+pub unsafe fn _xrstor(mem_addr: *const c_void, rs_mask: u64) -> () {
+    xrstor(mem_addr, (rs_mask >> 32) as i32, rs_mask as i32);
+}
+
+/// `XFEATURE_ENABLED_MASK` for `XCR`
+///
+/// This intrinsic maps to `XSETBV` instruction.
+const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
+
+/// Copy 64-bits from `val` to the extended control register (`XCR`) specified
+/// by `a`.
+///
+/// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported.
+#[inline(always)]
+#[target_feature = "+xsave"]
+#[cfg_attr(test, assert_instr(xsetbv))]
+pub unsafe fn _xsetbv(a: u32, val: u64) -> () {
+    xsetbv(a as i32, (val >> 32) as i32, val as i32);
+}
+
+/// Reads the contents of the extended control register `XCR`
+/// specified in `xcr_no`.
+#[inline(always)]
+#[target_feature = "+xsave"]
+#[cfg_attr(test, assert_instr(xgetbv))]
+pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
+    xgetbv(xcr_no as i32) as u64
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits [62:0] in `save_mask` and XCR0.
+/// `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
+/// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+#[inline(always)]
+#[target_feature = "+xsave"]
+#[cfg_attr(test, assert_instr(xsave64))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _xsave64(mem_addr: *mut c_void, save_mask: u64) -> () {
+    xsave64(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
+}
+
+/// Perform a full or partial restore of the enabled processor states using
+/// the state information stored in memory at `mem_addr`.
+///
+/// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+#[inline(always)]
+#[target_feature = "+xsave"]
+#[cfg_attr(test, assert_instr(xrstor64))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _xrstor64(mem_addr: *const c_void, rs_mask: u64) -> () {
+    xrstor64(mem_addr, (rs_mask >> 32) as i32, rs_mask as i32);
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits [62:0] in `save_mask` and `XCR0`.
+/// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
+/// the manner in which data is saved. The performance of this instruction will
+/// be equal to or better than using the `XSAVE` instruction.
+#[inline(always)]
+#[target_feature = "+xsave,+xsaveopt"]
+#[cfg_attr(test, assert_instr(xsaveopt))]
+pub unsafe fn _xsaveopt(mem_addr: *mut c_void, save_mask: u64) -> () {
+    xsaveopt(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`.
+///
+/// State is saved based on bits [62:0] in `save_mask` and `XCR0`.
+/// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
+/// the manner in which data is saved. The performance of this instruction will
+/// be equal to or better than using the `XSAVE64` instruction.
+#[inline(always)]
+#[target_feature = "+xsave,+xsaveopt"]
+#[cfg_attr(test, assert_instr(xsaveopt64))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _xsaveopt64(mem_addr: *mut c_void, save_mask: u64) -> () {
+    xsaveopt64(
+        mem_addr as *mut i8,
+        (save_mask >> 32) as i32,
+        save_mask as i32,
+    );
+}
+
+/// Perform a full or partial save of the enabled processor states to memory
+/// at `mem_addr`.
+///
+/// `xsavec` differs from `xsave` in that it uses compaction and that it may
+/// use init optimization. State is saved based on bits [62:0] in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+#[inline(always)]
+#[target_feature = "+xsave,+xsavec"]
+#[cfg_attr(test, assert_instr(xsavec))]
+pub unsafe fn _xsavec(mem_addr: *mut c_void, save_mask: u64) -> () {
+    xsavec(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
+}
+
+/// Perform a full or partial save of the enabled processor states to memory
+/// at `mem_addr`.
+///
+/// `xsavec` differs from `xsave` in that it uses compaction and that it may
+/// use init optimization. State is saved based on bits [62:0] in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+#[inline(always)]
+#[target_feature = "+xsave,+xsavec"]
+#[cfg_attr(test, assert_instr(xsavec64))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _xsavec64(mem_addr: *mut c_void, save_mask: u64) -> () {
+    xsavec64(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`
+///
+/// `xsaves` differs from xsave in that it can save state components
+/// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
+/// modified optimization. State is saved based on bits [62:0] in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+#[inline(always)]
+#[target_feature = "+xsave,+xsaves"]
+#[cfg_attr(test, assert_instr(xsaves))]
+pub unsafe fn _xsaves(mem_addr: *mut c_void, save_mask: u64) -> () {
+    xsaves(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
+}
+
+/// Perform a full or partial save of the enabled processor states to memory at
+/// `mem_addr`
+///
+/// `xsaves` differs from xsave in that it can save state components
+/// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
+/// modified optimization. State is saved based on bits [62:0] in `save_mask`
+/// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+#[inline(always)]
+#[target_feature = "+xsave,+xsaves"]
+#[cfg_attr(test, assert_instr(xsaves64))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _xsaves64(mem_addr: *mut c_void, save_mask: u64) -> () {
+    xsaves64(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
+}
+
+/// Perform a full or partial restore of the enabled processor states using the
+/// state information stored in memory at `mem_addr`.
+///
+/// `xrstors` differs from `xrstor` in that it can restore state components
+/// corresponding to bits set in the `IA32_XSS` `MSR`; `xrstors` cannot restore
+/// from an `xsave` area in which the extended region is in the standard form.
+/// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+#[inline(always)]
+#[target_feature = "+xsave,+xsaves"]
+#[cfg_attr(test, assert_instr(xrstors))]
+pub unsafe fn _xrstors(mem_addr: *const c_void, rs_mask: u64) -> () {
+    xrstors(mem_addr, (rs_mask >> 32) as i32, rs_mask as i32);
+}
+/// Perform a full or partial restore of the enabled processor states using the
+/// state information stored in memory at `mem_addr`.
+///
+/// `xrstors` differs from `xrstor` in that it can restore state components
+/// corresponding to bits set in the `IA32_XSS` `MSR`; `xrstors` cannot restore
+/// from an `xsave` area in which the extended region is in the standard form.
+/// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
+/// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
+/// boundary.
+#[inline(always)]
+#[target_feature = "+xsave,+xsaves"]
+#[cfg_attr(test, assert_instr(xrstors64))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _xrstors64(mem_addr: *const c_void, rs_mask: u64) -> () {
+    xrstors64(mem_addr, (rs_mask >> 32) as i32, rs_mask as i32);
+}
+
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use stdsimd_test::simd_test;
+    use std::fmt;
+
+    #[repr(align(64))]
+    struct Buffer {
+        data: [u64; 1024], // 8192 bytes
+    }
+
+    impl Buffer {
+        fn new() -> Buffer {
+            Buffer { data: [0; 1024] }
+        }
+        fn ptr(&mut self) -> *mut c_void {
+            &mut self.data[0] as *mut _ as *mut c_void
+        }
+    }
+
+    impl PartialEq<Buffer> for Buffer {
+        fn eq(&self, other: &Buffer) -> bool {
+            for i in 0..1024 {
+                if self.data[i] != other.data[i] {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+
+    impl fmt::Debug for Buffer {
+        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+            write!(f, "[")?;
+            for i in 0..1024 {
+                write!(f, "{}", self.data[i])?;
+                if i != 1023 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")
+        }
+    }
+
+    #[simd_test = "xsave"]
+    unsafe fn xsave() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = Buffer::new();
+        let mut b = Buffer::new();
+
+        _xsave(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsave(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[cfg(not(target_arch = "x86"))]
+    #[simd_test = "xsave"]
+    unsafe fn xsave64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = Buffer::new();
+        let mut b = Buffer::new();
+
+        _xsave64(a.ptr(), m);
+        _xrstor64(a.ptr(), m);
+        _xsave64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[simd_test = "xsave"]
+    unsafe fn xgetbv_xsetbv() {
+        let xcr_n: u32 = _XCR_XFEATURE_ENABLED_MASK;
+
+        let xcr: u64 = _xgetbv(xcr_n);
+        // FIXME: XSETBV is a privileged instruction we should only test this
+        // when running in privileged mode:
+        //
+        // _xsetbv(xcr_n, xcr);
+        let xcr_cpy: u64 = _xgetbv(xcr_n);
+        assert_eq!(xcr, xcr_cpy);
+    }
+
+    #[simd_test = "xsave,xsaveopt"]
+    unsafe fn xsaveopt() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = Buffer::new();
+        let mut b = Buffer::new();
+
+        _xsaveopt(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsaveopt(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[cfg(not(target_arch = "x86"))]
+    #[simd_test = "xsave,xsaveopt"]
+    unsafe fn xsaveopt64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = Buffer::new();
+        let mut b = Buffer::new();
+
+        _xsaveopt64(a.ptr(), m);
+        _xrstor64(a.ptr(), m);
+        _xsaveopt64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+
+    #[simd_test = "xsave,xsavec"]
+    unsafe fn xsavec() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = Buffer::new();
+        let mut b = Buffer::new();
+
+        _xsavec(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsavec(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[cfg(not(target_arch = "x86"))]
+    #[simd_test = "xsave,xsavec"]
+    unsafe fn xsavec64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = Buffer::new();
+        let mut b = Buffer::new();
+
+        _xsavec64(a.ptr(), m);
+        _xrstor64(a.ptr(), m);
+        _xsavec64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[cfg(not(feature = "intel_sde"))]
+    #[simd_test = "xsaves"]
+    unsafe fn xsaves() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = Buffer::new();
+        let mut b = Buffer::new();
+
+        _xsaves(a.ptr(), m);
+        _xrstors(a.ptr(), m);
+        _xsaves(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+
+    #[cfg(not(any(target_arch = "x86", feature = "intel_sde")))]
+    #[simd_test = "xsaves"]
+    unsafe fn xsaves64() {
+        let m = 0xFFFFFFFFFFFFFFFF_u64; //< all registers
+        let mut a = Buffer::new();
+        let mut b = Buffer::new();
+
+        _xsaves64(a.ptr(), m);
+        _xrstors64(a.ptr(), m);
+        _xsaves64(b.ptr(), m);
+        assert_eq!(a, b);
+    }
+}
diff --git a/stdsimd-test/simd-test-macro/src/lib.rs b/stdsimd-test/simd-test-macro/src/lib.rs
index 3777feae79..9bf073d62f 100644
--- a/stdsimd-test/simd-test-macro/src/lib.rs
+++ b/stdsimd-test/simd-test-macro/src/lib.rs
@@ -33,7 +33,17 @@ pub fn simd_test(
         TokenNode::Op('=', _) => {}
         _ => panic!("expected #[simd_test = \"feature\"]"),
     }
-    let target_feature = &tokens[1];
+    let target_features = match tokens[1].kind {
+        TokenNode::Literal(ref l) => l.to_string(),
+        _ => panic!("expected #[simd_test = \"feature\"]"),
+    };
+    let target_features: Vec<String> = target_features
+        .replace('"', "")
+        .replace('+', "")
+        .split(',')
+        .map(|v| String::from(v))
+        .collect();
+
     let enable_feature = match tokens[1].kind {
         TokenNode::Literal(ref l) => l.to_string(),
         _ => panic!("expected #[simd_test = \"feature\"]"),
@@ -41,17 +51,29 @@ pub fn simd_test(
     let enable_feature = enable_feature
         .trim_left_matches('"')
         .trim_right_matches('"');
-    let enable_feature = string(&format!("+{}", enable_feature));
+    let enable_feature =
+        string(&(format!("+{}", enable_feature).replace(',', ",+")));
     let item = TokenStream::from(item);
     let name = find_name(item.clone());
 
     let name: TokenStream = name.as_str().parse().unwrap();
 
+    let mut cfg_target_features = quote::Tokens::new();
+    use quote::ToTokens;
+    for feature in target_features {
+        let q = quote! {
+            cfg_feature_enabled!(#feature) &&
+        };
+        q.to_tokens(&mut cfg_target_features);
+    }
+    let q = quote!{ true };
+    q.to_tokens(&mut cfg_target_features);
+
     let ret: TokenStream = quote! {
         #[allow(non_snake_case)]
         #[test]
         fn #name() {
-            if cfg_feature_enabled!(#target_feature) {
+            if #cfg_target_features {
                 return unsafe { #name() };
             } else {
                 ::stdsimd_test::assert_skip_test_ok(stringify!(#name));
diff --git a/tests/cpu-detection.rs b/tests/cpu-detection.rs
index 294fd8ca7c..c4b4c9627f 100644
--- a/tests/cpu-detection.rs
+++ b/tests/cpu-detection.rs
@@ -20,10 +20,35 @@ fn works() {
     assert_eq!(cfg_feature_enabled!("sse4.2"), information.sse4_2());
     assert_eq!(cfg_feature_enabled!("avx"), information.avx());
     assert_eq!(cfg_feature_enabled!("avx2"), information.avx2());
+    // assert_eq!(cfg_feature_enabled!("avx512f"), information.avx512f());
+    // assert_eq!(cfg_feature_enabled!("avx512cd"), information.avx512cd());
+    // assert_eq!(cfg_feature_enabled!("avx512er"), information.avx512er());
+    // assert_eq!(cfg_feature_enabled!("avx512pf"), information.avx512pf());
+    // assert_eq!(cfg_feature_enabled!("avx512bw"), information.avx512bw());
+    // assert_eq!(cfg_feature_enabled!("avx512dq"), information.avx512dq());
+    // assert_eq!(cfg_feature_enabled!("avx512vl"), information.avx512vl());
+    // assert_eq!(cfg_feature_enabled!("avx512ifma"),
+    // information.avx512_ifma());
+    // assert_eq!(cfg_feature_enabled!("avx512vbmi"),
+    // information.avx512_vbmi());
+    // assert_eq!(cfg_feature_enabled!("avx512vpopcntdq"),
+    // information.avx512_vpopcntdq());
     assert_eq!(cfg_feature_enabled!("fma"), information.fma());
     assert_eq!(cfg_feature_enabled!("bmi"), information.bmi1());
     assert_eq!(cfg_feature_enabled!("bmi2"), information.bmi2());
     assert_eq!(cfg_feature_enabled!("popcnt"), information.popcnt());
-
-    // TODO: tbm, abm, lzcnt
+    // assert_eq!(cfg_feature_enabled!("sse4a"), information.sse4a());
+    assert_eq!(cfg_feature_enabled!("abm"), information.lzcnt());
+    assert_eq!(cfg_feature_enabled!("tbm"), information.tbm());
+    assert_eq!(cfg_feature_enabled!("lzcnt"), information.lzcnt());
+    assert_eq!(cfg_feature_enabled!("xsave"), information.xsave());
+    assert_eq!(cfg_feature_enabled!("xsaveopt"), information.xsaveopt());
+    assert_eq!(
+        cfg_feature_enabled!("xsavec"),
+        information.xsavec_and_xrstor()
+    );
+    assert_eq!(
+        cfg_feature_enabled!("xsavec"),
+        information.xsaves_xrstors_and_ia32_xss()
+    );
 }