diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index d441ee562a..392f9d7d9b 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -13066,7 +13066,7 @@ mod test {
         let a: [f64; 3] = [0., 1., 2.];
         let e: [f64; 2] = [1., 2.];
         let mut r: [f64; 2] = [0f64; 2];
-        vst1_f64_x2(r.as_mut_ptr(), vld1_f64_x2(a[1..].as_ptr()));
+        vst1_f64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -13075,7 +13075,7 @@ mod test {
         let a: [f64; 5] = [0., 1., 2., 3., 4.];
         let e: [f64; 4] = [1., 2., 3., 4.];
         let mut r: [f64; 4] = [0f64; 4];
-        vst1q_f64_x2(r.as_mut_ptr(), vld1q_f64_x2(a[1..].as_ptr()));
+        vst1q_f64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -13084,7 +13084,7 @@ mod test {
         let a: [f64; 4] = [0., 1., 2., 3.];
         let e: [f64; 3] = [1., 2., 3.];
         let mut r: [f64; 3] = [0f64; 3];
-        vst1_f64_x3(r.as_mut_ptr(), vld1_f64_x3(a[1..].as_ptr()));
+        vst1_f64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -13093,7 +13093,7 @@ mod test {
         let a: [f64; 7] = [0., 1., 2., 3., 4., 5., 6.];
         let e: [f64; 6] = [1., 2., 3., 4., 5., 6.];
         let mut r: [f64; 6] = [0f64; 6];
-        vst1q_f64_x3(r.as_mut_ptr(), vld1q_f64_x3(a[1..].as_ptr()));
+        vst1q_f64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -13102,7 +13102,7 @@ mod test {
         let a: [f64; 5] = [0., 1., 2., 3., 4.];
         let e: [f64; 4] = [1., 2., 3., 4.];
         let mut r: [f64; 4] = [0f64; 4];
-        vst1_f64_x4(r.as_mut_ptr(), vld1_f64_x4(a[1..].as_ptr()));
+        vst1_f64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -13111,7 +13111,7 @@ mod test {
         let a: [f64; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
         let e: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
         let mut r: [f64; 8] = [0f64; 8];
-        vst1q_f64_x4(r.as_mut_ptr(), vld1q_f64_x4(a[1..].as_ptr()));
+        vst1q_f64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs
index f40ed5c338..2f8cd1283f 100644
--- a/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/crates/core_arch/src/aarch64/neon/mod.rs
@@ -678,7 +678,7 @@ pub unsafe fn vld1_dup_f64(ptr: *const f64) -> float64x1_t {
 /// Load multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_f64(ptr: *const f64) -> float64x2_t {
     let x = vld1q_lane_f64::<0>(ptr, transmute(f64x2::splat(0.)));
     simd_shuffle2!(x, x, [0, 0])
@@ -698,7 +698,7 @@ pub unsafe fn vld1_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x1_t)
 #[inline]
 #[target_feature(enable = "neon")]
 #[rustc_legacy_const_generics(2)]
-#[cfg_attr(test, assert_instr(ldr, LANE = 1))]
+#[cfg_attr(test, assert_instr(ld1, LANE = 1))]
 pub unsafe fn vld1q_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x2_t) -> float64x2_t {
     static_assert_imm1!(LANE);
     simd_insert(src, LANE as u32, *ptr)
@@ -886,7 +886,7 @@ pub unsafe fn vst1q_p16(ptr: *mut p16, a: poly16x8_t) {
 
 // Store multiple single-element structures from one, two, three, or four registers.
 #[inline]
-#[target_feature(enable = "neon")]
+#[target_feature(enable = "neon,aes")]
 #[cfg_attr(test, assert_instr(str))]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn vst1_p64(ptr: *mut p64, a: poly64x1_t) {
@@ -895,7 +895,7 @@ pub unsafe fn vst1_p64(ptr: *mut p64, a: poly64x1_t) {
 
 // Store multiple single-element structures from one, two, three, or four registers.
 #[inline]
-#[target_feature(enable = "neon")]
+#[target_feature(enable = "neon,aes")]
 #[cfg_attr(test, assert_instr(str))]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn vst1q_p64(ptr: *mut p64, a: poly64x2_t) {
@@ -4803,29 +4803,6 @@ mod tests {
         assert_eq!(r, e)
     }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vst1_p64() {
-        let mut vals = [0_u64; 2];
-        let a = u64x1::new(1);
-
-        vst1_p64(vals[1..].as_mut_ptr(), transmute(a));
-
-        assert_eq!(vals[0], 0);
-        assert_eq!(vals[1], 1);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vst1q_p64() {
-        let mut vals = [0_u64; 3];
-        let a = u64x2::new(1, 2);
-
-        vst1q_p64(vals[1..].as_mut_ptr(), transmute(a));
-
-        assert_eq!(vals[0], 0);
-        assert_eq!(vals[1], 1);
-        assert_eq!(vals[2], 2);
-    }
-
     #[simd_test(enable = "neon")]
     unsafe fn test_vst1_f64() {
         let mut vals = [0_f64; 2];
diff --git a/crates/core_arch/src/arm/neon.rs b/crates/core_arch/src/arm/neon.rs
index ebcff041a6..2903e46330 100644
--- a/crates/core_arch/src/arm/neon.rs
+++ b/crates/core_arch/src/arm/neon.rs
@@ -480,6 +480,22 @@ pub unsafe fn vst1q_p16(ptr: *mut p16, a: poly16x8_t) {
     vst1q_v8i16(ptr as *const i8, transmute(a), align_of::<p8>() as i32)
 }
 
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes,v8")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1_p64(ptr: *mut p64, a: poly64x1_t) {
+    vst1_v1i64(ptr as *const i8, transmute(a), align_of::<p64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes,v8")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1q_p64(ptr: *mut p64, a: poly64x2_t) {
+    vst1q_v2i64(ptr as *const i8, transmute(a), align_of::<p64>() as i32)
+}
+
 // Store multiple single-element structures from one, two, three, or four registers.
 #[inline]
 #[target_feature(enable = "neon,v7")]
diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs
index 69d8da1795..616aad8ac4 100644
--- a/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -6616,7 +6616,7 @@ pub unsafe fn vld1_p64_x2(a: *const p64) -> poly64x1x2_t {
 #[inline]
 #[target_feature(enable = "neon,aes")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(ldr))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
 pub unsafe fn vld1_p64_x3(a: *const p64) -> poly64x1x3_t {
     transmute(vld1_s64_x3(transmute(a)))
@@ -6626,7 +6626,7 @@ pub unsafe fn vld1_p64_x3(a: *const p64) -> poly64x1x3_t {
 #[inline]
 #[target_feature(enable = "neon,aes")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(ldr))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
 pub unsafe fn vld1_p64_x4(a: *const p64) -> poly64x1x4_t {
     transmute(vld1_s64_x4(transmute(a)))
@@ -6636,7 +6636,7 @@ pub unsafe fn vld1_p64_x4(a: *const p64) -> poly64x1x4_t {
 #[inline]
 #[target_feature(enable = "neon,aes")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(ldr))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
 pub unsafe fn vld1q_p64_x2(a: *const p64) -> poly64x2x2_t {
     transmute(vld1q_s64_x2(transmute(a)))
@@ -6646,7 +6646,7 @@ pub unsafe fn vld1q_p64_x2(a: *const p64) -> poly64x2x2_t {
 #[inline]
 #[target_feature(enable = "neon,aes")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(ldr))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
 pub unsafe fn vld1q_p64_x3(a: *const p64) -> poly64x2x3_t {
     transmute(vld1q_s64_x3(transmute(a)))
@@ -6656,7 +6656,7 @@ pub unsafe fn vld1q_p64_x3(a: *const p64) -> poly64x2x3_t {
 #[inline]
 #[target_feature(enable = "neon,aes")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(ldr))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))]
 pub unsafe fn vld1q_p64_x4(a: *const p64) -> poly64x2x4_t {
     transmute(vld1q_s64_x4(transmute(a)))
@@ -7790,6 +7790,66 @@ pub unsafe fn vst1q_p16_x4(a: *mut p16, b: poly16x8x4_t) {
     vst1q_s16_x4(transmute(a), transmute(b))
 }
 
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_p64_x2(a: *mut p64, b: poly64x1x2_t) {
+    vst1_s64_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_p64_x3(a: *mut p64, b: poly64x1x3_t) {
+    vst1_s64_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1_p64_x4(a: *mut p64, b: poly64x1x4_t) {
+    vst1_s64_x4(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_p64_x2(a: *mut p64, b: poly64x2x2_t) {
+    vst1q_s64_x2(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_p64_x3(a: *mut p64, b: poly64x2x3_t) {
+    vst1q_s64_x3(transmute(a), transmute(b))
+}
+
+/// Store multiple single-element structures to one, two, three, or four registers
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st1))]
+pub unsafe fn vst1q_p64_x4(a: *mut p64, b: poly64x2x4_t) {
+    vst1q_s64_x4(transmute(a), transmute(b))
+}
+
 /// Store multiple single-element structures to one, two, three, or four registers
 #[inline]
 #[cfg(target_arch = "arm")]
@@ -21642,7 +21702,7 @@ mod test {
         let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [i8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [i8; 16] = [0i8; 16];
-        vst1_s8_x2(r.as_mut_ptr(), vld1_s8_x2(a[1..].as_ptr()));
+        vst1_s8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21651,7 +21711,7 @@ mod test {
         let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
         let e: [i16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
         let mut r: [i16; 8] = [0i16; 8];
-        vst1_s16_x2(r.as_mut_ptr(), vld1_s16_x2(a[1..].as_ptr()));
+        vst1_s16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21660,7 +21720,7 @@ mod test {
         let a: [i32; 5] = [0, 1, 2, 3, 4];
         let e: [i32; 4] = [1, 2, 3, 4];
         let mut r: [i32; 4] = [0i32; 4];
-        vst1_s32_x2(r.as_mut_ptr(), vld1_s32_x2(a[1..].as_ptr()));
+        vst1_s32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21669,7 +21729,7 @@ mod test {
         let a: [i64; 3] = [0, 1, 2];
         let e: [i64; 2] = [1, 2];
         let mut r: [i64; 2] = [0i64; 2];
-        vst1_s64_x2(r.as_mut_ptr(), vld1_s64_x2(a[1..].as_ptr()));
+        vst1_s64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21678,7 +21738,7 @@ mod test {
         let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [i8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [i8; 32] = [0i8; 32];
-        vst1q_s8_x2(r.as_mut_ptr(), vld1q_s8_x2(a[1..].as_ptr()));
+        vst1q_s8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21687,7 +21747,7 @@ mod test {
         let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [i16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [i16; 16] = [0i16; 16];
-        vst1q_s16_x2(r.as_mut_ptr(), vld1q_s16_x2(a[1..].as_ptr()));
+        vst1q_s16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21696,7 +21756,7 @@ mod test {
         let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
         let e: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
         let mut r: [i32; 8] = [0i32; 8];
-        vst1q_s32_x2(r.as_mut_ptr(), vld1q_s32_x2(a[1..].as_ptr()));
+        vst1q_s32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21705,7 +21765,7 @@ mod test {
         let a: [i64; 5] = [0, 1, 2, 3, 4];
         let e: [i64; 4] = [1, 2, 3, 4];
         let mut r: [i64; 4] = [0i64; 4];
-        vst1q_s64_x2(r.as_mut_ptr(), vld1q_s64_x2(a[1..].as_ptr()));
+        vst1q_s64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21714,7 +21774,7 @@ mod test {
         let a: [i8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let e: [i8; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let mut r: [i8; 24] = [0i8; 24];
-        vst1_s8_x3(r.as_mut_ptr(), vld1_s8_x3(a[1..].as_ptr()));
+        vst1_s8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21723,7 +21783,7 @@ mod test {
         let a: [i16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
         let e: [i16; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
         let mut r: [i16; 12] = [0i16; 12];
-        vst1_s16_x3(r.as_mut_ptr(), vld1_s16_x3(a[1..].as_ptr()));
+        vst1_s16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21732,7 +21792,7 @@ mod test {
         let a: [i32; 7] = [0, 1, 2, 3, 4, 5, 6];
         let e: [i32; 6] = [1, 2, 3, 4, 5, 6];
         let mut r: [i32; 6] = [0i32; 6];
-        vst1_s32_x3(r.as_mut_ptr(), vld1_s32_x3(a[1..].as_ptr()));
+        vst1_s32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21741,7 +21801,7 @@ mod test {
         let a: [i64; 4] = [0, 1, 2, 3];
         let e: [i64; 3] = [1, 2, 3];
         let mut r: [i64; 3] = [0i64; 3];
-        vst1_s64_x3(r.as_mut_ptr(), vld1_s64_x3(a[1..].as_ptr()));
+        vst1_s64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21750,7 +21810,7 @@ mod test {
         let a: [i8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [i8; 48] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [i8; 48] = [0i8; 48];
-        vst1q_s8_x3(r.as_mut_ptr(), vld1q_s8_x3(a[1..].as_ptr()));
+        vst1q_s8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21759,7 +21819,7 @@ mod test {
         let a: [i16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let e: [i16; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let mut r: [i16; 24] = [0i16; 24];
-        vst1q_s16_x3(r.as_mut_ptr(), vld1q_s16_x3(a[1..].as_ptr()));
+        vst1q_s16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21768,7 +21828,7 @@ mod test {
         let a: [i32; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
         let e: [i32; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
         let mut r: [i32; 12] = [0i32; 12];
-        vst1q_s32_x3(r.as_mut_ptr(), vld1q_s32_x3(a[1..].as_ptr()));
+        vst1q_s32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21777,7 +21837,7 @@ mod test {
         let a: [i64; 7] = [0, 1, 2, 3, 4, 5, 6];
         let e: [i64; 6] = [1, 2, 3, 4, 5, 6];
         let mut r: [i64; 6] = [0i64; 6];
-        vst1q_s64_x3(r.as_mut_ptr(), vld1q_s64_x3(a[1..].as_ptr()));
+        vst1q_s64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21786,7 +21846,7 @@ mod test {
         let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [i8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [i8; 32] = [0i8; 32];
-        vst1_s8_x4(r.as_mut_ptr(), vld1_s8_x4(a[1..].as_ptr()));
+        vst1_s8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21795,7 +21855,7 @@ mod test {
         let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [i16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [i16; 16] = [0i16; 16];
-        vst1_s16_x4(r.as_mut_ptr(), vld1_s16_x4(a[1..].as_ptr()));
+        vst1_s16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21804,7 +21864,7 @@ mod test {
         let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
         let e: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
         let mut r: [i32; 8] = [0i32; 8];
-        vst1_s32_x4(r.as_mut_ptr(), vld1_s32_x4(a[1..].as_ptr()));
+        vst1_s32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21813,7 +21873,7 @@ mod test {
         let a: [i64; 5] = [0, 1, 2, 3, 4];
         let e: [i64; 4] = [1, 2, 3, 4];
         let mut r: [i64; 4] = [0i64; 4];
-        vst1_s64_x4(r.as_mut_ptr(), vld1_s64_x4(a[1..].as_ptr()));
+        vst1_s64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21822,7 +21882,7 @@ mod test {
         let a: [i8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [i8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [i8; 64] = [0i8; 64];
-        vst1q_s8_x4(r.as_mut_ptr(), vld1q_s8_x4(a[1..].as_ptr()));
+        vst1q_s8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21831,7 +21891,7 @@ mod test {
         let a: [i16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [i16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [i16; 32] = [0i16; 32];
-        vst1q_s16_x4(r.as_mut_ptr(), vld1q_s16_x4(a[1..].as_ptr()));
+        vst1q_s16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21840,7 +21900,7 @@ mod test {
         let a: [i32; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [i32; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [i32; 16] = [0i32; 16];
-        vst1q_s32_x4(r.as_mut_ptr(), vld1q_s32_x4(a[1..].as_ptr()));
+        vst1q_s32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21849,7 +21909,7 @@ mod test {
         let a: [i64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
         let e: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
         let mut r: [i64; 8] = [0i64; 8];
-        vst1q_s64_x4(r.as_mut_ptr(), vld1q_s64_x4(a[1..].as_ptr()));
+        vst1q_s64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21858,7 +21918,7 @@ mod test {
         let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [u8; 16] = [0u8; 16];
-        vst1_u8_x2(r.as_mut_ptr(), vld1_u8_x2(a[1..].as_ptr()));
+        vst1_u8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21867,7 +21927,7 @@ mod test {
         let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
         let e: [u16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
         let mut r: [u16; 8] = [0u16; 8];
-        vst1_u16_x2(r.as_mut_ptr(), vld1_u16_x2(a[1..].as_ptr()));
+        vst1_u16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21876,7 +21936,7 @@ mod test {
         let a: [u32; 5] = [0, 1, 2, 3, 4];
         let e: [u32; 4] = [1, 2, 3, 4];
         let mut r: [u32; 4] = [0u32; 4];
-        vst1_u32_x2(r.as_mut_ptr(), vld1_u32_x2(a[1..].as_ptr()));
+        vst1_u32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21885,7 +21945,7 @@ mod test {
         let a: [u64; 3] = [0, 1, 2];
         let e: [u64; 2] = [1, 2];
         let mut r: [u64; 2] = [0u64; 2];
-        vst1_u64_x2(r.as_mut_ptr(), vld1_u64_x2(a[1..].as_ptr()));
+        vst1_u64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21894,7 +21954,7 @@ mod test {
         let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [u8; 32] = [0u8; 32];
-        vst1q_u8_x2(r.as_mut_ptr(), vld1q_u8_x2(a[1..].as_ptr()));
+        vst1q_u8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21903,7 +21963,7 @@ mod test {
         let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [u16; 16] = [0u16; 16];
-        vst1q_u16_x2(r.as_mut_ptr(), vld1q_u16_x2(a[1..].as_ptr()));
+        vst1q_u16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21912,7 +21972,7 @@ mod test {
         let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
         let e: [u32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
         let mut r: [u32; 8] = [0u32; 8];
-        vst1q_u32_x2(r.as_mut_ptr(), vld1q_u32_x2(a[1..].as_ptr()));
+        vst1q_u32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21921,7 +21981,7 @@ mod test {
         let a: [u64; 5] = [0, 1, 2, 3, 4];
         let e: [u64; 4] = [1, 2, 3, 4];
         let mut r: [u64; 4] = [0u64; 4];
-        vst1q_u64_x2(r.as_mut_ptr(), vld1q_u64_x2(a[1..].as_ptr()));
+        vst1q_u64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21930,7 +21990,7 @@ mod test {
         let a: [u8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let e: [u8; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let mut r: [u8; 24] = [0u8; 24];
-        vst1_u8_x3(r.as_mut_ptr(), vld1_u8_x3(a[1..].as_ptr()));
+        vst1_u8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21939,7 +21999,7 @@ mod test {
         let a: [u16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
         let e: [u16; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
         let mut r: [u16; 12] = [0u16; 12];
-        vst1_u16_x3(r.as_mut_ptr(), vld1_u16_x3(a[1..].as_ptr()));
+        vst1_u16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21948,7 +22008,7 @@ mod test {
         let a: [u32; 7] = [0, 1, 2, 3, 4, 5, 6];
         let e: [u32; 6] = [1, 2, 3, 4, 5, 6];
         let mut r: [u32; 6] = [0u32; 6];
-        vst1_u32_x3(r.as_mut_ptr(), vld1_u32_x3(a[1..].as_ptr()));
+        vst1_u32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21957,7 +22017,7 @@ mod test {
         let a: [u64; 4] = [0, 1, 2, 3];
         let e: [u64; 3] = [1, 2, 3];
         let mut r: [u64; 3] = [0u64; 3];
-        vst1_u64_x3(r.as_mut_ptr(), vld1_u64_x3(a[1..].as_ptr()));
+        vst1_u64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21966,7 +22026,7 @@ mod test {
         let a: [u8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [u8; 48] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [u8; 48] = [0u8; 48];
-        vst1q_u8_x3(r.as_mut_ptr(), vld1q_u8_x3(a[1..].as_ptr()));
+        vst1q_u8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21975,7 +22035,7 @@ mod test {
         let a: [u16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let e: [u16; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let mut r: [u16; 24] = [0u16; 24];
-        vst1q_u16_x3(r.as_mut_ptr(), vld1q_u16_x3(a[1..].as_ptr()));
+        vst1q_u16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21984,7 +22044,7 @@ mod test {
         let a: [u32; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
         let e: [u32; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
         let mut r: [u32; 12] = [0u32; 12];
-        vst1q_u32_x3(r.as_mut_ptr(), vld1q_u32_x3(a[1..].as_ptr()));
+        vst1q_u32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -21993,7 +22053,7 @@ mod test {
         let a: [u64; 7] = [0, 1, 2, 3, 4, 5, 6];
         let e: [u64; 6] = [1, 2, 3, 4, 5, 6];
         let mut r: [u64; 6] = [0u64; 6];
-        vst1q_u64_x3(r.as_mut_ptr(), vld1q_u64_x3(a[1..].as_ptr()));
+        vst1q_u64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22002,7 +22062,7 @@ mod test {
         let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [u8; 32] = [0u8; 32];
-        vst1_u8_x4(r.as_mut_ptr(), vld1_u8_x4(a[1..].as_ptr()));
+        vst1_u8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22011,7 +22071,7 @@ mod test {
         let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [u16; 16] = [0u16; 16];
-        vst1_u16_x4(r.as_mut_ptr(), vld1_u16_x4(a[1..].as_ptr()));
+        vst1_u16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22020,7 +22080,7 @@ mod test {
         let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
         let e: [u32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
         let mut r: [u32; 8] = [0u32; 8];
-        vst1_u32_x4(r.as_mut_ptr(), vld1_u32_x4(a[1..].as_ptr()));
+        vst1_u32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22029,7 +22089,7 @@ mod test {
         let a: [u64; 5] = [0, 1, 2, 3, 4];
         let e: [u64; 4] = [1, 2, 3, 4];
         let mut r: [u64; 4] = [0u64; 4];
-        vst1_u64_x4(r.as_mut_ptr(), vld1_u64_x4(a[1..].as_ptr()));
+        vst1_u64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22038,7 +22098,7 @@ mod test {
         let a: [u8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [u8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [u8; 64] = [0u8; 64];
-        vst1q_u8_x4(r.as_mut_ptr(), vld1q_u8_x4(a[1..].as_ptr()));
+        vst1q_u8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22047,7 +22107,7 @@ mod test {
         let a: [u16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [u16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [u16; 32] = [0u16; 32];
-        vst1q_u16_x4(r.as_mut_ptr(), vld1q_u16_x4(a[1..].as_ptr()));
+        vst1q_u16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22056,7 +22116,7 @@ mod test {
         let a: [u32; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [u32; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [u32; 16] = [0u32; 16];
-        vst1q_u32_x4(r.as_mut_ptr(), vld1q_u32_x4(a[1..].as_ptr()));
+        vst1q_u32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22065,7 +22125,7 @@ mod test {
         let a: [u64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
         let e: [u64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
         let mut r: [u64; 8] = [0u64; 8];
-        vst1q_u64_x4(r.as_mut_ptr(), vld1q_u64_x4(a[1..].as_ptr()));
+        vst1q_u64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22074,7 +22134,7 @@ mod test {
         let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [u8; 16] = [0u8; 16];
-        vst1_p8_x2(r.as_mut_ptr(), vld1_p8_x2(a[1..].as_ptr()));
+        vst1_p8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22083,7 +22143,7 @@ mod test {
         let a: [u8; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let e: [u8; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let mut r: [u8; 24] = [0u8; 24];
-        vst1_p8_x3(r.as_mut_ptr(), vld1_p8_x3(a[1..].as_ptr()));
+        vst1_p8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22092,7 +22152,7 @@ mod test {
         let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [u8; 32] = [0u8; 32];
-        vst1_p8_x4(r.as_mut_ptr(), vld1_p8_x4(a[1..].as_ptr()));
+        vst1_p8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22101,7 +22161,7 @@ mod test {
         let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [u8; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [u8; 32] = [0u8; 32];
-        vst1q_p8_x2(r.as_mut_ptr(), vld1q_p8_x2(a[1..].as_ptr()));
+        vst1q_p8_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22110,7 +22170,7 @@ mod test {
         let a: [u8; 49] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [u8; 48] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [u8; 48] = [0u8; 48];
-        vst1q_p8_x3(r.as_mut_ptr(), vld1q_p8_x3(a[1..].as_ptr()));
+        vst1q_p8_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22119,7 +22179,7 @@ mod test {
         let a: [u8; 65] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [u8; 64] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [u8; 64] = [0u8; 64];
-        vst1q_p8_x4(r.as_mut_ptr(), vld1q_p8_x4(a[1..].as_ptr()));
+        vst1q_p8_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22128,7 +22188,7 @@ mod test {
         let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
         let e: [u16; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
         let mut r: [u16; 8] = [0u16; 8];
-        vst1_p16_x2(r.as_mut_ptr(), vld1_p16_x2(a[1..].as_ptr()));
+        vst1_p16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22137,7 +22197,7 @@ mod test {
         let a: [u16; 13] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
         let e: [u16; 12] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
         let mut r: [u16; 12] = [0u16; 12];
-        vst1_p16_x3(r.as_mut_ptr(), vld1_p16_x3(a[1..].as_ptr()));
+        vst1_p16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22146,7 +22206,7 @@ mod test {
         let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [u16; 16] = [0u16; 16];
-        vst1_p16_x4(r.as_mut_ptr(), vld1_p16_x4(a[1..].as_ptr()));
+        vst1_p16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22155,7 +22215,7 @@ mod test {
         let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let e: [u16; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
         let mut r: [u16; 16] = [0u16; 16];
-        vst1q_p16_x2(r.as_mut_ptr(), vld1q_p16_x2(a[1..].as_ptr()));
+        vst1q_p16_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22164,7 +22224,7 @@ mod test {
         let a: [u16; 25] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let e: [u16; 24] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
         let mut r: [u16; 24] = [0u16; 24];
-        vst1q_p16_x3(r.as_mut_ptr(), vld1q_p16_x3(a[1..].as_ptr()));
+        vst1q_p16_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22173,7 +22233,61 @@ mod test {
         let a: [u16; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let e: [u16; 32] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
         let mut r: [u16; 32] = [0u16; 32];
-        vst1q_p16_x4(r.as_mut_ptr(), vld1q_p16_x4(a[1..].as_ptr()));
+        vst1q_p16_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p64_x2() {
+        let a: [u64; 3] = [0, 1, 2];
+        let e: [u64; 2] = [1, 2];
+        let mut r: [u64; 2] = [0u64; 2];
+        vst1_p64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p64_x3() {
+        let a: [u64; 4] = [0, 1, 2, 3];
+        let e: [u64; 3] = [1, 2, 3];
+        let mut r: [u64; 3] = [0u64; 3];
+        vst1_p64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1_p64_x4() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [u64; 4] = [1, 2, 3, 4];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst1_p64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p64_x2() {
+        let a: [u64; 5] = [0, 1, 2, 3, 4];
+        let e: [u64; 4] = [1, 2, 3, 4];
+        let mut r: [u64; 4] = [0u64; 4];
+        vst1q_p64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p64_x3() {
+        let a: [u64; 7] = [0, 1, 2, 3, 4, 5, 6];
+        let e: [u64; 6] = [1, 2, 3, 4, 5, 6];
+        let mut r: [u64; 6] = [0u64; 6];
+        vst1q_p64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vst1q_p64_x4() {
+        let a: [u64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+        let e: [u64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let mut r: [u64; 8] = [0u64; 8];
+        vst1q_p64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22182,7 +22296,7 @@ mod test {
         let a: [f32; 5] = [0., 1., 2., 3., 4.];
         let e: [f32; 4] = [1., 2., 3., 4.];
         let mut r: [f32; 4] = [0f32; 4];
-        vst1_f32_x2(r.as_mut_ptr(), vld1_f32_x2(a[1..].as_ptr()));
+        vst1_f32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22191,7 +22305,7 @@ mod test {
         let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
         let e: [f32; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
         let mut r: [f32; 8] = [0f32; 8];
-        vst1q_f32_x2(r.as_mut_ptr(), vld1q_f32_x2(a[1..].as_ptr()));
+        vst1q_f32_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22200,7 +22314,7 @@ mod test {
         let a: [f32; 7] = [0., 1., 2., 3., 4., 5., 6.];
         let e: [f32; 6] = [1., 2., 3., 4., 5., 6.];
         let mut r: [f32; 6] = [0f32; 6];
-        vst1_f32_x3(r.as_mut_ptr(), vld1_f32_x3(a[1..].as_ptr()));
+        vst1_f32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22209,7 +22323,7 @@ mod test {
         let a: [f32; 13] = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.];
         let e: [f32; 12] = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.];
         let mut r: [f32; 12] = [0f32; 12];
-        vst1q_f32_x3(r.as_mut_ptr(), vld1q_f32_x3(a[1..].as_ptr()));
+        vst1q_f32_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22218,7 +22332,7 @@ mod test {
         let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
         let e: [f32; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
         let mut r: [f32; 8] = [0f32; 8];
-        vst1_f32_x4(r.as_mut_ptr(), vld1_f32_x4(a[1..].as_ptr()));
+        vst1_f32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
@@ -22227,7 +22341,7 @@ mod test {
         let a: [f32; 17] = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.];
         let e: [f32; 16] = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.];
         let mut r: [f32; 16] = [0f32; 16];
-        vst1q_f32_x4(r.as_mut_ptr(), vld1q_f32_x4(a[1..].as_ptr()));
+        vst1q_f32_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
     }
 
diff --git a/crates/core_arch/src/arm_shared/neon/store_tests.rs b/crates/core_arch/src/arm_shared/neon/store_tests.rs
index c1e355fd9a..cad660e877 100644
--- a/crates/core_arch/src/arm_shared/neon/store_tests.rs
+++ b/crates/core_arch/src/arm_shared/neon/store_tests.rs
@@ -339,6 +339,29 @@ unsafe fn test_vst1q_p16() {
     assert_eq!(vals[8], 8);
 }
 
+#[simd_test(enable = "neon,aes")]
+unsafe fn test_vst1_p64() {
+    let mut vals = [0_u64; 2];
+    let a = u64x1::new(1);
+
+    vst1_p64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+}
+
+#[simd_test(enable = "neon,aes")]
+unsafe fn test_vst1q_p64() {
+    let mut vals = [0_u64; 3];
+    let a = u64x2::new(1, 2);
+
+    vst1q_p64(vals[1..].as_mut_ptr(), transmute(a));
+
+    assert_eq!(vals[0], 0);
+    assert_eq!(vals[1], 1);
+    assert_eq!(vals[2], 2);
+}
+
 #[simd_test(enable = "neon")]
 unsafe fn test_vst1_f32() {
     let mut vals = [0_f32; 3];
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
index a22dfe3dd4..e2d8f80dfb 100644
--- a/crates/stdarch-gen/neon.spec
+++ b/crates/stdarch-gen/neon.spec
@@ -2082,10 +2082,9 @@ generate *const p16:poly16x4x2_t, *const p16:poly16x4x3_t, *const p16:poly16x4x4
 generate *const p16:poly16x8x2_t, *const p16:poly16x8x3_t, *const p16:poly16x8x4_t
 target = aes
 generate *const p64:poly64x1x2_t
-arm = ldr
+arm = nop
 generate *const p64:poly64x1x3_t, *const p64:poly64x1x4_t
 generate *const p64:poly64x2x2_t, *const p64:poly64x2x3_t, *const p64:poly64x2x4_t
-
 /// Load multiple single-element structures to one, two, three, or four registers
 name = vld1
 out-suffix
@@ -2122,13 +2121,16 @@ out-nox
 a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
 validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
 load_fn
+arm-aarch64-separate
 
 aarch64 = ld2
 link-aarch64 = ld2._EXTv2_
+//generate *const i64:int64x2x2_t
+
 arm = vld2
 link-arm = vld2._EXTpi82_
 //generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t
-//generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t
+//generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t
 
 /// Load multiple 2-element structures to two registers
 name = vld2
@@ -2139,10 +2141,17 @@ validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9,
 load_fn
 
 aarch64 = ld2
+//generate *const u64:uint64x2x2_t
+target = aes
+//generate *const p64:poly64x2x2_t
+
+target = default
 arm = vld2
 //generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t
-//generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t
+//generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t
 //generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t
+target = aes
+//generate *const p64:poly64x1x2_t
 
 /// Load multiple 2-element structures to two registers
 name = vld2
@@ -2150,6 +2159,7 @@ out-nox
 a = 0., 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9.
 validate 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9.
 load_fn
+arm-aarch64-separate
 
 aarch64 = ld2
 link-aarch64 = ld2._EXTv2_
@@ -2166,12 +2176,14 @@ a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 1
 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
 load_fn
 
-arm = vld2dup
-link-arm = vld2dup._EXTpi82_
 aarch64 = ld2r
 link-aarch64 = ld2r._EXT2_
+//generate *const i64:int64x2x2_t
+
+arm = vld2dup
+link-arm = vld2dup._EXTpi82_
 //generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t
-//generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t
+//generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t
 
 /// Load single 2-element structure and replicate to all lanes of two registers
 name = vld2
@@ -2181,11 +2193,18 @@ a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 1
 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
 load_fn
 
-arm = vld2dup
 aarch64 = ld2r
+//generate *const u64:uint64x2x2_t
+target = aes
+//generate *const p64:poly64x2x2_t
+
+target = default
+arm = vld2dup
 //generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t
-//generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t
+//generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t
 //generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t
+target = aes
+//generate *const p64:poly64x1x2_t
 
 /// Load single 2-element structure and replicate to all lanes of two registers
 name = vld2
@@ -2217,13 +2236,13 @@ arm-aarch64-separate
 aarch64 = ld2lane
 const-aarch64 = LANE
 link-aarch64 = ld2lane._EXTpi82_
-//generate *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t
+//generate *const i8:int8x16x2_t:int8x16x2_t, *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t
 
 arm = vld2lane
 const-arm = LANE
 link-arm = vld2lane._EXTpi82_
 //generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t
-//generate *const i8:int8x16x2_t:int8x16x2_t, *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t
+//generate *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t
 
 /// Load multiple 2-element structures to two registers
 name = vld2
@@ -2236,7 +2255,6 @@ b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 1
 n = 0
 validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
 load_fn
-arm-aarch64-separate
 
 aarch64 = ld2lane
 const-aarch64 = LANE
@@ -2245,14 +2263,15 @@ target = aes
 //generate *const p64:poly64x1x2_t:poly64x1x2_t, *const p64:poly64x2x2_t:poly64x2x2_t
 
 target = default
-//generate *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t
+//generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t
+//generate *const p8:poly8x16x2_t:poly8x16x2_t
 
 arm = vld2lane
 const-arm = LANE
 //generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t
-//generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t
+//generate *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t
 //generate *const p8:poly8x8x2_t:poly8x8x2_t, *const p16:poly16x4x2_t:poly16x4x2_t
-//generate *const p8:poly8x16x2_t:poly8x16x2_t, *const p16:poly16x8x2_t:poly16x8x2_t
+//generate *const p16:poly16x8x2_t:poly16x8x2_t
 
 /// Load multiple 2-element structures to two registers
 name = vld2
@@ -2276,6 +2295,398 @@ const-arm = LANE
 link-arm = vld2lane._EXTpi82_
 //generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t
 
+/// Load multiple 3-element structures to three registers
+name = vld3
+out-nox
+a = 0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48
+validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld3
+link-aarch64 = ld3._EXTv2_
+//generate *const i64:int64x2x3_t
+
+arm = vld3
+link-arm = vld3._EXTpi82_
+//generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t
+//generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t
+
+/// Load multiple 3-element structures to three registers
+name = vld3
+out-nox
+multi_fn = transmute, {vld3-outsignednox-noext, transmute(a)}
+a = 0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48
+validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+load_fn
+
+aarch64 = ld3
+//generate *const u64:uint64x2x3_t
+target = aes
+//generate *const p64:poly64x2x3_t
+
+target = default
+arm = vld3
+//generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t
+//generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t
+//generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t
+target = aes
+//generate *const p64:poly64x1x3_t
+
+/// Load multiple 3-element structures to three registers
+name = vld3
+out-nox
+a = 0., 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8.
+validate 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8.
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld3
+link-aarch64 = ld3._EXTv2_
+//generate *const f64:float64x1x3_t, *const f64:float64x2x3_t
+
+arm = vld3
+link-arm = vld3._EXTpi82_
+//generate *const f32:float32x2x3_t, *const f32:float32x4x3_t
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+name = vld3
+out-dup-nox
+a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+load_fn
+
+aarch64 = ld3r
+link-aarch64 = ld3r._EXT2_
+//generate *const i64:int64x2x3_t
+
+arm = vld3dup
+link-arm = vld3dup._EXTpi82_
+//generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t
+//generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+name = vld3
+out-dup-nox
+multi_fn = transmute, {vld3-outsigneddupnox-noext, transmute(a)}
+a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+load_fn
+
+aarch64 = ld3r
+//generate *const u64:uint64x2x3_t
+target = aes
+//generate *const p64:poly64x2x3_t
+
+target = default
+arm = vld3dup
+//generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t
+//generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t
+//generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t
+target = aes
+//generate *const p64:poly64x1x3_t
+
+/// Load single 3-element structure and replicate to all lanes of three registers
+name = vld3
+out-dup-nox
+a = 0., 1., 1., 1., 3., 1., 4., 3., 5., 1., 4., 3., 5.
+validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.
+load_fn
+
+aarch64 = ld3r
+link-aarch64 = ld3r._EXT2_
+//generate *const f64:float64x1x3_t, *const f64:float64x2x3_t
+
+arm = vld3dup
+link-arm = vld3dup._EXTpi82_
+//generate *const f32:float32x2x3_t, *const f32:float32x4x3_t
+
+/// Load multiple 3-element structures to two registers
+name = vld3
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+constn = LANE
+a = 0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+n = 0
+validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld3lane
+const-aarch64 = LANE
+link-aarch64 = ld3lane._EXTpi82_
+//generate *const i8:int8x16x3_t:int8x16x3_t, *const i64:int64x1x3_t:int64x1x3_t, *const i64:int64x2x3_t:int64x2x3_t
+
+arm = vld3lane
+const-arm = LANE
+link-arm = vld3lane._EXTpi82_
+//generate *const i8:int8x8x3_t:int8x8x3_t, *const i16:int16x4x3_t:int16x4x3_t, *const i32:int32x2x3_t:int32x2x3_t
+//generate *const i16:int16x8x3_t:int16x8x3_t, *const i32:int32x4x3_t:int32x4x3_t
+
+/// Load multiple 3-element structures to three registers
+name = vld3
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vld3-outsignedlanenox-::<LANE>, transmute(a), transmute(b)}
+constn = LANE
+a = 0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+n = 0
+validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+load_fn
+
+aarch64 = ld3lane
+const-aarch64 = LANE
+target = aes
+//generate *const p64:poly64x1x3_t:poly64x1x3_t, *const p64:poly64x2x3_t:poly64x2x3_t
+target = default
+//generate *const p8:poly8x16x3_t:poly8x16x3_t, *const u8:uint8x16x3_t:uint8x16x3_t, *const u64:uint64x1x3_t:uint64x1x3_t, *const u64:uint64x2x3_t:uint64x2x3_t
+
+arm = vld3lane
+const-arm = LANE
+//generate *const u8:uint8x8x3_t:uint8x8x3_t, *const u16:uint16x4x3_t:uint16x4x3_t, *const u32:uint32x2x3_t:uint32x2x3_t
+//generate *const u16:uint16x8x3_t:uint16x8x3_t, *const u32:uint32x4x3_t:uint32x4x3_t
+//generate *const p8:poly8x8x3_t:poly8x8x3_t, *const p16:poly16x4x3_t:poly16x4x3_t
+//generate *const p16:poly16x8x3_t:poly16x8x3_t
+
+/// Load multiple 3-element structures to three registers
+name = vld3
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+constn = LANE
+a = 0., 1., 2., 2., 4., 5., 6., 7., 8., 5., 6., 7., 8.
+b = 0., 2., 2., 14., 9., 16., 17., 18., 5., 6., 7., 8.
+n = 0
+validate 1., 2., 2., 14., 2., 16., 17., 18., 2., 6., 7., 8.
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld3lane
+const-aarch64 = LANE
+link-aarch64 = ld3lane._EXTpi82_
+//generate *const f64:float64x1x3_t:float64x1x3_t, *const f64:float64x2x3_t:float64x2x3_t
+
+arm = vld3lane
+const-arm = LANE
+link-arm = vld3lane._EXTpi82_
+//generate *const f32:float32x2x3_t:float32x2x3_t, *const f32:float32x4x3_t:float32x4x3_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-nox
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld4
+link-aarch64 = ld4._EXTv2_
+//generate *const i64:int64x2x4_t
+
+arm = vld4
+link-arm = vld4._EXTpi82_
+//generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t
+//generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-nox
+multi_fn = transmute, {vld4-outsignednox-noext, transmute(a)}
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+load_fn
+
+aarch64 = ld4
+//generate *const u64:uint64x2x4_t
+target = aes
+//generate *const p64:poly64x2x4_t
+
+target = default
+arm = vld4
+//generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t
+//generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t
+//generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t
+target = aes
+//generate *const p64:poly64x1x4_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-nox
+a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 15., 16.
+validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 15., 6., 8., 8., 16.
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld4
+link-aarch64 = ld4._EXTv2_
+//generate *const f64:float64x1x4_t, *const f64:float64x2x4_t
+
+arm = vld4
+link-arm = vld4._EXTpi82_
+//generate *const f32:float32x2x4_t, *const f32:float32x4x4_t
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+name = vld4
+out-dup-nox
+a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+load_fn
+
+aarch64 = ld4r
+link-aarch64 = ld4r._EXT2_
+//generate *const i64:int64x2x4_t
+
+arm = vld4dup
+link-arm = vld4dup._EXTpi82_
+//generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t
+//generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+name = vld4
+out-dup-nox
+multi_fn = transmute, {vld4-outsigneddupnox-noext, transmute(a)}
+a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9
+validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+load_fn
+
+aarch64 = ld4r
+//generate *const u64:uint64x2x4_t
+target = aes
+//generate *const p64:poly64x2x4_t
+
+target = default
+arm = vld4dup
+//generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t
+//generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t
+//generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t
+target = aes
+//generate *const p64:poly64x1x4_t
+
+/// Load single 4-element structure and replicate to all lanes of four registers
+name = vld4
+out-dup-nox
+a = 0., 1., 1., 1., 1., 6., 4., 3., 5., 7., 4., 3., 5., 8., 4., 3., 5., 9., 4., 3., 5.
+validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.
+load_fn
+
+aarch64 = ld4r
+link-aarch64 = ld4r._EXT2_
+//generate *const f64:float64x1x4_t, *const f64:float64x2x4_t
+
+arm = vld4dup
+link-arm = vld4dup._EXTpi82_
+//generate *const f32:float32x2x4_t, *const f32:float32x4x4_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+constn = LANE
+a = 0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+b = 0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+n = 0
+validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld4lane
+const-aarch64 = LANE
+link-aarch64 = ld4lane._EXTpi82_
+//generate *const i8:int8x16x4_t:int8x16x4_t, *const i64:int64x1x4_t:int64x1x4_t, *const i64:int64x2x4_t:int64x2x4_t
+
+arm = vld4lane
+const-arm = LANE
+link-arm = vld4lane._EXTpi82_
+//generate *const i8:int8x8x4_t:int8x8x4_t, *const i16:int16x4x4_t:int16x4x4_t, *const i32:int32x2x4_t:int32x2x4_t
+//generate *const i16:int16x8x4_t:int16x8x4_t, *const i32:int32x4x4_t:int32x4x4_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vld4-outsignedlanenox-::<LANE>, transmute(a), transmute(b)}
+constn = LANE
+a = 0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+b = 0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+n = 0
+validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16
+load_fn
+
+aarch64 = ld4lane
+const-aarch64 = LANE
+target = aes
+//generate *const p64:poly64x1x4_t:poly64x1x4_t, *const p64:poly64x2x4_t:poly64x2x4_t
+target = default
+//generate *const p8:poly8x16x4_t:poly8x16x4_t, *const u8:uint8x16x4_t:uint8x16x4_t, *const u64:uint64x1x4_t:uint64x1x4_t, *const u64:uint64x2x4_t:uint64x2x4_t
+
+arm = vld4lane
+const-arm = LANE
+//generate *const u8:uint8x8x4_t:uint8x8x4_t, *const u16:uint16x4x4_t:uint16x4x4_t, *const u32:uint32x2x4_t:uint32x2x4_t
+//generate *const u16:uint16x8x4_t:uint16x8x4_t, *const u32:uint32x4x4_t:uint32x4x4_t
+//generate *const p8:poly8x8x4_t:poly8x8x4_t, *const p16:poly16x4x4_t:poly16x4x4_t
+//generate *const p16:poly16x8x4_t:poly16x8x4_t
+
+/// Load multiple 4-element structures to four registers
+name = vld4
+out-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+constn = LANE
+a = 0., 1., 2., 2., 2., 5., 6., 7., 8., 5., 6., 7., 8., 1., 4., 3., 5.
+b = 0., 2., 2., 2., 2., 16., 2., 18., 5., 6., 7., 8., 1., 4., 3., 5.
+n = 0
+validate 1., 2., 2., 2., 2., 16., 2., 18., 2., 6., 7., 8., 2., 4., 3., 5.
+load_fn
+arm-aarch64-separate
+
+aarch64 = ld4lane
+const-aarch64 = LANE
+link-aarch64 = ld4lane._EXTpi82_
+//generate *const f64:float64x1x4_t:float64x1x4_t, *const f64:float64x2x4_t:float64x2x4_t
+
+arm = vld4lane
+const-arm = LANE
+link-arm = vld4lane._EXTpi82_
+//generate *const f32:float32x2x4_t:float32x2x4_t, *const f32:float32x4x4_t:float32x4x4_t
+
+/// Store multiple single-element structures from one, two, three, or four registers
+name = vst1
+in1-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = *a, {simd_extract, b, LANE as u32}
+constn = LANE
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+n = 0
+validate 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+
+aarch64 = nop
+arm = nop
+//generate *mut i8:int8x8_t:void, *mut i16:int16x4_t:void, *mut i32:int32x2_t:void, *mut i64:int64x1_t:void
+//generate *mut i8:int8x16_t:void, *mut i16:int16x8_t:void, *mut i32:int32x4_t:void, *mut i64:int64x2_t:void
+//generate *mut u8:uint8x8_t:void, *mut u16:uint16x4_t:void, *mut u32:uint32x2_t:void, *mut u64:uint64x1_t:void
+//generate *mut u8:uint8x16_t:void, *mut u16:uint16x8_t:void, *mut u32:uint32x4_t:void, *mut u64:uint64x2_t:void
+//generate *mut p8:poly8x8_t:void, *mut p16:poly16x4_t:void, *mut p8:poly8x16_t:void, *mut p16:poly16x8_t:void
+target = aes
+//generate *mut p64:poly64x1_t:void, *mut p64:poly64x2_t:void
+
+/// Store multiple single-element structures from one, two, three, or four registers
+name = vst1
+in1-lane-nox
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = *a, {simd_extract, b, LANE as u32}
+constn = LANE
+a = 0., 1., 2., 3., 4., 5., 6., 7., 8.
+n = 0
+validate 1., 0., 0., 0., 0., 0., 0., 0.
+store_fn
+
+aarch64 = nop
+//generate *mut f64:float64x1_t:void, *mut f64:float64x2_t:void
+
+arm = nop
+//generate *mut f32:float32x2_t:void, *mut f32:float32x4_t:void
+
 /// Store multiple single-element structures from one, two, three, or four registers
 name = vst1
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
@@ -2319,6 +2730,11 @@ generate *mut p8:poly8x8x2_t:void, *mut p8:poly8x8x3_t:void, *mut p8:poly8x8x4_t
 generate *mut p8:poly8x16x2_t:void, *mut p8:poly8x16x3_t:void, *mut p8:poly8x16x4_t:void
 generate *mut p16:poly16x4x2_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x4x4_t:void
 generate *mut p16:poly16x8x2_t:void, *mut p16:poly16x8x3_t:void, *mut p16:poly16x8x4_t:void
+target = aes
+generate *mut p64:poly64x1x2_t:void
+arm = nop
+generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x1x4_t:void
+generate *mut p64:poly64x2x2_t:void, *mut p64:poly64x2x3_t:void, *mut p64:poly64x2x4_t:void
 
 /// Store multiple single-element structures to one, two, three, or four registers
 name = vst1
@@ -2350,6 +2766,363 @@ link-aarch64 = st1x4._EXT3_
 link-arm = vst1x4._EXTr3_
 generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
 
+/// Store multiple 2-element structures from two registers
+name = vst2
+in1-nox
+a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
+store_fn
+arm-aarch64-separate
+
+aarch64 = st2
+link-aarch64 = st2._EXTpi8_
+//generate *mut i64:int64x2x2_t:void
+
+arm = vst2
+link-arm = vst2._EXTpi8r_
+//generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void
+//generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+multi_fn = transmute, {vst2-in1signednox-noext, transmute(a), transmute(b)}
+in1-nox
+a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
+store_fn
+
+aarch64 = st2
+//generate *mut u64:uint64x2x2_t:void
+target = aes
+//generate *mut p64:poly64x2x2_t:void
+
+target = default
+arm = vst2
+//generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void
+//generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void
+//generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p8:poly8x16x2_t:void, *mut p16:poly16x8x2_t:void
+target = aes
+//generate *mut p64:poly64x1x2_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+in1-nox
+a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9.
+validate 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st2
+link-aarch64 = st2._EXTpi8_
+//generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void
+
+arm = vst2
+link-arm = vst2._EXTpi8r_
+//generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+n = 0
+validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+arm-aarch64-separate
+
+aarch64 = st2lane
+link-aarch64 = st2lane._EXTpi8_
+const-aarch64 = LANE
+//generate *mut i8:int8x16x2_t:void, *mut i64:int64x1x2_t:void, *mut i64:int64x2x2_t:void
+
+arm = vst2lane
+link-arm = vst2lane._EXTpi8r_
+const-arm = LANE
+//generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void
+//generate *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vst2-in1signedlanenox-::<LANE>, transmute(a), transmute(b)}
+a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+n = 0
+validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+
+aarch64 = st2lane
+//generate *mut u8:uint8x16x2_t:void, *mut u64:uint64x1x2_t:void, *mut u64:uint64x2x2_t:void, *mut p8:poly8x16x2_t:void
+target = aes
+//generate *mut p64:poly64x1x2_t:void, *mut p64:poly64x2x2_t:void
+
+target = default
+arm = vst2lane
+//generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void
+//generate *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void
+//generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p16:poly16x8x2_t:void
+
+/// Store multiple 2-element structures from two registers
+name = vst2
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9.
+n = 0
+validate 1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st2lane
+link-aarch64 = st2lane._EXTpi8_
+const-aarch64 = LANE
+//generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void
+
+arm = vst2lane
+link-arm = vst2lane._EXTpi8r_
+const-arm = LANE
+//generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+in1-nox
+a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48
+store_fn
+arm-aarch64-separate
+
+aarch64 = st3
+link-aarch64 = st3._EXTpi8_
+//generate *mut i64:int64x2x3_t:void
+
+arm = vst3
+link-arm = vst3._EXTpi8r_
+//generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void
+//generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+multi_fn = transmute, {vst3-in1signednox-noext, transmute(a), transmute(b)}
+in1-nox
+a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48
+store_fn
+
+aarch64 = st3
+//generate *mut u64:uint64x2x3_t:void
+target = aes
+//generate *mut p64:poly64x2x3_t:void
+
+target = default
+arm = vst3
+//generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void, *mut u64:uint64x1x3_t:void
+//generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void
+//generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p8:poly8x16x3_t:void, *mut p16:poly16x8x3_t:void
+target = aes
+//generate *mut p64:poly64x1x3_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+in1-nox
+a = 0., 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8., 13., 14., 15., 16
+validate 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8., 2., 13., 13., 4.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st3
+link-aarch64 = st3._EXTpi8_
+//generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void
+
+arm = vst3
+link-arm = vst3._EXTpi8r_
+//generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+n = 0
+validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+arm-aarch64-separate
+
+aarch64 = st3lane
+link-aarch64 = st3lane._EXTpi8_
+const-aarch64 = LANE
+//generate *mut i8:int8x16x3_t:void, *mut i64:int64x1x3_t:void, *mut i64:int64x2x3_t:void
+
+arm = vst3lane
+link-arm = vst3lane._EXTpi8r_
+const-arm = LANE
+//generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void
+//generate *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vst3-in1signedlanenox-::<LANE>, transmute(a), transmute(b)}
+a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48
+n = 0
+validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+
+aarch64 = st3lane
+//generate *mut u8:uint8x16x3_t:void, *mut u64:uint64x1x3_t:void, *mut u64:uint64x2x3_t:void, *mut p8:poly8x16x3_t:void
+target = aes
+//generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x2x3_t:void
+
+target = default
+arm = vst3lane
+//generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void
+//generate *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void
+//generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x8x3_t:void
+
+/// Store multiple 3-element structures from three registers
+name = vst3
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9.
+n = 0
+validate 1., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st3lane
+link-aarch64 = st3lane._EXTpi8_
+const-aarch64 = LANE
+//generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void
+
+arm = vst3lane
+link-arm = vst3lane._EXTpi8r_
+const-arm = LANE
+//generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+in1-nox
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+store_fn
+arm-aarch64-separate
+
+aarch64 = st4
+link-aarch64 = st4._EXTpi8_
+//generate *mut i64:int64x2x4_t:void
+
+arm = vst4
+link-arm = vst4._EXTpi8r_
+//generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void
+//generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+multi_fn = transmute, {vst4-in1signednox-noext, transmute(a), transmute(b)}
+in1-nox
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+store_fn
+
+aarch64 = st4
+//generate *mut u64:uint64x2x4_t:void
+target = aes
+//generate *mut p64:poly64x2x4_t:void
+
+target = default
+arm = vst4
+//generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void, *mut u64:uint64x1x4_t:void
+//generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void
+//generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p8:poly8x16x4_t:void, *mut p16:poly16x8x4_t:void
+target = aes
+//generate *mut p64:poly64x1x4_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+in1-nox
+a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.
+validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st4
+link-aarch64 = st4._EXTpi8_
+//generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void
+
+arm = vst4
+link-arm = vst4._EXTpi8r_
+//generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+n = 0
+validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+arm-aarch64-separate
+
+aarch64 = st4lane
+link-aarch64 = st4lane._EXTpi8_
+const-aarch64 = LANE
+//generate *mut i8:int8x16x4_t:void, *mut i64:int64x1x4_t:void, *mut i64:int64x2x4_t:void
+
+arm = vst4lane
+link-arm = vst4lane._EXTpi8r_
+const-arm = LANE
+//generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void
+//generate *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = transmute, {vst4-in1signedlanenox-::<LANE>, transmute(a), transmute(b)}
+a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64
+n = 0
+validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+store_fn
+
+aarch64 = st4lane
+//generate *mut u8:uint8x16x4_t:void, *mut u64:uint64x1x4_t:void, *mut u64:uint64x2x4_t:void, *mut p8:poly8x16x4_t:void
+target = aes
+//generate *mut p64:poly64x1x4_t:void, *mut p64:poly64x2x4_t:void
+
+target = default
+arm = vst4lane
+//generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void
+//generate *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void
+//generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p16:poly16x8x4_t:void
+
+/// Store multiple 4-element structures from four registers
+name = vst4
+in1-lane-nox
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.
+n = 0
+validate 1., 2., 2., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
+store_fn
+arm-aarch64-separate
+
+aarch64 = st4lane
+link-aarch64 = st4lane._EXTpi8_
+const-aarch64 = LANE
+//generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void
+
+arm = vst4lane
+link-arm = vst4lane._EXTpi8r_
+const-arm = LANE
+//generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
+
 /// Multiply
 name = vmul
 a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs
index 35ed3308dc..0f88daf111 100644
--- a/crates/stdarch-gen/src/main.rs
+++ b/crates/stdarch-gen/src/main.rs
@@ -427,8 +427,10 @@ enum Suffix {
     OutSuffix,
     OutNSuffix,
     OutNox,
+    In1Nox,
     OutDupNox,
     OutLaneNox,
+    In1LaneNox,
     Lane,
     In2,
     In2Lane,
@@ -909,7 +911,25 @@ fn ext(s: &str, in_t: &[&str; 3], out_t: &str) -> String {
         .replace("_EXT3_", &type_to_ext(in_t[1], false, false, false))
         .replace("_EXTr3_", &type_to_ext(in_t[1], false, true, false))
         .replace("_EXTv2_", &type_to_ext(out_t, true, false, false))
+        .replace("_EXTpi8_", &type_to_ext(in_t[1], false, false, true))
         .replace("_EXTpi82_", &type_to_ext(out_t, false, false, true))
+        .replace("_EXTpi8r_", &type_to_ext(in_t[1], false, true, true))
+}
+
+fn is_vldx(name: &str) -> bool {
+    let s: Vec<_> = name.split('_').collect();
+    s.len() == 2
+        && &name[0..3] == "vld"
+        && name[3..4].parse::<i32>().unwrap() > 1
+        && (s[1].starts_with("s") || s[1].starts_with("f"))
+}
+
+fn is_vstx(name: &str) -> bool {
+    let s: Vec<_> = name.split('_').collect();
+    s.len() == 2
+        && &name[0..3] == "vst"
+        && name[3..4].parse::<i32>().unwrap() > 1
+        && (s[1].starts_with("s") || s[1].starts_with("f"))
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -964,6 +984,11 @@ fn gen_aarch64(
             current_name,
             type_to_suffix(&type_to_sub_type(out_t))
         ),
+        In1Nox => format!(
+            "{}{}",
+            current_name,
+            type_to_suffix(&type_to_sub_type(in_t[1]))
+        ),
         OutDupNox => format!(
             "{}{}",
             current_name,
@@ -974,6 +999,11 @@ fn gen_aarch64(
             current_name,
             type_to_lane_suffix(&type_to_sub_type(out_t))
         ),
+        In1LaneNox => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffix(&type_to_sub_type(in_t[1]))
+        ),
         Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])),
         In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])),
         In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])),
@@ -1030,23 +1060,32 @@ fn gen_aarch64(
         };
         let (ext_inputs, ext_output) = {
             if const_aarch64.is_some() {
-                if matches!(fn_type, Fntype::Load) {
+                if !matches!(fn_type, Fntype::Normal) {
+                    let ptr_type = match fn_type {
+                        Fntype::Load => "*const i8",
+                        Fntype::Store => "*mut i8",
+                        _ => panic!("unsupported fn type"),
+                    };
                     let sub = type_to_sub_type(in_t[1]);
                     (
                         match type_sub_len(in_t[1]) {
-                            1 => format!("a: {}, n: i64, ptr: *const i8", sub),
-                            2 => format!("a: {}, b: {}, n: i64, ptr: *const i8", sub, sub),
+                            1 => format!("a: {}, n: i64, ptr: {}", sub, ptr_type),
+                            2 => format!("a: {}, b: {}, n: i64, ptr: {}", sub, sub, ptr_type),
                             3 => format!(
-                                "a: {}, b: {}, c: {}, n: i64, ptr: *const i8",
-                                sub, sub, sub
+                                "a: {}, b: {}, c: {}, n: i64, ptr: {}",
+                                sub, sub, sub, ptr_type
                             ),
                             4 => format!(
-                                "a: {}, b: {}, c: {}, d: {}, n: i64, ptr: *const i8",
-                                sub, sub, sub, sub
+                                "a: {}, b: {}, c: {}, d: {}, n: i64, ptr: {}",
+                                sub, sub, sub, sub, ptr_type
                             ),
                             _ => panic!("unsupported type: {}", in_t[1]),
                         },
-                        format!(" -> {}", out_t),
+                        if out_t != "void" {
+                            format!(" -> {}", out_t)
+                        } else {
+                            String::new()
+                        },
                     )
                 } else {
                     (
@@ -1061,19 +1100,23 @@ fn gen_aarch64(
                 }
             } else if matches!(fn_type, Fntype::Store) {
                 let sub = type_to_sub_type(in_t[1]);
-                let native = type_to_native_type(in_t[1]);
+                let ptr_type = if is_vstx(&name) {
+                    "i8".to_string()
+                } else {
+                    type_to_native_type(in_t[1])
+                };
+                let subs = match type_sub_len(in_t[1]) {
+                    1 => format!("a: {}", sub),
+                    2 => format!("a: {}, b: {}", sub, sub),
+                    3 => format!("a: {}, b: {}, c: {}", sub, sub, sub),
+                    4 => format!("a: {}, b: {}, c: {}, d: {}", sub, sub, sub, sub),
+                    _ => panic!("unsupported type: {}", in_t[1]),
+                };
+                (format!("{}, ptr: *mut {}", subs, ptr_type), String::new())
+            } else if is_vldx(&name) {
                 (
-                    match type_sub_len(in_t[1]) {
-                        1 => format!("a: {}, ptr: *mut {}", sub, native),
-                        2 => format!("a: {}, b: {}, ptr: *mut {}", sub, sub, native),
-                        3 => format!("a: {}, b: {}, c: {}, ptr: *mut {}", sub, sub, sub, native),
-                        4 => format!(
-                            "a: {}, b: {}, c: {}, d: {}, ptr: *mut {}",
-                            sub, sub, sub, sub, native
-                        ),
-                        _ => panic!("unsupported type: {}", in_t[1]),
-                    },
-                    String::new(),
+                    format!("ptr: *const {}", type_to_sub_type(out_t)),
+                    format!(" -> {}", out_t),
                 )
             } else {
                 (
@@ -1185,7 +1228,7 @@ fn gen_aarch64(
     };
     let call_params = {
         if let (Some(const_aarch64), Some(_)) = (const_aarch64, link_aarch64) {
-            if matches!(fn_type, Fntype::Load) {
+            if !matches!(fn_type, Fntype::Normal) {
                 let subs = match type_sub_len(in_t[1]) {
                     1 => "b",
                     2 => "b.0, b.1",
@@ -1195,7 +1238,7 @@ fn gen_aarch64(
                 };
                 format!(
                     r#"{}
-    {}{}({}, {} as i64, a as *const i8)"#,
+    {}{}({}, {} as i64, a.cast())"#,
                     multi_calls,
                     ext_c,
                     current_fn,
@@ -1217,14 +1260,17 @@ fn gen_aarch64(
                     _ => String::new(),
                 }
             }
-        } else if matches!(fn_type, Fntype::Store) {
+        } else if link_aarch64.is_some() && matches!(fn_type, Fntype::Store) {
+            let cast = if is_vstx(&name) { ".cast()" } else { "" };
             match type_sub_len(in_t[1]) {
-                1 => format!(r#"{}{}(b, a)"#, ext_c, current_fn),
-                2 => format!(r#"{}{}(b.0, b.1, a)"#, ext_c, current_fn),
-                3 => format!(r#"{}{}(b.0, b.1, b.2, a)"#, ext_c, current_fn),
-                4 => format!(r#"{}{}(b.0, b.1, b.2, b.3, a)"#, ext_c, current_fn),
+                1 => format!(r#"{}{}(b, a{})"#, ext_c, current_fn, cast),
+                2 => format!(r#"{}{}(b.0, b.1, a{})"#, ext_c, current_fn, cast),
+                3 => format!(r#"{}{}(b.0, b.1, b.2, a{})"#, ext_c, current_fn, cast),
+                4 => format!(r#"{}{}(b.0, b.1, b.2, b.3, a{})"#, ext_c, current_fn, cast),
                 _ => panic!("unsupported type: {}", in_t[1]),
             }
+        } else if link_aarch64.is_some() && is_vldx(&name) {
+            format!(r#"{}{}(a.cast())"#, ext_c, current_fn,)
         } else {
             let trans: [&str; 2] = if link_t[3] != out_t {
                 ["transmute(", ")"]
@@ -1406,7 +1452,7 @@ fn gen_store_test(
     unsafe fn test_{}() {{"#,
         name,
     );
-    for (a, _, _, _, e) in current_tests {
+    for (a, _, _, constn, e) in current_tests {
         let a: Vec<String> = a.iter().take(type_len + 1).cloned().collect();
         let e: Vec<String> = e.iter().take(type_len).cloned().collect();
         let mut input = String::from("[");
@@ -1425,12 +1471,15 @@ fn gen_store_test(
             output.push_str(&e[i])
         }
         output.push_str("]");
+        let const_n = constn
+            .as_deref()
+            .map_or(String::new(), |n| format!("::<{}>", n.to_string()));
         let t = format!(
             r#"
         let a: [{}; {}] = {};
         let e: [{}; {}] = {};
         let mut r: [{}; {}] = [0{}; {}];
-        {}(r.as_mut_ptr(), {}(a[1..].as_ptr()));
+        {}{}(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
         assert_eq!(r, e);
 "#,
             type_to_native_type(in_t[1]),
@@ -1444,7 +1493,7 @@ fn gen_store_test(
             type_to_native_type(in_t[1]),
             type_len,
             name,
-            name.replace("st", "ld"),
+            const_n,
         );
         test.push_str(&t);
     }
@@ -1613,6 +1662,11 @@ fn gen_arm(
             current_name,
             type_to_suffix(&type_to_sub_type(out_t))
         ),
+        In1Nox => format!(
+            "{}{}",
+            current_name,
+            type_to_suffix(&type_to_sub_type(in_t[1]))
+        ),
         OutDupNox => format!(
             "{}{}",
             current_name,
@@ -1623,6 +1677,11 @@ fn gen_arm(
             current_name,
             type_to_lane_suffix(&type_to_sub_type(out_t))
         ),
+        In1LaneNox => format!(
+            "{}{}",
+            current_name,
+            type_to_lane_suffix(&type_to_sub_type(in_t[1]))
+        ),
         Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])),
         In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])),
         In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])),
@@ -1752,7 +1811,12 @@ fn gen_arm(
         };
         let (arm_ext_inputs, arm_ext_output) = {
             if let Some(const_arm) = const_arm {
-                if matches!(fn_type, Fntype::Load) {
+                if !matches!(fn_type, Fntype::Normal) {
+                    let ptr_type = match fn_type {
+                        Fntype::Load => "*const i8",
+                        Fntype::Store => "*mut i8",
+                        _ => panic!("unsupported fn type"),
+                    };
                     let sub_type = type_to_sub_type(in_t[1]);
                     let inputs = match type_sub_len(in_t[1]) {
                         1 => format!("a: {}", sub_type),
@@ -1765,7 +1829,7 @@ fn gen_arm(
                         _ => panic!("unknown type: {}", in_t[1]),
                     };
                     (
-                        format!("ptr: *const i8, {}, n: i32, size: i32", inputs),
+                        format!("ptr: {}, {}, n: i32, size: i32", ptr_type, inputs),
                         String::new(),
                     )
                 } else {
@@ -1817,10 +1881,20 @@ fn gen_arm(
                     ),
                     _ => panic!("unknown type: {}", in_t[1]),
                 };
+                let (ptr_type, size) = if is_vstx(&name) {
+                    ("i8".to_string(), ", size: i32")
+                } else {
+                    (type_to_native_type(in_t[1]), "")
+                };
                 (
-                    format!("ptr: *mut {}, {}", type_to_native_type(in_t[1]), inputs),
+                    format!("ptr: *mut {}, {}{}", ptr_type, inputs, size),
                     String::new(),
                 )
+            } else if is_vldx(&name) {
+                (
+                    format!("ptr: *const i8, size: i32"),
+                    format!(" -> {}", out_t),
+                )
             } else {
                 (String::new(), String::new())
             }
@@ -1836,7 +1910,12 @@ fn gen_arm(
         ));
         let (aarch64_ext_inputs, aarch64_ext_output) = {
             if const_aarch64.is_some() {
-                if matches!(fn_type, Fntype::Load) {
+                if !matches!(fn_type, Fntype::Normal) {
+                    let ptr_type = match fn_type {
+                        Fntype::Load => "*const i8",
+                        Fntype::Store => "*mut i8",
+                        _ => panic!("unsupported fn type"),
+                    };
                     let sub_type = type_to_sub_type(in_t[1]);
                     let mut inputs = match type_sub_len(in_t[1]) {
                         1 => format!("a: {}", sub_type,),
@@ -1848,8 +1927,13 @@ fn gen_arm(
                         ),
                         _ => panic!("unknown type: {}", in_t[1]),
                     };
-                    inputs.push_str(&format!(", n: i64, ptr: *const i8"));
-                    (inputs, format!(" -> {}", out_t))
+                    inputs.push_str(&format!(", n: i64, ptr: {}", ptr_type));
+                    let out = if out_t == "void" {
+                        String::new()
+                    } else {
+                        format!(" -> {}", out_t)
+                    };
+                    (inputs, out)
                 } else {
                     (
                         match para_num {
@@ -1886,8 +1970,18 @@ fn gen_arm(
                     ),
                     _ => panic!("unknown type: {}", in_t[1]),
                 };
-                inputs.push_str(&format!(", ptr: *mut {}", type_to_native_type(in_t[0])));
+                let ptr_type = if is_vstx(&name) {
+                    "i8".to_string()
+                } else {
+                    type_to_native_type(in_t[1])
+                };
+                inputs.push_str(&format!(", ptr: *mut {}", ptr_type));
                 (inputs, String::new())
+            } else if is_vldx(&name) {
+                (
+                    format!("ptr: *const {}", type_to_sub_type(out_t)),
+                    format!(" -> {}", out_t),
+                )
             } else {
                 (String::new(), String::new())
             }
@@ -1962,7 +2056,7 @@ fn gen_arm(
     let function = if separate {
         let call_arm = {
             let arm_params = if let (Some(const_arm), Some(_)) = (const_arm, link_arm) {
-                if matches!(fn_type, Fntype::Load) {
+                if !matches!(fn_type, Fntype::Normal) {
                     let subs = match type_sub_len(in_t[1]) {
                         1 => "b",
                         2 => "b.0, b.1",
@@ -1971,7 +2065,7 @@ fn gen_arm(
                         _ => "",
                     };
                     format!(
-                        "{}(a as *const i8, {}, {}, {})",
+                        "{}(a.cast(), {}, {}, {})",
                         current_fn,
                         subs,
                         constn.as_deref().unwrap(),
@@ -2008,13 +2102,27 @@ fn gen_arm(
                     _ => String::new(),
                 }
             } else if matches!(fn_type, Fntype::Store) {
+                let (cast, size) = if is_vstx(&name) {
+                    (
+                        ".cast()",
+                        format!(", {}", type_bits(&type_to_sub_type(in_t[1])) / 8),
+                    )
+                } else {
+                    ("", String::new())
+                };
                 match type_sub_len(in_t[1]) {
-                    1 => format!("{}(a, b)", current_fn),
-                    2 => format!("{}(a, b.0, b.1)", current_fn),
-                    3 => format!("{}(a, b.0, b.1, b.2)", current_fn),
-                    4 => format!("{}(a, b.0, b.1, b.2, b.3)", current_fn),
+                    1 => format!("{}(a{}, b{})", current_fn, cast, size),
+                    2 => format!("{}(a{}, b.0, b.1{})", current_fn, cast, size),
+                    3 => format!("{}(a{}, b.0, b.1, b.2{})", current_fn, cast, size),
+                    4 => format!("{}(a{}, b.0, b.1, b.2, b.3{})", current_fn, cast, size),
                     _ => String::new(),
                 }
+            } else if link_arm.is_some() && is_vldx(&name) {
+                format!(
+                    "{}(a as *const i8, {})",
+                    current_fn,
+                    type_bits(&type_to_sub_type(out_t)) / 8
+                )
             } else {
                 String::new()
             };
@@ -2028,7 +2136,7 @@ fn gen_arm(
         let call_aarch64 = {
             let aarch64_params =
                 if let (Some(const_aarch64), Some(_)) = (const_aarch64, link_aarch64) {
-                    if matches!(fn_type, Fntype::Load) {
+                    if !matches!(fn_type, Fntype::Normal) {
                         let subs = match type_sub_len(in_t[1]) {
                             1 => "b",
                             2 => "b.0, b.1",
@@ -2037,7 +2145,7 @@ fn gen_arm(
                             _ => "",
                         };
                         format!(
-                            "{}({}, {} as i64, a as *const i8)",
+                            "{}({}, {} as i64, a.cast())",
                             current_fn,
                             subs,
                             constn.as_deref().unwrap()
@@ -2056,13 +2164,16 @@ fn gen_arm(
                         _ => String::new(),
                     }
                 } else if matches!(fn_type, Fntype::Store) {
+                    let cast = if is_vstx(&name) { ".cast()" } else { "" };
                     match type_sub_len(in_t[1]) {
-                        1 => format!("{}(b, a)", current_fn),
-                        2 => format!("{}(b.0, b.1, a)", current_fn),
-                        3 => format!("{}(b.0, b.1, b.2, a)", current_fn),
-                        4 => format!("{}(b.0, b.1, b.2, b.3, a)", current_fn),
+                        1 => format!("{}(b, a{})", current_fn, cast),
+                        2 => format!("{}(b.0, b.1, a{})", current_fn, cast),
+                        3 => format!("{}(b.0, b.1, b.2, a{})", current_fn, cast),
+                        4 => format!("{}(b.0, b.1, b.2, b.3, a{})", current_fn, cast),
                         _ => String::new(),
                     }
+                } else if link_aarch64.is_some() && is_vldx(&name) {
+                    format!("{}(a.cast())", current_fn)
                 } else {
                     String::new()
                 };
@@ -2599,6 +2710,10 @@ fn get_call(
             fn_name.push_str(&type_to_suffix(&type_to_sub_type(&type_to_signed(
                 &String::from(out_t),
             ))));
+        } else if fn_format[1] == "in1signednox" {
+            fn_name.push_str(&type_to_suffix(&type_to_sub_type(&type_to_signed(
+                &String::from(in_t[1]),
+            ))));
         } else if fn_format[1] == "outsigneddupnox" {
             fn_name.push_str(&type_to_dup_suffix(&type_to_sub_type(&type_to_signed(
                 &String::from(out_t),
@@ -2607,6 +2722,10 @@ fn get_call(
             fn_name.push_str(&type_to_lane_suffix(&type_to_sub_type(&type_to_signed(
                 &String::from(out_t),
             ))));
+        } else if fn_format[1] == "in1signedlanenox" {
+            fn_name.push_str(&type_to_lane_suffix(&type_to_sub_type(&type_to_signed(
+                &String::from(in_t[1]),
+            ))));
         } else if fn_format[1] == "unsigned" {
             fn_name.push_str(type_to_suffix(type_to_unsigned(in_t[1])));
         } else if fn_format[1] == "doubleself" {
@@ -2672,6 +2791,8 @@ fn get_call(
             r#"let {}: {} = {}({});"#,
             re_name, re_type, fn_name, param_str
         )
+    } else if fn_name.starts_with("*") {
+        format!(r#"{} = {};"#, fn_name, param_str)
     } else {
         format!(r#"{}({})"#, fn_name, param_str)
     };
@@ -2827,10 +2948,14 @@ mod test {
             suffix = OutSuffix;
         } else if line.starts_with("out-nox") {
             suffix = OutNox;
+        } else if line.starts_with("in1-nox") {
+            suffix = In1Nox;
         } else if line.starts_with("out-dup-nox") {
             suffix = OutDupNox;
         } else if line.starts_with("out-lane-nox") {
             suffix = OutLaneNox;
+        } else if line.starts_with("in1-lane-nox") {
+            suffix = In1LaneNox;
         } else if line.starts_with("lane-suffixes") {
             suffix = Lane;
         } else if line.starts_with("in2-suffix") {
diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
index 15ff9595af..a62bddbad4 100644
--- a/crates/stdarch-test/src/lib.rs
+++ b/crates/stdarch-test/src/lib.rs
@@ -138,6 +138,10 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
                 // removed once it has been addressed in LLVM.
                 "fcvtzu" | "fcvtzs" | "vcvt" => 64,
 
+                // core_arch/src/arm_shared/simd32
+                // vst1q_p64_x4_nop : #instructions = 33 >= 22 (limit)
+                "nop" if fnname.contains("vst1q_p64") => 34,
+
                 // Original limit was 20 instructions, but ARM DSP Intrinsics
                 // are exactly 20 instructions long. So, bump the limit to 22
                 // instead of adding here a long list of exceptions.