diff --git a/cranelift/codegen/meta/src/gen_inst.rs b/cranelift/codegen/meta/src/gen_inst.rs
index 97829f666219..82e80c722fc8 100644
--- a/cranelift/codegen/meta/src/gen_inst.rs
+++ b/cranelift/codegen/meta/src/gen_inst.rs
@@ -1233,6 +1233,9 @@ fn gen_builder(
 
         There is also a method per instruction format. These methods all
         return an `Inst`.
+
+        When an address to a load or store is specified, its integer
+        size is required to be equal to the platform's pointer width.
     "#,
     );
     fmt.line("pub trait InstBuilder<'f>: InstBuilderBase<'f> {");
diff --git a/cranelift/codegen/src/verifier/mod.rs b/cranelift/codegen/src/verifier/mod.rs
index 33d9132f8e68..ac95cf0cc2bb 100644
--- a/cranelift/codegen/src/verifier/mod.rs
+++ b/cranelift/codegen/src/verifier/mod.rs
@@ -663,6 +663,40 @@ impl<'a> Verifier<'a> {
             } => {
                 self.verify_bitcast(inst, flags, arg, errors)?;
             }
+            LoadNoOffset { opcode, arg, .. } if opcode.can_load() => {
+                self.verify_is_address(inst, arg, errors)?;
+            }
+            Load { opcode, arg, .. } if opcode.can_load() => {
+                self.verify_is_address(inst, arg, errors)?;
+            }
+            AtomicCas {
+                opcode,
+                args: [p, _, _],
+                ..
+            } if opcode.can_load() || opcode.can_store() => {
+                self.verify_is_address(inst, p, errors)?;
+            }
+            AtomicRmw {
+                opcode,
+                args: [p, _],
+                ..
+            } if opcode.can_load() || opcode.can_store() => {
+                self.verify_is_address(inst, p, errors)?;
+            }
+            Store {
+                opcode,
+                args: [_, p],
+                ..
+            } if opcode.can_store() => {
+                self.verify_is_address(inst, p, errors)?;
+            }
+            StoreNoOffset {
+                opcode,
+                args: [_, p],
+                ..
+            } if opcode.can_store() => {
+                self.verify_is_address(inst, p, errors)?;
+            }
             UnaryConst {
                 opcode: opcode @ (Opcode::Vconst | Opcode::F128const),
                 constant_handle,
@@ -1046,6 +1080,31 @@ impl<'a> Verifier<'a> {
         }
     }
 
+    fn verify_is_address(
+        &self,
+        loc_inst: Inst,
+        v: Value,
+        errors: &mut VerifierErrors,
+    ) -> VerifierStepResult {
+        if let Some(isa) = self.isa {
+            let pointer_width = isa.triple().pointer_width()?;
+            let value_type = self.func.dfg.value_type(v);
+            let expected_width = pointer_width.bits() as u32;
+            let value_width = value_type.bits();
+            if expected_width != value_width {
+                errors.nonfatal((
+                    loc_inst,
+                    self.context(loc_inst),
+                    format!("invalid pointer width (got {value_width}, expected {expected_width}) encountered {v}"),
+                ))
+            } else {
+                Ok(())
+            }
+        } else {
+            Ok(())
+        }
+    }
+
     fn domtree_integrity(
         &self,
         domtree: &DominatorTree,
diff --git a/cranelift/filetests/filetests/runtests/fdemote.clif b/cranelift/filetests/filetests/runtests/fdemote.clif
index 74bc4c9cb03a..916d43872364 100644
--- a/cranelift/filetests/filetests/runtests/fdemote.clif
+++ b/cranelift/filetests/filetests/runtests/fdemote.clif
@@ -73,21 +73,3 @@ block0(v0: f64):
 ; run: %fdemote_is_nan(-sNaN:0x1) == 1
 ; run: %fdemote_is_nan(+sNaN:0x4000000000001) == 1
 ; run: %fdemote_is_nan(-sNaN:0x4000000000001) == 1
-
-
-;; Tests a fdemote+load combo which some backends may optimize
-function %fdemote_load(i64, f64) -> f32 {
-    ss0 = explicit_slot 16
-
-block0(v1: i64, v2: f64):
-    v3 = stack_addr.i64 ss0
-    store.f64 v2, v3
-    v4 = load.f64 v3
-    v5 = fdemote.f32 v4
-    return v5
-}
-; run: %fdemote_load(0, 0x0.0) == 0x0.0
-; run: %fdemote_load(1, 0x0.1) == 0x0.1
-; run: %fdemote_load(2, 0x0.2) == 0x0.2
-; run: %fdemote_load(3, 0x3.2) == 0x3.2
-; run: %fdemote_load(0x8, 0x3.2) == 0x3.2
diff --git a/cranelift/filetests/filetests/runtests/fdemote_32.clif b/cranelift/filetests/filetests/runtests/fdemote_32.clif
new file mode 100644
index 000000000000..497bb9cc2b1f
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/fdemote_32.clif
@@ -0,0 +1,21 @@
+test interpret
+test run
+target pulley32
+target pulley32be
+
+;; Tests a fdemote+load combo which some backends may optimize
+function %fdemote_load(i32, f64) -> f32 {
+    ss0 = explicit_slot 16
+
+block0(v1: i32, v2: f64):
+    v3 = stack_addr.i32 ss0
+    store.f64 v2, v3
+    v4 = load.f64 v3
+    v5 = fdemote.f32 v4
+    return v5
+}
+; run: %fdemote_load(0, 0x0.0) == 0x0.0
+; run: %fdemote_load(1, 0x0.1) == 0x0.1
+; run: %fdemote_load(2, 0x0.2) == 0x0.2
+; run: %fdemote_load(3, 0x3.2) == 0x3.2
+; run: %fdemote_load(0x8, 0x3.2) == 0x3.2
diff --git a/cranelift/filetests/filetests/runtests/fdemote_64.clif b/cranelift/filetests/filetests/runtests/fdemote_64.clif
new file mode 100644
index 000000000000..37710bd149ac
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/fdemote_64.clif
@@ -0,0 +1,26 @@
+test interpret
+test run
+target x86_64
+target x86_64 has_avx
+target s390x
+target aarch64
+target riscv64
+target pulley64
+target pulley64be
+
+;; Tests a fdemote+load combo which some backends may optimize
+function %fdemote_load(i64, f64) -> f32 {
+    ss0 = explicit_slot 16
+
+block0(v1: i64, v2: f64):
+    v3 = stack_addr.i64 ss0
+    store.f64 v2, v3
+    v4 = load.f64 v3
+    v5 = fdemote.f32 v4
+    return v5
+}
+; run: %fdemote_load(0, 0x0.0) == 0x0.0
+; run: %fdemote_load(1, 0x0.1) == 0x0.1
+; run: %fdemote_load(2, 0x0.2) == 0x0.2
+; run: %fdemote_load(3, 0x3.2) == 0x3.2
+; run: %fdemote_load(0x8, 0x3.2) == 0x3.2
diff --git a/cranelift/filetests/filetests/runtests/fpromote.clif b/cranelift/filetests/filetests/runtests/fpromote.clif
index 37ba3970e8cb..7c13fcf4e919 100644
--- a/cranelift/filetests/filetests/runtests/fpromote.clif
+++ b/cranelift/filetests/filetests/runtests/fpromote.clif
@@ -83,20 +83,3 @@ block0(v0: f32):
 ; run: %fpromote_is_nan(+sNaN:0x200001) == 1
 ; run: %fpromote_is_nan(-sNaN:0x200001) == 1
 
-
-;; Tests a fpromote+load combo which some backends may optimize
-function %fpromote_load(i64, f32) -> f64 {
-    ss0 = explicit_slot 16
-
-block0(v1: i64, v2: f32):
-    v3 = stack_addr.i64 ss0
-    store.f32 v2, v3
-    v4 = load.f32 v3
-    v5 = fpromote.f64 v4
-    return v5
-}
-; run: %fpromote_load(0, 0x0.0) == 0x0.0
-; run: %fpromote_load(1, 0x0.1) == 0x0.1
-; run: %fpromote_load(2, 0x0.2) == 0x0.2
-; run: %fpromote_load(3, 0x3.2) == 0x3.2
-; run: %fpromote_load(0xC, 0x3.2) == 0x3.2
diff --git a/cranelift/filetests/filetests/runtests/fpromote_32.clif b/cranelift/filetests/filetests/runtests/fpromote_32.clif
new file mode 100644
index 000000000000..f5e3dcb6fbbd
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/fpromote_32.clif
@@ -0,0 +1,21 @@
+test interpret
+test run
+target pulley32
+target pulley32be
+
+;; Tests a fpromote+load combo which some backends may optimize
+function %fpromote_load(i64, f32) -> f64 {
+    ss0 = explicit_slot 16
+
+block0(v1: i64, v2: f32):
+    v3 = stack_addr.i32 ss0
+    store.f32 v2, v3
+    v4 = load.f32 v3
+    v5 = fpromote.f64 v4
+    return v5
+}
+; run: %fpromote_load(0, 0x0.0) == 0x0.0
+; run: %fpromote_load(1, 0x0.1) == 0x0.1
+; run: %fpromote_load(2, 0x0.2) == 0x0.2
+; run: %fpromote_load(3, 0x3.2) == 0x3.2
+; run: %fpromote_load(0xC, 0x3.2) == 0x3.2
diff --git a/cranelift/filetests/filetests/runtests/fpromote_64.clif b/cranelift/filetests/filetests/runtests/fpromote_64.clif
new file mode 100644
index 000000000000..6b0b3daab294
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/fpromote_64.clif
@@ -0,0 +1,27 @@
+test interpret
+test run
+target x86_64
+target x86_64 has_avx
+target s390x
+target aarch64
+target riscv64
+target riscv64 has_c has_zcb
+target pulley64
+target pulley64be
+
+;; Tests a fpromote+load combo which some backends may optimize
+function %fpromote_load(i64, f32) -> f64 {
+    ss0 = explicit_slot 16
+
+block0(v1: i64, v2: f32):
+    v3 = stack_addr.i64 ss0
+    store.f32 v2, v3
+    v4 = load.f32 v3
+    v5 = fpromote.f64 v4
+    return v5
+}
+; run: %fpromote_load(0, 0x0.0) == 0x0.0
+; run: %fpromote_load(1, 0x0.1) == 0x0.1
+; run: %fpromote_load(2, 0x0.2) == 0x0.2
+; run: %fpromote_load(3, 0x3.2) == 0x3.2
+; run: %fpromote_load(0xC, 0x3.2) == 0x3.2
diff --git a/cranelift/filetests/filetests/runtests/simd-extractlane.clif b/cranelift/filetests/filetests/runtests/simd-extractlane.clif
index 0d35960ac752..ce90b6897a67 100644
--- a/cranelift/filetests/filetests/runtests/simd-extractlane.clif
+++ b/cranelift/filetests/filetests/runtests/simd-extractlane.clif
@@ -43,72 +43,6 @@ block0(v0: i64x2):
 }
 ; run: %extractlane_1([0 4294967297]) == 4294967297
 
-function %extractlane_i8x16_through_stack(i8x16) -> i8 {
-    ss0 = explicit_slot 8
-block0(v0: i8x16):
-    v2 = stack_addr.i64 ss0
-    v3 = extractlane v0, 1
-    store v3, v2
-    v4 = load.i8 v2
-    return v4
-}
-; run: %extractlane_i8x16_through_stack([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == 2
-
-function %extractlane_i16x8_through_stack(i16x8) -> i16 {
-    ss0 = explicit_slot 8
-block0(v0: i16x8):
-    v2 = stack_addr.i64 ss0
-    v3 = extractlane v0, 2
-    store v3, v2
-    v4 = load.i16 v2
-    return v4
-}
-; run: %extractlane_i16x8_through_stack([1 2 3 4 5 6 7 8]) == 3
-
-function %extractlane_i32x4_through_stack(i32x4) -> i32 {
-    ss0 = explicit_slot 8
-block0(v0: i32x4):
-    v2 = stack_addr.i64 ss0
-    v3 = extractlane v0, 3
-    store v3, v2
-    v4 = load.i32 v2
-    return v4
-}
-; run: %extractlane_i32x4_through_stack([1 2 3 4]) == 4
-
-function %extractlane_i64x2_through_stack(i64x2) -> i64 {
-    ss0 = explicit_slot 8
-block0(v0: i64x2):
-    v2 = stack_addr.i64 ss0
-    v3 = extractlane v0, 0
-    store v3, v2
-    v4 = load.i64 v2
-    return v4
-}
-; run: %extractlane_i64x2_through_stack([1 2]) == 1
-
-function %extractlane_f32x4_through_stack(f32x4) -> f32 {
-    ss0 = explicit_slot 8
-block0(v0: f32x4):
-    v2 = stack_addr.i64 ss0
-    v3 = extractlane v0, 3
-    store v3, v2
-    v4 = load.f32 v2
-    return v4
-}
-; run: %extractlane_f32x4_through_stack([0x1.0 0x2.0 0x3.0 0x4.0]) == 0x4.0
-
-function %extractlane_f64x2_through_stack(f64x2) -> f64 {
-    ss0 = explicit_slot 8
-block0(v0: f64x2):
-    v2 = stack_addr.i64 ss0
-    v3 = extractlane v0, 0
-    store v3, v2
-    v4 = load.f64 v2
-    return v4
-}
-; run: %extractlane_f64x2_through_stack([0x1.0 0x2.0]) == 0x1.0
-
 
 function %unaligned_extractlane() -> f64 {
     ss0 = explicit_slot 24
diff --git a/cranelift/filetests/filetests/runtests/simd-extractlane_32.clif b/cranelift/filetests/filetests/runtests/simd-extractlane_32.clif
new file mode 100644
index 000000000000..32c7177781a5
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-extractlane_32.clif
@@ -0,0 +1,70 @@
+test interpret
+test run
+target pulley32
+target pulley32be
+
+function %extractlane_i8x16_through_stack(i8x16) -> i8 {
+    ss0 = explicit_slot 8
+block0(v0: i8x16):
+    v2 = stack_addr.i32 ss0
+    v3 = extractlane v0, 1
+    store v3, v2
+    v4 = load.i8 v2
+    return v4
+}
+; run: %extractlane_i8x16_through_stack([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == 2
+
+function %extractlane_i16x8_through_stack(i16x8) -> i16 {
+    ss0 = explicit_slot 8
+block0(v0: i16x8):
+    v2 = stack_addr.i32 ss0
+    v3 = extractlane v0, 2
+    store v3, v2
+    v4 = load.i16 v2
+    return v4
+}
+; run: %extractlane_i16x8_through_stack([1 2 3 4 5 6 7 8]) == 3
+
+function %extractlane_i32x4_through_stack(i32x4) -> i32 {
+    ss0 = explicit_slot 8
+block0(v0: i32x4):
+    v2 = stack_addr.i32 ss0
+    v3 = extractlane v0, 3
+    store v3, v2
+    v4 = load.i32 v2
+    return v4
+}
+; run: %extractlane_i32x4_through_stack([1 2 3 4]) == 4
+
+function %extractlane_i64x2_through_stack(i64x2) -> i64 {
+    ss0 = explicit_slot 8
+block0(v0: i64x2):
+    v2 = stack_addr.i32 ss0
+    v3 = extractlane v0, 0
+    store v3, v2
+    v4 = load.i64 v2
+    return v4
+}
+; run: %extractlane_i64x2_through_stack([1 2]) == 1
+
+function %extractlane_f32x4_through_stack(f32x4) -> f32 {
+    ss0 = explicit_slot 8
+block0(v0: f32x4):
+    v2 = stack_addr.i32 ss0
+    v3 = extractlane v0, 3
+    store v3, v2
+    v4 = load.f32 v2
+    return v4
+}
+; run: %extractlane_f32x4_through_stack([0x1.0 0x2.0 0x3.0 0x4.0]) == 0x4.0
+
+function %extractlane_f64x2_through_stack(f64x2) -> f64 {
+    ss0 = explicit_slot 8
+block0(v0: f64x2):
+    v2 = stack_addr.i32 ss0
+    v3 = extractlane v0, 0
+    store v3, v2
+    v4 = load.f64 v2
+    return v4
+}
+; run: %extractlane_f64x2_through_stack([0x1.0 0x2.0]) == 0x1.0
diff --git a/cranelift/filetests/filetests/runtests/simd-extractlane_64.clif b/cranelift/filetests/filetests/runtests/simd-extractlane_64.clif
new file mode 100644
index 000000000000..4a7e542c430c
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-extractlane_64.clif
@@ -0,0 +1,79 @@
+test interpret
+test run
+target aarch64
+target s390x
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
+set enable_multi_ret_implicit_sret
+target riscv64 has_v
+target riscv64 has_v has_c has_zcb
+target pulley64
+target pulley64be
+
+function %extractlane_i8x16_through_stack(i8x16) -> i8 {
+    ss0 = explicit_slot 8
+block0(v0: i8x16):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 1
+    store v3, v2
+    v4 = load.i8 v2
+    return v4
+}
+; run: %extractlane_i8x16_through_stack([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == 2
+
+function %extractlane_i16x8_through_stack(i16x8) -> i16 {
+    ss0 = explicit_slot 8
+block0(v0: i16x8):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 2
+    store v3, v2
+    v4 = load.i16 v2
+    return v4
+}
+; run: %extractlane_i16x8_through_stack([1 2 3 4 5 6 7 8]) == 3
+
+function %extractlane_i32x4_through_stack(i32x4) -> i32 {
+    ss0 = explicit_slot 8
+block0(v0: i32x4):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 3
+    store v3, v2
+    v4 = load.i32 v2
+    return v4
+}
+; run: %extractlane_i32x4_through_stack([1 2 3 4]) == 4
+
+function %extractlane_i64x2_through_stack(i64x2) -> i64 {
+    ss0 = explicit_slot 8
+block0(v0: i64x2):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 0
+    store v3, v2
+    v4 = load.i64 v2
+    return v4
+}
+; run: %extractlane_i64x2_through_stack([1 2]) == 1
+
+function %extractlane_f32x4_through_stack(f32x4) -> f32 {
+    ss0 = explicit_slot 8
+block0(v0: f32x4):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 3
+    store v3, v2
+    v4 = load.f32 v2
+    return v4
+}
+; run: %extractlane_f32x4_through_stack([0x1.0 0x2.0 0x3.0 0x4.0]) == 0x4.0
+
+function %extractlane_f64x2_through_stack(f64x2) -> f64 {
+    ss0 = explicit_slot 8
+block0(v0: f64x2):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 0
+    store v3, v2
+    v4 = load.f64 v2
+    return v4
+}
+; run: %extractlane_f64x2_through_stack([0x1.0 0x2.0]) == 0x1.0
diff --git a/cranelift/filetests/filetests/runtests/simd-insertlane.clif b/cranelift/filetests/filetests/runtests/simd-insertlane.clif
index 7792f735f224..304f5e74f94d 100644
--- a/cranelift/filetests/filetests/runtests/simd-insertlane.clif
+++ b/cranelift/filetests/filetests/runtests/simd-insertlane.clif
@@ -84,95 +84,6 @@ block0(v0: f64x2, v1: f64):
 }
 ; run: %insertlane_1_in_f64x2([0x1.0 0x2.0], 0x3.0) == [0x1.0 0x3.0]
 
-function %insertlane_i8x16_through_stack(i8x16, i8) -> i8x16 {
-    ss0 = explicit_slot 8
-block0(v0: i8x16, v1: i8):
-    v2 = stack_addr.i64 ss0
-    store v1, v2
-    v3 = load.i8 v2
-    v4 = insertlane v0, v3, 1
-    return v4
-}
-; run: %insertlane_i8x16_through_stack([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], 2) == [1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
-
-function %insertlane_i16x8_through_stack(i16x8, i16) -> i16x8 {
-    ss0 = explicit_slot 8
-block0(v0: i16x8, v1: i16):
-    v2 = stack_addr.i64 ss0
-    store v1, v2
-    v3 = load.i16 v2
-    v4 = insertlane v0, v3, 2
-    return v4
-}
-; run: %insertlane_i16x8_through_stack([1 1 1 1 1 1 1 1], 2) == [1 1 2 1 1 1 1 1]
-
-function %insertlane_i32x4_through_stack(i32x4, i32) -> i32x4 {
-    ss0 = explicit_slot 8
-block0(v0: i32x4, v1: i32):
-    v2 = stack_addr.i64 ss0
-    store v1, v2
-    v3 = load.i32 v2
-    v4 = insertlane v0, v3, 3
-    return v4
-}
-; run: %insertlane_i32x4_through_stack([1 1 1 1], 2) == [1 1 1 2]
-
-function %insertlane_i64x2_through_stack(i64x2, i64) -> i64x2 {
-    ss0 = explicit_slot 8
-block0(v0: i64x2, v1: i64):
-    v2 = stack_addr.i64 ss0
-    store v1, v2
-    v3 = load.i64 v2
-    v4 = insertlane v0, v3, 0
-    return v4
-}
-; run: %insertlane_i64x2_through_stack([1 1], 2) == [2 1]
-
-function %insertlane_f32x4_through_stack(f32x4, f32) -> f32x4 {
-    ss0 = explicit_slot 8
-block0(v0: f32x4, v1: f32):
-    v2 = stack_addr.i64 ss0
-    store v1, v2
-    v3 = load.f32 v2
-    v4 = insertlane v0, v3, 3
-    return v4
-}
-; run: %insertlane_f32x4_through_stack([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x1.0 0x1.0 0x1.0 0x2.0]
-
-function %insertlane_f32x4_through_stack2(f32x4, f32) -> f32x4 {
-    ss0 = explicit_slot 8
-block0(v0: f32x4, v1: f32):
-    v2 = stack_addr.i64 ss0
-    store v1, v2
-    v3 = load.f32 v2
-    v4 = insertlane v0, v3, 0
-    return v4
-}
-; run: %insertlane_f32x4_through_stack2([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0 0x1.0 0x1.0]
-
-function %insertlane_f64x2_through_stack(f64x2, f64) -> f64x2 {
-    ss0 = explicit_slot 8
-block0(v0: f64x2, v1: f64):
-    v2 = stack_addr.i64 ss0
-    store v1, v2
-    v3 = load.f64 v2
-    v4 = insertlane v0, v3, 0
-    return v4
-}
-; run: %insertlane_f64x2_through_stack([0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0]
-
-function %insertlane_f64x2_through_stack2(f64x2, f64) -> f64x2 {
-    ss0 = explicit_slot 8
-block0(v0: f64x2, v1: f64):
-    v2 = stack_addr.i64 ss0
-    store v1, v2
-    v3 = load.f64 v2
-    v4 = insertlane v0, v3, 1
-    return v4
-}
-; run: %insertlane_f64x2_through_stack2([0x1.0 0x1.0], 0x2.0) == [0x1.0 0x2.0]
-
-
 function %insertlane_const_15(i8x16) -> i8x16 {
 block0(v0: i8x16):
     v1 = iconst.i8 10
diff --git a/cranelift/filetests/filetests/runtests/simd-insertlane_32.clif b/cranelift/filetests/filetests/runtests/simd-insertlane_32.clif
new file mode 100644
index 000000000000..e54a00784cdb
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-insertlane_32.clif
@@ -0,0 +1,93 @@
+test interpret
+test run
+target pulley32
+target pulley32be
+
+function %insertlane_i8x16_through_stack(i8x16, i8) -> i8x16 {
+    ss0 = explicit_slot 8
+block0(v0: i8x16, v1: i8):
+    v2 = stack_addr.i32 ss0
+    store v1, v2
+    v3 = load.i8 v2
+    v4 = insertlane v0, v3, 1
+    return v4
+}
+; run: %insertlane_i8x16_through_stack([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], 2) == [1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+
+function %insertlane_i16x8_through_stack(i16x8, i16) -> i16x8 {
+    ss0 = explicit_slot 8
+block0(v0: i16x8, v1: i16):
+    v2 = stack_addr.i32 ss0
+    store v1, v2
+    v3 = load.i16 v2
+    v4 = insertlane v0, v3, 2
+    return v4
+}
+; run: %insertlane_i16x8_through_stack([1 1 1 1 1 1 1 1], 2) == [1 1 2 1 1 1 1 1]
+
+function %insertlane_i32x4_through_stack(i32x4, i32) -> i32x4 {
+    ss0 = explicit_slot 8
+block0(v0: i32x4, v1: i32):
+    v2 = stack_addr.i32 ss0
+    store v1, v2
+    v3 = load.i32 v2
+    v4 = insertlane v0, v3, 3
+    return v4
+}
+; run: %insertlane_i32x4_through_stack([1 1 1 1], 2) == [1 1 1 2]
+
+function %insertlane_i64x2_through_stack(i64x2, i64) -> i64x2 {
+    ss0 = explicit_slot 8
+block0(v0: i64x2, v1: i64):
+    v2 = stack_addr.i32 ss0
+    store v1, v2
+    v3 = load.i64 v2
+    v4 = insertlane v0, v3, 0
+    return v4
+}
+; run: %insertlane_i64x2_through_stack([1 1], 2) == [2 1]
+
+function %insertlane_f32x4_through_stack(f32x4, f32) -> f32x4 {
+    ss0 = explicit_slot 8
+block0(v0: f32x4, v1: f32):
+    v2 = stack_addr.i32 ss0
+    store v1, v2
+    v3 = load.f32 v2
+    v4 = insertlane v0, v3, 3
+    return v4
+}
+; run: %insertlane_f32x4_through_stack([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x1.0 0x1.0 0x1.0 0x2.0]
+
+function %insertlane_f32x4_through_stack2(f32x4, f32) -> f32x4 {
+    ss0 = explicit_slot 8
+block0(v0: f32x4, v1: f32):
+    v2 = stack_addr.i32 ss0
+    store v1, v2
+    v3 = load.f32 v2
+    v4 = insertlane v0, v3, 0
+    return v4
+}
+; run: %insertlane_f32x4_through_stack2([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0 0x1.0 0x1.0]
+
+function %insertlane_f64x2_through_stack(f64x2, f64) -> f64x2 {
+    ss0 = explicit_slot 8
+block0(v0: f64x2, v1: f64):
+    v2 = stack_addr.i32 ss0
+    store v1, v2
+    v3 = load.f64 v2
+    v4 = insertlane v0, v3, 0
+    return v4
+}
+; run: %insertlane_f64x2_through_stack([0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0]
+
+function %insertlane_f64x2_through_stack2(f64x2, f64) -> f64x2 {
+    ss0 = explicit_slot 8
+block0(v0: f64x2, v1: f64):
+    v2 = stack_addr.i32 ss0
+    store v1, v2
+    v3 = load.f64 v2
+    v4 = insertlane v0, v3, 1
+    return v4
+}
+; run: %insertlane_f64x2_through_stack2([0x1.0 0x1.0], 0x2.0) == [0x1.0 0x2.0]
+
diff --git a/cranelift/filetests/filetests/runtests/simd-insertlane_64.clif b/cranelift/filetests/filetests/runtests/simd-insertlane_64.clif
new file mode 100644
index 000000000000..f299a015b9db
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-insertlane_64.clif
@@ -0,0 +1,102 @@
+test interpret
+test run
+target aarch64
+target s390x
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
+set enable_multi_ret_implicit_sret
+target riscv64 has_v
+target riscv64 has_v has_c has_zcb
+target pulley64
+target pulley64be
+
+function %insertlane_i8x16_through_stack(i8x16, i8) -> i8x16 {
+    ss0 = explicit_slot 8
+block0(v0: i8x16, v1: i8):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.i8 v2
+    v4 = insertlane v0, v3, 1
+    return v4
+}
+; run: %insertlane_i8x16_through_stack([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], 2) == [1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+
+function %insertlane_i16x8_through_stack(i16x8, i16) -> i16x8 {
+    ss0 = explicit_slot 8
+block0(v0: i16x8, v1: i16):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.i16 v2
+    v4 = insertlane v0, v3, 2
+    return v4
+}
+; run: %insertlane_i16x8_through_stack([1 1 1 1 1 1 1 1], 2) == [1 1 2 1 1 1 1 1]
+
+function %insertlane_i32x4_through_stack(i32x4, i32) -> i32x4 {
+    ss0 = explicit_slot 8
+block0(v0: i32x4, v1: i32):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.i32 v2
+    v4 = insertlane v0, v3, 3
+    return v4
+}
+; run: %insertlane_i32x4_through_stack([1 1 1 1], 2) == [1 1 1 2]
+
+function %insertlane_i64x2_through_stack(i64x2, i64) -> i64x2 {
+    ss0 = explicit_slot 8
+block0(v0: i64x2, v1: i64):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.i64 v2
+    v4 = insertlane v0, v3, 0
+    return v4
+}
+; run: %insertlane_i64x2_through_stack([1 1], 2) == [2 1]
+
+function %insertlane_f32x4_through_stack(f32x4, f32) -> f32x4 {
+    ss0 = explicit_slot 8
+block0(v0: f32x4, v1: f32):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.f32 v2
+    v4 = insertlane v0, v3, 3
+    return v4
+}
+; run: %insertlane_f32x4_through_stack([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x1.0 0x1.0 0x1.0 0x2.0]
+
+function %insertlane_f32x4_through_stack2(f32x4, f32) -> f32x4 {
+    ss0 = explicit_slot 8
+block0(v0: f32x4, v1: f32):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.f32 v2
+    v4 = insertlane v0, v3, 0
+    return v4
+}
+; run: %insertlane_f32x4_through_stack2([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0 0x1.0 0x1.0]
+
+function %insertlane_f64x2_through_stack(f64x2, f64) -> f64x2 {
+    ss0 = explicit_slot 8
+block0(v0: f64x2, v1: f64):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.f64 v2
+    v4 = insertlane v0, v3, 0
+    return v4
+}
+; run: %insertlane_f64x2_through_stack([0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0]
+
+function %insertlane_f64x2_through_stack2(f64x2, f64) -> f64x2 {
+    ss0 = explicit_slot 8
+block0(v0: f64x2, v1: f64):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.f64 v2
+    v4 = insertlane v0, v3, 1
+    return v4
+}
+; run: %insertlane_f64x2_through_stack2([0x1.0 0x1.0], 0x2.0) == [0x1.0 0x2.0]
+
diff --git a/cranelift/filetests/filetests/verifier/pointer_width_32.clif b/cranelift/filetests/filetests/verifier/pointer_width_32.clif
new file mode 100644
index 000000000000..251fda4bb503
--- /dev/null
+++ b/cranelift/filetests/filetests/verifier/pointer_width_32.clif
@@ -0,0 +1,61 @@
+test verifier
+target pulley32
+
+function %error_i8_load_store_i32(i64) -> i8 {
+block0(v0: i64):
+    v1 = load.i8 v0 ; error: invalid pointer width (got 64, expected 32) encountered v0
+    store.i8 v1, v0 ; error: invalid pointer width (got 64, expected 32) encountered v0
+    return v1
+}
+
+function %error_i8_load_offset_i32(i64) -> i8 {
+block0(v0: i64):
+    v1 = load.i8 v0+16 ; error: invalid pointer width (got 64, expected 32) encountered v0
+    store.i8 v1, v0+16 ; error: invalid pointer width (got 64, expected 32) encountered v0
+    return v1
+}
+
+function %error_i64_atomic_store_load(i64) -> i64 {
+    ss0 = explicit_slot 8
+
+block0(v0: i64):
+    v1 = stack_addr.i64 ss0
+    atomic_store.i64 v0, v1; error: invalid pointer width (got 64, expected 32) encountered v1
+    v2 = atomic_load.i64 v1; error: invalid pointer width (got 64, expected 32) encountered v1
+    return v2
+}
+
+function %error_atomic_cas(i128, i128, i128) -> i128, i128 {
+    ss0 = explicit_slot 16
+
+block0(v0: i128, v1: i128, v2: i128):
+    stack_store.i128 v0, ss0
+    v3 = stack_addr.i64 ss0
+    v4 = atomic_cas.i128 v3, v1, v2; error: invalid pointer width (got 64, expected 32) encountered v3
+    v5 = stack_load.i128 ss0
+    return v5, v4
+}
+
+function %error_atomic_rmw_add_i64(i64, i64) -> i64, i64 {
+    ss0 = explicit_slot 8
+
+block0(v0: i64, v1: i64):
+    v2 = stack_addr.i64 ss0
+    store.i64 little v0, v2 ; error: invalid pointer width (got 64, expected 32) encountered v2
+
+    v3 = atomic_rmw.i64 little add v2, v1; error: invalid pointer width (got 64, expected 32) encountered v2
+
+    v4 = load.i64 little v2 ; error: invalid pointer width (got 64, expected 32) encountered v2
+    return v3, v4
+}
+
+function %error_fmsub_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
+    ss0 = explicit_slot 16
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+    v3 = stack_addr.i64 ss0
+    store.f32x4 v0, v3 ; error: invalid pointer width (got 64, expected 32) encountered v3
+    v4 = load.f32x4 v3 ; error: invalid pointer width (got 64, expected 32) encountered v3
+    v5 = fneg v2
+    v6 = fma v4, v1, v5
+    return v6
+}
diff --git a/cranelift/filetests/filetests/verifier/pointer_width_64.clif b/cranelift/filetests/filetests/verifier/pointer_width_64.clif
new file mode 100644
index 000000000000..e0a9587f4e4c
--- /dev/null
+++ b/cranelift/filetests/filetests/verifier/pointer_width_64.clif
@@ -0,0 +1,61 @@
+test verifier
+target pulley64
+
+function %error_i8_load_store_i32(i32) -> i8 {
+block0(v0: i32):
+    v1 = load.i8 v0 ; error: invalid pointer width (got 32, expected 64) encountered v0
+    store.i8 v1, v0 ; error: invalid pointer width (got 32, expected 64) encountered v0
+    return v1
+}
+
+function %error_i8_load_offset_i32(i32) -> i8 {
+block0(v0: i32):
+    v1 = load.i8 v0+16 ; error: invalid pointer width (got 32, expected 64) encountered v0
+    store.i8 v1, v0+16 ; error: invalid pointer width (got 32, expected 64) encountered v0
+    return v1
+}
+
+function %error_i64_atomic_store_load(i64) -> i64 {
+    ss0 = explicit_slot 8
+
+block0(v0: i64):
+    v1 = stack_addr.i32 ss0
+    atomic_store.i64 v0, v1; error: invalid pointer width (got 32, expected 64) encountered v1
+    v2 = atomic_load.i64 v1; error: invalid pointer width (got 32, expected 64) encountered v1
+    return v2
+}
+
+function %error_atomic_cas(i128, i128, i128) -> i128, i128 {
+    ss0 = explicit_slot 16
+
+block0(v0: i128, v1: i128, v2: i128):
+    stack_store.i128 v0, ss0
+    v3 = stack_addr.i32 ss0
+    v4 = atomic_cas.i128 v3, v1, v2; error: invalid pointer width (got 32, expected 64) encountered v3
+    v5 = stack_load.i128 ss0
+    return v5, v4
+}
+
+function %error_atomic_rmw_add_i64(i64, i64) -> i64, i64 {
+    ss0 = explicit_slot 8
+
+block0(v0: i64, v1: i64):
+    v2 = stack_addr.i32 ss0
+    store.i64 little v0, v2 ; error: invalid pointer width (got 32, expected 64) encountered v2
+
+    v3 = atomic_rmw.i64 little add v2, v1; error: invalid pointer width (got 32, expected 64) encountered v2
+
+    v4 = load.i64 little v2 ; error: invalid pointer width (got 32, expected 64) encountered v2
+    return v3, v4
+}
+
+function %error_fmsub_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
+    ss0 = explicit_slot 16
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+    v3 = stack_addr.i32 ss0
+    store.f32x4 v0, v3 ; error: invalid pointer width (got 32, expected 64) encountered v3
+    v4 = load.f32x4 v3 ; error: invalid pointer width (got 32, expected 64) encountered v3
+    v5 = fneg v2
+    v6 = fma v4, v1, v5
+    return v6
+}