diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c337c26a2..003102d59 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -160,6 +160,21 @@ jobs:
         rm -rf /tmp/.buildx-cache
         mv /tmp/.buildx-cache-new /tmp/.buildx-cache
 
+  miri:
+    name: Miri
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Install Rust (rustup)
+      run: rustup update nightly --no-self-update && rustup default nightly
+      shell: bash
+    - run: rustup component add miri
+    - run: cargo miri setup
+    - uses: Swatinem/rust-cache@v2
+    - run: ./ci/miri.sh
+
   rustfmt:
     name: Rustfmt
     runs-on: ubuntu-latest
@@ -190,6 +205,7 @@ jobs:
       - test
       - rustfmt
       - clippy
+      - miri
     runs-on: ubuntu-latest
     # GitHub branch protection is exceedingly silly and treats "jobs skipped because a dependency
     # failed" as success. So we have to do some contortions to ensure the job fails if any of its
diff --git a/ci/miri.sh b/ci/miri.sh
new file mode 100755
index 000000000..f9a1240a4
--- /dev/null
+++ b/ci/miri.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -ex
+
+# We need Tree Borrows as some of our raw pointer patterns are not
+# compatible with Stacked Borrows.
+export MIRIFLAGS="-Zmiri-tree-borrows"
+
+# One target that sets `mem-unaligned` and one that does not,
+# and a big-endian target.
+TARGETS=(x86_64-unknown-linux-gnu
+    armv7-unknown-linux-gnueabihf
+    s390x-unknown-linux-gnu)
+for TARGET in "${TARGETS[@]}"; do
+    # Only run the `mem` tests to avoid this taking too long.
+    cargo miri test --manifest-path testcrate/Cargo.toml --features no-asm --target $TARGET -- mem
+done
diff --git a/compiler-builtins/src/mem/impls.rs b/compiler-builtins/src/mem/impls.rs
index c602a67db..dc12d6996 100644
--- a/compiler-builtins/src/mem/impls.rs
+++ b/compiler-builtins/src/mem/impls.rs
@@ -41,6 +41,72 @@ unsafe fn read_usize_unaligned(x: *const usize) -> usize {
     core::mem::transmute(x_read)
 }
 
+/// Loads a `T`-sized chunk from `src` into `dst` at offset `offset`, if that does not exceed
+/// `load_sz`. The offset pointers must both be `T`-aligned. Returns the new offset, advanced by the
+/// chunk size if a load happened.
+#[cfg(not(feature = "mem-unaligned"))]
+#[inline(always)]
+unsafe fn load_chunk_aligned<T: Copy>(
+    src: *const usize,
+    dst: *mut usize,
+    load_sz: usize,
+    offset: usize,
+) -> usize {
+    let chunk_sz = core::mem::size_of::<T>();
+    if (load_sz & chunk_sz) != 0 {
+        *dst.wrapping_byte_add(offset).cast::<T>() = *src.wrapping_byte_add(offset).cast::<T>();
+        offset | chunk_sz
+    } else {
+        offset
+    }
+}
+
+/// Load `load_sz` many bytes from `src`, which must be usize-aligned. Acts as if we did a `usize`
+/// read with the out-of-bounds part filled with 0s.
+/// `load_sz` be strictly less than `WORD_SIZE`.
+#[cfg(not(feature = "mem-unaligned"))]
+#[inline(always)]
+unsafe fn load_aligned_partial(src: *const usize, load_sz: usize) -> usize {
+    debug_assert!(load_sz < WORD_SIZE);
+    // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
+    // (since `load_sz < WORD_SIZE`).
+    const { assert!(WORD_SIZE <= 8) };
+
+    let mut i = 0;
+    let mut out = 0usize;
+    // We load in decreasing order, so the pointers remain sufficiently aligned for the next step.
+    i = load_chunk_aligned::<u32>(src, &raw mut out, load_sz, i);
+    i = load_chunk_aligned::<u16>(src, &raw mut out, load_sz, i);
+    i = load_chunk_aligned::<u8>(src, &raw mut out, load_sz, i);
+    debug_assert!(i == load_sz);
+    out
+}
+
+/// Load `load_sz` many bytes from `src.wrapping_byte_add(WORD_SIZE - load_sz)`. `src` must be
+/// `usize`-aligned. The bytes are returned as the *last* bytes of the return value, i.e., this acts
+/// as if we had done a `usize` read from `src`, with the out-of-bounds part filled with 0s.
+/// `load_sz` be strictly less than `WORD_SIZE`.
+#[cfg(not(feature = "mem-unaligned"))]
+#[inline(always)]
+unsafe fn load_aligned_end_partial(src: *const usize, load_sz: usize) -> usize {
+    debug_assert!(load_sz < WORD_SIZE);
+    // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
+    // (since `load_sz < WORD_SIZE`).
+    const { assert!(WORD_SIZE <= 8) };
+
+    let mut i = 0;
+    let mut out = 0usize;
+    // Obtain pointers pointing to the beginning of the range we want to load.
+    let src_shifted = src.wrapping_byte_add(WORD_SIZE - load_sz);
+    let out_shifted = (&raw mut out).wrapping_byte_add(WORD_SIZE - load_sz);
+    // We load in increasing order, so by the time we reach `u16` things are 2-aligned etc.
+    i = load_chunk_aligned::<u8>(src_shifted, out_shifted, load_sz, i);
+    i = load_chunk_aligned::<u16>(src_shifted, out_shifted, load_sz, i);
+    i = load_chunk_aligned::<u32>(src_shifted, out_shifted, load_sz, i);
+    debug_assert!(i == load_sz);
+    out
+}
+
 #[inline(always)]
 pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize) {
     #[inline(always)]
@@ -66,40 +132,57 @@ pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize)
         }
     }
 
+    /// `n` is in units of bytes, but must be a multiple of the word size and must not be 0.
+    /// `src` *must not* be `usize`-aligned.
     #[cfg(not(feature = "mem-unaligned"))]
     #[inline(always)]
     unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        debug_assert!(n > 0 && n % WORD_SIZE == 0);
+        debug_assert!(src.addr() % WORD_SIZE != 0);
+
         let mut dest_usize = dest as *mut usize;
         let dest_end = dest.wrapping_add(n) as *mut usize;
 
         // Calculate the misalignment offset and shift needed to reassemble value.
+        // Since `src` is definitely not aligned, `offset` is in the range 1..WORD_SIZE.
         let offset = src as usize & WORD_MASK;
         let shift = offset * 8;
 
         // Realign src
-        let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize;
-        // This will read (but won't use) bytes out of bound.
-        // cfg needed because not all targets will have atomic loads that can be lowered
-        // (e.g. BPF, MSP430), or provided by an external library (e.g. RV32I)
-        #[cfg(target_has_atomic_load_store = "ptr")]
-        let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned);
-        #[cfg(not(target_has_atomic_load_store = "ptr"))]
-        let mut prev_word = core::ptr::read_volatile(src_aligned);
+        let mut src_aligned = src.wrapping_byte_sub(offset) as *mut usize;
+        let mut prev_word = load_aligned_end_partial(src_aligned, WORD_SIZE - offset);
 
-        while dest_usize < dest_end {
+        while dest_usize.wrapping_add(1) < dest_end {
             src_aligned = src_aligned.wrapping_add(1);
             let cur_word = *src_aligned;
-            #[cfg(target_endian = "little")]
-            let resembled = prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift);
-            #[cfg(target_endian = "big")]
-            let resembled = prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift);
+            let reassembled = if cfg!(target_endian = "little") {
+                prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift)
+            } else {
+                prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift)
+            };
             prev_word = cur_word;
 
-            *dest_usize = resembled;
+            *dest_usize = reassembled;
             dest_usize = dest_usize.wrapping_add(1);
         }
+
+        // There's one more element left to go, and we can't use the loop for that as on the `src` side,
+        // it is partially out-of-bounds.
+        src_aligned = src_aligned.wrapping_add(1);
+        let cur_word = load_aligned_partial(src_aligned, offset);
+        let reassembled = if cfg!(target_endian = "little") {
+            prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift)
+        } else {
+            prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift)
+        };
+        // prev_word does not matter any more
+
+        *dest_usize = reassembled;
+        // dest_usize does not matter any more
     }
 
+    /// `n` is in units of bytes, but must be a multiple of the word size and must not be 0.
+    /// `src` *must not* be `usize`-aligned.
     #[cfg(feature = "mem-unaligned")]
     #[inline(always)]
     unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
@@ -164,40 +247,57 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, mut n: usize) {
         }
     }
 
+    /// `n` is in units of bytes, but must be a multiple of the word size and must not be 0.
+    /// `src` *must not* be `usize`-aligned.
     #[cfg(not(feature = "mem-unaligned"))]
     #[inline(always)]
     unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        debug_assert!(n > 0 && n % WORD_SIZE == 0);
+        debug_assert!(src.addr() % WORD_SIZE != 0);
+
         let mut dest_usize = dest as *mut usize;
-        let dest_start = dest.wrapping_sub(n) as *mut usize;
+        let dest_start = dest.wrapping_sub(n) as *mut usize; // we're moving towards the start
 
         // Calculate the misalignment offset and shift needed to reassemble value.
+        // Since `src` is definitely not aligned, `offset` is in the range 1..WORD_SIZE.
         let offset = src as usize & WORD_MASK;
         let shift = offset * 8;
 
-        // Realign src_aligned
-        let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize;
-        // This will read (but won't use) bytes out of bound.
-        // cfg needed because not all targets will have atomic loads that can be lowered
-        // (e.g. BPF, MSP430), or provided by an external library (e.g. RV32I)
-        #[cfg(target_has_atomic_load_store = "ptr")]
-        let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned);
-        #[cfg(not(target_has_atomic_load_store = "ptr"))]
-        let mut prev_word = core::ptr::read_volatile(src_aligned);
+        // Realign src
+        let mut src_aligned = src.wrapping_byte_sub(offset) as *mut usize;
+        let mut prev_word = load_aligned_partial(src_aligned, offset);
 
-        while dest_start < dest_usize {
+        while dest_start.wrapping_add(1) < dest_usize {
             src_aligned = src_aligned.wrapping_sub(1);
             let cur_word = *src_aligned;
-            #[cfg(target_endian = "little")]
-            let resembled = prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift;
-            #[cfg(target_endian = "big")]
-            let resembled = prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift;
+            let reassembled = if cfg!(target_endian = "little") {
+                prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift
+            } else {
+                prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift
+            };
             prev_word = cur_word;
 
             dest_usize = dest_usize.wrapping_sub(1);
-            *dest_usize = resembled;
+            *dest_usize = reassembled;
         }
+
+        // There's one more element left to go, and we can't use the loop for that as on the `src` side,
+        // it is partially out-of-bounds.
+        src_aligned = src_aligned.wrapping_sub(1);
+        let cur_word = load_aligned_end_partial(src_aligned, WORD_SIZE - offset);
+        let reassembled = if cfg!(target_endian = "little") {
+            prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift
+        } else {
+            prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift
+        };
+        // prev_word does not matter any more
+
+        dest_usize = dest_usize.wrapping_sub(1);
+        *dest_usize = reassembled;
     }
 
+    /// `n` is in units of bytes, but must be a multiple of the word size and must not be 0.
+    /// `src` *must not* be `usize`-aligned.
     #[cfg(feature = "mem-unaligned")]
     #[inline(always)]
     unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
diff --git a/testcrate/tests/mem.rs b/testcrate/tests/mem.rs
index 48ac95adc..d838ef159 100644
--- a/testcrate/tests/mem.rs
+++ b/testcrate/tests/mem.rs
@@ -128,11 +128,13 @@ fn memcmp_eq() {
 #[test]
 fn memcmp_ne() {
     let arr1 @ arr2 = gen_arr::<256>();
-    for i in 0..256 {
+    // Reduce iteration count in Miri as it is too slow otherwise.
+    let limit = if cfg!(miri) { 64 } else { 256 };
+    for i in 0..limit {
         let mut diff_arr = arr1;
         diff_arr.0[i] = 127;
         let expect = diff_arr.0[i].cmp(&arr2.0[i]);
-        for k in i + 1..256 {
+        for k in i + 1..limit {
             let result = unsafe { memcmp(diff_arr.0.as_ptr(), arr2.0.as_ptr(), k) };
             assert_eq!(expect, result.cmp(&0));
         }
@@ -230,6 +232,23 @@ fn memmove_backward_aligned() {
     }
 }
 
+#[test]
+fn memmove_misaligned_bounds() {
+    // The above test have the downside that the addresses surrounding the range-to-copy are all
+    // still in-bounds, so Miri would not actually complain about OOB accesses. So we also test with
+    // an array that has just the right size. We test a few times to avoid it being accidentally
+    // aligned.
+    for _ in 0..8 {
+        let mut arr1 = [0u8; 17];
+        let mut arr2 = [0u8; 17];
+        unsafe {
+            // Copy both ways so we hit both the forward and backward cases.
+            memmove(arr1.as_mut_ptr(), arr2.as_mut_ptr(), 17);
+            memmove(arr2.as_mut_ptr(), arr1.as_mut_ptr(), 17);
+        }
+    }
+}
+
 #[test]
 fn memset_backward_misaligned_nonaligned_start() {
     let mut arr = gen_arr::<32>();