diff --git a/library/core/src/mem/mod.rs b/library/core/src/mem/mod.rs index 7d2f297152365..af088b4cfb7de 100644 --- a/library/core/src/mem/mod.rs +++ b/library/core/src/mem/mod.rs @@ -734,14 +734,23 @@ pub const fn swap(x: &mut T, y: &mut T) { // a backend can choose to implement using the block optimization, or not. #[cfg(not(any(target_arch = "spirv")))] { + // Types with alignment bigger than usize are almost always used for + // hand-tuned SIMD optimizations so we don't get into way. + // // For types that are larger multiples of their alignment, the simple way // tends to copy the whole thing to stack rather than doing it one part - // at a time, so instead treat them as one-element slices and piggy-back - // the slice optimizations that will split up the swaps. - if size_of::() / align_of::() > 4 { - // SAFETY: exclusive references always point to one non-overlapping - // element and are non-null and properly aligned. - return unsafe { ptr::swap_nonoverlapping(x, y, 1) }; + // at a time, so instead try to split them into chunks that fit into registers + // and swap chunks. + if const { + let size = size_of::(); + let align = align_of::(); + // It is weird that LLVM sometimes optimizes `4*align` fine while failing `3*align`. + // Proof: https://godbolt.org/z/MhnqvjjPz + (align < align_of::() && (size == 3 * align || size > 4 * align)) + // And usize fails even for `4*align`. + || (align == align_of::() && size > 2 * align) + } { + return swap_chunked(x, y); } } @@ -784,6 +793,236 @@ pub(crate) const fn swap_simple(x: &mut T, y: &mut T) { } } +// This version swaps 2 values by chunks of usize and smaller +// using unaligned reads and writes because they are cheap +// on modern x86_64 processors for at least 10 years now (at 2022-07-04). +// https://lemire.me/blog/2012/05/31/data-alignment-for-speed-myth-or-reality/ +// It generates less instructions and memory accesses as well: https://godbolt.org/z/Mr4rWfoad +// Feel free to add another targets as well if they have fast unaligned accesses. +// +// This should be done by backend optimizer but it currently fails to do so. +#[cfg(target_arch = "x86_64")] +#[rustc_const_unstable(feature = "const_swap", issue = "83163")] +#[inline] +const fn swap_chunked(x: &mut T, y: &mut T) { + // Algorithm: + // 1. Swap first `n*ZMM_BYTES` using `u64`, which reliably autovectorized by LLVM. + // 2. Force backend to generate use SIMD registers YMM or XMM using `force_swap_simd`. + // 3. Swap remaining bytes using integers each of them twice smaller than previous. + + // Note: Current version of this function optimized for x86_64. + // If you allow use of it for another architecture, check generated code first. + const XMM_BYTES: usize = 128 / 8; + const YMM_BYTES: usize = 256 / 8; + const ZMM_BYTES: usize = 512 / 8; + + /// This function successfully autovectorizes if `size_bytes` is divisible + /// by biggest available SIMD register size. If it is not, it would generate + /// SIMD operations for first divisible bytes, then would emit just integer reads and writes. + /// # Safety + /// 1. Must not overlap. 2. Must have at least `size_bytes` valid bytes. + /// 3. `size_bytes` must be exactly divisible by `size_of::`. + #[inline] + const unsafe fn swap_simple_u64_chunks( + size_bytes: usize, + x: *mut MaybeUninit, + y: *mut MaybeUninit, + ) { + let mut byte_offset = 0; + while byte_offset < size_bytes { + // SAFETY: + // Caller must ensure pointers validity and range. + // We use unaligned reads/writes. + unsafe { + let x = x.add(byte_offset).cast(); + let y = y.add(byte_offset).cast(); + // Same rationale for 2 reads and 2 writes + // as in `swap_simple`. + let tmp_x: MaybeUninit = ptr::read_unaligned(x); + let tmp_y: MaybeUninit = ptr::read_unaligned(y); + ptr::write_unaligned(x, tmp_y); + ptr::write_unaligned(y, tmp_x); + } + byte_offset += const { size_of::() }; + } + } + + /// This function would generate swap of next `SIZE_BYTES` using SIMD registers. + /// It has drawback: it would generate swap using `SIZE_BYTES` only. + /// E.g. if type has ZMM_BYTES size, but swap implemented using only `force_swap_simd`, + /// it would use only XMM registers. + /// So this function should be used only once because if it called twice, + /// it is better to use bigger register. + /// + /// It is OK to call it with YMM_BYTES even if only SSE enabled because + /// It would just use 4 XMM registers. + /// # Safety + /// 1. Must not overlap. 2. Must have at least `SIZE_BYTES` valid bytes. + #[inline] + const unsafe fn force_swap_simd( + x: *mut MaybeUninit, + y: *mut MaybeUninit, + ) { + const { + assert!( + SIZE_BYTES == XMM_BYTES || SIZE_BYTES == YMM_BYTES, + "Must have valid SIMD register size", + ); + } + // SAFETY: We require valid non-overlapping pointers with SIZE_BYTES. + // We checked that they SIMD register sized. + unsafe { + // Don't use an array for temporary here because it ends up being on a stack. + // E.g. it would copy from memory to register, + // from register to stack, from stack to register + // and from register to destination. + + const { + assert!(XMM_BYTES == size_of::() * 2, "Check number of temporaries below."); + assert!(YMM_BYTES == size_of::() * 4, "Check number of temporaries below."); + } + let x: *mut MaybeUninit = x.cast(); + let y: *mut MaybeUninit = y.cast(); + // Use separate variables instead. + // They are successfully folded into YMM register when compiled with AVX, + // or pair of XMM registers in `swap_simd::` without AVX, + // or single XMM register in `swap_simd::`. + let x0: MaybeUninit = ptr::read_unaligned(x); + let x1: MaybeUninit = ptr::read_unaligned(x.add(1)); + let x2: MaybeUninit; + let x3: MaybeUninit; + if const { SIZE_BYTES == YMM_BYTES } { + x2 = ptr::read_unaligned(x.add(2)); + x3 = ptr::read_unaligned(x.add(3)); + } else { + x2 = MaybeUninit::uninit(); + x3 = MaybeUninit::uninit(); + } + + // Unlike simple swap, we need to use direct move here + // instead of using temporary value for `y` like in `swap_simple` + // because it causes temporaries for `x` to be copied to stack. + + // Cast to `MaybeUninit` because `copy_nonoverlapping` requires correct alignment. + ptr::copy_nonoverlapping::>(y.cast(), x.cast(), SIZE_BYTES); + + ptr::write_unaligned(y, x0); + ptr::write_unaligned(y.add(1), x1); + if const { SIZE_BYTES == YMM_BYTES } { + ptr::write_unaligned(y.add(2), x2); + ptr::write_unaligned(y.add(3), x3); + } + } + } + + /// Would swap first `size_of::` bytes of tail. + /// SAFETY: + /// `x` and `y` must not overlap. + /// `x` and `y` must have at least `size_of::` bytes. + #[inline] + const unsafe fn swap_tail(x: *mut MaybeUninit, y: *mut MaybeUninit) { + // SAFETY: Caller must ensure pointers validity. + // We use unaligned reads/writes. + unsafe { + // Same rationale for 2 reads and 2 writes + // as in `swap_simple`. + let tmp_x: MaybeUninit = ptr::read_unaligned(x.cast()); + let tmp_y: MaybeUninit = ptr::read_unaligned(y.cast()); + ptr::write_unaligned(x.cast(), tmp_y); + ptr::write_unaligned(y.cast(), tmp_x); + } + } + + const { + assert!(size_of::() <= usize::MAX / 4, "We assume that overflows cannot happen."); + } + + let x: *mut MaybeUninit = (x as *mut T).cast(); + let y: *mut MaybeUninit = (y as *mut T).cast(); + + // I would like to add detection for available SIMD here + // but since standard library is distributed precompiled, + // `cfg!(target_feature="xxx")` evaluates to false here + // even if final binary is built with those features. + + let size = const { size_of::() }; + let mut byte_offset = 0; + + // This part would autovectorize to use biggest SIMD register available. + // SAFETY: pointers are valid because they are from references, + // `limit` <= `size`, we removed remainder. + // Whole function doesn't contain places which can panic + // so function wouldn't interrepted before swapping ends. + unsafe { + let exactly_divisible = const { + let size = size_of::(); + size - size % ZMM_BYTES + }; + swap_simple_u64_chunks(exactly_divisible, x, y); + byte_offset += exactly_divisible; + } + if byte_offset + YMM_BYTES <= size { + // SAFETY: Pointers don't overlap because mutable references don't overlap. + // We just checked range. + // Whole function doesn't contain places which can panic + // so function wouldn't interrepted before swapping ends. + unsafe { + // We need to do this only once because tail after ZMM_BYTES chunks cannot contain more than 1. + // It is OK to do this even if AVX not enabled because it would just use 4 XMM registers. + force_swap_simd::(x.add(byte_offset), y.add(byte_offset)); + byte_offset += YMM_BYTES; + } + } + if byte_offset + XMM_BYTES <= size { + // SAFETY: Pointers don't overlap because mutable references don't overlap. + // We just checked range. + // Whole function doesn't contain places which can panic + // so function wouldn't interrepted before swapping ends. + unsafe { + // We need to do this only once because tail after YMM_BYTES chunks cannot contain more than 1. + force_swap_simd::(x.add(byte_offset), y.add(byte_offset)); + byte_offset += XMM_BYTES; + } + } + + macro_rules! swap_tail_by { + ($t:ty) => { + // SAFETY: Pointers don't overlap because mutable references don't overlap. + // We never access pointers in a way that require alignment, + // and whole function doesn't contain places which can panic + // so it is drop safe. + // We swapped first `size_of::() / 16 * 16` bytes already. + // We try `swap_tail` functions in order from bigger to smaller. + unsafe { + if byte_offset + const { size_of::<$t>() } <= size { + swap_tail::<$t>(x.add(byte_offset), y.add(byte_offset)); + byte_offset += const { size_of::<$t>() }; + } + } + }; + } + swap_tail_by!(u64); + swap_tail_by!(u32); + swap_tail_by!(u16); + // Swapping by `u8` is guaranteed to finish all bytes + // because it has `size_of == 1`. + swap_tail_by!(u8); + let _ = byte_offset; +} + +// For platforms wit slow unaligned accesses +// we can just delegate to pointer swaps. +#[cfg(not(target_arch = "x86_64"))] +#[rustc_const_unstable(feature = "const_swap", issue = "83163")] +#[inline] +pub(crate) const fn swap_chunked(x: &mut T, y: &mut T) { + // SAFETY: exclusive references always point to one non-overlapping + // element and are non-null and properly aligned. + unsafe { + ptr::swap_nonoverlapping(x, y, 1); + } +} + /// Replaces `dest` with the default value of `T`, returning the previous `dest` value. /// /// * If you want to replace the values of two variables, see [`swap`]. diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs index 5f55f762ad555..46b377a9756a8 100644 --- a/library/core/src/ptr/mod.rs +++ b/library/core/src/ptr/mod.rs @@ -942,6 +942,8 @@ pub const unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { { attempt_swap_as_chunks!(usize); attempt_swap_as_chunks!(u8); + // Should not happen because anything must be splittable by u8. + unreachable!(); } // SAFETY: Same preconditions as this function diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs index 03e2a2327fc4c..42514cc246334 100644 --- a/tests/codegen/swap-small-types.rs +++ b/tests/codegen/swap-small-types.rs @@ -11,10 +11,27 @@ type RGB48 = [u16; 3]; // CHECK-LABEL: @swap_rgb48 #[no_mangle] pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) { - // FIXME MIR inlining messes up LLVM optimizations. -// WOULD-CHECK-NOT: alloca -// WOULD-CHECK: load i48 -// WOULD-CHECK: store i48 + // CHECK-NOT: alloca + // Those can be supported only by backend + // WOULD-CHECK: load i48 + // WOULD-CHECK: store i48 + // CHECK: ret void + swap(x, y) +} + +// CHECK-LABEL: @swap_vecs +#[no_mangle] +pub fn swap_vecs(x: &mut Vec, y: &mut Vec) { + // CHECK-NOT: alloca + // CHECK: ret void + swap(x, y) +} + +// CHECK-LABEL: @swap_slices +#[no_mangle] +pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) { + // CHECK-NOT: alloca + // CHECK: ret void swap(x, y) } @@ -25,9 +42,9 @@ type RGB24 = [u8; 3]; // CHECK-LABEL: @swap_rgb24_slices #[no_mangle] pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i8> -// CHECK: store <{{[0-9]+}} x i8> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i8> + // CHECK: store <{{[0-9]+}} x i8> if x.len() == y.len() { x.swap_with_slice(y); } @@ -39,9 +56,9 @@ type RGBA32 = [u8; 4]; // CHECK-LABEL: @swap_rgba32_slices #[no_mangle] pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i32> -// CHECK: store <{{[0-9]+}} x i32> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i32> + // CHECK: store <{{[0-9]+}} x i32> if x.len() == y.len() { x.swap_with_slice(y); } @@ -54,10 +71,24 @@ const _: () = assert!(!std::mem::size_of::().is_power_of_two()); // CHECK-LABEL: @swap_string_slices #[no_mangle] pub fn swap_string_slices(x: &mut [String], y: &mut [String]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i64> -// CHECK: store <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i64> + // CHECK: store <{{[0-9]+}} x i64> if x.len() == y.len() { x.swap_with_slice(y); } } + +#[repr(C, packed)] +pub struct Packed { + pub first: bool, + pub second: usize, +} + +// CHECK-LABEL: @swap_packed_structs +#[no_mangle] +pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) { + // CHECK-NOT: alloca + // CHECK: ret void + swap(x, y) +}