38 files changed, 3055 insertions, 414 deletions
diff --git a/src/tools/miri/.github/workflows/ci.yml b/src/tools/miri/.github/workflows/ci.yml
index b0dab9f509d..73afd2a12a9 100644
--- a/src/tools/miri/.github/workflows/ci.yml
+++ b/src/tools/miri/.github/workflows/ci.yml
@@ -32,7 +32,7 @@ jobs:
     env:
       HOST_TARGET: ${{ matrix.host_target }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Show Rust version (stable toolchain)
         run: |
@@ -85,7 +85,7 @@ jobs:
     name: style checks
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       # This is exactly duplicated from above. GHA is pretty terrible when it comes
       # to avoiding code duplication.
@@ -191,7 +191,7 @@ jobs:
           The Miri Cronjobs Bot'
 
       # Attempt to auto-sync with rustc
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 256 # get a bit more of the history
       - name: install josh-proxy
diff --git a/src/tools/miri/Cargo.lock b/src/tools/miri/Cargo.lock
index 4fb479e1c54..293b937a5e5 100644
--- a/src/tools/miri/Cargo.lock
+++ b/src/tools/miri/Cargo.lock
@@ -38,6 +38,21 @@ dependencies = [
 ]
 
 [[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
 name = "annotate-snippets"
 version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -107,6 +122,12 @@ dependencies = [
 ]
 
 [[package]]
+name = "bumpalo"
+version = "3.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
+
+[[package]]
 name = "camino"
 version = "1.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -151,6 +172,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
+name = "chrono"
+version = "0.4.38"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "num-traits",
+ "windows-targets 0.52.3",
+]
+
+[[package]]
 name = "cipher"
 version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -217,6 +250,12 @@ dependencies = [
 ]
 
 [[package]]
+name = "core-foundation-sys"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
+
+[[package]]
 name = "cpufeatures"
 version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -261,6 +300,27 @@ dependencies = [
 ]
 
 [[package]]
+name = "directories"
+version = "5.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a49173b84e034382284f27f1af4dcbbd231ffa358c0fe316541a7337f376a35"
+dependencies = [
+ "dirs-sys",
+]
+
+[[package]]
+name = "dirs-sys"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
+dependencies = [
+ "libc",
+ "option-ext",
+ "redox_users",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
 name = "encode_unicode"
 version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -320,6 +380,29 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
 [[package]]
+name = "iana-time-zone"
+version = "0.1.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
 name = "indenter"
 version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -373,6 +456,15 @@ dependencies = [
 ]
 
 [[package]]
+name = "js-sys"
+version = "0.3.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
 name = "lazy_static"
 version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -420,6 +512,16 @@ dependencies = [
 ]
 
 [[package]]
+name = "libredox"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
+dependencies = [
+ "bitflags 2.4.2",
+ "libc",
+]
+
+[[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -484,8 +586,10 @@ name = "miri"
 version = "0.1.0"
 dependencies = [
  "aes",
+ "chrono",
  "colored",
  "ctrlc",
+ "directories",
  "getrandom",
  "jemalloc-sys",
  "lazy_static",
@@ -513,6 +617,15 @@ dependencies = [
 ]
 
 [[package]]
+name = "num-traits"
+version = "0.2.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
 name = "number_prefix"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -534,6 +647,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
+name = "option-ext"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
+
+[[package]]
 name = "owo-colors"
 version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -666,6 +785,17 @@ dependencies = [
 ]
 
 [[package]]
+name = "redox_users"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891"
+dependencies = [
+ "getrandom",
+ "libredox",
+ "thiserror",
+]
+
+[[package]]
 name = "regex"
 version = "1.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -965,6 +1095,60 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
+name = "wasm-bindgen"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
+
+[[package]]
 name = "winapi"
 version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -987,6 +1171,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
 [[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.3",
+]
+
+[[package]]
 name = "windows-sys"
 version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/src/tools/miri/Cargo.toml b/src/tools/miri/Cargo.toml
index 9d24d3c6f47..b00dae784d2 100644
--- a/src/tools/miri/Cargo.toml
+++ b/src/tools/miri/Cargo.toml
@@ -24,6 +24,8 @@ smallvec = "1.7"
 aes = { version = "0.8.3", features = ["hazmat"] }
 measureme = "11"
 ctrlc = "3.2.5"
+chrono = { version = "0.4.38", default-features = false, features = ["clock"] }
+directories = "5"
 
 # Copied from `compiler/rustc/Cargo.toml`.
 # But only for some targets, it fails for others. Rustc configures this in its CI, but we can't
diff --git a/src/tools/miri/README.md b/src/tools/miri/README.md
index 4254b9bb67d..ef01ca25fb0 100644
--- a/src/tools/miri/README.md
+++ b/src/tools/miri/README.md
@@ -321,6 +321,10 @@ environment variable. We first document the most relevant and most commonly used
 * `-Zmiri-env-forward=<var>` forwards the `var` environment variable to the interpreted program. Can
   be used multiple times to forward several variables. Execution will still be deterministic if the
   value of forwarded variables stays the same. Has no effect if `-Zmiri-disable-isolation` is set.
+* `-Zmiri-env-set=<var>=<value>` sets the `var` environment variable to `value` in the interpreted program.
+  It can be used to pass environment variables without needing to alter the host environment. It can
+  be used multiple times to set several variables. If `-Zmiri-disable-isolation` or `-Zmiri-env-forward`
+  is set, values set with this option will have priority over values from the host environment.
 * `-Zmiri-ignore-leaks` disables the memory leak checker, and also allows some
   remaining threads to exist when the main thread exits.
 * `-Zmiri-isolation-error=<action>` configures Miri's response to operations
@@ -560,7 +564,8 @@ used according to their aliasing restrictions.
 
 ## Bugs found by Miri
 
-Miri has already found a number of bugs in the Rust standard library and beyond, which we collect here.
+Miri has already found a number of bugs in the Rust standard library and beyond, some of which we collect here.
+If Miri helped you find a subtle UB bug in your code, we'd appreciate a PR adding it to the list!
 
 Definite bugs found:
 
@@ -595,6 +600,7 @@ Definite bugs found:
 * [Deallocating with the wrong layout in new specializations for in-place `Iterator::collect`](https://github.com/rust-lang/rust/pull/118460)
 * [Incorrect offset computation for highly-aligned types in `portable-atomic-util`](https://github.com/taiki-e/portable-atomic/pull/138)
 * [Occasional memory leak in `std::mpsc` channels](https://github.com/rust-lang/rust/issues/121582) (original code in [crossbeam](https://github.com/crossbeam-rs/crossbeam/pull/1084))
+* [Weak-memory-induced memory leak in Windows thread-local storage](https://github.com/rust-lang/rust/pull/124281)
 
 Violations of [Stacked Borrows] found that are likely bugs (but Stacked Borrows is currently just an experiment):
 
diff --git a/src/tools/miri/clippy.toml b/src/tools/miri/clippy.toml
new file mode 100644
index 00000000000..284e18a45a3
--- /dev/null
+++ b/src/tools/miri/clippy.toml
@@ -0,0 +1 @@
+arithmetic-side-effects-allowed = ["rustc_target::abi::Size"]
diff --git a/src/tools/miri/rust-version b/src/tools/miri/rust-version
index a60acf44a40..a6433a8e286 100644
--- a/src/tools/miri/rust-version
+++ b/src/tools/miri/rust-version
@@ -1 +1 @@
-c8d19a92aa9022eb690899cf6d54fd23cb6877e5
+cb3752d20e0f5d24348062211102a08d46fbecff
diff --git a/src/tools/miri/src/bin/miri.rs b/src/tools/miri/src/bin/miri.rs
index 0070d1f3ebc..44201cb89ae 100644
--- a/src/tools/miri/src/bin/miri.rs
+++ b/src/tools/miri/src/bin/miri.rs
@@ -506,6 +506,11 @@ fn main() {
             );
         } else if let Some(param) = arg.strip_prefix("-Zmiri-env-forward=") {
             miri_config.forwarded_env_vars.push(param.to_owned());
+        } else if let Some(param) = arg.strip_prefix("-Zmiri-env-set=") {
+            let Some((name, value)) = param.split_once('=') else {
+                show_error!("-Zmiri-env-set requires an argument of the form <name>=<value>");
+            };
+            miri_config.set_env_vars.insert(name.to_owned(), value.to_owned());
         } else if let Some(param) = arg.strip_prefix("-Zmiri-track-pointer-tag=") {
             let ids: Vec<u64> = parse_comma_list(param).unwrap_or_else(|err| {
                 show_error!("-Zmiri-track-pointer-tag requires a comma separated list of valid `u64` arguments: {err}")
diff --git a/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs b/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs
index bebd14d2f1e..55ff09c53fe 100644
--- a/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs
+++ b/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs
@@ -248,7 +248,7 @@ impl<'tcx> Stack {
     #[cfg(feature = "stack-cache")]
     fn find_granting_cache(&mut self, access: AccessKind, tag: BorTag) -> Option<usize> {
         // This looks like a common-sense optimization; we're going to do a linear search of the
-        // cache or the borrow stack to scan the shorter of the two. This optimization is miniscule
+        // cache or the borrow stack to scan the shorter of the two. This optimization is minuscule
         // and this check actually ensures we do not access an invalid cache.
         // When a stack is created and when items are removed from the top of the borrow stack, we
         // need some valid value to populate the cache. In both cases, we try to use the bottom
diff --git a/src/tools/miri/src/concurrency/data_race.rs b/src/tools/miri/src/concurrency/data_race.rs
index 2281609a049..f2bec972b18 100644
--- a/src/tools/miri/src/concurrency/data_race.rs
+++ b/src/tools/miri/src/concurrency/data_race.rs
@@ -847,6 +847,7 @@ impl VClockAlloc {
         kind: MemoryKind,
         current_span: Span,
     ) -> VClockAlloc {
+        // Determine the thread that did the allocation, and when it did it.
         let (alloc_timestamp, alloc_index) = match kind {
             // User allocated and stack memory should track allocation.
             MemoryKind::Machine(
@@ -858,13 +859,13 @@ impl VClockAlloc {
                 | MiriMemoryKind::Mmap,
             )
             | MemoryKind::Stack => {
-                let (alloc_index, clocks) = global.current_thread_state(thread_mgr);
+                let (alloc_index, clocks) = global.active_thread_state(thread_mgr);
                 let mut alloc_timestamp = clocks.clock[alloc_index];
                 alloc_timestamp.span = current_span;
                 (alloc_timestamp, alloc_index)
             }
             // Other global memory should trace races but be allocated at the 0 timestamp
-            // (conceptually they are allocated before everything).
+            // (conceptually they are allocated on the main thread before everything).
             MemoryKind::Machine(
                 MiriMemoryKind::Global
                 | MiriMemoryKind::Machine
@@ -872,7 +873,8 @@ impl VClockAlloc {
                 | MiriMemoryKind::ExternStatic
                 | MiriMemoryKind::Tls,
             )
-            | MemoryKind::CallerLocation => (VTimestamp::ZERO, VectorIdx::MAX_INDEX),
+            | MemoryKind::CallerLocation =>
+                (VTimestamp::ZERO, global.thread_index(ThreadId::MAIN_THREAD)),
         };
         VClockAlloc {
             alloc_ranges: RefCell::new(RangeMap::new(
@@ -930,7 +932,7 @@ impl VClockAlloc {
         ptr_dbg: Pointer<AllocId>,
         ty: Option<Ty<'_>>,
     ) -> InterpResult<'tcx> {
-        let (current_index, current_clocks) = global.current_thread_state(thread_mgr);
+        let (active_index, active_clocks) = global.active_thread_state(thread_mgr);
         let mut other_size = None; // if `Some`, this was a size-mismatch race
         let write_clock;
         let (other_access, other_thread, other_clock) =
@@ -939,30 +941,30 @@ impl VClockAlloc {
             // we are reporting races between two non-atomic reads.
             if !access.is_atomic() &&
                 let Some(atomic) = mem_clocks.atomic() &&
-                let Some(idx) = Self::find_gt_index(&atomic.write_vector, &current_clocks.clock)
+                let Some(idx) = Self::find_gt_index(&atomic.write_vector, &active_clocks.clock)
             {
                 (AccessType::AtomicStore, idx, &atomic.write_vector)
             } else if !access.is_atomic() &&
                 let Some(atomic) = mem_clocks.atomic() &&
-                let Some(idx) = Self::find_gt_index(&atomic.read_vector, &current_clocks.clock)
+                let Some(idx) = Self::find_gt_index(&atomic.read_vector, &active_clocks.clock)
             {
                 (AccessType::AtomicLoad, idx, &atomic.read_vector)
             // Then check races with non-atomic writes/reads.
-            } else if mem_clocks.write.1 > current_clocks.clock[mem_clocks.write.0] {
+            } else if mem_clocks.write.1 > active_clocks.clock[mem_clocks.write.0] {
                 write_clock = mem_clocks.write();
                 (AccessType::NaWrite(mem_clocks.write_type), mem_clocks.write.0, &write_clock)
-            } else if let Some(idx) = Self::find_gt_index(&mem_clocks.read, &current_clocks.clock) {
+            } else if let Some(idx) = Self::find_gt_index(&mem_clocks.read, &active_clocks.clock) {
                 (AccessType::NaRead(mem_clocks.read[idx].read_type()), idx, &mem_clocks.read)
             // Finally, mixed-size races.
             } else if access.is_atomic() && let Some(atomic) = mem_clocks.atomic() && atomic.size != access_size {
                 // This is only a race if we are not synchronized with all atomic accesses, so find
                 // the one we are not synchronized with.
                 other_size = Some(atomic.size);
-                if let Some(idx) = Self::find_gt_index(&atomic.write_vector, &current_clocks.clock)
+                if let Some(idx) = Self::find_gt_index(&atomic.write_vector, &active_clocks.clock)
                     {
                         (AccessType::AtomicStore, idx, &atomic.write_vector)
                     } else if let Some(idx) =
-                        Self::find_gt_index(&atomic.read_vector, &current_clocks.clock)
+                        Self::find_gt_index(&atomic.read_vector, &active_clocks.clock)
                     {
                         (AccessType::AtomicLoad, idx, &atomic.read_vector)
                     } else {
@@ -975,7 +977,7 @@ impl VClockAlloc {
             };
 
         // Load elaborated thread information about the racing thread actions.
-        let current_thread_info = global.print_thread_metadata(thread_mgr, current_index);
+        let active_thread_info = global.print_thread_metadata(thread_mgr, active_index);
         let other_thread_info = global.print_thread_metadata(thread_mgr, other_thread);
         let involves_non_atomic = !access.is_atomic() || !other_access.is_atomic();
 
@@ -1003,8 +1005,8 @@ impl VClockAlloc {
             },
             op2: RacingOp {
                 action: access.description(ty, other_size.map(|_| access_size)),
-                thread_info: current_thread_info,
-                span: current_clocks.clock.as_slice()[current_index.index()].span_data(),
+                thread_info: active_thread_info,
+                span: active_clocks.clock.as_slice()[active_index.index()].span_data(),
             },
         }))?
     }
@@ -1026,7 +1028,7 @@ impl VClockAlloc {
         let current_span = machine.current_span();
         let global = machine.data_race.as_ref().unwrap();
         if global.race_detecting() {
-            let (index, mut thread_clocks) = global.current_thread_state_mut(&machine.threads);
+            let (index, mut thread_clocks) = global.active_thread_state_mut(&machine.threads);
             let mut alloc_ranges = self.alloc_ranges.borrow_mut();
             for (mem_clocks_range, mem_clocks) in
                 alloc_ranges.iter_mut(access_range.start, access_range.size)
@@ -1069,7 +1071,7 @@ impl VClockAlloc {
         let current_span = machine.current_span();
         let global = machine.data_race.as_mut().unwrap();
         if global.race_detecting() {
-            let (index, mut thread_clocks) = global.current_thread_state_mut(&machine.threads);
+            let (index, mut thread_clocks) = global.active_thread_state_mut(&machine.threads);
             for (mem_clocks_range, mem_clocks) in
                 self.alloc_ranges.get_mut().iter_mut(access_range.start, access_range.size)
             {
@@ -1454,7 +1456,7 @@ impl GlobalState {
         // Setup the main-thread since it is not explicitly created:
         // uses vector index and thread-id 0.
         let index = global_state.vector_clocks.get_mut().push(ThreadClockSet::default());
-        global_state.vector_info.get_mut().push(ThreadId::new(0));
+        global_state.vector_info.get_mut().push(ThreadId::MAIN_THREAD);
         global_state
             .thread_info
             .get_mut()
@@ -1518,7 +1520,7 @@ impl GlobalState {
         thread: ThreadId,
         current_span: Span,
     ) {
-        let current_index = self.current_index(thread_mgr);
+        let current_index = self.active_thread_index(thread_mgr);
 
         // Enable multi-threaded execution, there are now at least two threads
         // so data-races are now possible.
@@ -1642,7 +1644,7 @@ impl GlobalState {
     /// `thread_joined`.
     #[inline]
     pub fn thread_terminated(&mut self, thread_mgr: &ThreadManager<'_, '_>, current_span: Span) {
-        let current_index = self.current_index(thread_mgr);
+        let current_index = self.active_thread_index(thread_mgr);
 
         // Increment the clock to a unique termination timestamp.
         let vector_clocks = self.vector_clocks.get_mut();
@@ -1680,9 +1682,9 @@ impl GlobalState {
         op: impl FnOnce(VectorIdx, RefMut<'_, ThreadClockSet>) -> InterpResult<'tcx, bool>,
     ) -> InterpResult<'tcx> {
         if self.multi_threaded.get() {
-            let (index, clocks) = self.current_thread_state_mut(thread_mgr);
+            let (index, clocks) = self.active_thread_state_mut(thread_mgr);
             if op(index, clocks)? {
-                let (_, mut clocks) = self.current_thread_state_mut(thread_mgr);
+                let (_, mut clocks) = self.active_thread_state_mut(thread_mgr);
                 clocks.increment_clock(index, current_span);
             }
         }
@@ -1725,13 +1727,15 @@ impl GlobalState {
         Ref::map(clocks, |c| &c.clock)
     }
 
+    fn thread_index(&self, thread: ThreadId) -> VectorIdx {
+        self.thread_info.borrow()[thread].vector_index.expect("thread has no assigned vector")
+    }
+
     /// Load the vector index used by the given thread as well as the set of vector clocks
     /// used by the thread.
     #[inline]
     fn thread_state_mut(&self, thread: ThreadId) -> (VectorIdx, RefMut<'_, ThreadClockSet>) {
-        let index = self.thread_info.borrow()[thread]
-            .vector_index
-            .expect("Loading thread state for thread with no assigned vector");
+        let index = self.thread_index(thread);
         let ref_vector = self.vector_clocks.borrow_mut();
         let clocks = RefMut::map(ref_vector, |vec| &mut vec[index]);
         (index, clocks)
@@ -1741,9 +1745,7 @@ impl GlobalState {
     /// used by the thread.
     #[inline]
     fn thread_state(&self, thread: ThreadId) -> (VectorIdx, Ref<'_, ThreadClockSet>) {
-        let index = self.thread_info.borrow()[thread]
-            .vector_index
-            .expect("Loading thread state for thread with no assigned vector");
+        let index = self.thread_index(thread);
         let ref_vector = self.vector_clocks.borrow();
         let clocks = Ref::map(ref_vector, |vec| &vec[index]);
         (index, clocks)
@@ -1752,7 +1754,7 @@ impl GlobalState {
     /// Load the current vector clock in use and the current set of thread clocks
     /// in use for the vector.
     #[inline]
-    pub(super) fn current_thread_state(
+    pub(super) fn active_thread_state(
         &self,
         thread_mgr: &ThreadManager<'_, '_>,
     ) -> (VectorIdx, Ref<'_, ThreadClockSet>) {
@@ -1762,7 +1764,7 @@ impl GlobalState {
     /// Load the current vector clock in use and the current set of thread clocks
     /// in use for the vector mutably for modification.
     #[inline]
-    pub(super) fn current_thread_state_mut(
+    pub(super) fn active_thread_state_mut(
         &self,
         thread_mgr: &ThreadManager<'_, '_>,
     ) -> (VectorIdx, RefMut<'_, ThreadClockSet>) {
@@ -1772,22 +1774,20 @@ impl GlobalState {
     /// Return the current thread, should be the same
     /// as the data-race active thread.
     #[inline]
-    fn current_index(&self, thread_mgr: &ThreadManager<'_, '_>) -> VectorIdx {
+    fn active_thread_index(&self, thread_mgr: &ThreadManager<'_, '_>) -> VectorIdx {
         let active_thread_id = thread_mgr.get_active_thread_id();
-        self.thread_info.borrow()[active_thread_id]
-            .vector_index
-            .expect("active thread has no assigned vector")
+        self.thread_index(active_thread_id)
     }
 
     // SC ATOMIC STORE rule in the paper.
     pub(super) fn sc_write(&self, thread_mgr: &ThreadManager<'_, '_>) {
-        let (index, clocks) = self.current_thread_state(thread_mgr);
+        let (index, clocks) = self.active_thread_state(thread_mgr);
         self.last_sc_write.borrow_mut().set_at_index(&clocks.clock, index);
     }
 
     // SC ATOMIC READ rule in the paper.
     pub(super) fn sc_read(&self, thread_mgr: &ThreadManager<'_, '_>) {
-        let (.., mut clocks) = self.current_thread_state_mut(thread_mgr);
+        let (.., mut clocks) = self.active_thread_state_mut(thread_mgr);
         clocks.read_seqcst.join(&self.last_sc_fence.borrow());
     }
 }
diff --git a/src/tools/miri/src/concurrency/thread.rs b/src/tools/miri/src/concurrency/thread.rs
index 2fabd39a744..0116bd0281a 100644
--- a/src/tools/miri/src/concurrency/thread.rs
+++ b/src/tools/miri/src/concurrency/thread.rs
@@ -57,6 +57,8 @@ impl ThreadId {
     pub fn to_u32(self) -> u32 {
         self.0
     }
+
+    pub const MAIN_THREAD: ThreadId = ThreadId(0);
 }
 
 impl Idx for ThreadId {
@@ -401,7 +403,7 @@ impl<'mir, 'tcx> Default for ThreadManager<'mir, 'tcx> {
         // Create the main thread and add it to the list of threads.
         threads.push(Thread::new(Some("main"), None));
         Self {
-            active_thread: ThreadId::new(0),
+            active_thread: ThreadId::MAIN_THREAD,
             threads,
             sync: SynchronizationState::default(),
             thread_local_alloc_ids: Default::default(),
@@ -416,10 +418,12 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
         ecx: &mut MiriInterpCx<'mir, 'tcx>,
         on_main_stack_empty: StackEmptyCallback<'mir, 'tcx>,
     ) {
-        ecx.machine.threads.threads[ThreadId::new(0)].on_stack_empty = Some(on_main_stack_empty);
+        ecx.machine.threads.threads[ThreadId::MAIN_THREAD].on_stack_empty =
+            Some(on_main_stack_empty);
         if ecx.tcx.sess.target.os.as_ref() != "windows" {
             // The main thread can *not* be joined on except on windows.
-            ecx.machine.threads.threads[ThreadId::new(0)].join_status = ThreadJoinStatus::Detached;
+            ecx.machine.threads.threads[ThreadId::MAIN_THREAD].join_status =
+                ThreadJoinStatus::Detached;
         }
     }
 
diff --git a/src/tools/miri/src/concurrency/vector_clock.rs b/src/tools/miri/src/concurrency/vector_clock.rs
index 2cd3d031b1e..c3496bc1a0c 100644
--- a/src/tools/miri/src/concurrency/vector_clock.rs
+++ b/src/tools/miri/src/concurrency/vector_clock.rs
@@ -13,15 +13,13 @@ use super::data_race::NaReadType;
 /// but in some cases one vector index may be shared with
 /// multiple thread ids if it's safe to do so.
 #[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
-pub struct VectorIdx(u32);
+pub(super) struct VectorIdx(u32);
 
 impl VectorIdx {
     #[inline(always)]
-    pub fn to_u32(self) -> u32 {
+    fn to_u32(self) -> u32 {
         self.0
     }
-
-    pub const MAX_INDEX: VectorIdx = VectorIdx(u32::MAX);
 }
 
 impl Idx for VectorIdx {
@@ -51,7 +49,7 @@ const SMALL_VECTOR: usize = 4;
 /// a 32-bit unsigned integer which is the actual timestamp, and a `Span`
 /// so that diagnostics can report what code was responsible for an operation.
 #[derive(Clone, Copy, Debug)]
-pub struct VTimestamp {
+pub(super) struct VTimestamp {
     /// The lowest bit indicates read type, the rest is the time.
     /// `1` indicates a retag read, `0` a regular read.
     time_and_read_type: u32,
@@ -87,7 +85,7 @@ impl VTimestamp {
     }
 
     #[inline]
-    pub fn read_type(&self) -> NaReadType {
+    pub(super) fn read_type(&self) -> NaReadType {
         if self.time_and_read_type & 1 == 0 { NaReadType::Read } else { NaReadType::Retag }
     }
 
@@ -97,7 +95,7 @@ impl VTimestamp {
     }
 
     #[inline]
-    pub fn span_data(&self) -> SpanData {
+    pub(super) fn span_data(&self) -> SpanData {
         self.span.data()
     }
 }
diff --git a/src/tools/miri/src/concurrency/weak_memory.rs b/src/tools/miri/src/concurrency/weak_memory.rs
index 9ebb64afd35..f544393cfe6 100644
--- a/src/tools/miri/src/concurrency/weak_memory.rs
+++ b/src/tools/miri/src/concurrency/weak_memory.rs
@@ -270,7 +270,7 @@ impl<'mir, 'tcx: 'mir> StoreBuffer {
     ) {
         let store_elem = self.buffer.back();
         if let Some(store_elem) = store_elem {
-            let (index, clocks) = global.current_thread_state(thread_mgr);
+            let (index, clocks) = global.active_thread_state(thread_mgr);
             store_elem.load_impl(index, &clocks, is_seqcst);
         }
     }
@@ -289,7 +289,7 @@ impl<'mir, 'tcx: 'mir> StoreBuffer {
         let (store_elem, recency) = {
             // The `clocks` we got here must be dropped before calling validate_atomic_load
             // as the race detector will update it
-            let (.., clocks) = global.current_thread_state(thread_mgr);
+            let (.., clocks) = global.active_thread_state(thread_mgr);
             // Load from a valid entry in the store buffer
             self.fetch_store(is_seqcst, &clocks, &mut *rng)
         };
@@ -300,7 +300,7 @@ impl<'mir, 'tcx: 'mir> StoreBuffer {
         // requires access to ThreadClockSet.clock, which is updated by the race detector
         validate()?;
 
-        let (index, clocks) = global.current_thread_state(thread_mgr);
+        let (index, clocks) = global.active_thread_state(thread_mgr);
         let loaded = store_elem.load_impl(index, &clocks, is_seqcst);
         Ok((loaded, recency))
     }
@@ -312,7 +312,7 @@ impl<'mir, 'tcx: 'mir> StoreBuffer {
         thread_mgr: &ThreadManager<'_, '_>,
         is_seqcst: bool,
     ) -> InterpResult<'tcx> {
-        let (index, clocks) = global.current_thread_state(thread_mgr);
+        let (index, clocks) = global.active_thread_state(thread_mgr);
 
         self.store_impl(val, index, &clocks.clock, is_seqcst);
         Ok(())
diff --git a/src/tools/miri/src/eval.rs b/src/tools/miri/src/eval.rs
index 45dadb50f4b..2242768a568 100644
--- a/src/tools/miri/src/eval.rs
+++ b/src/tools/miri/src/eval.rs
@@ -9,7 +9,7 @@ use std::thread;
 
 use crate::concurrency::thread::TlsAllocAction;
 use crate::diagnostics::report_leaks;
-use rustc_data_structures::fx::FxHashSet;
+use rustc_data_structures::fx::{FxHashMap, FxHashSet};
 use rustc_hir::def::Namespace;
 use rustc_hir::def_id::DefId;
 use rustc_middle::ty::{
@@ -100,6 +100,8 @@ pub struct MiriConfig {
     pub ignore_leaks: bool,
     /// Environment variables that should always be forwarded from the host.
     pub forwarded_env_vars: Vec<String>,
+    /// Additional environment variables that should be set in the interpreted program.
+    pub set_env_vars: FxHashMap<String, String>,
     /// Command-line arguments passed to the interpreted program.
     pub args: Vec<String>,
     /// The seed to use when non-determinism or randomness are required (e.g. ptr-to-int cast, `getrandom()`).
@@ -167,6 +169,7 @@ impl Default for MiriConfig {
             isolated_op: IsolatedOp::Reject(RejectOpWith::Abort),
             ignore_leaks: false,
             forwarded_env_vars: vec![],
+            set_env_vars: FxHashMap::default(),
             args: vec![],
             seed: None,
             tracked_pointer_tags: FxHashSet::default(),
@@ -383,10 +386,9 @@ pub fn create_ecx<'mir, 'tcx: 'mir>(
 
             let main_ptr = ecx.fn_ptr(FnVal::Instance(entry_instance));
 
-            // Inlining of `DEFAULT` from
-            // https://github.com/rust-lang/rust/blob/master/compiler/rustc_session/src/config/sigpipe.rs.
             // Always using DEFAULT is okay since we don't support signals in Miri anyway.
-            let sigpipe = 2;
+            // (This means we are effectively ignoring `#[unix_sigpipe]`.)
+            let sigpipe = rustc_session::config::sigpipe::DEFAULT;
 
             ecx.call_function(
                 start_instance,
diff --git a/src/tools/miri/src/shims/env.rs b/src/tools/miri/src/shims/env.rs
index 1779189c9ce..298fefdb0f3 100644
--- a/src/tools/miri/src/shims/env.rs
+++ b/src/tools/miri/src/shims/env.rs
@@ -44,21 +44,15 @@ impl<'tcx> EnvVars<'tcx> {
                 let forward = ecx.machine.communicate()
                     || config.forwarded_env_vars.iter().any(|v| **v == *name);
                 if forward {
-                    let var_ptr = match ecx.tcx.sess.target.os.as_ref() {
-                        _ if ecx.target_os_is_unix() =>
-                            alloc_env_var_as_c_str(name.as_ref(), value.as_ref(), ecx)?,
-                        "windows" => alloc_env_var_as_wide_str(name.as_ref(), value.as_ref(), ecx)?,
-                        unsupported =>
-                            throw_unsup_format!(
-                                "environment support for target OS `{}` not yet available",
-                                unsupported
-                            ),
-                    };
-                    ecx.machine.env_vars.map.insert(name.clone(), var_ptr);
+                    add_env_var(ecx, name, value)?;
                 }
             }
         }
 
+        for (name, value) in &config.set_env_vars {
+            add_env_var(ecx, OsStr::new(name), OsStr::new(value))?;
+        }
+
         // Initialize the `environ` pointer when needed.
         if ecx.target_os_is_unix() {
             // This is memory backing an extern static, hence `ExternStatic`, not `Env`.
@@ -89,6 +83,24 @@ impl<'tcx> EnvVars<'tcx> {
     }
 }
 
+fn add_env_var<'mir, 'tcx>(
+    ecx: &mut InterpCx<'mir, 'tcx, MiriMachine<'mir, 'tcx>>,
+    name: &OsStr,
+    value: &OsStr,
+) -> InterpResult<'tcx, ()> {
+    let var_ptr = match ecx.tcx.sess.target.os.as_ref() {
+        _ if ecx.target_os_is_unix() => alloc_env_var_as_c_str(name, value, ecx)?,
+        "windows" => alloc_env_var_as_wide_str(name, value, ecx)?,
+        unsupported =>
+            throw_unsup_format!(
+                "environment support for target OS `{}` not yet available",
+                unsupported
+            ),
+    };
+    ecx.machine.env_vars.map.insert(name.to_os_string(), var_ptr);
+    Ok(())
+}
+
 fn alloc_env_var_as_c_str<'mir, 'tcx>(
     name: &OsStr,
     value: &OsStr,
@@ -148,10 +160,12 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         this.assert_target_os("windows", "GetEnvironmentVariableW");
 
         let name_ptr = this.read_pointer(name_op)?;
+        let buf_ptr = this.read_pointer(buf_op)?;
+        let buf_size = this.read_scalar(size_op)?.to_u32()?; // in characters
+
         let name = this.read_os_str_from_wide_str(name_ptr)?;
         Ok(match this.machine.env_vars.map.get(&name) {
             Some(&var_ptr) => {
-                this.set_last_error(Scalar::from_u32(0))?; // make sure this is unambiguously not an error
                 // The offset is used to strip the "{name}=" part of the string.
                 #[rustfmt::skip]
                 let name_offset_bytes = u64::try_from(name.len()).unwrap()
@@ -160,14 +174,13 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                 let var_ptr = var_ptr.offset(Size::from_bytes(name_offset_bytes), this)?;
                 let var = this.read_os_str_from_wide_str(var_ptr)?;
 
-                let buf_ptr = this.read_pointer(buf_op)?;
-                // `buf_size` represents the size in characters.
-                let buf_size = u64::from(this.read_scalar(size_op)?.to_u32()?);
-                Scalar::from_u32(windows_check_buffer_size(
-                    this.write_os_str_to_wide_str(
-                        &var, buf_ptr, buf_size, /*truncate*/ false,
-                    )?,
-                ))
+                Scalar::from_u32(windows_check_buffer_size(this.write_os_str_to_wide_str(
+                    &var,
+                    buf_ptr,
+                    buf_size.into(),
+                )?))
+                // This can in fact return 0. It is up to the caller to set last_error to 0
+                // beforehand and check it afterwards to exclude that case.
             }
             None => {
                 let envvar_not_found = this.eval_windows("c", "ERROR_ENVVAR_NOT_FOUND");
@@ -363,9 +376,10 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         // If we cannot get the current directory, we return 0
         match env::current_dir() {
             Ok(cwd) => {
-                this.set_last_error(Scalar::from_u32(0))?; // make sure this is unambiguously not an error
+                // This can in fact return 0. It is up to the caller to set last_error to 0
+                // beforehand and check it afterwards to exclude that case.
                 return Ok(Scalar::from_u32(windows_check_buffer_size(
-                    this.write_path_to_wide_str(&cwd, buf, size, /*truncate*/ false)?,
+                    this.write_path_to_wide_str(&cwd, buf, size)?,
                 )));
             }
             Err(e) => this.set_last_error_from_io_error(e.kind())?,
@@ -482,9 +496,60 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
     fn GetCurrentProcessId(&mut self) -> InterpResult<'tcx, u32> {
         let this = self.eval_context_mut();
         this.assert_target_os("windows", "GetCurrentProcessId");
-
         this.check_no_isolation("`GetCurrentProcessId`")?;
 
         Ok(std::process::id())
     }
+
+    #[allow(non_snake_case)]
+    fn GetUserProfileDirectoryW(
+        &mut self,
+        token: &OpTy<'tcx, Provenance>, // HANDLE
+        buf: &OpTy<'tcx, Provenance>,   // LPWSTR
+        size: &OpTy<'tcx, Provenance>,  // LPDWORD
+    ) -> InterpResult<'tcx, Scalar<Provenance>> // returns BOOL
+    {
+        let this = self.eval_context_mut();
+        this.assert_target_os("windows", "GetUserProfileDirectoryW");
+        this.check_no_isolation("`GetUserProfileDirectoryW`")?;
+
+        let token = this.read_target_isize(token)?;
+        let buf = this.read_pointer(buf)?;
+        let size = this.deref_pointer(size)?;
+
+        if token != -4 {
+            throw_unsup_format!(
+                "GetUserProfileDirectoryW: only CURRENT_PROCESS_TOKEN is supported"
+            );
+        }
+
+        // See <https://learn.microsoft.com/en-us/windows/win32/api/userenv/nf-userenv-getuserprofiledirectoryw> for docs.
+        Ok(match directories::UserDirs::new() {
+            Some(dirs) => {
+                let home = dirs.home_dir();
+                let size_avail = if this.ptr_is_null(size.ptr())? {
+                    0 // if the buf pointer is null, we can't write to it; `size` will be updated to the required length
+                } else {
+                    this.read_scalar(&size)?.to_u32()?
+                };
+                // Of course we cannot use `windows_check_buffer_size` here since this uses
+                // a different method for dealing with a too-small buffer than the other functions...
+                let (success, len) = this.write_path_to_wide_str(home, buf, size_avail.into())?;
+                // The Windows docs just say that this is written on failure. But std
+                // seems to rely on it always being written.
+                this.write_scalar(Scalar::from_u32(len.try_into().unwrap()), &size)?;
+                if success {
+                    Scalar::from_i32(1) // return TRUE
+                } else {
+                    this.set_last_error(this.eval_windows("c", "ERROR_INSUFFICIENT_BUFFER"))?;
+                    Scalar::from_i32(0) // return FALSE
+                }
+            }
+            None => {
+                // We have to pick some error code.
+                this.set_last_error(this.eval_windows("c", "ERROR_BAD_USER_PROFILE"))?;
+                Scalar::from_i32(0) // return FALSE
+            }
+        })
+    }
 }
diff --git a/src/tools/miri/src/shims/os_str.rs b/src/tools/miri/src/shims/os_str.rs
index 3e8c35d48ae..5fcea9ced69 100644
--- a/src/tools/miri/src/shims/os_str.rs
+++ b/src/tools/miri/src/shims/os_str.rs
@@ -72,11 +72,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         u16vec_to_osstring(u16_vec)
     }
 
-    /// Helper function to write an OsStr as a null-terminated sequence of bytes, which is what
-    /// the Unix APIs usually handle. This function returns `Ok((false, length))` without trying
-    /// to write if `size` is not large enough to fit the contents of `os_string` plus a null
-    /// terminator. It returns `Ok((true, length))` if the writing process was successful. The
-    /// string length returned does include the null terminator.
+    /// Helper function to write an OsStr as a null-terminated sequence of bytes, which is what the
+    /// Unix APIs usually handle. Returns `(success, full_len)`, where length includes the null
+    /// terminator. On failure, nothing is written.
     fn write_os_str_to_c_str(
         &mut self,
         os_str: &OsStr,
@@ -87,19 +85,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         self.eval_context_mut().write_c_str(bytes, ptr, size)
     }
 
-    /// Helper function to write an OsStr as a 0x0000-terminated u16-sequence, which is what the
-    /// Windows APIs usually handle.
-    ///
-    /// If `truncate == false` (the usual mode of operation), this function returns `Ok((false,
-    /// length))` without trying to write if `size` is not large enough to fit the contents of
-    /// `os_string` plus a null terminator. It returns `Ok((true, length))` if the writing process
-    /// was successful. The string length returned does include the null terminator. Length is
-    /// measured in units of `u16.`
-    ///
-    /// If `truncate == true`, then in case `size` is not large enough it *will* write the first
-    /// `size.saturating_sub(1)` many items, followed by a null terminator (if `size > 0`).
-    /// The return value is still `(false, length)` in that case.
-    fn write_os_str_to_wide_str(
+    /// Internal helper to share code between `write_os_str_to_wide_str` and
+    /// `write_os_str_to_wide_str_truncated`.
+    fn write_os_str_to_wide_str_helper(
         &mut self,
         os_str: &OsStr,
         ptr: Pointer<Option<Provenance>>,
@@ -133,6 +121,29 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         Ok((written, size_needed))
     }
 
+    /// Helper function to write an OsStr as a 0x0000-terminated u16-sequence, which is what the
+    /// Windows APIs usually handle. Returns `(success, full_len)`, where length is measured
+    /// in units of `u16` and includes the null terminator. On failure, nothing is written.
+    fn write_os_str_to_wide_str(
+        &mut self,
+        os_str: &OsStr,
+        ptr: Pointer<Option<Provenance>>,
+        size: u64,
+    ) -> InterpResult<'tcx, (bool, u64)> {
+        self.write_os_str_to_wide_str_helper(os_str, ptr, size, /*truncate*/ false)
+    }
+
+    /// Like `write_os_str_to_wide_str`, but on failure as much as possible is written into
+    /// the buffer (always with a null terminator).
+    fn write_os_str_to_wide_str_truncated(
+        &mut self,
+        os_str: &OsStr,
+        ptr: Pointer<Option<Provenance>>,
+        size: u64,
+    ) -> InterpResult<'tcx, (bool, u64)> {
+        self.write_os_str_to_wide_str_helper(os_str, ptr, size, /*truncate*/ true)
+    }
+
     /// Allocate enough memory to store the given `OsStr` as a null-terminated sequence of bytes.
     fn alloc_os_str_as_c_str(
         &mut self,
@@ -160,9 +171,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
         let arg_type = Ty::new_array(this.tcx.tcx, this.tcx.types.u16, size);
         let arg_place = this.allocate(this.layout_of(arg_type).unwrap(), memkind)?;
-        let (written, _) = self
-            .write_os_str_to_wide_str(os_str, arg_place.ptr(), size, /*truncate*/ false)
-            .unwrap();
+        let (written, _) = self.write_os_str_to_wide_str(os_str, arg_place.ptr(), size).unwrap();
         assert!(written);
         Ok(arg_place.ptr())
     }
@@ -217,12 +226,25 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         path: &Path,
         ptr: Pointer<Option<Provenance>>,
         size: u64,
-        truncate: bool,
     ) -> InterpResult<'tcx, (bool, u64)> {
         let this = self.eval_context_mut();
         let os_str =
             this.convert_path(Cow::Borrowed(path.as_os_str()), PathConversion::HostToTarget);
-        this.write_os_str_to_wide_str(&os_str, ptr, size, truncate)
+        this.write_os_str_to_wide_str(&os_str, ptr, size)
+    }
+
+    /// Write a Path to the machine memory (as a null-terminated sequence of `u16`s),
+    /// adjusting path separators if needed.
+    fn write_path_to_wide_str_truncated(
+        &mut self,
+        path: &Path,
+        ptr: Pointer<Option<Provenance>>,
+        size: u64,
+    ) -> InterpResult<'tcx, (bool, u64)> {
+        let this = self.eval_context_mut();
+        let os_str =
+            this.convert_path(Cow::Borrowed(path.as_os_str()), PathConversion::HostToTarget);
+        this.write_os_str_to_wide_str_truncated(&os_str, ptr, size)
     }
 
     /// Allocate enough memory to store a Path as a null-terminated sequence of bytes,
diff --git a/src/tools/miri/src/shims/time.rs b/src/tools/miri/src/shims/time.rs
index 1126c900226..dfdf58470d6 100644
--- a/src/tools/miri/src/shims/time.rs
+++ b/src/tools/miri/src/shims/time.rs
@@ -1,5 +1,9 @@
+use std::ffi::OsString;
+use std::fmt::Write;
 use std::time::{Duration, SystemTime};
 
+use chrono::{DateTime, Datelike, Local, Timelike, Utc};
+
 use crate::concurrency::thread::MachineCallback;
 use crate::*;
 
@@ -107,6 +111,80 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         Ok(0)
     }
 
+    // The localtime() function shall convert the time in seconds since the Epoch pointed to by
+    // timer into a broken-down time, expressed as a local time.
+    // https://linux.die.net/man/3/localtime_r
+    fn localtime_r(
+        &mut self,
+        timep: &OpTy<'tcx, Provenance>,
+        result_op: &OpTy<'tcx, Provenance>,
+    ) -> InterpResult<'tcx, Pointer<Option<Provenance>>> {
+        let this = self.eval_context_mut();
+
+        this.assert_target_os_is_unix("localtime_r");
+        this.check_no_isolation("`localtime_r`")?;
+
+        let timep = this.deref_pointer(timep)?;
+        let result = this.deref_pointer_as(result_op, this.libc_ty_layout("tm"))?;
+
+        // The input "represents the number of seconds elapsed since the Epoch,
+        // 1970-01-01 00:00:00 +0000 (UTC)".
+        let sec_since_epoch: i64 = this
+            .read_scalar(&timep)?
+            .to_int(this.libc_ty_layout("time_t").size)?
+            .try_into()
+            .unwrap();
+        let dt_utc: DateTime<Utc> =
+            DateTime::from_timestamp(sec_since_epoch, 0).expect("Invalid timestamp");
+        // Convert that to local time, then return the broken-down time value.
+        let dt: DateTime<Local> = DateTime::from(dt_utc);
+
+        // This value is always set to -1, because there is no way to know if dst is in effect with
+        // chrono crate yet.
+        // This may not be consistent with libc::localtime_r's result.
+        let tm_isdst = -1;
+
+        // tm_zone represents the timezone value in the form of: +0730, +08, -0730 or -08.
+        // This may not be consistent with libc::localtime_r's result.
+        let offset_in_second = Local::now().offset().local_minus_utc();
+        let tm_gmtoff = offset_in_second;
+        let mut tm_zone = String::new();
+        if offset_in_second < 0 {
+            tm_zone.push('-');
+        } else {
+            tm_zone.push('+');
+        }
+        let offset_hour = offset_in_second.abs() / 3600;
+        write!(tm_zone, "{:02}", offset_hour).unwrap();
+        let offset_min = (offset_in_second.abs() % 3600) / 60;
+        if offset_min != 0 {
+            write!(tm_zone, "{:02}", offset_min).unwrap();
+        }
+
+        // FIXME: String de-duplication is needed so that we only allocate this string only once
+        // even when there are multiple calls to this function.
+        let tm_zone_ptr =
+            this.alloc_os_str_as_c_str(&OsString::from(tm_zone), MiriMemoryKind::Machine.into())?;
+
+        this.write_pointer(tm_zone_ptr, &this.project_field_named(&result, "tm_zone")?)?;
+        this.write_int_fields_named(
+            &[
+                ("tm_sec", dt.second().into()),
+                ("tm_min", dt.minute().into()),
+                ("tm_hour", dt.hour().into()),
+                ("tm_mday", dt.day().into()),
+                ("tm_mon", dt.month0().into()),
+                ("tm_year", dt.year().checked_sub(1900).unwrap().into()),
+                ("tm_wday", dt.weekday().num_days_from_sunday().into()),
+                ("tm_yday", dt.ordinal0().into()),
+                ("tm_isdst", tm_isdst),
+                ("tm_gmtoff", tm_gmtoff.into()),
+            ],
+            &result,
+        )?;
+
+        Ok(result.ptr())
+    }
     #[allow(non_snake_case, clippy::arithmetic_side_effects)]
     fn GetSystemTimeAsFileTime(
         &mut self,
diff --git a/src/tools/miri/src/shims/unix/foreign_items.rs b/src/tools/miri/src/shims/unix/foreign_items.rs
index c72d3bb3df4..bd299aaa125 100644
--- a/src/tools/miri/src/shims/unix/foreign_items.rs
+++ b/src/tools/miri/src/shims/unix/foreign_items.rs
@@ -234,6 +234,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                 let result = this.gettimeofday(tv, tz)?;
                 this.write_scalar(Scalar::from_i32(result), dest)?;
             }
+            "localtime_r" => {
+                let [timep, result_op] = this.check_shim(abi, Abi::C {unwind: false}, link_name, args)?;
+                let result = this.localtime_r(timep, result_op)?;
+                this.write_pointer(result, dest)?;
+            }
             "clock_gettime" => {
                 let [clk_id, tp] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
diff --git a/src/tools/miri/src/shims/unix/linux/mem.rs b/src/tools/miri/src/shims/unix/linux/mem.rs
index ec2922d0275..3948216f729 100644
--- a/src/tools/miri/src/shims/unix/linux/mem.rs
+++ b/src/tools/miri/src/shims/unix/linux/mem.rs
@@ -23,7 +23,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         // old_address must be a multiple of the page size
         #[allow(clippy::arithmetic_side_effects)] // PAGE_SIZE is nonzero
         if old_address.addr().bytes() % this.machine.page_size != 0 || new_size == 0 {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
@@ -37,7 +37,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
         if flags & this.eval_libc_i32("MREMAP_MAYMOVE") == 0 {
             // We only support MREMAP_MAYMOVE, so not passing the flag is just a failure
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
diff --git a/src/tools/miri/src/shims/unix/mem.rs b/src/tools/miri/src/shims/unix/mem.rs
index d3470893dbb..f52dc23656d 100644
--- a/src/tools/miri/src/shims/unix/mem.rs
+++ b/src/tools/miri/src/shims/unix/mem.rs
@@ -53,11 +53,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
         // First, we do some basic argument validation as required by mmap
         if (flags & (map_private | map_shared)).count_ones() != 1 {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
         if length == 0 {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
@@ -77,7 +77,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         //
         // Miri doesn't support MAP_FIXED or any any protections other than PROT_READ|PROT_WRITE.
         if flags & map_fixed != 0 || prot != prot_read | prot_write {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("ENOTSUP")))?;
+            this.set_last_error(this.eval_libc("ENOTSUP"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
@@ -96,11 +96,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
         let align = this.machine.page_align();
         let Some(map_length) = length.checked_next_multiple_of(this.machine.page_size) else {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         };
         if map_length > this.target_usize_max() {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
@@ -131,16 +131,16 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         // as a dealloc.
         #[allow(clippy::arithmetic_side_effects)] // PAGE_SIZE is nonzero
         if addr.addr().bytes() % this.machine.page_size != 0 {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(Scalar::from_i32(-1));
         }
 
         let Some(length) = length.checked_next_multiple_of(this.machine.page_size) else {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(Scalar::from_i32(-1));
         };
         if length > this.target_usize_max() {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
diff --git a/src/tools/miri/src/shims/windows/foreign_items.rs b/src/tools/miri/src/shims/windows/foreign_items.rs
index ec4c6101487..24f7cd18e7a 100644
--- a/src/tools/miri/src/shims/windows/foreign_items.rs
+++ b/src/tools/miri/src/shims/windows/foreign_items.rs
@@ -135,6 +135,12 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                 let result = this.SetCurrentDirectoryW(path)?;
                 this.write_scalar(result, dest)?;
             }
+            "GetUserProfileDirectoryW" => {
+                let [token, buf, size] =
+                    this.check_shim(abi, Abi::System { unwind: false }, link_name, args)?;
+                let result = this.GetUserProfileDirectoryW(token, buf, size)?;
+                this.write_scalar(result, dest)?;
+            }
 
             // File related shims
             "NtWriteFile" => {
@@ -225,15 +231,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                         Scalar::from_u32(0) // return zero upon failure
                     }
                     Ok(abs_filename) => {
-                        this.set_last_error(Scalar::from_u32(0))?; // make sure this is unambiguously not an error
                         Scalar::from_u32(helpers::windows_check_buffer_size(
-                            this.write_path_to_wide_str(
-                                &abs_filename,
-                                buffer,
-                                size.into(),
-                                /*truncate*/ false,
-                            )?,
+                            this.write_path_to_wide_str(&abs_filename, buffer, size.into())?,
                         ))
+                        // This can in fact return 0. It is up to the caller to set last_error to 0
+                        // beforehand and check it afterwards to exclude that case.
                     }
                 };
                 this.write_scalar(result, dest)?;
@@ -601,15 +603,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
                 // Using the host current_exe is a bit off, but consistent with Linux
                 // (where stdlib reads /proc/self/exe).
-                // Unfortunately this Windows function has a crazy behavior so we can't just use
-                // `write_path_to_wide_str`...
                 let path = std::env::current_exe().unwrap();
-                let (all_written, size_needed) = this.write_path_to_wide_str(
-                    &path,
-                    filename,
-                    size.into(),
-                    /*truncate*/ true,
-                )?;
+                let (all_written, size_needed) =
+                    this.write_path_to_wide_str_truncated(&path, filename, size.into())?;
 
                 if all_written {
                     // If the function succeeds, the return value is the length of the string that
@@ -649,12 +645,8 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                     Some(err) => format!("{err}"),
                     None => format!("<unknown error in FormatMessageW: {message_id}>"),
                 };
-                let (complete, length) = this.write_os_str_to_wide_str(
-                    OsStr::new(&formatted),
-                    buffer,
-                    size.into(),
-                    /*trunacte*/ false,
-                )?;
+                let (complete, length) =
+                    this.write_os_str_to_wide_str(OsStr::new(&formatted), buffer, size.into())?;
                 if !complete {
                     // The API docs don't say what happens when the buffer is not big enough...
                     // Let's just bail.
diff --git a/src/tools/miri/src/shims/x86/avx.rs b/src/tools/miri/src/shims/x86/avx.rs
index 23c78647b9c..41c20d768f7 100644
--- a/src/tools/miri/src/shims/x86/avx.rs
+++ b/src/tools/miri/src/shims/x86/avx.rs
@@ -7,7 +7,8 @@ use rustc_target::spec::abi::Abi;
 
 use super::{
     bin_op_simd_float_all, conditional_dot_product, convert_float_to_int, horizontal_bin_op,
-    round_all, test_bits_masked, test_high_bits_masked, unary_op_ps, FloatBinOp, FloatUnaryOp,
+    mask_load, mask_store, round_all, test_bits_masked, test_high_bits_masked, unary_op_ps,
+    FloatBinOp, FloatUnaryOp,
 };
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
@@ -347,71 +348,3 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
         Ok(EmulateForeignItemResult::NeedsJumping)
     }
 }
-
-/// Conditionally loads from `ptr` according the high bit of each
-/// element of `mask`. `ptr` does not need to be aligned.
-fn mask_load<'tcx>(
-    this: &mut crate::MiriInterpCx<'_, 'tcx>,
-    ptr: &OpTy<'tcx, Provenance>,
-    mask: &OpTy<'tcx, Provenance>,
-    dest: &MPlaceTy<'tcx, Provenance>,
-) -> InterpResult<'tcx, ()> {
-    let (mask, mask_len) = this.operand_to_simd(mask)?;
-    let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-    assert_eq!(dest_len, mask_len);
-
-    let mask_item_size = mask.layout.field(this, 0).size;
-    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
-
-    let ptr = this.read_pointer(ptr)?;
-    for i in 0..dest_len {
-        let mask = this.project_index(&mask, i)?;
-        let dest = this.project_index(&dest, i)?;
-
-        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
-            // Size * u64 is implemented as always checked
-            #[allow(clippy::arithmetic_side_effects)]
-            let ptr = ptr.wrapping_offset(dest.layout.size * i, &this.tcx);
-            // Unaligned copy, which is what we want.
-            this.mem_copy(ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?;
-        } else {
-            this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
-        }
-    }
-
-    Ok(())
-}
-
-/// Conditionally stores into `ptr` according the high bit of each
-/// element of `mask`. `ptr` does not need to be aligned.
-fn mask_store<'tcx>(
-    this: &mut crate::MiriInterpCx<'_, 'tcx>,
-    ptr: &OpTy<'tcx, Provenance>,
-    mask: &OpTy<'tcx, Provenance>,
-    value: &OpTy<'tcx, Provenance>,
-) -> InterpResult<'tcx, ()> {
-    let (mask, mask_len) = this.operand_to_simd(mask)?;
-    let (value, value_len) = this.operand_to_simd(value)?;
-
-    assert_eq!(value_len, mask_len);
-
-    let mask_item_size = mask.layout.field(this, 0).size;
-    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
-
-    let ptr = this.read_pointer(ptr)?;
-    for i in 0..value_len {
-        let mask = this.project_index(&mask, i)?;
-        let value = this.project_index(&value, i)?;
-
-        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
-            // Size * u64 is implemented as always checked
-            #[allow(clippy::arithmetic_side_effects)]
-            let ptr = ptr.wrapping_offset(value.layout.size * i, &this.tcx);
-            // Unaligned copy, which is what we want.
-            this.mem_copy(value.ptr(), ptr, value.layout.size, /*nonoverlapping*/ true)?;
-        }
-    }
-
-    Ok(())
-}
diff --git a/src/tools/miri/src/shims/x86/avx2.rs b/src/tools/miri/src/shims/x86/avx2.rs
new file mode 100644
index 00000000000..bbf53f9f1e5
--- /dev/null
+++ b/src/tools/miri/src/shims/x86/avx2.rs
@@ -0,0 +1,444 @@
+use crate::rustc_middle::ty::layout::LayoutOf as _;
+use rustc_middle::mir;
+use rustc_middle::ty::Ty;
+use rustc_span::Symbol;
+use rustc_target::spec::abi::Abi;
+
+use super::{
+    horizontal_bin_op, int_abs, mask_load, mask_store, mpsadbw, packssdw, packsswb, packusdw,
+    packuswb, pmulhrsw, psign, shift_simd_by_scalar, shift_simd_by_simd, ShiftOp,
+};
+use crate::*;
+use shims::foreign_items::EmulateForeignItemResult;
+
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
+pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
+    crate::MiriInterpCxExt<'mir, 'tcx>
+{
+    fn emulate_x86_avx2_intrinsic(
+        &mut self,
+        link_name: Symbol,
+        abi: Abi,
+        args: &[OpTy<'tcx, Provenance>],
+        dest: &MPlaceTy<'tcx, Provenance>,
+    ) -> InterpResult<'tcx, EmulateForeignItemResult> {
+        let this = self.eval_context_mut();
+        this.expect_target_feature_for_intrinsic(link_name, "avx2")?;
+        // Prefix should have already been checked.
+        let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.avx2.").unwrap();
+
+        match unprefixed_name {
+            // Used to implement the _mm256_abs_epi{8,16,32} functions.
+            // Calculates the absolute value of packed 8/16/32-bit integers.
+            "pabs.b" | "pabs.w" | "pabs.d" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                int_abs(this, op, dest)?;
+            }
+            // Used to implement the _mm256_h{add,adds,sub}_epi{16,32} functions.
+            // Horizontally add / add with saturation / subtract adjacent 16/32-bit
+            // integer values in `left` and `right`.
+            "phadd.w" | "phadd.sw" | "phadd.d" | "phsub.w" | "phsub.sw" | "phsub.d" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (which, saturating) = match unprefixed_name {
+                    "phadd.w" | "phadd.d" => (mir::BinOp::Add, false),
+                    "phadd.sw" => (mir::BinOp::Add, true),
+                    "phsub.w" | "phsub.d" => (mir::BinOp::Sub, false),
+                    "phsub.sw" => (mir::BinOp::Sub, true),
+                    _ => unreachable!(),
+                };
+
+                horizontal_bin_op(this, which, saturating, left, right, dest)?;
+            }
+            // Used to implement `_mm{,_mask}_{i32,i64}gather_{epi32,epi64,pd,ps}` functions
+            // Gathers elements from `slice` using `offsets * scale` as indices.
+            // When the highest bit of the corresponding element of `mask` is 0,
+            // the value is copied from `src` instead.
+            "gather.d.d" | "gather.d.d.256" | "gather.d.q" | "gather.d.q.256" | "gather.q.d"
+            | "gather.q.d.256" | "gather.q.q" | "gather.q.q.256" | "gather.d.pd"
+            | "gather.d.pd.256" | "gather.q.pd" | "gather.q.pd.256" | "gather.d.ps"
+            | "gather.d.ps.256" | "gather.q.ps" | "gather.q.ps.256" => {
+                let [src, slice, offsets, mask, scale] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                assert_eq!(dest.layout, src.layout);
+
+                let (src, _) = this.operand_to_simd(src)?;
+                let (offsets, offsets_len) = this.operand_to_simd(offsets)?;
+                let (mask, mask_len) = this.operand_to_simd(mask)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                // There are cases like dest: i32x4, offsets: i64x2
+                let actual_len = dest_len.min(offsets_len);
+
+                assert_eq!(dest_len, mask_len);
+
+                let mask_item_size = mask.layout.field(this, 0).size;
+                let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+                let scale = this.read_scalar(scale)?.to_i8()?;
+                if !matches!(scale, 1 | 2 | 4 | 8) {
+                    throw_unsup_format!("invalid gather scale {scale}");
+                }
+                let scale = i64::from(scale);
+
+                let slice = this.read_pointer(slice)?;
+                for i in 0..actual_len {
+                    let mask = this.project_index(&mask, i)?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+                        let offset = this.project_index(&offsets, i)?;
+                        let offset =
+                            i64::try_from(this.read_scalar(&offset)?.to_int(offset.layout.size)?)
+                                .unwrap();
+                        let ptr = slice
+                            .wrapping_signed_offset(offset.checked_mul(scale).unwrap(), &this.tcx);
+                        // Unaligned copy, which is what we want.
+                        this.mem_copy(
+                            ptr,
+                            dest.ptr(),
+                            dest.layout.size,
+                            /*nonoverlapping*/ true,
+                        )?;
+                    } else {
+                        this.copy_op(&this.project_index(&src, i)?, &dest)?;
+                    }
+                }
+                for i in actual_len..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+                    this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
+                }
+            }
+            // Used to implement the _mm256_madd_epi16 function.
+            // Multiplies packed signed 16-bit integers in `left` and `right`, producing
+            // intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+            // intermediate 32-bit integers, and pack the results in `dest`.
+            "pmadd.wd" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(dest_len.checked_mul(2).unwrap(), left_len);
+
+                for i in 0..dest_len {
+                    let j1 = i.checked_mul(2).unwrap();
+                    let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_i16()?;
+                    let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i16()?;
+
+                    let j2 = j1.checked_add(1).unwrap();
+                    let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_i16()?;
+                    let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i16()?;
+
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Multiplications are i16*i16->i32, which will not overflow.
+                    let mul1 = i32::from(left1).checked_mul(right1.into()).unwrap();
+                    let mul2 = i32::from(left2).checked_mul(right2.into()).unwrap();
+                    // However, this addition can overflow in the most extreme case
+                    // (-0x8000)*(-0x8000)+(-0x8000)*(-0x8000) = 0x80000000
+                    let res = mul1.wrapping_add(mul2);
+
+                    this.write_scalar(Scalar::from_i32(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm256_maddubs_epi16 function.
+            // Multiplies packed 8-bit unsigned integers from `left` and packed
+            // signed 8-bit integers from `right` into 16-bit signed integers. Then,
+            // the saturating sum of the products with indices `2*i` and `2*i+1`
+            // produces the output at index `i`.
+            "pmadd.ub.sw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(dest_len.checked_mul(2).unwrap(), left_len);
+
+                for i in 0..dest_len {
+                    let j1 = i.checked_mul(2).unwrap();
+                    let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_u8()?;
+                    let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i8()?;
+
+                    let j2 = j1.checked_add(1).unwrap();
+                    let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_u8()?;
+                    let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i8()?;
+
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Multiplication of a u8 and an i8 into an i16 cannot overflow.
+                    let mul1 = i16::from(left1).checked_mul(right1.into()).unwrap();
+                    let mul2 = i16::from(left2).checked_mul(right2.into()).unwrap();
+                    let res = mul1.saturating_add(mul2);
+
+                    this.write_scalar(Scalar::from_i16(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm_maskload_epi32, _mm_maskload_epi64,
+            // _mm256_maskload_epi32 and _mm256_maskload_epi64 functions.
+            // For the element `i`, if the high bit of the `i`-th element of `mask`
+            // is one, it is loaded from `ptr.wrapping_add(i)`, otherwise zero is
+            // loaded.
+            "maskload.d" | "maskload.q" | "maskload.d.256" | "maskload.q.256" => {
+                let [ptr, mask] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mask_load(this, ptr, mask, dest)?;
+            }
+            // Used to implement the _mm_maskstore_epi32, _mm_maskstore_epi64,
+            // _mm256_maskstore_epi32 and _mm256_maskstore_epi64 functions.
+            // For the element `i`, if the high bit of the element `i`-th of `mask`
+            // is one, it is stored into `ptr.wapping_add(i)`.
+            // Unlike SSE2's _mm_maskmoveu_si128, these are not non-temporal stores.
+            "maskstore.d" | "maskstore.q" | "maskstore.d.256" | "maskstore.q.256" => {
+                let [ptr, mask, value] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mask_store(this, ptr, mask, value)?;
+            }
+            // Used to implement the _mm256_mpsadbw_epu8 function.
+            // Compute the sum of absolute differences of quadruplets of unsigned
+            // 8-bit integers in `left` and `right`, and store the 16-bit results
+            // in `right`. Quadruplets are selected from `left` and `right` with
+            // offsets specified in `imm`.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8
+            "mpsadbw" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mpsadbw(this, left, right, imm, dest)?;
+            }
+            // Used to implement the _mm256_mulhrs_epi16 function.
+            // Multiplies packed 16-bit signed integer values, truncates the 32-bit
+            // product to the 18 most significant bits by right-shifting, and then
+            // divides the 18-bit value by 2 (rounding to nearest) by first adding
+            // 1 and then taking the bits `1..=16`.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16
+            "pmul.hr.sw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                pmulhrsw(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packs_epi16 function.
+            // Converts two 16-bit integer vectors to a single 8-bit integer
+            // vector with signed saturation.
+            "packsswb" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packsswb(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packs_epi32 function.
+            // Converts two 32-bit integer vectors to a single 16-bit integer
+            // vector with signed saturation.
+            "packssdw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packssdw(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packus_epi16 function.
+            // Converts two 16-bit signed integer vectors to a single 8-bit
+            // unsigned integer vector with saturation.
+            "packuswb" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packuswb(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packus_epi32 function.
+            // Concatenates two 32-bit signed integer vectors and converts
+            // the result to a 16-bit unsigned integer vector with saturation.
+            "packusdw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packusdw(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_permutevar8x32_epi32 and
+            // _mm256_permutevar8x32_ps function.
+            // Shuffles `left` using the three low bits of each element of `right`
+            // as indices.
+            "permd" | "permps" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u32()?;
+                    let left = this.project_index(&left, (right & 0b111).into())?;
+
+                    this.copy_op(&left, &dest)?;
+                }
+            }
+            // Used to implement the _mm256_permute2x128_si256 function.
+            // Shuffles 128-bit blocks of `a` and `b` using `imm` as pattern.
+            "vperm2i128" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                assert_eq!(left.layout.size.bits(), 256);
+                assert_eq!(right.layout.size.bits(), 256);
+                assert_eq!(dest.layout.size.bits(), 256);
+
+                // Transmute to `[i128; 2]`
+
+                let array_layout =
+                    this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.i128, 2))?;
+                let left = left.transmute(array_layout, this)?;
+                let right = right.transmute(array_layout, this)?;
+                let dest = dest.transmute(array_layout, this)?;
+
+                let imm = this.read_scalar(imm)?.to_u8()?;
+
+                for i in 0..2 {
+                    let dest = this.project_index(&dest, i)?;
+                    let src = match (imm >> i.checked_mul(4).unwrap()) & 0b11 {
+                        0 => this.project_index(&left, 0)?,
+                        1 => this.project_index(&left, 1)?,
+                        2 => this.project_index(&right, 0)?,
+                        3 => this.project_index(&right, 1)?,
+                        _ => unreachable!(),
+                    };
+
+                    this.copy_op(&src, &dest)?;
+                }
+            }
+            // Used to implement the _mm256_sad_epu8 function.
+            // Compute the absolute differences of packed unsigned 8-bit integers
+            // in `left` and `right`, then horizontally sum each consecutive 8
+            // differences to produce four unsigned 16-bit integers, and pack
+            // these unsigned 16-bit integers in the low 16 bits of 64-bit elements
+            // in `dest`.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8
+            "psad.bw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(left_len, dest_len.checked_mul(8).unwrap());
+
+                for i in 0..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+
+                    let mut acc: u16 = 0;
+                    for j in 0..8 {
+                        let src_index = i.checked_mul(8).unwrap().checked_add(j).unwrap();
+
+                        let left = this.project_index(&left, src_index)?;
+                        let left = this.read_scalar(&left)?.to_u8()?;
+
+                        let right = this.project_index(&right, src_index)?;
+                        let right = this.read_scalar(&right)?.to_u8()?;
+
+                        acc = acc.checked_add(left.abs_diff(right).into()).unwrap();
+                    }
+
+                    this.write_scalar(Scalar::from_u64(acc.into()), &dest)?;
+                }
+            }
+            // Used to implement the _mm256_shuffle_epi8 intrinsic.
+            // Shuffles bytes from `left` using `right` as pattern.
+            // Each 128-bit block is shuffled independently.
+            "pshuf.b" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u8()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = if right & 0x80 == 0 {
+                        // Shuffle each 128-bit (16-byte) block independently.
+                        let j = u64::from(right % 16).checked_add(i & !15).unwrap();
+                        this.read_scalar(&this.project_index(&left, j)?)?
+                    } else {
+                        // If the highest bit in `right` is 1, write zero.
+                        Scalar::from_u8(0)
+                    };
+
+                    this.write_scalar(res, &dest)?;
+                }
+            }
+            // Used to implement the _mm256_sign_epi{8,16,32} functions.
+            // Negates elements from `left` when the corresponding element in
+            // `right` is negative. If an element from `right` is zero, zero
+            // is writen to the corresponding output element.
+            // Basically, we multiply `left` with `right.signum()`.
+            "psign.b" | "psign.w" | "psign.d" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                psign(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_{sll,srl,sra}_epi{16,32,64} functions
+            // (except _mm256_sra_epi64, which is not available in AVX2).
+            // Shifts N-bit packed integers in left by the amount in right.
+            // `right` is as 128-bit vector. but it is interpreted as a single
+            // 64-bit integer (remaining bits are ignored).
+            // For logic shifts, when right is larger than N - 1, zero is produced.
+            // For arithmetic shifts, when right is larger than N - 1, the sign bit
+            // is copied to remaining bits.
+            "psll.w" | "psrl.w" | "psra.w" | "psll.d" | "psrl.d" | "psra.d" | "psll.q"
+            | "psrl.q" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "psll.w" | "psll.d" | "psll.q" => ShiftOp::Left,
+                    "psrl.w" | "psrl.d" | "psrl.q" => ShiftOp::RightLogic,
+                    "psra.w" | "psra.d" => ShiftOp::RightArith,
+                    _ => unreachable!(),
+                };
+
+                shift_simd_by_scalar(this, left, right, which, dest)?;
+            }
+            // Used to implement the _mm{,256}_{sllv,srlv,srav}_epi{32,64} functions
+            // (except _mm{,256}_srav_epi64, which are not available in AVX2).
+            "psllv.d" | "psllv.d.256" | "psllv.q" | "psllv.q.256" | "psrlv.d" | "psrlv.d.256"
+            | "psrlv.q" | "psrlv.q.256" | "psrav.d" | "psrav.d.256" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "psllv.d" | "psllv.d.256" | "psllv.q" | "psllv.q.256" => ShiftOp::Left,
+                    "psrlv.d" | "psrlv.d.256" | "psrlv.q" | "psrlv.q.256" => ShiftOp::RightLogic,
+                    "psrav.d" | "psrav.d.256" => ShiftOp::RightArith,
+                    _ => unreachable!(),
+                };
+
+                shift_simd_by_simd(this, left, right, which, dest)?;
+            }
+            _ => return Ok(EmulateForeignItemResult::NotSupported),
+        }
+        Ok(EmulateForeignItemResult::NeedsJumping)
+    }
+}
diff --git a/src/tools/miri/src/shims/x86/mod.rs b/src/tools/miri/src/shims/x86/mod.rs
index 615821b2e37..cf4d6a04bec 100644
--- a/src/tools/miri/src/shims/x86/mod.rs
+++ b/src/tools/miri/src/shims/x86/mod.rs
@@ -14,6 +14,7 @@ use shims::foreign_items::EmulateForeignItemResult;
 
 mod aesni;
 mod avx;
+mod avx2;
 mod sse;
 mod sse2;
 mod sse3;
@@ -136,6 +137,11 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     this, link_name, abi, args, dest,
                 );
             }
+            name if name.starts_with("avx2.") => {
+                return avx2::EvalContextExt::emulate_x86_avx2_intrinsic(
+                    this, link_name, abi, args, dest,
+                );
+            }
 
             _ => return Ok(EmulateForeignItemResult::NotSupported),
         }
@@ -482,7 +488,7 @@ enum ShiftOp {
 ///
 /// For logic shifts, when right is larger than BITS - 1, zero is produced.
 /// For arithmetic right-shifts, when right is larger than BITS - 1, the sign
-/// bit is copied to remaining bits.
+/// bit is copied to all bits.
 fn shift_simd_by_scalar<'tcx>(
     this: &mut crate::MiriInterpCx<'_, 'tcx>,
     left: &OpTy<'tcx, Provenance>,
@@ -534,6 +540,61 @@ fn shift_simd_by_scalar<'tcx>(
     Ok(())
 }
 
+/// Shifts each element of `left` by the corresponding element of `right`.
+///
+/// For logic shifts, when right is larger than BITS - 1, zero is produced.
+/// For arithmetic right-shifts, when right is larger than BITS - 1, the sign
+/// bit is copied to all bits.
+fn shift_simd_by_simd<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    which: ShiftOp,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    for i in 0..dest_len {
+        let left = this.read_scalar(&this.project_index(&left, i)?)?;
+        let right = this.read_scalar(&this.project_index(&right, i)?)?;
+        let dest = this.project_index(&dest, i)?;
+
+        // It is ok to saturate the value to u32::MAX because any value
+        // above BITS - 1 will produce the same result.
+        let shift = u32::try_from(right.to_uint(dest.layout.size)?).unwrap_or(u32::MAX);
+
+        let res = match which {
+            ShiftOp::Left => {
+                let left = left.to_uint(dest.layout.size)?;
+                let res = left.checked_shl(shift).unwrap_or(0);
+                // `truncate` is needed as left-shift can make the absolute value larger.
+                Scalar::from_uint(dest.layout.size.truncate(res), dest.layout.size)
+            }
+            ShiftOp::RightLogic => {
+                let left = left.to_uint(dest.layout.size)?;
+                let res = left.checked_shr(shift).unwrap_or(0);
+                // No `truncate` needed as right-shift can only make the absolute value smaller.
+                Scalar::from_uint(res, dest.layout.size)
+            }
+            ShiftOp::RightArith => {
+                let left = left.to_int(dest.layout.size)?;
+                // On overflow, copy the sign bit to the remaining bits
+                let res = left.checked_shr(shift).unwrap_or(left >> 127);
+                // No `truncate` needed as right-shift can only make the absolute value smaller.
+                Scalar::from_int(res, dest.layout.size)
+            }
+        };
+        this.write_scalar(res, &dest)?;
+    }
+
+    Ok(())
+}
+
 /// Takes a 128-bit vector, transmutes it to `[u64; 2]` and extracts
 /// the first value.
 fn extract_first_u64<'tcx>(
@@ -650,7 +711,7 @@ fn convert_float_to_int<'tcx>(
         let dest = this.project_index(&dest, i)?;
 
         let res = this.float_to_int_checked(&op, dest.layout, rnd)?.unwrap_or_else(|| {
-            // Fallback to minimum acording to SSE/AVX semantics.
+            // Fallback to minimum according to SSE/AVX semantics.
             ImmTy::from_int(dest.layout.size.signed_int_min(), dest.layout)
         });
         this.write_immediate(*res, &dest)?;
@@ -664,6 +725,33 @@ fn convert_float_to_int<'tcx>(
     Ok(())
 }
 
+/// Calculates absolute value of integers in `op` and stores the result in `dest`.
+///
+/// In case of overflow (when the operand is the minimum value), the operation
+/// will wrap around.
+fn int_abs<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    op: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (op, op_len) = this.operand_to_simd(op)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(op_len, dest_len);
+
+    for i in 0..dest_len {
+        let op = this.read_scalar(&this.project_index(&op, i)?)?;
+        let dest = this.project_index(&dest, i)?;
+
+        // Converting to a host "i128" works since the input is always signed.
+        let res = op.to_int(dest.layout.size)?.unsigned_abs();
+
+        this.write_scalar(Scalar::from_uint(res, dest.layout.size), &dest)?;
+    }
+
+    Ok(())
+}
+
 /// Splits `op` (which must be a SIMD vector) into 128-bit chuncks.
 ///
 /// Returns a tuple where:
@@ -874,3 +962,316 @@ fn test_high_bits_masked<'tcx>(
 
     Ok((direct, negated))
 }
+
+/// Conditionally loads from `ptr` according the high bit of each
+/// element of `mask`. `ptr` does not need to be aligned.
+fn mask_load<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    ptr: &OpTy<'tcx, Provenance>,
+    mask: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (mask, mask_len) = this.operand_to_simd(mask)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, mask_len);
+
+    let mask_item_size = mask.layout.field(this, 0).size;
+    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+    let ptr = this.read_pointer(ptr)?;
+    for i in 0..dest_len {
+        let mask = this.project_index(&mask, i)?;
+        let dest = this.project_index(&dest, i)?;
+
+        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+            let ptr = ptr.wrapping_offset(dest.layout.size * i, &this.tcx);
+            // Unaligned copy, which is what we want.
+            this.mem_copy(ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?;
+        } else {
+            this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Conditionally stores into `ptr` according the high bit of each
+/// element of `mask`. `ptr` does not need to be aligned.
+fn mask_store<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    ptr: &OpTy<'tcx, Provenance>,
+    mask: &OpTy<'tcx, Provenance>,
+    value: &OpTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (mask, mask_len) = this.operand_to_simd(mask)?;
+    let (value, value_len) = this.operand_to_simd(value)?;
+
+    assert_eq!(value_len, mask_len);
+
+    let mask_item_size = mask.layout.field(this, 0).size;
+    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+    let ptr = this.read_pointer(ptr)?;
+    for i in 0..value_len {
+        let mask = this.project_index(&mask, i)?;
+        let value = this.project_index(&value, i)?;
+
+        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+            let ptr = ptr.wrapping_offset(value.layout.size * i, &this.tcx);
+            // Unaligned copy, which is what we want.
+            this.mem_copy(value.ptr(), ptr, value.layout.size, /*nonoverlapping*/ true)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Compute the sum of absolute differences of quadruplets of unsigned
+/// 8-bit integers in `left` and `right`, and store the 16-bit results
+/// in `right`. Quadruplets are selected from `left` and `right` with
+/// offsets specified in `imm`.
+///
+/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16>
+/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8>
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn mpsadbw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    imm: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    assert_eq!(left.layout, right.layout);
+    assert_eq!(left.layout.size, dest.layout.size);
+
+    let (num_chunks, op_items_per_chunk, left) = split_simd_to_128bit_chunks(this, left)?;
+    let (_, _, right) = split_simd_to_128bit_chunks(this, right)?;
+    let (_, dest_items_per_chunk, dest) = split_simd_to_128bit_chunks(this, dest)?;
+
+    assert_eq!(op_items_per_chunk, dest_items_per_chunk.checked_mul(2).unwrap());
+
+    let imm = this.read_scalar(imm)?.to_uint(imm.layout.size)?;
+    // Bit 2 of `imm` specifies the offset for indices of `left`.
+    // The offset is 0 when the bit is 0 or 4 when the bit is 1.
+    let left_offset = u64::try_from((imm >> 2) & 1).unwrap().checked_mul(4).unwrap();
+    // Bits 0..=1 of `imm` specify the offset for indices of
+    // `right` in blocks of 4 elements.
+    let right_offset = u64::try_from(imm & 0b11).unwrap().checked_mul(4).unwrap();
+
+    for i in 0..num_chunks {
+        let left = this.project_index(&left, i)?;
+        let right = this.project_index(&right, i)?;
+        let dest = this.project_index(&dest, i)?;
+
+        for j in 0..dest_items_per_chunk {
+            let left_offset = left_offset.checked_add(j).unwrap();
+            let mut res: u16 = 0;
+            for k in 0..4 {
+                let left = this
+                    .read_scalar(&this.project_index(&left, left_offset.checked_add(k).unwrap())?)?
+                    .to_u8()?;
+                let right = this
+                    .read_scalar(
+                        &this.project_index(&right, right_offset.checked_add(k).unwrap())?,
+                    )?
+                    .to_u8()?;
+                res = res.checked_add(left.abs_diff(right).into()).unwrap();
+            }
+            this.write_scalar(Scalar::from_u16(res), &this.project_index(&dest, j)?)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// product to the 18 most significant bits by right-shifting, and then
+/// divides the 18-bit value by 2 (rounding to nearest) by first adding
+/// 1 and then taking the bits `1..=16`.
+///
+/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16>
+/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16>
+fn pmulhrsw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    for i in 0..dest_len {
+        let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
+        let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
+        let dest = this.project_index(&dest, i)?;
+
+        let res =
+            (i32::from(left).checked_mul(right.into()).unwrap() >> 14).checked_add(1).unwrap() >> 1;
+
+        // The result of this operation can overflow a signed 16-bit integer.
+        // When `left` and `right` are -0x8000, the result is 0x8000.
+        #[allow(clippy::cast_possible_truncation)]
+        let res = res as i16;
+
+        this.write_scalar(Scalar::from_i16(res), &dest)?;
+    }
+
+    Ok(())
+}
+
+fn pack_generic<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+    f: impl Fn(Scalar<Provenance>) -> InterpResult<'tcx, Scalar<Provenance>>,
+) -> InterpResult<'tcx, ()> {
+    assert_eq!(left.layout, right.layout);
+    assert_eq!(left.layout.size, dest.layout.size);
+
+    let (num_chunks, op_items_per_chunk, left) = split_simd_to_128bit_chunks(this, left)?;
+    let (_, _, right) = split_simd_to_128bit_chunks(this, right)?;
+    let (_, dest_items_per_chunk, dest) = split_simd_to_128bit_chunks(this, dest)?;
+
+    assert_eq!(dest_items_per_chunk, op_items_per_chunk.checked_mul(2).unwrap());
+
+    for i in 0..num_chunks {
+        let left = this.project_index(&left, i)?;
+        let right = this.project_index(&right, i)?;
+        let dest = this.project_index(&dest, i)?;
+
+        for j in 0..op_items_per_chunk {
+            let left = this.read_scalar(&this.project_index(&left, j)?)?;
+            let right = this.read_scalar(&this.project_index(&right, j)?)?;
+            let left_dest = this.project_index(&dest, j)?;
+            let right_dest =
+                this.project_index(&dest, j.checked_add(op_items_per_chunk).unwrap())?;
+
+            let left_res = f(left)?;
+            let right_res = f(right)?;
+
+            this.write_scalar(left_res, &left_dest)?;
+            this.write_scalar(right_res, &right_dest)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Converts two 16-bit integer vectors to a single 8-bit integer
+/// vector with signed saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packsswb<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i16()?;
+        let res = i8::try_from(op).unwrap_or(if op < 0 { i8::MIN } else { i8::MAX });
+        Ok(Scalar::from_i8(res))
+    })
+}
+
+/// Converts two 16-bit signed integer vectors to a single 8-bit
+/// unsigned integer vector with saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packuswb<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i16()?;
+        let res = u8::try_from(op).unwrap_or(if op < 0 { 0 } else { u8::MAX });
+        Ok(Scalar::from_u8(res))
+    })
+}
+
+/// Converts two 32-bit integer vectors to a single 16-bit integer
+/// vector with signed saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packssdw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i32()?;
+        let res = i16::try_from(op).unwrap_or(if op < 0 { i16::MIN } else { i16::MAX });
+        Ok(Scalar::from_i16(res))
+    })
+}
+
+/// Converts two 32-bit integer vectors to a single 16-bit integer
+/// vector with unsigned saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packusdw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i32()?;
+        let res = u16::try_from(op).unwrap_or(if op < 0 { 0 } else { u16::MAX });
+        Ok(Scalar::from_u16(res))
+    })
+}
+
+/// Negates elements from `left` when the corresponding element in
+/// `right` is negative. If an element from `right` is zero, zero
+/// is writen to the corresponding output element.
+/// In other words, multiplies `left` with `right.signum()`.
+fn psign<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    for i in 0..dest_len {
+        let dest = this.project_index(&dest, i)?;
+        let left = this.read_immediate(&this.project_index(&left, i)?)?;
+        let right = this.read_scalar(&this.project_index(&right, i)?)?.to_int(dest.layout.size)?;
+
+        let res = this.wrapping_binary_op(
+            mir::BinOp::Mul,
+            &left,
+            &ImmTy::from_int(right.signum(), dest.layout),
+        )?;
+
+        this.write_immediate(*res, &dest)?;
+    }
+
+    Ok(())
+}
diff --git a/src/tools/miri/src/shims/x86/sse.rs b/src/tools/miri/src/shims/x86/sse.rs
index b8c0dfb1c7f..17608837319 100644
--- a/src/tools/miri/src/shims/x86/sse.rs
+++ b/src/tools/miri/src/shims/x86/sse.rs
@@ -182,7 +182,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 };
 
                 let res = this.float_to_int_checked(&op, dest.layout, rnd)?.unwrap_or_else(|| {
-                    // Fallback to minimum acording to SSE semantics.
+                    // Fallback to minimum according to SSE semantics.
                     ImmTy::from_int(dest.layout.size.signed_int_min(), dest.layout)
                 });
 
diff --git a/src/tools/miri/src/shims/x86/sse2.rs b/src/tools/miri/src/shims/x86/sse2.rs
index 9db30d7ddca..c9ed751d36c 100644
--- a/src/tools/miri/src/shims/x86/sse2.rs
+++ b/src/tools/miri/src/shims/x86/sse2.rs
@@ -3,8 +3,8 @@ use rustc_span::Symbol;
 use rustc_target::spec::abi::Abi;
 
 use super::{
-    bin_op_simd_float_all, bin_op_simd_float_first, convert_float_to_int, shift_simd_by_scalar,
-    FloatBinOp, ShiftOp,
+    bin_op_simd_float_all, bin_op_simd_float_first, convert_float_to_int, packssdw, packsswb,
+    packuswb, shift_simd_by_scalar, FloatBinOp, ShiftOp,
 };
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
@@ -176,29 +176,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                // left and right are i16x8, dest is i8x16
-                assert_eq!(left_len, 8);
-                assert_eq!(right_len, 8);
-                assert_eq!(dest_len, 16);
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res =
-                        i8::try_from(left).unwrap_or(if left < 0 { i8::MIN } else { i8::MAX });
-                    let right_res =
-                        i8::try_from(right).unwrap_or(if right < 0 { i8::MIN } else { i8::MAX });
-
-                    this.write_scalar(Scalar::from_i8(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_i8(right_res), &right_dest)?;
-                }
+                packsswb(this, left, right, dest)?;
             }
             // Used to implement the _mm_packus_epi16 function.
             // Converts two 16-bit signed integer vectors to a single 8-bit
@@ -207,28 +185,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                // left and right are i16x8, dest is u8x16
-                assert_eq!(left_len, 8);
-                assert_eq!(right_len, 8);
-                assert_eq!(dest_len, 16);
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res = u8::try_from(left).unwrap_or(if left < 0 { 0 } else { u8::MAX });
-                    let right_res =
-                        u8::try_from(right).unwrap_or(if right < 0 { 0 } else { u8::MAX });
-
-                    this.write_scalar(Scalar::from_u8(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_u8(right_res), &right_dest)?;
-                }
+                packuswb(this, left, right, dest)?;
             }
             // Used to implement the _mm_packs_epi32 function.
             // Converts two 32-bit integer vectors to a single 16-bit integer
@@ -237,29 +194,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                // left and right are i32x4, dest is i16x8
-                assert_eq!(left_len, 4);
-                assert_eq!(right_len, 4);
-                assert_eq!(dest_len, 8);
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res =
-                        i16::try_from(left).unwrap_or(if left < 0 { i16::MIN } else { i16::MAX });
-                    let right_res =
-                        i16::try_from(right).unwrap_or(if right < 0 { i16::MIN } else { i16::MAX });
-
-                    this.write_scalar(Scalar::from_i16(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_i16(right_res), &right_dest)?;
-                }
+                packssdw(this, left, right, dest)?;
             }
             // Used to implement _mm_min_sd and _mm_max_sd functions.
             // Note that the semantics are a bit different from Rust simd_min
@@ -420,7 +355,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 };
 
                 let res = this.float_to_int_checked(&op, dest.layout, rnd)?.unwrap_or_else(|| {
-                    // Fallback to minimum acording to SSE semantics.
+                    // Fallback to minimum according to SSE semantics.
                     ImmTy::from_int(dest.layout.size.signed_int_min(), dest.layout)
                 });
 
@@ -447,7 +382,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let res0 = this.float_to_float_or_int(&right0, dest0.layout)?;
                 this.write_immediate(*res0, &dest0)?;
 
-                // Copy remianing from `left`
+                // Copy remaining from `left`
                 for i in 1..dest_len {
                     this.copy_op(&this.project_index(&left, i)?, &this.project_index(&dest, i)?)?;
                 }
diff --git a/src/tools/miri/src/shims/x86/sse41.rs b/src/tools/miri/src/shims/x86/sse41.rs
index 16a82eed99b..19bc27421d3 100644
--- a/src/tools/miri/src/shims/x86/sse41.rs
+++ b/src/tools/miri/src/shims/x86/sse41.rs
@@ -1,7 +1,7 @@
 use rustc_span::Symbol;
 use rustc_target::spec::abi::Abi;
 
-use super::{conditional_dot_product, round_all, round_first, test_bits_masked};
+use super::{conditional_dot_product, mpsadbw, packusdw, round_all, round_first, test_bits_masked};
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
 
@@ -68,27 +68,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(left_len, right_len);
-                assert_eq!(dest_len, left_len.checked_mul(2).unwrap());
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res =
-                        u16::try_from(left).unwrap_or(if left < 0 { 0 } else { u16::MAX });
-                    let right_res =
-                        u16::try_from(right).unwrap_or(if right < 0 { 0 } else { u16::MAX });
-
-                    this.write_scalar(Scalar::from_u16(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_u16(right_res), &right_dest)?;
-                }
+                packusdw(this, left, right, dest)?;
             }
             // Used to implement the _mm_dp_ps and _mm_dp_pd functions.
             // Conditionally multiplies the packed floating-point elements in
@@ -176,40 +156,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right, imm] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(left_len, right_len);
-                assert_eq!(left_len, dest_len.checked_mul(2).unwrap());
-
-                let imm = this.read_scalar(imm)?.to_u8()?;
-                // Bit 2 of `imm` specifies the offset for indices of `left`.
-                // The offset is 0 when the bit is 0 or 4 when the bit is 1.
-                let left_offset = u64::from((imm >> 2) & 1).checked_mul(4).unwrap();
-                // Bits 0..=1 of `imm` specify the offset for indices of
-                // `right` in blocks of 4 elements.
-                let right_offset = u64::from(imm & 0b11).checked_mul(4).unwrap();
-
-                for i in 0..dest_len {
-                    let left_offset = left_offset.checked_add(i).unwrap();
-                    let mut res: u16 = 0;
-                    for j in 0..4 {
-                        let left = this
-                            .read_scalar(
-                                &this.project_index(&left, left_offset.checked_add(j).unwrap())?,
-                            )?
-                            .to_u8()?;
-                        let right = this
-                            .read_scalar(
-                                &this
-                                    .project_index(&right, right_offset.checked_add(j).unwrap())?,
-                            )?
-                            .to_u8()?;
-                        res = res.checked_add(left.abs_diff(right).into()).unwrap();
-                    }
-                    this.write_scalar(Scalar::from_u16(res), &this.project_index(&dest, i)?)?;
-                }
+                mpsadbw(this, left, right, imm, dest)?;
             }
             // Used to implement the _mm_testz_si128, _mm_testc_si128
             // and _mm_testnzc_si128 functions.
diff --git a/src/tools/miri/src/shims/x86/ssse3.rs b/src/tools/miri/src/shims/x86/ssse3.rs
index dd5d064b20f..4f8e52dbb7d 100644
--- a/src/tools/miri/src/shims/x86/ssse3.rs
+++ b/src/tools/miri/src/shims/x86/ssse3.rs
@@ -2,7 +2,7 @@ use rustc_middle::mir;
 use rustc_span::Symbol;
 use rustc_target::spec::abi::Abi;
 
-use super::horizontal_bin_op;
+use super::{horizontal_bin_op, int_abs, pmulhrsw, psign};
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
 
@@ -28,20 +28,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
             "pabs.b.128" | "pabs.w.128" | "pabs.d.128" => {
                 let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (op, op_len) = this.operand_to_simd(op)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(op_len, dest_len);
-
-                for i in 0..dest_len {
-                    let op = this.read_scalar(&this.project_index(&op, i)?)?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    // Converting to a host "i128" works since the input is always signed.
-                    let res = op.to_int(dest.layout.size)?.unsigned_abs();
-
-                    this.write_scalar(Scalar::from_uint(res, dest.layout.size), &dest)?;
-                }
+                int_abs(this, op, dest)?;
             }
             // Used to implement the _mm_shuffle_epi8 intrinsic.
             // Shuffles bytes from `left` using `right` as pattern.
@@ -136,30 +123,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(dest_len, left_len);
-                assert_eq!(dest_len, right_len);
-
-                for i in 0..dest_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    let res = (i32::from(left).checked_mul(right.into()).unwrap() >> 14)
-                        .checked_add(1)
-                        .unwrap()
-                        >> 1;
-
-                    // The result of this operation can overflow a signed 16-bit integer.
-                    // When `left` and `right` are -0x8000, the result is 0x8000.
-                    #[allow(clippy::cast_possible_truncation)]
-                    let res = res as i16;
-
-                    this.write_scalar(Scalar::from_i16(res), &dest)?;
-                }
+                pmulhrsw(this, left, right, dest)?;
             }
             // Used to implement the _mm_sign_epi{8,16,32} functions.
             // Negates elements from `left` when the corresponding element in
@@ -170,28 +134,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(dest_len, left_len);
-                assert_eq!(dest_len, right_len);
-
-                for i in 0..dest_len {
-                    let dest = this.project_index(&dest, i)?;
-                    let left = this.read_immediate(&this.project_index(&left, i)?)?;
-                    let right = this
-                        .read_scalar(&this.project_index(&right, i)?)?
-                        .to_int(dest.layout.size)?;
-
-                    let res = this.wrapping_binary_op(
-                        mir::BinOp::Mul,
-                        &left,
-                        &ImmTy::from_int(right.signum(), dest.layout),
-                    )?;
-
-                    this.write_immediate(*res, &dest)?;
-                }
+                psign(this, left, right, dest)?;
             }
             _ => return Ok(EmulateForeignItemResult::NotSupported),
         }
diff --git a/src/tools/miri/tests/fail/both_borrows/aliasing_mut4.rs b/src/tools/miri/tests/fail/both_borrows/aliasing_mut4.rs
index e188a1f0c34..c656a509644 100644
--- a/src/tools/miri/tests/fail/both_borrows/aliasing_mut4.rs
+++ b/src/tools/miri/tests/fail/both_borrows/aliasing_mut4.rs
@@ -8,7 +8,7 @@ use std::mem;
 pub fn safe(x: &i32, y: &mut Cell<i32>) {
     //~[stack]^ ERROR: protect
     y.set(1);
-    let _ = *x;
+    let _load = *x;
 }
 
 fn main() {
diff --git a/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.rs b/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.rs
index f9983f48c61..f4349286801 100644
--- a/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.rs
+++ b/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.rs
@@ -10,7 +10,7 @@ fn fill(v: &mut i32) {
 }
 
 fn evil() {
-    let _ = unsafe { &mut *(LEAK as *mut i32) }; //~ ERROR: is a dangling pointer
+    let _ref = unsafe { &mut *(LEAK as *mut i32) }; //~ ERROR: is a dangling pointer
 }
 
 fn main() {
diff --git a/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.stderr b/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.stderr
index 27e5a865069..73c3ff1ee05 100644
--- a/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.stderr
+++ b/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.stderr
@@ -1,8 +1,8 @@
 error: Undefined Behavior: out-of-bounds pointer use: $HEX[noalloc] is a dangling pointer (it has no provenance)
   --> $DIR/storage_dead_dangling.rs:LL:CC
    |
-LL |     let _ = unsafe { &mut *(LEAK as *mut i32) };
-   |                      ^^^^^^^^^^^^^^^^^^^^^^^^ out-of-bounds pointer use: $HEX[noalloc] is a dangling pointer (it has no provenance)
+LL |     let _ref = unsafe { &mut *(LEAK as *mut i32) };
+   |                         ^^^^^^^^^^^^^^^^^^^^^^^^ out-of-bounds pointer use: $HEX[noalloc] is a dangling pointer (it has no provenance)
    |
    = help: this indicates a bug in the program: it performed an invalid operation, and caused Undefined Behavior
    = help: see https://doc.rust-lang.org/nightly/reference/behavior-considered-undefined.html for further information
diff --git a/src/tools/miri/tests/pass-dep/shims/libc-misc.rs b/src/tools/miri/tests/pass-dep/shims/libc-misc.rs
index abb384b0a85..f710daf5277 100644
--- a/src/tools/miri/tests/pass-dep/shims/libc-misc.rs
+++ b/src/tools/miri/tests/pass-dep/shims/libc-misc.rs
@@ -213,6 +213,50 @@ fn test_posix_gettimeofday() {
     assert_eq!(is_error, -1);
 }
 
+fn test_localtime_r() {
+    use std::ffi::CStr;
+    use std::{env, ptr};
+
+    // Set timezone to GMT.
+    let key = "TZ";
+    env::set_var(key, "GMT");
+
+    const TIME_SINCE_EPOCH: libc::time_t = 1712475836;
+    let custom_time_ptr = &TIME_SINCE_EPOCH;
+    let mut tm = libc::tm {
+        tm_sec: 0,
+        tm_min: 0,
+        tm_hour: 0,
+        tm_mday: 0,
+        tm_mon: 0,
+        tm_year: 0,
+        tm_wday: 0,
+        tm_yday: 0,
+        tm_isdst: 0,
+        tm_gmtoff: 0,
+        tm_zone: std::ptr::null_mut::<libc::c_char>(),
+    };
+    let res = unsafe { libc::localtime_r(custom_time_ptr, &mut tm) };
+
+    assert_eq!(tm.tm_sec, 56);
+    assert_eq!(tm.tm_min, 43);
+    assert_eq!(tm.tm_hour, 7);
+    assert_eq!(tm.tm_mday, 7);
+    assert_eq!(tm.tm_mon, 3);
+    assert_eq!(tm.tm_year, 124);
+    assert_eq!(tm.tm_wday, 0);
+    assert_eq!(tm.tm_yday, 97);
+    assert_eq!(tm.tm_isdst, -1);
+    assert_eq!(tm.tm_gmtoff, 0);
+    unsafe { assert_eq!(CStr::from_ptr(tm.tm_zone).to_str().unwrap(), "+00") };
+
+    // The returned value is the pointer passed in.
+    assert!(ptr::eq(res, &mut tm));
+
+    //Remove timezone setting.
+    env::remove_var(key);
+}
+
 fn test_isatty() {
     // Testing whether our isatty shim returns the right value would require controlling whether
     // these streams are actually TTYs, which is hard.
@@ -365,6 +409,7 @@ fn main() {
     test_posix_realpath_errors();
 
     test_thread_local_errno();
+    test_localtime_r();
 
     test_isatty();
 
diff --git a/src/tools/miri/tests/pass/adjacent-allocs.rs b/src/tools/miri/tests/pass/adjacent-allocs.rs
index cbf41d68b57..8be4bdac7e1 100644
--- a/src/tools/miri/tests/pass/adjacent-allocs.rs
+++ b/src/tools/miri/tests/pass/adjacent-allocs.rs
@@ -30,7 +30,7 @@ fn test1() {
         // See https://github.com/rust-lang/miri/issues/1866#issuecomment-985770125
         {
             let m = 0u64;
-            let _ = &m as *const u64;
+            let _ptr = &m as *const u64;
         }
 
         let iptr = ptr as usize;
diff --git a/src/tools/miri/tests/pass/const-addrs.rs b/src/tools/miri/tests/pass/const-addrs.rs
index 6c14f0b679c..727c67ebfb5 100644
--- a/src/tools/miri/tests/pass/const-addrs.rs
+++ b/src/tools/miri/tests/pass/const-addrs.rs
@@ -4,7 +4,7 @@
 // deallocated.
 // In Miri we explicitly store previously-assigned AllocIds for each const and ensure
 // that we only hand out a finite number of AllocIds per const.
-// MIR inlining will put every evaluation of the const we're repeatedly evaluting into the same
+// MIR inlining will put every evaluation of the const we're repeatedly evaluating into the same
 // stack frame, breaking this test.
 //@compile-flags: -Zinline-mir=no
 #![feature(strict_provenance)]
diff --git a/src/tools/miri/tests/pass/dyn-upcast.rs b/src/tools/miri/tests/pass/dyn-upcast.rs
index ddc4bdcf082..ff995f38196 100644
--- a/src/tools/miri/tests/pass/dyn-upcast.rs
+++ b/src/tools/miri/tests/pass/dyn-upcast.rs
@@ -69,7 +69,7 @@ fn basic() {
     }
 
     let baz: &dyn Baz = &1;
-    let _: &dyn fmt::Debug = baz;
+    let _up: &dyn fmt::Debug = baz;
     assert_eq!(*baz, 1);
     assert_eq!(baz.a(), 100);
     assert_eq!(baz.b(), 200);
@@ -79,7 +79,7 @@ fn basic() {
     assert_eq!(baz.w(), 21);
 
     let bar: &dyn Bar = baz;
-    let _: &dyn fmt::Debug = bar;
+    let _up: &dyn fmt::Debug = bar;
     assert_eq!(*bar, 1);
     assert_eq!(bar.a(), 100);
     assert_eq!(bar.b(), 200);
@@ -88,14 +88,14 @@ fn basic() {
     assert_eq!(bar.w(), 21);
 
     let foo: &dyn Foo = baz;
-    let _: &dyn fmt::Debug = foo;
+    let _up: &dyn fmt::Debug = foo;
     assert_eq!(*foo, 1);
     assert_eq!(foo.a(), 100);
     assert_eq!(foo.z(), 11);
     assert_eq!(foo.y(), 12);
 
     let foo: &dyn Foo = bar;
-    let _: &dyn fmt::Debug = foo;
+    let _up: &dyn fmt::Debug = foo;
     assert_eq!(*foo, 1);
     assert_eq!(foo.a(), 100);
     assert_eq!(foo.z(), 11);
@@ -168,7 +168,7 @@ fn diamond() {
     }
 
     let baz: &dyn Baz = &1;
-    let _: &dyn fmt::Debug = baz;
+    let _up: &dyn fmt::Debug = baz;
     assert_eq!(*baz, 1);
     assert_eq!(baz.a(), 100);
     assert_eq!(baz.b(), 200);
@@ -180,7 +180,7 @@ fn diamond() {
     assert_eq!(baz.v(), 31);
 
     let bar1: &dyn Bar1 = baz;
-    let _: &dyn fmt::Debug = bar1;
+    let _up: &dyn fmt::Debug = bar1;
     assert_eq!(*bar1, 1);
     assert_eq!(bar1.a(), 100);
     assert_eq!(bar1.b(), 200);
@@ -189,7 +189,7 @@ fn diamond() {
     assert_eq!(bar1.w(), 21);
 
     let bar2: &dyn Bar2 = baz;
-    let _: &dyn fmt::Debug = bar2;
+    let _up: &dyn fmt::Debug = bar2;
     assert_eq!(*bar2, 1);
     assert_eq!(bar2.a(), 100);
     assert_eq!(bar2.c(), 300);
@@ -198,17 +198,17 @@ fn diamond() {
     assert_eq!(bar2.v(), 31);
 
     let foo: &dyn Foo = baz;
-    let _: &dyn fmt::Debug = foo;
+    let _up: &dyn fmt::Debug = foo;
     assert_eq!(*foo, 1);
     assert_eq!(foo.a(), 100);
 
     let foo: &dyn Foo = bar1;
-    let _: &dyn fmt::Debug = foo;
+    let _up: &dyn fmt::Debug = foo;
     assert_eq!(*foo, 1);
     assert_eq!(foo.a(), 100);
 
     let foo: &dyn Foo = bar2;
-    let _: &dyn fmt::Debug = foo;
+    let _up: &dyn fmt::Debug = foo;
     assert_eq!(*foo, 1);
     assert_eq!(foo.a(), 100);
 }
diff --git a/src/tools/miri/tests/pass/intrinsics-x86-avx2.rs b/src/tools/miri/tests/pass/intrinsics-x86-avx2.rs
new file mode 100644
index 00000000000..80d125bb856
--- /dev/null
+++ b/src/tools/miri/tests/pass/intrinsics-x86-avx2.rs
@@ -0,0 +1,1613 @@
+// Ignore everything except x86 and x86_64
+// Any new targets that are added to CI should be ignored here.
+// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.)
+//@ignore-target-aarch64
+//@ignore-target-arm
+//@ignore-target-avr
+//@ignore-target-s390x
+//@ignore-target-thumbv7em
+//@ignore-target-wasm32
+//@compile-flags: -C target-feature=+avx2
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::mem::transmute;
+
+fn main() {
+    assert!(is_x86_feature_detected!("avx2"));
+
+    unsafe {
+        test_avx2();
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn test_avx2() {
+    // Mostly copied from library/stdarch/crates/core_arch/src/x86/avx2.rs
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi32(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_abs_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0,  1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi16(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_abs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_abs_epi8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadd_epi16(a, b);
+        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            1,
+            i16::MAX,
+            2,
+            i16::MAX,
+            3,
+            i16::MAX,
+            4,
+            i16::MAX,
+            5,
+            i16::MAX,
+            6,
+            i16::MAX,
+            7,
+            i16::MAX,
+            8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            -1,
+            i16::MIN,
+            -2,
+            i16::MIN,
+            -3,
+            i16::MIN,
+            -4,
+            i16::MIN,
+            -5,
+            i16::MIN,
+            -6,
+            i16::MIN,
+            -7,
+            i16::MIN,
+            -8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MIN,
+            i16::MIN + 1,
+            i16::MIN + 2,
+            i16::MIN + 3,
+            i16::MAX,
+            i16::MAX - 1,
+            i16::MAX - 2,
+            i16::MAX - 3,
+            i16::MIN + 4,
+            i16::MIN + 5,
+            i16::MIN + 6,
+            i16::MIN + 7,
+            i16::MAX - 4,
+            i16::MAX - 5,
+            i16::MAX - 6,
+            i16::MAX - 7,
+        );
+        let r = _mm256_hadd_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hadd_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hadd_epi32(a, b);
+        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi32(i32::MAX, 1, i32::MAX, 2, i32::MAX, 3, i32::MAX, 4);
+        let b = _mm256_setr_epi32(i32::MIN, -1, i32::MIN, -2, i32::MIN, -3, i32::MIN, -4);
+        let expected = _mm256_setr_epi32(
+            i32::MIN,
+            i32::MIN + 1,
+            i32::MAX,
+            i32::MAX - 1,
+            i32::MIN + 2,
+            i32::MIN + 3,
+            i32::MAX - 2,
+            i32::MAX - 3,
+        );
+        let r = _mm256_hadd_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hadd_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hadds_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+        let a = _mm256_insert_epi16::<1>(a, 1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadds_epi16(a, b);
+        let e = _mm256_setr_epi16(0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
+        assert_eq_m256i(r, e);
+
+        // Test saturating on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            1,
+            i16::MAX,
+            2,
+            i16::MAX,
+            3,
+            i16::MAX,
+            4,
+            i16::MAX,
+            5,
+            i16::MAX,
+            6,
+            i16::MAX,
+            7,
+            i16::MAX,
+            8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            -1,
+            i16::MIN,
+            -2,
+            i16::MIN,
+            -3,
+            i16::MIN,
+            -4,
+            i16::MIN,
+            -5,
+            i16::MIN,
+            -6,
+            i16::MIN,
+            -7,
+            i16::MIN,
+            -8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+        );
+        let r = _mm256_hadds_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hadds_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsub_epi16(a, b);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            -1,
+            i16::MAX,
+            -2,
+            i16::MAX,
+            -3,
+            i16::MAX,
+            -4,
+            i16::MAX,
+            -5,
+            i16::MAX,
+            -6,
+            i16::MAX,
+            -7,
+            i16::MAX,
+            -8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            1,
+            i16::MIN,
+            2,
+            i16::MIN,
+            3,
+            i16::MIN,
+            4,
+            i16::MIN,
+            5,
+            i16::MIN,
+            6,
+            i16::MIN,
+            7,
+            i16::MIN,
+            8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MIN,
+            i16::MIN + 1,
+            i16::MIN + 2,
+            i16::MIN + 3,
+            i16::MAX,
+            i16::MAX - 1,
+            i16::MAX - 2,
+            i16::MAX - 3,
+            i16::MIN + 4,
+            i16::MIN + 5,
+            i16::MIN + 6,
+            i16::MIN + 7,
+            i16::MAX - 4,
+            i16::MAX - 5,
+            i16::MAX - 6,
+            i16::MAX - 7,
+        );
+        let r = _mm256_hsub_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hsub_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hsub_epi32(a, b);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi32(i32::MAX, -1, i32::MAX, -2, i32::MAX, -3, i32::MAX, -4);
+        let b = _mm256_setr_epi32(i32::MIN, 1, i32::MIN, 2, i32::MIN, 3, i32::MIN, 4);
+        let expected = _mm256_setr_epi32(
+            i32::MIN,
+            i32::MIN + 1,
+            i32::MAX,
+            i32::MAX - 1,
+            i32::MIN + 2,
+            i32::MIN + 3,
+            i32::MAX - 2,
+            i32::MAX - 3,
+        );
+        let r = _mm256_hsub_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hsub_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hsubs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+        let a = _mm256_insert_epi16::<1>(a, -1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsubs_epi16(a, b);
+        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
+        assert_eq_m256i(r, e);
+
+        // Test saturating on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            -1,
+            i16::MAX,
+            -2,
+            i16::MAX,
+            -3,
+            i16::MAX,
+            -4,
+            i16::MAX,
+            -5,
+            i16::MAX,
+            -6,
+            i16::MAX,
+            -7,
+            i16::MAX,
+            -8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            1,
+            i16::MIN,
+            2,
+            i16::MIN,
+            3,
+            i16::MIN,
+            4,
+            i16::MIN,
+            5,
+            i16::MIN,
+            6,
+            i16::MIN,
+            7,
+            i16::MIN,
+            8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+        );
+        let r = _mm256_hsubs_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hsubs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+    test_mm_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i32gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+    test_mm_mask_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r =
+            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+    }
+    test_mm256_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i32gather_epi32::<4>(
+            _mm256_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
+    }
+    test_mm256_mask_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i32gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm_mask_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r =
+            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
+    }
+    test_mm256_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i32gather_ps::<4>(
+            _mm256_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
+        );
+        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0));
+    }
+    test_mm256_mask_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+    test_mm_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i32gather_epi64::<8>(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_epi64x(-1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+    test_mm_mask_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+    test_mm256_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i32gather_epi64::<8>(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+    test_mm256_mask_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+    test_mm_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i32gather_pd::<8>(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+    test_mm_mask_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm256_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i32gather_pd::<8>(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm256_mask_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
+    }
+    test_mm_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i64gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_epi32(-1, 0, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
+    }
+    test_mm_mask_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+    test_mm256_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i64gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+    test_mm256_mask_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
+    }
+    test_mm_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i64gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
+    }
+    test_mm_mask_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm256_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i64gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm256_mask_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+    test_mm_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i64gather_epi64::<8>(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_epi64x(-1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+    test_mm_mask_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+    test_mm256_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i64gather_epi64::<8>(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+    test_mm256_mask_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+    test_mm_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i64gather_pd::<8>(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+    test_mm_mask_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm256_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i64gather_pd::<8>(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm256_mask_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_madd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_madd_epi16(a, b);
+        let e = _mm256_set1_epi32(16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_madd_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maddubs_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_maddubs_epi16(a, b);
+        let e = _mm256_set1_epi16(16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_maddubs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi32() {
+        let nums = [1, 2, 3, 4];
+        let a = &nums as *const i32;
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        let r = _mm_maskload_epi32(a, mask);
+        let e = _mm_setr_epi32(1, 0, 0, 4);
+        assert_eq_m128i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i32, 2, 3, 4]);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let r = _mm_maskload_epi32(a.as_ptr().cast(), mask);
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm_setr_epi32(!0, 0, 0, 0);
+        let r = _mm_maskload_epi32(a.as_ptr(), mask);
+        let e = _mm_setr_epi32(2, 0, 0, 0);
+        assert_eq_m128i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm_setr_epi32(0, 0, 0, !0);
+        let r = _mm_maskload_epi32(a.as_ptr().wrapping_sub(3), mask);
+        let e = _mm_setr_epi32(0, 0, 0, 2);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_maskload_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi32() {
+        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
+        let a = &nums as *const i32;
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        let r = _mm256_maskload_epi32(a, mask);
+        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
+        assert_eq_m256i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i32, 2, 3, 4, 5, 6, 7, 8]);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let r = _mm256_maskload_epi32(a.as_ptr().cast(), mask);
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm256_setr_epi32(!0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_maskload_epi32(a.as_ptr(), mask);
+        let e = _mm256_setr_epi32(2, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, !0);
+        let r = _mm256_maskload_epi32(a.as_ptr().wrapping_sub(7), mask);
+        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_maskload_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi64() {
+        let nums = [1_i64, 2_i64];
+        let a = &nums as *const i64;
+        let mask = _mm_setr_epi64x(0, -1);
+        let r = _mm_maskload_epi64(a, mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i64, 2]);
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_epi64(a.as_ptr().cast(), mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm_setr_epi64x(!0, 0);
+        let r = _mm_maskload_epi64(a.as_ptr(), mask);
+        let e = _mm_setr_epi64x(2, 0);
+        assert_eq_m128i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_epi64(a.as_ptr().wrapping_sub(1), mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_maskload_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi64() {
+        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
+        let a = &nums as *const i64;
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        let r = _mm256_maskload_epi64(a, mask);
+        let e = _mm256_setr_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i64, 2, 3, 4]);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let r = _mm256_maskload_epi64(a.as_ptr().cast(), mask);
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm256_setr_epi64x(!0, 0, 0, 0);
+        let r = _mm256_maskload_epi64(a.as_ptr(), mask);
+        let e = _mm256_setr_epi64x(2, 0, 0, 0);
+        assert_eq_m256i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm256_setr_epi64x(0, 0, 0, !0);
+        let r = _mm256_maskload_epi64(a.as_ptr().wrapping_sub(3), mask);
+        let e = _mm256_setr_epi64x(0, 0, 0, 2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_maskload_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut arr = [-1, -1, -1, -1];
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 4];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i32; 4]);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        _mm_maskstore_epi32(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i32, 2, 0, 4];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm_setr_epi32(!0, 0, 0, 0);
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        _mm_maskstore_epi32(r.as_mut_ptr(), mask, a);
+        let e = [1i32];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm_setr_epi32(0, 0, 0, !0);
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        _mm_maskstore_epi32(r.as_mut_ptr().wrapping_sub(3), mask, a);
+        let e = [4i32];
+        assert_eq!(r, e);
+    }
+    test_mm_maskstore_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi32() {
+        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
+        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 42, -1, 6, 7, -1];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i32; 8]);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        _mm256_maskstore_epi32(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i32, 2, 0, 4, 0, 6, 0, 8];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm256_setr_epi32(!0, 0, 0, 0, 0, 0, 0, 0);
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        _mm256_maskstore_epi32(r.as_mut_ptr(), mask, a);
+        let e = [1i32];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, !0);
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        _mm256_maskstore_epi32(r.as_mut_ptr().wrapping_sub(7), mask, a);
+        let e = [8i32];
+        assert_eq!(r, e);
+    }
+    test_mm256_maskstore_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi64() {
+        let a = _mm_setr_epi64x(1_i64, 2_i64);
+        let mut arr = [-1_i64, -1_i64];
+        let mask = _mm_setr_epi64x(0, -1);
+        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i64; 2]);
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_epi64x(1, 2);
+        _mm_maskstore_epi64(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i64, 2];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm_setr_epi64x(!0, 0);
+        let a = _mm_setr_epi64x(1, 2);
+        _mm_maskstore_epi64(r.as_mut_ptr(), mask, a);
+        let e = [1i64];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_epi64x(1, 2);
+        _mm_maskstore_epi64(r.as_mut_ptr().wrapping_sub(1), mask, a);
+        let e = [2i64];
+        assert_eq!(r, e);
+    }
+    test_mm_maskstore_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi64() {
+        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
+        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2, 3, -1];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i64; 4]);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        _mm256_maskstore_epi64(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i64, 2, 0, 4];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm256_setr_epi64x(!0, 0, 0, 0);
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        _mm256_maskstore_epi64(r.as_mut_ptr(), mask, a);
+        let e = [1i64];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm256_setr_epi64x(0, 0, 0, !0);
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        _mm256_maskstore_epi64(r.as_mut_ptr().wrapping_sub(3), mask, a);
+        let e = [4i64];
+        assert_eq!(r, e);
+    }
+    test_mm256_maskstore_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mpsadbw_epu8() {
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 2, 4, 6, 8, 10, 12, 14, 16,
+            18, 20, 22, 24, 26, 28, 30,
+        );
+
+        let r = _mm256_mpsadbw_epu8::<0b000>(a, a);
+        let e = _mm256_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28, 0, 8, 16, 24, 32, 40, 48, 56);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b001>(a, a);
+        let e = _mm256_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12, 32, 24, 16, 8, 0, 8, 16, 24);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b100>(a, a);
+        let e = _mm256_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44, 32, 40, 48, 56, 64, 72, 80, 88);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b101>(a, a);
+        let e = _mm256_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28, 0, 8, 16, 24, 32, 40, 48, 56);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b111>(a, a);
+        let e = _mm256_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4, 64, 56, 48, 40, 32, 24, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_mpsadbw_epu8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_mullo_epi16(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_mulhrs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packs_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packs_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packs_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packus_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packus_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packus_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packus_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_epi32() {
+        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
+        let r = _mm256_permutevar8x32_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_permutevar8x32_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_permute2x128_si256() {
+        let a = _mm256_setr_epi64x(100, 200, 500, 600);
+        let b = _mm256_setr_epi64x(300, 400, 700, 800);
+        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
+        let e = _mm256_setr_epi64x(700, 800, 500, 600);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_permute2x128_si256();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let r = _mm256_permutevar8x32_ps(a, b);
+        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
+        assert_eq_m256(r, e);
+    }
+    test_mm256_permutevar8x32_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_sad_epu8(a, b);
+        let e = _mm256_set1_epi64x(16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sad_epu8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+        );
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            5, 0, 5, 4, 9, 13, 7, 4,
+            13, 6, 6, 11, 5, 2, 9, 1,
+            21, 0, 21, 20, 25, 29, 23, 20,
+            29, 22, 22, 27, 21, 18, 25, 17,
+        );
+        let r = _mm256_shuffle_epi8(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_shuffle_epi8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let r = _mm256_sign_epi16(a, b);
+        let e = _mm256_set1_epi16(-2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sign_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_sign_epi32(a, b);
+        let e = _mm256_set1_epi32(-2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sign_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let r = _mm256_sign_epi8(a, b);
+        let e = _mm256_set1_epi8(-2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sign_epi8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi16() {
+        let a = _mm256_setr_epi16(
+            0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE,
+            -0xEE, 0xFF, -0xFF,
+        );
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(
+                0x880, -0x880, 0x990, -0x990, 0xAA0, -0xAA0, 0xBB0, -0xBB0, 0xCC0, -0xCC0, 0xDD0,
+                -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0,
+            ),
+        );
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+    }
+    test_mm256_sll_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi32() {
+        let a =
+            _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi32(
+                0xCCCC0, -0xCCCC0, 0xDDDD0, -0xDDDD0, 0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0,
+            ),
+        );
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+    }
+    test_mm256_sll_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi64() {
+        let a = _mm256_set_epi64x(0xEEEEEEEE, -0xEEEEEEEE, 0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(r, _mm256_set_epi64x(0xEEEEEEEE0, -0xEEEEEEEE0, 0xFFFFFFFF0, -0xFFFFFFFF0));
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+    }
+    test_mm256_sll_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi16() {
+        let a = _mm256_setr_epi16(
+            0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE,
+            -0xEE, 0xFF, -0xFF,
+        );
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(
+                0x8, -0x9, 0x9, -0xA, 0xA, -0xB, 0xB, -0xC, 0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF,
+                -0x10,
+            ),
+        );
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1),
+        );
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1),
+        );
+    }
+    test_mm256_sra_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi32() {
+        let a =
+            _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi32(0xCCC, -0xCCD, 0xDDD, -0xDDE, 0xEEE, -0xEEF, 0xFFF, -0x1000),
+        );
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, -1, 0, -1, 0, -1, 0, -1));
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, -1, 0, -1, 0, -1, 0, -1));
+    }
+    test_mm256_sra_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi16() {
+        let a = _mm256_setr_epi16(
+            0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE,
+            -0xEE, 0xFF, -0xFF,
+        );
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(
+                0x8, 0xFF7, 0x9, 0xFF6, 0xA, 0xFF5, 0xB, 0xFF4, 0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1,
+                0xF, 0xFF0,
+            ),
+        );
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+    }
+    test_mm256_srl_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi32() {
+        let a =
+            _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi32(
+                0xCCC, 0xFFFF333, 0xDDD, 0xFFFF222, 0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000,
+            ),
+        );
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+    }
+    test_mm256_srl_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi64() {
+        let a = _mm256_set_epi64x(0xEEEEEEEE, -0xEEEEEEEE, 0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_set_epi64x(0xEEEEEEE, 0xFFFFFFFF1111111, 0xFFFFFFF, 0xFFFFFFFF0000000),
+        );
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+    }
+    test_mm256_srl_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(4, 3, 2, 1);
+        let r = _mm_sllv_epi32(a, b);
+        let e = _mm_set_epi32(16, 16, 12, 8);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_sllv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = _mm256_sllv_epi32(a, b);
+        let e = _mm256_set_epi32(256, 256, 192, 128, 80, 48, 28, 16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sllv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(1, 2);
+        let r = _mm_sllv_epi64(a, b);
+        let e = _mm_set_epi64x(4, 12);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_sllv_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(4, 3, 2, 1);
+        let r = _mm256_sllv_epi64(a, b);
+        let e = _mm256_set_epi64x(16, 16, 12, 8);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sllv_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_srav_epi32() {
+        let a = _mm_set_epi32(16, -32, 64, -128);
+        let b = _mm_set_epi32(4, 3, 2, 1);
+        let r = _mm_srav_epi32(a, b);
+        let e = _mm_set_epi32(1, -4, 16, -64);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_srav_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srav_epi32() {
+        let a = _mm256_set_epi32(256, -512, 1024, -2048, 4096, -8192, 16384, -32768);
+        let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = _mm256_srav_epi32(a, b);
+        let e = _mm256_set_epi32(1, -4, 16, -64, 256, -1024, 4096, -16384);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_srav_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi32() {
+        let a = _mm_set_epi32(16, 32, 64, 128);
+        let b = _mm_set_epi32(4, 3, 2, 1);
+        let r = _mm_srlv_epi32(a, b);
+        let e = _mm_set_epi32(1, 4, 16, 64);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_srlv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi32() {
+        let a = _mm256_set_epi32(256, 512, 1024, 2048, 4096, 8192, 16384, 32768);
+        let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = _mm256_srlv_epi32(a, b);
+        let e = _mm256_set_epi32(1, 4, 16, 64, 256, 1024, 4096, 16384);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_srlv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi64() {
+        let a = _mm_set_epi64x(4, 8);
+        let b = _mm_set_epi64x(2, 1);
+        let r = _mm_srlv_epi64(a, b);
+        let e = _mm_set_epi64x(1, 4);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_srlv_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi64() {
+        let a = _mm256_set_epi64x(16, 32, 64, 128);
+        let b = _mm256_set_epi64x(4, 3, 2, 1);
+        let r = _mm256_srlv_epi64(a, b);
+        let e = _mm256_set_epi64x(1, 4, 16, 64);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_srlv_epi64();
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+    _mm_set_epi64x(b, a)
+}
+
+#[track_caller]
+#[target_feature(enable = "sse")]
+unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256(a: __m256, b: __m256) {
+    let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_ps(cmp) != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
+    let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_pd(cmp) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
+    assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
+}
+
+/// Stores `T` in an unaligned address
+struct Unaligned<T: Copy> {
+    buf: Vec<u8>,
+    offset: bool,
+    _marker: std::marker::PhantomData<T>,
+}
+
+impl<T: Copy> Unaligned<T> {
+    fn new(value: T) -> Self {
+        // Allocate extra byte for unalignment headroom
+        let len = std::mem::size_of::<T>();
+        let mut buf = Vec::<u8>::with_capacity(len + 1);
+        // Force the address to be a non-multiple of 2, so it is as unaligned as it can get.
+        let offset = (buf.as_ptr() as usize % 2) == 0;
+        let value_ptr: *const T = &value;
+        unsafe {
+            buf.as_mut_ptr().add(offset.into()).copy_from_nonoverlapping(value_ptr.cast(), len);
+        }
+        Self { buf, offset, _marker: std::marker::PhantomData }
+    }
+
+    fn as_ptr(&self) -> *const T {
+        unsafe { self.buf.as_ptr().add(self.offset.into()).cast() }
+    }
+
+    fn as_mut_ptr(&mut self) -> *mut T {
+        unsafe { self.buf.as_mut_ptr().add(self.offset.into()).cast() }
+    }
+
+    fn read(&self) -> T {
+        unsafe { self.as_ptr().read_unaligned() }
+    }
+}
diff --git a/src/tools/miri/tests/pass/issues/issue-miri-1909.rs b/src/tools/miri/tests/pass/issues/issue-miri-1909.rs
index ce2114e760a..8a2e67cdd09 100644
--- a/src/tools/miri/tests/pass/issues/issue-miri-1909.rs
+++ b/src/tools/miri/tests/pass/issues/issue-miri-1909.rs
@@ -9,7 +9,7 @@ use std::alloc::System;
 /// `ptr` must be valid for writes of `len` bytes
 unsafe fn volatile_write_zeroize_mem(ptr: *mut u8, len: usize) {
     for i in 0..len {
-        // ptr as usize + i can't overlow because `ptr` is valid for writes of `len`
+        // ptr as usize + i can't overflow because `ptr` is valid for writes of `len`
         let ptr_new: *mut u8 = ((ptr as usize) + i) as *mut u8;
         // SAFETY: `ptr` is valid for writes of `len` bytes, so `ptr_new` is valid for a
         // byte write
diff --git a/src/tools/miri/tests/pass/shims/env/home.rs b/src/tools/miri/tests/pass/shims/env/home.rs
index 9eb9c3af569..c237f9ed9ff 100644
--- a/src/tools/miri/tests/pass/shims/env/home.rs
+++ b/src/tools/miri/tests/pass/shims/env/home.rs
@@ -1,9 +1,9 @@
-//@ignore-target-windows: home_dir is not supported on Windows
 //@compile-flags: -Zmiri-disable-isolation
 use std::env;
 
 fn main() {
     env::remove_var("HOME"); // make sure we enter the interesting codepath
+    env::remove_var("USERPROFILE"); // Windows also looks as this env var
     #[allow(deprecated)]
     env::home_dir().unwrap();
 }
diff --git a/src/tools/miri/tests/pass/shims/env/var-set.rs b/src/tools/miri/tests/pass/shims/env/var-set.rs
new file mode 100644
index 00000000000..2875b6c815a
--- /dev/null
+++ b/src/tools/miri/tests/pass/shims/env/var-set.rs
@@ -0,0 +1,7 @@
+// Test a value set on the host (MIRI_ENV_VAR_TEST) and one that is not.
+//@compile-flags: -Zmiri-env-set=MIRI_ENV_VAR_TEST=test_value_1 -Zmiri-env-set=TEST_VAR_2=test_value_2
+
+fn main() {
+    assert_eq!(std::env::var("MIRI_ENV_VAR_TEST"), Ok("test_value_1".to_owned()));
+    assert_eq!(std::env::var("TEST_VAR_2"), Ok("test_value_2".to_owned()));
+}