Merge branch 'llvm19'

Upgrades the LLVM, Clang, and LLD dependencies to LLVM 19.x Related to #16270 Big thanks to Alex Rønne Petersen for doing the bulk of the upgrade work in this branch.
2024-11-13 23:52:57 +00:00 · 2024-09-19 22:47:00 -07:00 · 2024-09-19 22:47:00 -07:00 · c6ad4521c7
commit c6ad4521c7
parent 8b82a0e0fc 075ec55552
796 changed files with 62849 additions and 55015 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -4,7 +4,7 @@ on:
  push:
    branches:
      - master
-      - llvm18
+      - llvm19
 concurrency:
  # Cancels pending runs when a PR gets updated.
  group: ${{ github.head_ref || github.run_id }}-${{ github.actor }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -137,9 +137,9 @@ else()
    set(ZIG_SYSTEM_LIBCXX "stdc++" CACHE STRING "system libcxx name for build.zig")
 endif()

-find_package(llvm 18)
-find_package(clang 18)
-find_package(lld 18)
+find_package(llvm 19)
+find_package(clang 19)
+find_package(lld 19)

 if(ZIG_STATIC_ZLIB)
    if (MSVC)
@ -834,6 +834,11 @@ else()
  endif()
 endif()

+option(ZIG2_NO_RTLIB "Build zig2 without linking to a compiler runtime library (for `zig cc` only)" OFF)
+if(ZIG2_NO_RTLIB)
+  set(ZIG2_LINK_FLAGS "${ZIG2_LINK_FLAGS} -rtlib=none")
+endif()
+
 set(ZIG1_WASM_MODULE "${PROJECT_SOURCE_DIR}/stage1/zig1.wasm")
 set(ZIG1_C_SOURCE "${PROJECT_BINARY_DIR}/zig1.c")
 set(ZIG2_C_SOURCE "${PROJECT_BINARY_DIR}/zig2.c")
@ -889,9 +894,7 @@ set(BUILD_COMPILER_RT_ARGS
  --name compiler_rt
  -femit-bin="${ZIG_COMPILER_RT_C_SOURCE}"
  -target "${ZIG_HOST_TARGET_TRIPLE}"
-  --dep "build_options"
  "-Mroot=lib/compiler_rt.zig"
-  "-Mbuild_options=${ZIG_CONFIG_ZIG_OUT}"
 )

 add_custom_command(
--- a/bootstrap.c
+++ b/bootstrap.c
@ -170,9 +170,7 @@ int main(int argc, char **argv) {
            "-ofmt=c", "-OReleaseSmall",
            "--name", "compiler_rt", "-femit-bin=compiler_rt.c",
            "-target", host_triple,
-            "--dep", "build_options",
            "-Mroot=lib/compiler_rt.zig",
-            "-Mbuild_options=config.zig",
            NULL,
        };
        print_and_run(child_argv);
--- a/build.zig
+++ b/build.zig
@ -1099,6 +1099,8 @@ const clang_libs = [_][]const u8{
    "clangToolingCore",
    "clangExtractAPI",
    "clangSupport",
+    "clangInstallAPI",
+    "clangAST",
 };
 const lld_libs = [_][]const u8{
    "lldMinGW",
@ -1120,6 +1122,7 @@ const llvm_libs = [_][]const u8{
    "LLVMTextAPIBinaryReader",
    "LLVMCoverage",
    "LLVMLineEditor",
+    "LLVMSandboxIR",
    "LLVMXCoreDisassembler",
    "LLVMXCoreCodeGen",
    "LLVMXCoreDesc",
@ -1255,6 +1258,7 @@ const llvm_libs = [_][]const u8{
    "LLVMDWARFLinkerParallel",
    "LLVMDWARFLinkerClassic",
    "LLVMDWARFLinker",
+    "LLVMCodeGenData",
    "LLVMGlobalISel",
    "LLVMMIRParser",
    "LLVMAsmPrinter",
--- a/ci/aarch64-linux-debug.sh
+++ b/ci/aarch64-linux-debug.sh
@ -8,7 +8,7 @@ set -e
 ARCH="$(uname -m)"
 TARGET="$ARCH-linux-musl"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.418+ebd9efa85"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.1622+2ac543388"
 PREFIX="$HOME/deps/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

--- a/ci/aarch64-linux-release.sh
+++ b/ci/aarch64-linux-release.sh
@ -8,7 +8,7 @@ set -e
 ARCH="$(uname -m)"
 TARGET="$ARCH-linux-musl"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.418+ebd9efa85"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.1622+2ac543388"
 PREFIX="$HOME/deps/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

--- a/ci/aarch64-macos-debug.sh
+++ b/ci/aarch64-macos-debug.sh
@ -9,7 +9,7 @@ set -e
 ZIGDIR="$PWD"
 TARGET="$ARCH-macos-none"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.1622+2ac543388"
 PREFIX="$HOME/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

--- a/ci/aarch64-macos-release.sh
+++ b/ci/aarch64-macos-release.sh
@ -9,7 +9,7 @@ set -e
 ZIGDIR="$PWD"
 TARGET="$ARCH-macos-none"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.1622+2ac543388"
 PREFIX="$HOME/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

--- a/ci/aarch64-windows.ps1
+++ b/ci/aarch64-windows.ps1
@ -1,5 +1,5 @@
 $TARGET = "$($Env:ARCH)-windows-gnu"
-$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
+$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.14.0-dev.1622+2ac543388"
 $MCPU = "baseline"
 $ZIG_LLVM_CLANG_LLD_URL = "https://ziglang.org/deps/$ZIG_LLVM_CLANG_LLD_NAME.zip"
 $PREFIX_PATH = "$(Get-Location)\..\$ZIG_LLVM_CLANG_LLD_NAME"
--- a/ci/x86_64-linux-debug.sh
+++ b/ci/x86_64-linux-debug.sh
@ -8,7 +8,7 @@ set -e
 ARCH="$(uname -m)"
 TARGET="$ARCH-linux-musl"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.418+ebd9efa85"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.1622+2ac543388"
 PREFIX="$HOME/deps/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

--- a/ci/x86_64-linux-release.sh
+++ b/ci/x86_64-linux-release.sh
@ -8,7 +8,7 @@ set -e
 ARCH="$(uname -m)"
 TARGET="$ARCH-linux-musl"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.418+ebd9efa85"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.1622+2ac543388"
 PREFIX="$HOME/deps/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

--- a/ci/x86_64-macos-release.sh
+++ b/ci/x86_64-macos-release.sh
@ -6,7 +6,7 @@ set -e
 ZIGDIR="$PWD"
 TARGET="$ARCH-macos-none"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.14.0-dev.1622+2ac543388"
 PREFIX="$HOME/$CACHE_BASENAME"
 JOBS="-j3"
 ZIG="$PREFIX/bin/zig"
--- a/ci/x86_64-windows-debug.ps1
+++ b/ci/x86_64-windows-debug.ps1
@ -1,5 +1,5 @@
 $TARGET = "$($Env:ARCH)-windows-gnu"
-$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
+$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.14.0-dev.1622+2ac543388"
 $MCPU = "baseline"
 $ZIG_LLVM_CLANG_LLD_URL = "https://ziglang.org/deps/$ZIG_LLVM_CLANG_LLD_NAME.zip"
 $PREFIX_PATH = "$($Env:USERPROFILE)\$ZIG_LLVM_CLANG_LLD_NAME"
--- a/ci/x86_64-windows-release.ps1
+++ b/ci/x86_64-windows-release.ps1
@ -1,5 +1,5 @@
 $TARGET = "$($Env:ARCH)-windows-gnu"
-$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
+$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.14.0-dev.1622+2ac543388"
 $MCPU = "baseline"
 $ZIG_LLVM_CLANG_LLD_URL = "https://ziglang.org/deps/$ZIG_LLVM_CLANG_LLD_NAME.zip"
 $PREFIX_PATH = "$($Env:USERPROFILE)\$ZIG_LLVM_CLANG_LLD_NAME"
--- a/cmake/Findclang.cmake
+++ b/cmake/Findclang.cmake
@ -17,10 +17,10 @@ find_path(CLANG_INCLUDE_DIRS NAMES clang/Frontend/ASTUnit.h
 if(${LLVM_LINK_MODE} STREQUAL "shared")
  find_library(CLANG_LIBRARIES
    NAMES
-      libclang-cpp.so.18
-      libclang-cpp.so.18.1
-      clang-cpp-18.0
-      clang-cpp180
+      libclang-cpp.so.19
+      libclang-cpp.so.19.1
+      clang-cpp-19.0
+      clang-cpp190
      clang-cpp
    NAMES_PER_DIR
    HINTS "${LLVM_LIBDIRS}"
@ -68,6 +68,8 @@ else()
  FIND_AND_ADD_CLANG_LIB(clangToolingCore)
  FIND_AND_ADD_CLANG_LIB(clangExtractAPI)
  FIND_AND_ADD_CLANG_LIB(clangSupport)
+  FIND_AND_ADD_CLANG_LIB(clangInstallAPI)
+  FIND_AND_ADD_CLANG_LIB(clangAST)
 endif()

 if (MSVC)
--- a/cmake/Findlld.cmake
+++ b/cmake/Findlld.cmake
@ -9,21 +9,21 @@
 find_path(LLD_INCLUDE_DIRS NAMES lld/Common/Driver.h
    HINTS ${LLVM_INCLUDE_DIRS}
    PATHS
-        /usr/lib/llvm-18/include
-        /usr/local/llvm180/include
-        /usr/local/llvm18/include
-        /usr/local/opt/llvm@18/include
-        /opt/homebrew/opt/llvm@18/include
+        /usr/lib/llvm-19/include
+        /usr/local/llvm190/include
+        /usr/local/llvm19/include
+        /usr/local/opt/llvm@19/include
+        /opt/homebrew/opt/llvm@19/include
        /mingw64/include)

-find_library(LLD_LIBRARY NAMES lld-18.0 lld180 lld NAMES_PER_DIR
+find_library(LLD_LIBRARY NAMES lld-19.0 lld190 lld NAMES_PER_DIR
    HINTS ${LLVM_LIBDIRS}
    PATHS
-        /usr/lib/llvm-18/lib
-        /usr/local/llvm180/lib
-        /usr/local/llvm18/lib
-        /usr/local/opt/llvm@18/lib
-        /opt/homebrew/opt/llvm@18/lib
+        /usr/lib/llvm-19/lib
+        /usr/local/llvm190/lib
+        /usr/local/llvm19/lib
+        /usr/local/opt/llvm@19/lib
+        /opt/homebrew/opt/llvm@19/lib
 )
 if(EXISTS ${LLD_LIBRARY})
    set(LLD_LIBRARIES ${LLD_LIBRARY})
@ -34,11 +34,11 @@ else()
            HINTS ${LLVM_LIBDIRS}
            PATHS
                ${LLD_LIBDIRS}
-                /usr/lib/llvm-18/lib
-                /usr/local/llvm180/lib
-                /usr/local/llvm18/lib
-                /usr/local/opt/llvm@18/lib
-                /opt/homebrew/opt/llvm@18/lib
+                /usr/lib/llvm-19/lib
+                /usr/local/llvm190/lib
+                /usr/local/llvm19/lib
+                /usr/local/opt/llvm@19/lib
+                /opt/homebrew/opt/llvm@19/lib
                /mingw64/lib
                /c/msys64/mingw64/lib
                c:/msys64/mingw64/lib)
--- a/cmake/Findllvm.cmake
+++ b/cmake/Findllvm.cmake
@ -17,12 +17,12 @@ if(ZIG_USE_LLVM_CONFIG)
    # terminate when the right LLVM version is not found.
    unset(LLVM_CONFIG_EXE CACHE)
    find_program(LLVM_CONFIG_EXE
-        NAMES llvm-config-18 llvm-config-18.0 llvm-config180 llvm-config18 llvm-config NAMES_PER_DIR
+        NAMES llvm-config-19 llvm-config-19.0 llvm-config190 llvm-config19 llvm-config NAMES_PER_DIR
        PATHS
            "/mingw64/bin"
            "/c/msys64/mingw64/bin"
            "c:/msys64/mingw64/bin"
-            "C:/Libraries/llvm-18.0.0/bin")
+            "C:/Libraries/llvm-19.0.0/bin")

    if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
      if (NOT LLVM_CONFIG_ERROR_MESSAGES STREQUAL "")
@ -40,9 +40,9 @@ if(ZIG_USE_LLVM_CONFIG)
      OUTPUT_STRIP_TRAILING_WHITESPACE)

    get_filename_component(LLVM_CONFIG_DIR "${LLVM_CONFIG_EXE}" DIRECTORY)
-    if("${LLVM_CONFIG_VERSION}" VERSION_LESS 18 OR "${LLVM_CONFIG_VERSION}" VERSION_EQUAL 19 OR "${LLVM_CONFIG_VERSION}" VERSION_GREATER 19)
+    if("${LLVM_CONFIG_VERSION}" VERSION_LESS 19 OR "${LLVM_CONFIG_VERSION}" VERSION_EQUAL 20 OR "${LLVM_CONFIG_VERSION}" VERSION_GREATER 20)
      # Save the error message, in case this is the last llvm-config we find
-      list(APPEND LLVM_CONFIG_ERROR_MESSAGES "expected LLVM 18.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+      list(APPEND LLVM_CONFIG_ERROR_MESSAGES "expected LLVM 19.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")

      # Ignore this directory and try the search again
      list(APPEND CMAKE_IGNORE_PATH "${LLVM_CONFIG_DIR}")
@ -63,12 +63,12 @@ if(ZIG_USE_LLVM_CONFIG)
        ERROR_VARIABLE LLVM_CONFIG_ERROR
        ERROR_STRIP_TRAILING_WHITESPACE)

-      if (LLVM_CONFIG_ERROR) 
+      if (LLVM_CONFIG_ERROR)
        # Save the error message, in case this is the last llvm-config we find
        if (ZIG_SHARED_LLVM)
-          list(APPEND LLVM_CONFIG_ERROR_MESSAGES "LLVM 18.x found at ${LLVM_CONFIG_EXE} does not support linking as a shared library")
+          list(APPEND LLVM_CONFIG_ERROR_MESSAGES "LLVM 19.x found at ${LLVM_CONFIG_EXE} does not support linking as a shared library")
        else()
-          list(APPEND LLVM_CONFIG_ERROR_MESSAGES "LLVM 18.x found at ${LLVM_CONFIG_EXE} does not support linking as a static library")
+          list(APPEND LLVM_CONFIG_ERROR_MESSAGES "LLVM 19.x found at ${LLVM_CONFIG_EXE} does not support linking as a static library")
        endif()

        # Ignore this directory and try the search again
@ -200,6 +200,7 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMTextAPIBinaryReader)
  FIND_AND_ADD_LLVM_LIB(LLVMCoverage)
  FIND_AND_ADD_LLVM_LIB(LLVMLineEditor)
+  FIND_AND_ADD_LLVM_LIB(LLVMSandboxIR)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreDisassembler)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreDesc)
@ -335,6 +336,7 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMDWARFLinkerParallel)
  FIND_AND_ADD_LLVM_LIB(LLVMDWARFLinkerClassic)
  FIND_AND_ADD_LLVM_LIB(LLVMDWARFLinker)
+  FIND_AND_ADD_LLVM_LIB(LLVMCodeGenData)
  FIND_AND_ADD_LLVM_LIB(LLVMGlobalISel)
  FIND_AND_ADD_LLVM_LIB(LLVMMIRParser)
  FIND_AND_ADD_LLVM_LIB(LLVMAsmPrinter)
--- a/lib/compiler/aro/aro/target.zig
+++ b/lib/compiler/aro/aro/target.zig
@ -658,6 +658,7 @@ pub fn toLLVMTriple(target: std.Target, buf: []u8) []const u8 {
        .shadermodel => "shadermodel",
        .visionos => "xros",
        .serenity => "serenity",
+        .bridgeos => "bridgeos",
        .opencl,
        .opengl,
        .vulkan,
--- a/lib/compiler_rt/common.zig
+++ b/lib/compiler_rt/common.zig
@ -1,8 +1,14 @@
 const std = @import("std");
 const builtin = @import("builtin");
 const native_endian = builtin.cpu.arch.endian();
+const ofmt_c = builtin.object_format == .c;

-pub const linkage: std.builtin.GlobalLinkage = if (builtin.is_test) .internal else .weak;
+pub const linkage: std.builtin.GlobalLinkage = if (builtin.is_test)
+    .internal
+else if (ofmt_c)
+    .strong
+else
+    .weak;
 /// Determines the symbol's visibility to other objects.
 /// For WebAssembly this allows the symbol to be resolved to other modules, but will not
 /// export it to the host runtime.
@ -28,7 +34,7 @@ pub const want_float_exceptions = !builtin.cpu.arch.isWasm();

 // Libcalls that involve u128 on Windows x86-64 are expected by LLVM to use the
 // calling convention of @Vector(2, u64), rather than what's standard.
-pub const want_windows_v2u64_abi = builtin.os.tag == .windows and builtin.cpu.arch == .x86_64 and @import("builtin").object_format != .c;
+pub const want_windows_v2u64_abi = builtin.os.tag == .windows and builtin.cpu.arch == .x86_64 and !ofmt_c;

 /// This governs whether to use these symbol names for f16/f32 conversions
 /// rather than the standard names:
--- a/lib/include/__clang_cuda_intrinsics.h
+++ b/lib/include/__clang_cuda_intrinsics.h
@ -215,9 +215,7 @@ inline __device__ unsigned int __activemask() {
 #if CUDA_VERSION < 9020
  return __nvvm_vote_ballot(1);
 #else
-  unsigned int mask;
-  asm volatile("activemask.b32 %0;" : "=r"(mask));
-  return mask;
+  return __nvvm_activemask();
 #endif
 }

--- a/lib/include/__stdarg_header_macro.h
+++ b/lib/include/__stdarg_header_macro.h
@ -0,0 +1,12 @@
+/*===---- __stdarg_header_macro.h ------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+#endif
--- a/lib/include/__stddef_header_macro.h
+++ b/lib/include/__stddef_header_macro.h
@ -0,0 +1,12 @@
+/*===---- __stddef_header_macro.h ------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDDEF_H
+#define __STDDEF_H
+#endif
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -75,6 +75,14 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(v
 #define __dbg(t) __builtin_arm_dbg(t)
 #endif

+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+#define _CHKFEAT_GCS 1
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__chkfeat(uint64_t __features) {
+  return __builtin_arm_chkfeat(__features) ^ __features;
+}
+#endif
+
 /* 7.5 Swap */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __swp(uint32_t __x, volatile uint32_t *__p) {
@ -109,7 +117,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
 #endif

 /* 7.7 NOP */
-#if !defined(_MSC_VER) || !defined(__aarch64__)
+#if !defined(_MSC_VER) || (!defined(__aarch64__) && !defined(__arm64ec__))
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
  __builtin_arm_nop();
 }
@ -313,7 +321,7 @@ __qdbl(int32_t __t) {
 }
 #endif

-/* 8.4.3 Accumultating multiplications */
+/* 8.4.3 Accumulating multiplications */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlabb(int32_t __a, int32_t __b, int32_t __c) {
@ -545,7 +553,7 @@ __usub16(uint16x2_t __a, uint16x2_t __b) {
 }
 #endif

-/* 8.5.10 Parallel 16-bit multiplications */
+/* 8.5.10 Parallel 16-bit multiplication */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
@ -748,7 +756,7 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))

-/* 10.3 Memory Tagging Extensions (MTE) Intrinsics */
+/* 10.3 MTE intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 #define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
 #define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
@ -757,7 +765,7 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)

-/* 18 Memory Operations Intrinsics */
+/* 18 memcpy family of operations intrinsics - MOPS */
 #define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
  __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
 #endif
@ -855,6 +863,24 @@ __rndrrs(uint64_t *__p) {
 }
 #endif

+/* 11.2 Guarded Control Stack intrinsics */
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ void * __attribute__((__always_inline__, __nodebug__))
+__gcspr() {
+  return (void *)__builtin_arm_rsr64("gcspr_el0");
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("gcs")))
+__gcspopm() {
+  return __builtin_arm_gcspopm(0);
+}
+
+static __inline__ const void * __attribute__((__always_inline__, __nodebug__, target("gcs")))
+__gcsss(const void *__stack) {
+  return __builtin_arm_gcsss(__stack);
+}
+#endif
+
 #if defined(__cplusplus)
 }
 #endif
--- a/lib/include/arm_fp16.h
+++ b/lib/include/arm_fp16.h
@ -29,7 +29,7 @@
 typedef __fp16 float16_t;
 #define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))

-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__arm64ec__)
 #define vabdh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
--- a/lib/include/arm_sme.h
+++ b/lib/include/arm_sme.h
@ -16,6 +16,8 @@
 #endif
 #include <arm_sve.h>

+#include <stddef.h>
+
 /* Function attributes */
 #define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))

@ -39,6 +41,11 @@ __ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible {
  return x0 & 1;
 }

+void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible;
+void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible;
+void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible;
+void *__arm_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible;
+
 __ai __attribute__((target("sme"))) void svundef_za(void) __arm_streaming_compatible __arm_out("za") { }

 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m)))
@ -368,7 +375,7 @@ void svwrite_ver_za8_s8_m(uint64_t, uint32_t, svbool_t, svint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_mask_za)))
 void svzero_mask_za(uint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za)))
-void svzero_za();
+void svzero_za(void);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m)))
 void svaddha_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m)))
@ -597,6 +604,78 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_
 void svwrite_ver_za8_m(uint64_t, uint32_t, svbool_t, svuint8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m)))
 void svwrite_ver_za8_m(uint64_t, uint32_t, svbool_t, svint8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x2)))
+void svmla_single_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x4)))
+void svmla_single_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_f16_vg1x2)))
+void svmla_lane_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_f16_vg1x4)))
+void svmla_lane_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_f16_vg1x2)))
+void svmla_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_f16_vg1x4)))
+void svmla_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_f16_vg1x2)))
+void svmls_single_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_f16_vg1x4)))
+void svmls_single_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_f16_vg1x2)))
+void svmls_lane_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_f16_vg1x4)))
+void svmls_lane_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_f16_vg1x2)))
+void svmls_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_f16_vg1x4)))
+void svmls_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_f16_m)))
+void svmopa_za16_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_f16_m)))
+void svmops_za16_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x2)))
+void svmla_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x4)))
+void svmla_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_f16_vg1x2)))
+void svmla_lane_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_f16_vg1x4)))
+void svmla_lane_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_f16_vg1x2)))
+void svmla_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_f16_vg1x4)))
+void svmla_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_f16_vg1x2)))
+void svmls_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_f16_vg1x4)))
+void svmls_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_f16_vg1x2)))
+void svmls_lane_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_f16_vg1x4)))
+void svmls_lane_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_f16_vg1x2)))
+void svmls_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_f16_vg1x4)))
+void svmls_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_f16_m)))
+void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_f16_m)))
+void svmops_za16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
+void svadd_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
+void svadd_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
+void svsub_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
+void svsub_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
+void svadd_za16_vg1x2(uint32_t, svfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
+void svadd_za16_vg1x4(uint32_t, svfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
+void svsub_za16_vg1x2(uint32_t, svfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
+void svsub_za16_vg1x4(uint32_t, svfloat16x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m)))
 void svmopa_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m)))
@ -2059,6 +2138,78 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_u8_vg1x
 void svwrite_za8_vg1x4(uint32_t, svuint8x4_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_s8_vg1x4)))
 void svwrite_za8_vg1x4(uint32_t, svint8x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
+void svadd_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
+void svadd_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
+void svmla_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
+void svmla_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
+void svmla_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
+void svmla_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
+void svmla_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
+void svmla_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
+void svmls_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
+void svmls_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
+void svmls_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
+void svmls_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
+void svmls_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
+void svmls_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
+void svmopa_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
+void svmops_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
+void svsub_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
+void svsub_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
+void svadd_za16_vg1x2(uint32_t, svbfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
+void svadd_za16_vg1x4(uint32_t, svbfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
+void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
+void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
+void svmla_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
+void svmla_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
+void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
+void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
+void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
+void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
+void svmls_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
+void svmls_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
+void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
+void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
+void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
+void svmops_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
+void svsub_za16_vg1x2(uint32_t, svbfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
+void svsub_za16_vg1x4(uint32_t, svbfloat16x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x2)))
 void svadd_za64_f64_vg1x2(uint32_t, svfloat64x2_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x4)))
@ -2403,6 +2554,262 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za64_s1
 void svvdot_lane_za64_vg1x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za64_u16_vg1x4)))
 void svvdot_lane_za64_vg1x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_u8)))
+svuint8_t svreadz_hor_za128_u8(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_u32)))
+svuint32_t svreadz_hor_za128_u32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_u64)))
+svuint64_t svreadz_hor_za128_u64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_u16)))
+svuint16_t svreadz_hor_za128_u16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_bf16)))
+svbfloat16_t svreadz_hor_za128_bf16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_s8)))
+svint8_t svreadz_hor_za128_s8(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_f64)))
+svfloat64_t svreadz_hor_za128_f64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_f32)))
+svfloat32_t svreadz_hor_za128_f32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_f16)))
+svfloat16_t svreadz_hor_za128_f16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_s32)))
+svint32_t svreadz_hor_za128_s32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_s64)))
+svint64_t svreadz_hor_za128_s64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_s16)))
+svint16_t svreadz_hor_za128_s16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_u16)))
+svuint16_t svreadz_hor_za16_u16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_bf16)))
+svbfloat16_t svreadz_hor_za16_bf16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_f16)))
+svfloat16_t svreadz_hor_za16_f16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_s16)))
+svint16_t svreadz_hor_za16_s16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_u16_vg2)))
+svuint16x2_t svreadz_hor_za16_u16_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_bf16_vg2)))
+svbfloat16x2_t svreadz_hor_za16_bf16_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_f16_vg2)))
+svfloat16x2_t svreadz_hor_za16_f16_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_s16_vg2)))
+svint16x2_t svreadz_hor_za16_s16_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_u16_vg4)))
+svuint16x4_t svreadz_hor_za16_u16_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_bf16_vg4)))
+svbfloat16x4_t svreadz_hor_za16_bf16_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_f16_vg4)))
+svfloat16x4_t svreadz_hor_za16_f16_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_s16_vg4)))
+svint16x4_t svreadz_hor_za16_s16_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_u32)))
+svuint32_t svreadz_hor_za32_u32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_f32)))
+svfloat32_t svreadz_hor_za32_f32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_s32)))
+svint32_t svreadz_hor_za32_s32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_u32_vg2)))
+svuint32x2_t svreadz_hor_za32_u32_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_f32_vg2)))
+svfloat32x2_t svreadz_hor_za32_f32_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_s32_vg2)))
+svint32x2_t svreadz_hor_za32_s32_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_u32_vg4)))
+svuint32x4_t svreadz_hor_za32_u32_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_f32_vg4)))
+svfloat32x4_t svreadz_hor_za32_f32_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_s32_vg4)))
+svint32x4_t svreadz_hor_za32_s32_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_u64)))
+svuint64_t svreadz_hor_za64_u64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_f64)))
+svfloat64_t svreadz_hor_za64_f64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_s64)))
+svint64_t svreadz_hor_za64_s64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_u64_vg2)))
+svuint64x2_t svreadz_hor_za64_u64_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_f64_vg2)))
+svfloat64x2_t svreadz_hor_za64_f64_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_s64_vg2)))
+svint64x2_t svreadz_hor_za64_s64_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_u64_vg4)))
+svuint64x4_t svreadz_hor_za64_u64_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_f64_vg4)))
+svfloat64x4_t svreadz_hor_za64_f64_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_s64_vg4)))
+svint64x4_t svreadz_hor_za64_s64_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_u8)))
+svuint8_t svreadz_hor_za8_u8(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_s8)))
+svint8_t svreadz_hor_za8_s8(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_u8_vg2)))
+svuint8x2_t svreadz_hor_za8_u8_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_s8_vg2)))
+svint8x2_t svreadz_hor_za8_s8_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_u8_vg4)))
+svuint8x4_t svreadz_hor_za8_u8_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_s8_vg4)))
+svint8x4_t svreadz_hor_za8_s8_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_u8)))
+svuint8_t svreadz_ver_za128_u8(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_u32)))
+svuint32_t svreadz_ver_za128_u32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_u64)))
+svuint64_t svreadz_ver_za128_u64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_u16)))
+svuint16_t svreadz_ver_za128_u16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_bf16)))
+svbfloat16_t svreadz_ver_za128_bf16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_s8)))
+svint8_t svreadz_ver_za128_s8(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_f64)))
+svfloat64_t svreadz_ver_za128_f64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_f32)))
+svfloat32_t svreadz_ver_za128_f32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_f16)))
+svfloat16_t svreadz_ver_za128_f16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_s32)))
+svint32_t svreadz_ver_za128_s32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_s64)))
+svint64_t svreadz_ver_za128_s64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_s16)))
+svint16_t svreadz_ver_za128_s16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_u16)))
+svuint16_t svreadz_ver_za16_u16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_bf16)))
+svbfloat16_t svreadz_ver_za16_bf16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_f16)))
+svfloat16_t svreadz_ver_za16_f16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_s16)))
+svint16_t svreadz_ver_za16_s16(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_u16_vg2)))
+svuint16x2_t svreadz_ver_za16_u16_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_bf16_vg2)))
+svbfloat16x2_t svreadz_ver_za16_bf16_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_f16_vg2)))
+svfloat16x2_t svreadz_ver_za16_f16_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_s16_vg2)))
+svint16x2_t svreadz_ver_za16_s16_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_u16_vg4)))
+svuint16x4_t svreadz_ver_za16_u16_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_bf16_vg4)))
+svbfloat16x4_t svreadz_ver_za16_bf16_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_f16_vg4)))
+svfloat16x4_t svreadz_ver_za16_f16_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_s16_vg4)))
+svint16x4_t svreadz_ver_za16_s16_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_u32)))
+svuint32_t svreadz_ver_za32_u32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_f32)))
+svfloat32_t svreadz_ver_za32_f32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_s32)))
+svint32_t svreadz_ver_za32_s32(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_u32_vg2)))
+svuint32x2_t svreadz_ver_za32_u32_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_f32_vg2)))
+svfloat32x2_t svreadz_ver_za32_f32_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_s32_vg2)))
+svint32x2_t svreadz_ver_za32_s32_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_u32_vg4)))
+svuint32x4_t svreadz_ver_za32_u32_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_f32_vg4)))
+svfloat32x4_t svreadz_ver_za32_f32_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_s32_vg4)))
+svint32x4_t svreadz_ver_za32_s32_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_u64)))
+svuint64_t svreadz_ver_za64_u64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_f64)))
+svfloat64_t svreadz_ver_za64_f64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_s64)))
+svint64_t svreadz_ver_za64_s64(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_u64_vg2)))
+svuint64x2_t svreadz_ver_za64_u64_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_f64_vg2)))
+svfloat64x2_t svreadz_ver_za64_f64_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_s64_vg2)))
+svint64x2_t svreadz_ver_za64_s64_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_u64_vg4)))
+svuint64x4_t svreadz_ver_za64_u64_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_f64_vg4)))
+svfloat64x4_t svreadz_ver_za64_f64_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_s64_vg4)))
+svint64x4_t svreadz_ver_za64_s64_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_u8)))
+svuint8_t svreadz_ver_za8_u8(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_s8)))
+svint8_t svreadz_ver_za8_s8(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_u8_vg2)))
+svuint8x2_t svreadz_ver_za8_u8_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_s8_vg2)))
+svint8x2_t svreadz_ver_za8_s8_vg2(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_u8_vg4)))
+svuint8x4_t svreadz_ver_za8_u8_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_s8_vg4)))
+svint8x4_t svreadz_ver_za8_s8_vg4(uint64_t, uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_u16_vg1x2)))
+svuint16x2_t svreadz_za16_u16_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_bf16_vg1x2)))
+svbfloat16x2_t svreadz_za16_bf16_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_f16_vg1x2)))
+svfloat16x2_t svreadz_za16_f16_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_s16_vg1x2)))
+svint16x2_t svreadz_za16_s16_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_u16_vg1x4)))
+svuint16x4_t svreadz_za16_u16_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_bf16_vg1x4)))
+svbfloat16x4_t svreadz_za16_bf16_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_f16_vg1x4)))
+svfloat16x4_t svreadz_za16_f16_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_s16_vg1x4)))
+svint16x4_t svreadz_za16_s16_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_u32_vg1x2)))
+svuint32x2_t svreadz_za32_u32_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_f32_vg1x2)))
+svfloat32x2_t svreadz_za32_f32_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_s32_vg1x2)))
+svint32x2_t svreadz_za32_s32_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_u32_vg1x4)))
+svuint32x4_t svreadz_za32_u32_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_f32_vg1x4)))
+svfloat32x4_t svreadz_za32_f32_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_s32_vg1x4)))
+svint32x4_t svreadz_za32_s32_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_u64_vg1x2)))
+svuint64x2_t svreadz_za64_u64_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_f64_vg1x2)))
+svfloat64x2_t svreadz_za64_f64_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_s64_vg1x2)))
+svint64x2_t svreadz_za64_s64_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_u64_vg1x4)))
+svuint64x4_t svreadz_za64_u64_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_f64_vg1x4)))
+svfloat64x4_t svreadz_za64_f64_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_s64_vg1x4)))
+svint64x4_t svreadz_za64_s64_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za8_u8_vg1x2)))
+svuint8x2_t svreadz_za8_u8_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za8_s8_vg1x2)))
+svint8x2_t svreadz_za8_s8_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za8_u8_vg1x4)))
+svuint8x4_t svreadz_za8_u8_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za8_s8_vg1x4)))
+svint8x4_t svreadz_za8_s8_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg1x2)))
+void svzero_za64_vg1x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg1x4)))
+void svzero_za64_vg1x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg2x1)))
+void svzero_za64_vg2x1(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg2x2)))
+void svzero_za64_vg2x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg2x4)))
+void svzero_za64_vg2x4(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg4x1)))
+void svzero_za64_vg4x1(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg4x2)))
+void svzero_za64_vg4x2(uint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg4x4)))
+void svzero_za64_vg4x4(uint32_t);
 #ifdef __cplusplus
 } // extern "C"
 #endif
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
--- a/lib/include/arm_vector_types.h
+++ b/lib/include/arm_vector_types.h
@ -16,7 +16,7 @@
 #define __ARM_NEON_TYPES_H
 typedef float float32_t;
 typedef __fp16 float16_t;
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__arm64ec__)
 typedef double float64_t;
 #endif

@ -40,7 +40,7 @@ typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
 typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
 typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
 typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__arm64ec__)
 typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
 typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
 #endif
@ -125,7 +125,7 @@ typedef struct float32x4x2_t {
  float32x4_t val[2];
 } float32x4x2_t;

-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__arm64ec__)
 typedef struct float64x1x2_t {
  float64x1_t val[2];
 } float64x1x2_t;
@ -215,7 +215,7 @@ typedef struct float32x4x3_t {
  float32x4_t val[3];
 } float32x4x3_t;

-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__arm64ec__)
 typedef struct float64x1x3_t {
  float64x1_t val[3];
 } float64x1x3_t;
@ -305,7 +305,7 @@ typedef struct float32x4x4_t {
  float32x4_t val[4];
 } float32x4x4_t;

-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__arm64ec__)
 typedef struct float64x1x4_t {
  float64x1_t val[4];
 } float64x1x4_t;
--- a/lib/include/avx512erintrin.h
+++ b/lib/include/avx512erintrin.h
@ -1,271 +0,0 @@
-/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512ERINTRIN_H
-#define __AVX512ERINTRIN_H
-
-/* exp2a23 */
-#define _mm512_exp2a23_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                       (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)(M), (int)(R)))
-
-#define _mm512_exp2a23_pd(A) \
-  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_pd(S, M, A) \
-  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_pd(M, A) \
-  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_exp2a23_round_ps(A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                      (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)(M), (int)(R)))
-
-#define _mm512_exp2a23_ps(A) \
-  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_ps(S, M, A) \
-  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_ps(M, A) \
-  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-/* rsqrt28 */
-#define _mm512_rsqrt28_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                          (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(M), (int)(R)))
-
-#define _mm512_rsqrt28_pd(A) \
-  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_pd(S, M, A) \
-  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_pd(M, A) \
-  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rsqrt28_round_ps(A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                         (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(M), (int)(R)))
-
-#define _mm512_rsqrt28_ps(A) \
-  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_ps(S, M, A) \
-  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_ps(M, A) \
-  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)(__m128)(S), \
-                                               (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_ss(A, B) \
-  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_ss(S, M, A, B) \
-  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_ss(M, A, B) \
-  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)(__m128d)(S), \
-                                                (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_sd(A, B) \
-  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_sd(S, M, A, B) \
-  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_sd(M, A, B) \
-  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-/* rcp28 */
-#define _mm512_rcp28_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                        (int)(R)))
-
-#define _mm512_maskz_rcp28_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(M), (int)(R)))
-
-#define _mm512_rcp28_pd(A) \
-  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_pd(S, M, A) \
-  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_pd(M, A) \
-  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rcp28_round_ps(A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                       (int)(R)))
-
-#define _mm512_maskz_rcp28_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(M), (int)(R)))
-
-#define _mm512_rcp28_ps(A) \
-  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_ps(S, M, A) \
-  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_ps(M, A) \
-  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)(__m128)(S), \
-                                             (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_ss(A, B) \
-  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_ss(S, M, A, B) \
-  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_ss(M, A, B) \
-  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)(__m128d)(S), \
-                                              (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_sd(A, B) \
-  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_sd(S, M, A, B) \
-  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_sd(M, A, B) \
-  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#endif /* __AVX512ERINTRIN_H */
--- a/lib/include/avx512fp16intrin.h
+++ b/lib/include/avx512fp16intrin.h
@ -96,8 +96,8 @@ _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
                (h5), (h4), (h3), (h2), (h1))

 static __inline __m512h __DEFAULT_FN_ATTRS512
-_mm512_set1_pch(_Float16 _Complex h) {
-  return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h));
+_mm512_set1_pch(_Float16 _Complex __h) {
+  return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
@ -282,75 +282,75 @@ _mm512_zextph256_ph512(__m256h __a) {
 #define _mm_comi_sh(A, B, pred)                                                \
  _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
-                                                          __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
+                                                          __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
-                                                          __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
+                                                          __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
-                                                          __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
+                                                          __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
-                                                          __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
+                                                          __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
-                                                          __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
+                                                          __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
-                                                           __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
+                                                           __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
-                                                           __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
+                                                           __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
-                                                           __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
+                                                           __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
-                                                           __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
+                                                           __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
-                                                           __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
+                                                           __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
-                                                           __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
+                                                           __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
                                _MM_FROUND_CUR_DIRECTION);
 }

-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
-                                                            __m128h B) {
-  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
+                                                            __m128h __B) {
+  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
                                _MM_FROUND_CUR_DIRECTION);
 }

--- a/lib/include/avx512pfintrin.h
+++ b/lib/include/avx512pfintrin.h
@ -1,92 +0,0 @@
-/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512PFINTRIN_H
-#define __AVX512PFINTRIN_H
-
-#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
-                             (__v16si)(__m512i)(index), (void const *)(addr), \
-                             (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfdps((__mmask16) -1, \
-                             (__v16si)(__m512i)(index), (void const *)(addr), \
-                             (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
-                              (__v16si)(__m512i)(index), (void *)(addr), \
-                              (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#endif
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@ -207,6 +207,8 @@ _mm256_div_ps(__m256 __a, __m256 __b)
 /// Compares two 256-bit vectors of [4 x double] and returns the greater
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
@ -226,6 +228,8 @@ _mm256_max_pd(__m256d __a, __m256d __b)
 /// Compares two 256-bit vectors of [8 x float] and returns the greater
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
@ -245,6 +249,8 @@ _mm256_max_ps(__m256 __a, __m256 __b)
 /// Compares two 256-bit vectors of [4 x double] and returns the lesser
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMINPD </c> instruction.
@ -264,6 +270,8 @@ _mm256_min_pd(__m256d __a, __m256d __b)
 /// Compares two 256-bit vectors of [8 x float] and returns the lesser
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMINPS </c> instruction.
@ -832,6 +840,7 @@ _mm256_permutevar_pd(__m256d __a, __m256i __c)

 /// Copies the values stored in a 128-bit vector of [4 x float] as
 ///    specified by the 128-bit integer vector operand.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
@ -1574,14 +1583,6 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
                                     (__v4df)(__m256d)(b), (int)(mask)))

 /* Compare */
-#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
-#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
-#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
-#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
-#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
-#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
-#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
-#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
 #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
@ -1607,13 +1608,14 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */

+/* Below intrinsic defined in emmintrin.h can be used for AVX */
 /// Compares each of the corresponding double-precision values of two
 ///    128-bit vectors of [2 x double], using the operation specified by the
 ///    immediate integer operand.
 ///
-///    Returns a [2 x double] vector consisting of two doubles corresponding to
-///    the two comparison results: zero if the comparison is false, and all 1's
-///    if the comparison is true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1663,17 +1665,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1E: Greater-than (ordered, non-signaling) \n
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
-#define _mm_cmp_pd(a, b, c) \
-  ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
-                                 (__v2df)(__m128d)(b), (c)))
+/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)

+/* Below intrinsic defined in xmmintrin.h can be used for AVX */
 /// Compares each of the corresponding values of two 128-bit vectors of
 ///    [4 x float], using the operation specified by the immediate integer
 ///    operand.
 ///
-///    Returns a [4 x float] vector consisting of four floats corresponding to
-///    the four comparison results: zero if the comparison is false, and all 1's
-///    if the comparison is true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1723,17 +1724,15 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1E: Greater-than (ordered, non-signaling) \n
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
-#define _mm_cmp_ps(a, b, c) \
-  ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
-                                (__v4sf)(__m128)(b), (c)))
+/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)

 /// Compares each of the corresponding double-precision values of two
 ///    256-bit vectors of [4 x double], using the operation specified by the
 ///    immediate integer operand.
 ///
-///    Returns a [4 x double] vector consisting of four doubles corresponding to
-///    the four comparison results: zero if the comparison is false, and all 1's
-///    if the comparison is true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1791,9 +1790,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    [8 x float], using the operation specified by the immediate integer
 ///    operand.
 ///
-///    Returns a [8 x float] vector consisting of eight floats corresponding to
-///    the eight comparison results: zero if the comparison is false, and all
-///    1's if the comparison is true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1847,12 +1846,14 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
  ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
                                   (__v8sf)(__m256)(b), (c)))

+/* Below intrinsic defined in emmintrin.h can be used for AVX */
 /// Compares each of the corresponding scalar double-precision values of
 ///    two 128-bit vectors of [2 x double], using the operation specified by the
 ///    immediate integer operand.
 ///
-///    If the result is true, all 64 bits of the destination vector are set;
-///    otherwise they are cleared.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1902,16 +1903,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1E: Greater-than (ordered, non-signaling) \n
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
-#define _mm_cmp_sd(a, b, c) \
-  ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
-                                 (__v2df)(__m128d)(b), (c)))
+/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)

+/* Below intrinsic defined in xmmintrin.h can be used for AVX */
 /// Compares each of the corresponding scalar values of two 128-bit
 ///    vectors of [4 x float], using the operation specified by the immediate
 ///    integer operand.
 ///
-///    If the result is true, all 32 bits of the destination vector are set;
-///    otherwise they are cleared.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1961,9 +1962,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1E: Greater-than (ordered, non-signaling) \n
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
-#define _mm_cmp_ss(a, b, c) \
-  ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
-                                (__v4sf)(__m128)(b), (c)))
+/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)

 /// Takes a [8 x i32] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
@ -2213,6 +2212,10 @@ _mm256_cvtpd_ps(__m256d __a)

 /// Converts a vector of [8 x float] into a vector of [8 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
@ -2242,9 +2245,13 @@ _mm256_cvtps_pd(__m128 __a)
  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
 }

-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
-///    x i32], truncating the result by rounding towards zero when it is
-///    inexact.
+/// Converts a 256-bit vector of [4 x double] into four signed truncated
+///    (rounded toward zero) 32-bit integers returned in a 128-bit vector of
+///    [4 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -2259,9 +2266,12 @@ _mm256_cvttpd_epi32(__m256d __a)
  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
 }

-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
-///    x i32]. When a conversion is inexact, the value returned is rounded
-///    according to the rounding control bits in the MXCSR register.
+/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
+///    [4 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -2276,8 +2286,12 @@ _mm256_cvtpd_epi32(__m256d __a)
  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
 }

-/// Converts a vector of [8 x float] into a vector of [8 x i32],
-///    truncating the result by rounding towards zero when it is inexact.
+/// Converts a vector of [8 x float] into eight signed truncated (rounded
+///    toward zero) 32-bit integers returned in a vector of [8 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
--- a/lib/include/bmiintrin.h
+++ b/lib/include/bmiintrin.h
@ -161,8 +161,7 @@ _mm_tzcnt_64(unsigned long long __X)

 #undef __RELAXED_FN_ATTRS

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__BMI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__)

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
@ -610,7 +609,6 @@ __blsr_u64(unsigned long long __X)

 #undef __DEFAULT_FN_ATTRS

-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules)   \
-          || defined(__BMI__) */
+#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__BMI__) */

 #endif /* __BMIINTRIN_H */
--- a/lib/include/builtins.h
+++ b/lib/include/builtins.h
@ -13,4 +13,7 @@
 #ifndef __BUILTINS_H
 #define __BUILTINS_H

+#if defined(__MVS__) && __has_include_next(<builtins.h>)
+#include_next <builtins.h>
+#endif /* __MVS__ */
 #endif /* __BUILTINS_H */
--- a/lib/include/cpuid.h
+++ b/lib/include/cpuid.h
@ -10,7 +10,7 @@
 #ifndef __CPUID_H
 #define __CPUID_H

-#if !(__x86_64__ || __i386__)
+#if !defined(__x86_64__) && !defined(__i386__)
 #error this header is for x86 only
 #endif

@ -200,6 +200,9 @@
 #define bit_AMXINT8       0x02000000

 /* Features in %eax for leaf 7 sub-leaf 1 */
+#define bit_SHA512        0x00000001
+#define bit_SM3           0x00000002
+#define bit_SM4           0x00000004
 #define bit_RAOINT        0x00000008
 #define bit_AVXVNNI       0x00000010
 #define bit_AVX512BF16    0x00000020
@ -211,7 +214,12 @@
 /* Features in %edx for leaf 7 sub-leaf 1 */
 #define bit_AVXVNNIINT8   0x00000010
 #define bit_AVXNECONVERT  0x00000020
+#define bit_AMXCOMPLEX    0x00000100
+#define bit_AVXVNNIINT16  0x00000400
 #define bit_PREFETCHI     0x00004000
+#define bit_USERMSR       0x00008000
+#define bit_AVX10         0x00080000
+#define bit_APXF          0x00200000

 /* Features in %eax for leaf 13 sub-leaf 1 */
 #define bit_XSAVEOPT    0x00000001
@ -244,8 +252,11 @@
 #define bit_RDPRU       0x00000010
 #define bit_WBNOINVD    0x00000200

+/* Features in %ebx for leaf 0x24 */
+#define bit_AVX10_256   0x00020000
+#define bit_AVX10_512   0x00040000

-#if __i386__
+#ifdef __i386__
 #define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
                  : "0"(__leaf))
@ -274,7 +285,7 @@ static __inline unsigned int __get_cpuid_max (unsigned int __leaf,
                                              unsigned int *__sig)
 {
    unsigned int __eax, __ebx, __ecx, __edx;
-#if __i386__
+#ifdef __i386__
    int __cpuid_supported;

    __asm("  pushfl\n"
@ -328,4 +339,13 @@ static __inline int __get_cpuid_count (unsigned int __leaf,
    return 1;
 }

+// In some configurations, __cpuidex is defined as a builtin (primarily
+// -fms-extensions) which will conflict with the __cpuidex definition below.
+#if !(__has_builtin(__cpuidex))
+static __inline void __cpuidex(int __cpu_info[4], int __leaf, int __subleaf) {
+  __cpuid_count(__leaf, __subleaf, __cpu_info[0], __cpu_info[1], __cpu_info[2],
+                __cpu_info[3]);
+}
+#endif
+
 #endif /* __CPUID_H */
--- a/lib/include/cuda_wrappers/algorithm
+++ b/lib/include/cuda_wrappers/algorithm
@ -99,7 +99,7 @@ template <class __T>
 __attribute__((enable_if(true, "")))
 inline _CPP14_CONSTEXPR __host__ __device__ const __T &
 min(const __T &__a, const __T &__b) {
-  return __a < __b ? __a : __b;
+  return __b < __a ? __b : __a;
 }

 #pragma pop_macro("_CPP14_CONSTEXPR")
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
--- a/lib/include/float.h
+++ b/lib/include/float.h
@ -10,6 +10,10 @@
 #ifndef __CLANG_FLOAT_H
 #define __CLANG_FLOAT_H

+#if defined(__MVS__) && __has_include_next(<float.h>)
+#include_next <float.h>
+#else
+
 /* If we're on MinGW, fall back to the system's float.h, which might have
 * additional definitions provided for Windows.
 * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
@ -82,6 +86,18 @@
 #    undef DBL_HAS_SUBNORM
 #    undef LDBL_HAS_SUBNORM
 #  endif
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
+    !defined(__STRICT_ANSI__)
+#    undef FLT_NORM_MAX
+#    undef DBL_NORM_MAX
+#    undef LDBL_NORM_MAX
+#endif
+#endif
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
+    !defined(__STRICT_ANSI__)
+#  undef INFINITY
+#  undef NAN
 #endif

 /* Characteristics of floating point types, C99 5.2.4.2.2 */
@ -151,6 +167,17 @@
 #  define LDBL_HAS_SUBNORM __LDBL_HAS_DENORM__
 #endif

+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
+    !defined(__STRICT_ANSI__)
+   /* C23 5.2.5.3.3p29-30 */
+#  define INFINITY (__builtin_inff())
+#  define NAN (__builtin_nanf(""))
+   /* C23 5.2.5.3.3p32 */
+#  define FLT_NORM_MAX __FLT_NORM_MAX__
+#  define DBL_NORM_MAX __DBL_NORM_MAX__
+#  define LDBL_NORM_MAX __LDBL_NORM_MAX__
+#endif
+
 #ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
 #  define FLT16_MANT_DIG    __FLT16_MANT_DIG__
 #  define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__
@ -165,4 +192,5 @@
 #  define FLT16_TRUE_MIN    __FLT16_TRUE_MIN__
 #endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */

+#endif /* __MVS__ */
 #endif /* __CLANG_FLOAT_H */
--- a/lib/include/fmaintrin.h
+++ b/lib/include/fmaintrin.h
@ -60,7 +60,8 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)

 /// Computes a scalar multiply-add of the single-precision values in the
 ///    low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@ -88,7 +89,8 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)

 /// Computes a scalar multiply-add of the double-precision values in the
 ///    low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@ -156,7 +158,8 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)

 /// Computes a scalar multiply-subtract of the single-precision values in
 ///    the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@ -184,7 +187,8 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)

 /// Computes a scalar multiply-subtract of the double-precision values in
 ///    the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@ -252,7 +256,8 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)

 /// Computes a scalar negated multiply-add of the single-precision values in
 ///    the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@ -280,7 +285,8 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)

 /// Computes a scalar negated multiply-add of the double-precision values
 ///    in the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@ -348,7 +354,8 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)

 /// Computes a scalar negated multiply-subtract of the single-precision
 ///    values in the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@ -376,7 +383,8 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)

 /// Computes a scalar negated multiply-subtract of the double-precision
 ///    values in the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@ -404,7 +412,8 @@ _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)

 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
@ -430,7 +439,8 @@ _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)

 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 /// \endcode
@ -454,7 +464,8 @@ _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)

 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
@ -480,7 +491,8 @@ _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)

 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 /// \endcode
@ -664,7 +676,8 @@ _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)

 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 ///    [8 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
@ -694,7 +707,8 @@ _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)

 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 ///    [4 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
@ -720,7 +734,8 @@ _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)

 /// Computes a vector multiply with alternating add/subtract of 256-bit
 ///    vectors of [8 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
@ -750,7 +765,8 @@ _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)

 /// Computes a vector multiply with alternating add/subtract of 256-bit
 ///    vectors of [4 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
--- a/lib/include/ia32intrin.h
+++ b/lib/include/ia32intrin.h
@ -26,8 +26,8 @@
 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
 #endif

-/// Find the first set bit starting from the lsb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -43,8 +43,8 @@ __bsfd(int __A) {
  return __builtin_ctz((unsigned int)__A);
 }

-/// Find the first set bit starting from the msb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -90,8 +90,8 @@ _bswap(int __A) {
  return (int)__builtin_bswap32((unsigned int)__A);
 }

-/// Find the first set bit starting from the lsb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -108,8 +108,8 @@ _bswap(int __A) {
 /// \see __bsfd
 #define _bit_scan_forward(A) __bsfd((A))

-/// Find the first set bit starting from the msb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -127,8 +127,8 @@ _bswap(int __A) {
 #define _bit_scan_reverse(A) __bsrd((A))

 #ifdef __x86_64__
-/// Find the first set bit starting from the lsb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -143,8 +143,8 @@ __bsfq(long long __A) {
  return (long long)__builtin_ctzll((unsigned long long)__A);
 }

-/// Find the first set bit starting from the msb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -159,7 +159,7 @@ __bsrq(long long __A) {
  return 63 - __builtin_clzll((unsigned long long)__A);
 }

-/// Swaps the bytes in the input. Converting little endian to big endian or
+/// Swaps the bytes in the input, converting little endian to big endian or
 ///    vice versa.
 ///
 /// \headerfile <x86intrin.h>
@ -175,7 +175,7 @@ __bswapq(long long __A) {
  return (long long)__builtin_bswap64((unsigned long long)__A);
 }

-/// Swaps the bytes in the input. Converting little endian to big endian or
+/// Swaps the bytes in the input, converting little endian to big endian or
 ///    vice versa.
 ///
 /// \headerfile <x86intrin.h>
@ -198,7 +198,7 @@ __bswapq(long long __A) {
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param __A
 ///    An unsigned 32-bit integer operand.
@ -220,7 +220,7 @@ __popcntd(unsigned int __A)
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param A
 ///    An unsigned 32-bit integer operand.
@ -235,7 +235,7 @@ __popcntd(unsigned int __A)
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param __A
 ///    An unsigned 64-bit integer operand.
@ -257,7 +257,7 @@ __popcntq(unsigned long long __A)
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param A
 ///    An unsigned 64-bit integer operand.
@ -268,7 +268,7 @@ __popcntq(unsigned long long __A)
 #endif /* __x86_64__ */

 #ifdef __x86_64__
-/// Returns the program status and control \c RFLAGS register with the \c VM
+/// Returns the program status-and-control \c RFLAGS register with the \c VM
 ///    and \c RF flags cleared.
 ///
 /// \headerfile <x86intrin.h>
@ -282,7 +282,7 @@ __readeflags(void)
  return __builtin_ia32_readeflags_u64();
 }

-/// Writes the specified value to the program status and control \c RFLAGS
+/// Writes the specified value to the program status-and-control \c RFLAGS
 ///    register. Reserved bits are not affected.
 ///
 /// \headerfile <x86intrin.h>
@ -298,7 +298,7 @@ __writeeflags(unsigned long long __f)
 }

 #else /* !__x86_64__ */
-/// Returns the program status and control \c EFLAGS register with the \c VM
+/// Returns the program status-and-control \c EFLAGS register with the \c VM
 ///    and \c RF flags cleared.
 ///
 /// \headerfile <x86intrin.h>
@ -312,7 +312,7 @@ __readeflags(void)
  return __builtin_ia32_readeflags_u32();
 }

-/// Writes the specified value to the program status and control \c EFLAGS
+/// Writes the specified value to the program status-and-control \c EFLAGS
 ///    register. Reserved bits are not affected.
 ///
 /// \headerfile <x86intrin.h>
@ -328,7 +328,7 @@ __writeeflags(unsigned int __f)
 }
 #endif /* !__x86_64__ */

-/// Cast a 32-bit float value to a 32-bit unsigned integer value.
+/// Casts a 32-bit float value to a 32-bit unsigned integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -337,13 +337,13 @@ __writeeflags(unsigned int __f)
 ///
 /// \param __A
 ///    A 32-bit float value.
-/// \returns a 32-bit unsigned integer containing the converted value.
+/// \returns A 32-bit unsigned integer containing the converted value.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
 _castf32_u32(float __A) {
  return __builtin_bit_cast(unsigned int, __A);
 }

-/// Cast a 64-bit float value to a 64-bit unsigned integer value.
+/// Casts a 64-bit float value to a 64-bit unsigned integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -352,13 +352,13 @@ _castf32_u32(float __A) {
 ///
 /// \param __A
 ///    A 64-bit float value.
-/// \returns a 64-bit unsigned integer containing the converted value.
+/// \returns A 64-bit unsigned integer containing the converted value.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
 _castf64_u64(double __A) {
  return __builtin_bit_cast(unsigned long long, __A);
 }

-/// Cast a 32-bit unsigned integer value to a 32-bit float value.
+/// Casts a 32-bit unsigned integer value to a 32-bit float value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -367,13 +367,13 @@ _castf64_u64(double __A) {
 ///
 /// \param __A
 ///    A 32-bit unsigned integer value.
-/// \returns a 32-bit float value containing the converted value.
+/// \returns A 32-bit float value containing the converted value.
 static __inline__ float __DEFAULT_FN_ATTRS_CAST
 _castu32_f32(unsigned int __A) {
  return __builtin_bit_cast(float, __A);
 }

-/// Cast a 64-bit unsigned integer value to a 64-bit float value.
+/// Casts a 64-bit unsigned integer value to a 64-bit float value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -382,7 +382,7 @@ _castu32_f32(unsigned int __A) {
 ///
 /// \param __A
 ///    A 64-bit unsigned integer value.
-/// \returns a 64-bit float value containing the converted value.
+/// \returns A 64-bit float value containing the converted value.
 static __inline__ double __DEFAULT_FN_ATTRS_CAST
 _castu64_f64(unsigned long long __A) {
  return __builtin_bit_cast(double, __A);
@ -470,7 +470,7 @@ __crc32q(unsigned long long __C, unsigned long long __D)
 }
 #endif /* __x86_64__ */

-/// Reads the specified performance monitoring counter. Refer to your
+/// Reads the specified performance-monitoring counter. Refer to your
 ///    processor's documentation to determine which performance counters are
 ///    supported.
 ///
@ -487,7 +487,7 @@ __rdpmc(int __A) {
  return __builtin_ia32_rdpmc(__A);
 }

-/// Reads the processor's time stamp counter and the \c IA32_TSC_AUX MSR
+/// Reads the processor's time-stamp counter and the \c IA32_TSC_AUX MSR
 ///    \c (0xc0000103).
 ///
 /// \headerfile <x86intrin.h>
@ -495,14 +495,14 @@ __rdpmc(int __A) {
 /// This intrinsic corresponds to the \c RDTSCP instruction.
 ///
 /// \param __A
-///    Address of where to store the 32-bit \c IA32_TSC_AUX value.
-/// \returns The 64-bit value of the time stamp counter.
+///    The address of where to store the 32-bit \c IA32_TSC_AUX value.
+/// \returns The 64-bit value of the time-stamp counter.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __rdtscp(unsigned int *__A) {
  return __builtin_ia32_rdtscp(__A);
 }

-/// Reads the processor's time stamp counter.
+/// Reads the processor's time-stamp counter.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -512,7 +512,7 @@ __rdtscp(unsigned int *__A) {
 ///
 /// This intrinsic corresponds to the \c RDTSC instruction.
 ///
-/// \returns The 64-bit value of the time stamp counter.
+/// \returns The 64-bit value of the time-stamp counter.
 #define _rdtsc() __rdtsc()

 /// Reads the specified performance monitoring counter. Refer to your
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -16,281 +16,231 @@

 #include <x86gprintrin.h>

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MMX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MMX__)
 #include <mmintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE__)
 #include <xmmintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE2__)
 #include <emmintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE3__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE3__)
 #include <pmmintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSSE3__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSSE3__)
 #include <tmmintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__SSE4_2__) || defined(__SSE4_1__))
 #include <smmintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AES__) || defined(__PCLMUL__))
 #include <wmmintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLFLUSHOPT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLFLUSHOPT__)
 #include <clflushoptintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLWB__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLWB__)
 #include <clwbintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX__)
 #include <avxintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX2__)
 #include <avx2intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__F16C__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__F16C__)
 #include <f16cintrin.h>
 #endif

 /* No feature check desired due to internal checks */
 #include <bmiintrin.h>

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__BMI2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI2__)
 #include <bmi2intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__LZCNT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__LZCNT__)
 #include <lzcntintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__POPCNT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__POPCNT__)
 #include <popcntintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FMA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA__)
 #include <fmaintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512F__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512F__)
 #include <avx512fintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VL__)
 #include <avx512vlintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512BW__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BW__)
 #include <avx512bwintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512BITALG__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BITALG__)
 #include <avx512bitalgintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512CD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512CD__)
 #include <avx512cdintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VPOPCNTDQ__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
 #include <avx512vpopcntdqintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
 #include <avx512vpopcntdqvlintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VNNI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VNNI__)
 #include <avx512vnniintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VL__) && defined(__AVX512VNNI__))
 #include <avx512vlvnniintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXVNNI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNI__)
 #include <avxvnniintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512DQ__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VL__) && defined(__AVX512BITALG__))
 #include <avx512vlbitalgintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VL__) && defined(__AVX512BW__))
 #include <avx512vlbwintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VL__) && defined(__AVX512CD__))
 #include <avx512vlcdintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VL__) && defined(__AVX512DQ__))
 #include <avx512vldqintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512ER__)
-#include <avx512erintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512IFMA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
 #include <avx512ifmaintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512IFMA__) && defined(__AVX512VL__))
 #include <avx512ifmavlintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXIFMA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXIFMA__)
 #include <avxifmaintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VBMI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI__)
 #include <avx512vbmiintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VBMI__) && defined(__AVX512VL__))
 #include <avx512vbmivlintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VBMI2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI2__)
 #include <avx512vbmi2intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
 #include <avx512vlvbmi2intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512PF__)
-#include <avx512pfintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512FP16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
 #include <avx512fp16intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VL__) && defined(__AVX512FP16__))
 #include <avx512vlfp16intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512BF16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BF16__)
 #include <avx512bf16intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VL__) && defined(__AVX512BF16__))
 #include <avx512vlbf16intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PKU__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PKU__)
 #include <pkuintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__VPCLMULQDQ__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__VPCLMULQDQ__)
 #include <vpclmulqdqintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__VAES__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__VAES__)
 #include <vaesintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__GFNI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__GFNI__)
 #include <gfniintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXVNNIINT8__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT8__)
 #include <avxvnniint8intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXNECONVERT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXNECONVERT__)
 #include <avxneconvertintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SHA512__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA512__)
 #include <sha512intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SM3__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SM3__)
 #include <sm3intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SM4__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SM4__)
 #include <sm4intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXVNNIINT16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT16__)
 #include <avxvnniint16intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDPID__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPID__)
 /// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
 ///
 /// \headerfile <immintrin.h>
@ -304,8 +254,7 @@ _rdpid_u32(void) {
 }
 #endif // __RDPID__

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDRND__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDRND__)
 /// Returns a 16-bit hardware-generated random value.
 ///
 /// \headerfile <immintrin.h>
@ -367,8 +316,7 @@ _rdrand64_step(unsigned long long *__p)
 }
 #endif /* __RDRND__ */

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FSGSBASE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FSGSBASE__)
 #ifdef __x86_64__
 /// Reads the FS base register.
 ///
@ -481,8 +429,7 @@ _writegsbase_u64(unsigned long long __V)
 #endif
 #endif /* __FSGSBASE__ */

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MOVBE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVBE__)

 /* The structs used below are to force the load/store to be unaligned. This
 * is accomplished with the __packed__ attribute. The __may_alias__ prevents
@ -598,139 +545,118 @@ _storebe_i64(void * __P, long long __D) {
 #endif
 #endif /* __MOVBE */

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RTM__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RTM__)
 #include <rtmintrin.h>
 #include <xtestintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SHA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA__)
 #include <shaintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FXSR__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FXSR__)
 #include <fxsrintrin.h>
 #endif

 /* No feature check desired due to internal MSC_VER checks */
 #include <xsaveintrin.h>

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XSAVEOPT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEOPT__)
 #include <xsaveoptintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XSAVEC__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEC__)
 #include <xsavecintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XSAVES__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVES__)
 #include <xsavesintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SHSTK__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHSTK__)
 #include <cetintrin.h>
 #endif

 /* Intrinsics inside adcintrin.h are available at all times. */
 #include <adcintrin.h>

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__ADX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__ADX__)
 #include <adxintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDSEED__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDSEED__)
 #include <rdseedintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__WBNOINVD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WBNOINVD__)
 #include <wbnoinvdintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLDEMOTE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLDEMOTE__)
 #include <cldemoteintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__WAITPKG__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WAITPKG__)
 #include <waitpkgintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MOVDIRI__) || defined(__MOVDIR64B__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVDIRI__) ||     \
+    defined(__MOVDIR64B__)
 #include <movdirintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PCONFIG__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
 #include <pconfigintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SGX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SGX__)
 #include <sgxintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PTWRITE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PTWRITE__)
 #include <ptwriteintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__INVPCID__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
 #include <invpcidintrin.h>
 #endif
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMX_FP16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
 #include <amxfp16intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__KL__) || defined(__WIDEKL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) ||          \
+    defined(__WIDEKL__)
 #include <keylockerintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMX_TILE__) || defined(__AMX_INT8__) || defined(__AMX_BF16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TILE__) ||    \
+    defined(__AMX_INT8__) || defined(__AMX_BF16__)
 #include <amxintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMX_COMPLEX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
 #include <amxcomplexintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
    (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
 #include <avx512vlvp2intersectintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__ENQCMD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
 #include <enqcmdintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SERIALIZE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SERIALIZE__)
 #include <serializeintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__TSXLDTRK__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__TSXLDTRK__)
 #include <tsxldtrkintrin.h>
 #endif

--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -15,8 +15,10 @@
 #ifndef __INTRIN_H
 #define __INTRIN_H

+#include <intrin0.h>
+
 /* First include the standard intrinsics. */
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
 #include <x86intrin.h>
 #endif

@ -24,7 +26,7 @@
 #include <armintr.h>
 #endif

-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__arm64ec__)
 #include <arm64intr.h>
 #endif

@ -131,8 +133,6 @@ void __writefsqword(unsigned long, unsigned __int64);
 void __writefsword(unsigned long, unsigned short);
 void __writemsr(unsigned long, unsigned __int64);
 void *_AddressOfReturnAddress(void);
-unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
-unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
 unsigned char _bittest(long const *, long);
 unsigned char _bittestandcomplement(long *, long);
 unsigned char _bittestandreset(long *, long);
@ -151,7 +151,6 @@ long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
 void _ReadBarrier(void);
-void _ReadWriteBarrier(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
 int _sarx_i32(int, unsigned int);
 #if __STDC_HOSTED__
@ -167,7 +166,7 @@ unsigned __int32 xbegin(void);
 void _xend(void);

 /* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
-#ifdef __x86_64__
+#if defined(__x86_64__) && !defined(__arm64ec__)
 void __addgsbyte(unsigned long, unsigned char);
 void __addgsdword(unsigned long, unsigned long);
 void __addgsqword(unsigned long, unsigned __int64);
@ -182,12 +181,6 @@ unsigned char __readgsbyte(unsigned long);
 unsigned long __readgsdword(unsigned long);
 unsigned __int64 __readgsqword(unsigned long);
 unsigned short __readgsword(unsigned long);
-unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
-                                unsigned __int64 _HighPart,
-                                unsigned char _Shift);
-unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
-                                 unsigned __int64 _HighPart,
-                                 unsigned char _Shift);
 void __stosq(unsigned __int64 *, unsigned __int64, size_t);
 unsigned char __vmx_on(unsigned __int64 *);
 unsigned char __vmx_vmclear(unsigned __int64 *);
@ -236,216 +229,15 @@ unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
 unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
 __int64 __mulh(__int64, __int64);
 unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
-__int64 _mul128(__int64, __int64, __int64*);
-unsigned __int64 _umul128(unsigned __int64,
-                          unsigned __int64,
-                          unsigned __int64*);
+__int64 _mul128(__int64, __int64, __int64 *);

 #endif /* __x86_64__ */

-#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
-
-unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
-unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
-
-#endif
-
-#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
-__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
-__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
-__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
-__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
-__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
-__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
-
-#endif
-
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange Add
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value);
-char _InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value);
-char _InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value);
-short _InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value);
-short _InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value);
-short _InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value);
-long _InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value);
-long _InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value);
-long _InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value);
-__int64 _InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value);
-__int64 _InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value);
-__int64 _InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Increment
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-short _InterlockedIncrement16_acq(short volatile *_Value);
-short _InterlockedIncrement16_nf(short volatile *_Value);
-short _InterlockedIncrement16_rel(short volatile *_Value);
-long _InterlockedIncrement_acq(long volatile *_Value);
-long _InterlockedIncrement_nf(long volatile *_Value);
-long _InterlockedIncrement_rel(long volatile *_Value);
-__int64 _InterlockedIncrement64_acq(__int64 volatile *_Value);
-__int64 _InterlockedIncrement64_nf(__int64 volatile *_Value);
-__int64 _InterlockedIncrement64_rel(__int64 volatile *_Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Decrement
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-short _InterlockedDecrement16_acq(short volatile *_Value);
-short _InterlockedDecrement16_nf(short volatile *_Value);
-short _InterlockedDecrement16_rel(short volatile *_Value);
-long _InterlockedDecrement_acq(long volatile *_Value);
-long _InterlockedDecrement_nf(long volatile *_Value);
-long _InterlockedDecrement_rel(long volatile *_Value);
-__int64 _InterlockedDecrement64_acq(__int64 volatile *_Value);
-__int64 _InterlockedDecrement64_nf(__int64 volatile *_Value);
-__int64 _InterlockedDecrement64_rel(__int64 volatile *_Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked And
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedAnd8_acq(char volatile *_Value, char _Mask);
-char _InterlockedAnd8_nf(char volatile *_Value, char _Mask);
-char _InterlockedAnd8_rel(char volatile *_Value, char _Mask);
-short _InterlockedAnd16_acq(short volatile *_Value, short _Mask);
-short _InterlockedAnd16_nf(short volatile *_Value, short _Mask);
-short _InterlockedAnd16_rel(short volatile *_Value, short _Mask);
-long _InterlockedAnd_acq(long volatile *_Value, long _Mask);
-long _InterlockedAnd_nf(long volatile *_Value, long _Mask);
-long _InterlockedAnd_rel(long volatile *_Value, long _Mask);
-__int64 _InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Bit Counting and Testing
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-unsigned char _interlockedbittestandset_acq(long volatile *_BitBase,
-                                            long _BitPos);
-unsigned char _interlockedbittestandset_nf(long volatile *_BitBase,
-                                           long _BitPos);
-unsigned char _interlockedbittestandset_rel(long volatile *_BitBase,
-                                            long _BitPos);
-unsigned char _interlockedbittestandreset_acq(long volatile *_BitBase,
-                                              long _BitPos);
-unsigned char _interlockedbittestandreset_nf(long volatile *_BitBase,
-                                             long _BitPos);
-unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase,
-                                              long _BitPos);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Or
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedOr8_acq(char volatile *_Value, char _Mask);
-char _InterlockedOr8_nf(char volatile *_Value, char _Mask);
-char _InterlockedOr8_rel(char volatile *_Value, char _Mask);
-short _InterlockedOr16_acq(short volatile *_Value, short _Mask);
-short _InterlockedOr16_nf(short volatile *_Value, short _Mask);
-short _InterlockedOr16_rel(short volatile *_Value, short _Mask);
-long _InterlockedOr_acq(long volatile *_Value, long _Mask);
-long _InterlockedOr_nf(long volatile *_Value, long _Mask);
-long _InterlockedOr_rel(long volatile *_Value, long _Mask);
-__int64 _InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Xor
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedXor8_acq(char volatile *_Value, char _Mask);
-char _InterlockedXor8_nf(char volatile *_Value, char _Mask);
-char _InterlockedXor8_rel(char volatile *_Value, char _Mask);
-short _InterlockedXor16_acq(short volatile *_Value, short _Mask);
-short _InterlockedXor16_nf(short volatile *_Value, short _Mask);
-short _InterlockedXor16_rel(short volatile *_Value, short _Mask);
-long _InterlockedXor_acq(long volatile *_Value, long _Mask);
-long _InterlockedXor_nf(long volatile *_Value, long _Mask);
-long _InterlockedXor_rel(long volatile *_Value, long _Mask);
-__int64 _InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedExchange8_acq(char volatile *_Target, char _Value);
-char _InterlockedExchange8_nf(char volatile *_Target, char _Value);
-char _InterlockedExchange8_rel(char volatile *_Target, char _Value);
-short _InterlockedExchange16_acq(short volatile *_Target, short _Value);
-short _InterlockedExchange16_nf(short volatile *_Target, short _Value);
-short _InterlockedExchange16_rel(short volatile *_Target, short _Value);
-long _InterlockedExchange_acq(long volatile *_Target, long _Value);
-long _InterlockedExchange_nf(long volatile *_Target, long _Value);
-long _InterlockedExchange_rel(long volatile *_Target, long _Value);
-__int64 _InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value);
-__int64 _InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value);
-__int64 _InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Compare Exchange
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedCompareExchange8_acq(char volatile *_Destination,
-                             char _Exchange, char _Comparand);
-char _InterlockedCompareExchange8_nf(char volatile *_Destination,
-                             char _Exchange, char _Comparand);
-char _InterlockedCompareExchange8_rel(char volatile *_Destination,
-                             char _Exchange, char _Comparand);
-short _InterlockedCompareExchange16_acq(short volatile *_Destination,
-                              short _Exchange, short _Comparand);
-short _InterlockedCompareExchange16_nf(short volatile *_Destination,
-                              short _Exchange, short _Comparand);
-short _InterlockedCompareExchange16_rel(short volatile *_Destination,
-                              short _Exchange, short _Comparand);
-long _InterlockedCompareExchange_acq(long volatile *_Destination,
-                              long _Exchange, long _Comparand);
-long _InterlockedCompareExchange_nf(long volatile *_Destination,
-                              long _Exchange, long _Comparand);
-long _InterlockedCompareExchange_rel(long volatile *_Destination,
-                              long _Exchange, long _Comparand);
-__int64 _InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand);
-__int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand);
-__int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand);
-#endif
-#if defined(__x86_64__) || defined(__aarch64__)
-unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
-                                             __int64 _ExchangeHigh,
-                                             __int64 _ExchangeLow,
-                                             __int64 *_ComparandResult);
-#endif
-#if defined(__aarch64__)
-unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
-                                                 __int64 _ExchangeHigh,
-                                                 __int64 _ExchangeLow,
-                                                 __int64 *_ComparandResult);
-unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
-                                                __int64 _ExchangeHigh,
-                                                __int64 _ExchangeLow,
-                                                __int64 *_ComparandResult);
-unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
-                                                 __int64 _ExchangeHigh,
-                                                 __int64 _ExchangeLow,
-                                                 __int64 *_ComparandResult);
-#endif
-
 /*----------------------------------------------------------------------------*\
 |* movs, stos
 \*----------------------------------------------------------------------------*/
-#if defined(__i386__) || defined(__x86_64__)
+
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
 static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
                                                  unsigned char const *__src,
                                                  size_t __n) {
@ -514,7 +306,7 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosw(unsigned short *__dst,
                       : "memory");
 }
 #endif
-#ifdef __x86_64__
+#if defined(__x86_64__) && !defined(__arm64ec__)
 static __inline__ void __DEFAULT_FN_ATTRS __movsq(
    unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
  __asm__ __volatile__("rep movsq"
@ -533,10 +325,40 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
 /*----------------------------------------------------------------------------*\
 |* Misc
 \*----------------------------------------------------------------------------*/
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
 static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
  __asm__ volatile("hlt");
 }
+
+static inline unsigned char __inbyte(unsigned short port) {
+  unsigned char ret;
+  __asm__ __volatile__("inb %w1, %b0" : "=a"(ret) : "Nd"(port));
+  return ret;
+}
+
+static inline unsigned short __inword(unsigned short port) {
+  unsigned short ret;
+  __asm__ __volatile__("inw %w1, %w0" : "=a"(ret) : "Nd"(port));
+  return ret;
+}
+
+static inline unsigned long __indword(unsigned short port) {
+  unsigned long ret;
+  __asm__ __volatile__("inl %w1, %k0" : "=a"(ret) : "Nd"(port));
+  return ret;
+}
+
+static inline void __outbyte(unsigned short port, unsigned char data) {
+  __asm__ __volatile__("outb %b0, %w1" : : "a"(data), "Nd"(port));
+}
+
+static inline void __outword(unsigned short port, unsigned short data) {
+  __asm__ __volatile__("outw %w0, %w1" : : "a"(data), "Nd"(port));
+}
+
+static inline void __outdword(unsigned short port, unsigned long data) {
+  __asm__ __volatile__("outl %k0, %w1" : : "a"(data), "Nd"(port));
+}
 #endif

 #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
@ -548,9 +370,10 @@ static __inline__ void __DEFAULT_FN_ATTRS __nop(void) {
 /*----------------------------------------------------------------------------*\
 |* MS AArch64 specific
 \*----------------------------------------------------------------------------*/
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__arm64ec__)
 unsigned __int64 __getReg(int);
 long _InterlockedAdd(long volatile *Addend, long Value);
+__int64 _InterlockedAdd64(__int64 volatile *Addend, __int64 Value);
 __int64 _ReadStatusReg(int);
 void _WriteStatusReg(int, __int64);

@ -582,18 +405,19 @@ unsigned int _CountLeadingOnes(unsigned long);
 unsigned int _CountLeadingOnes64(unsigned __int64);
 unsigned int _CountLeadingSigns(long);
 unsigned int _CountLeadingSigns64(__int64);
-unsigned int _CountLeadingZeros(unsigned long);
-unsigned int _CountLeadingZeros64(unsigned _int64);
 unsigned int _CountOneBits(unsigned long);
 unsigned int _CountOneBits64(unsigned __int64);

-void __cdecl __prefetch(void *);
+unsigned int __hlt(unsigned int, ...);
+
+void __cdecl __prefetch(const void *);
+
 #endif

 /*----------------------------------------------------------------------------*\
 |* Privileged intrinsics
 \*----------------------------------------------------------------------------*/
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
 static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
 __readmsr(unsigned long __register) {
  // Loads the contents of a 64-bit model specific register (MSR) specified in
@ -607,7 +431,6 @@ __readmsr(unsigned long __register) {
  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
 }
-#endif

 static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) {
  unsigned __LPTRINT_TYPE__ __cr3_val;
@ -623,6 +446,7 @@ static __inline__ void __DEFAULT_FN_ATTRS
 __writecr3(unsigned __INTPTR_TYPE__ __cr3_val) {
  __asm__ ("mov {%0, %%cr3|cr3, %0}" : : "r"(__cr3_val) : "memory");
 }
+#endif

 #ifdef __cplusplus
 }
--- a/lib/include/intrin0.h
+++ b/lib/include/intrin0.h
@ -0,0 +1,247 @@
+/* ===-------- intrin.h ---------------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <intrin0.h>
+#else
+
+#ifndef __INTRIN0_H
+#define __INTRIN0_H
+
+#if defined(__x86_64__) && !defined(__arm64ec__)
+#include <adcintrin.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+void _ReadWriteBarrier(void);
+
+#if defined(__aarch64__) || defined(__arm64ec__)
+unsigned int _CountLeadingZeros(unsigned long);
+unsigned int _CountLeadingZeros64(unsigned _int64);
+unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+#endif
+
+#ifdef __x86_64__ && !defined(__arm64ec__)
+unsigned __int64 _umul128(unsigned __int64, unsigned __int64,
+                          unsigned __int64 *);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+#endif
+
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
+void _mm_pause(void);
+#endif
+
+#if defined(__x86_64__) || defined(__aarch64__)
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_ComparandResult);
+#endif
+
+#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) ||            \
+    defined(__aarch64__)
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
+#endif
+
+#if defined(__arm__) || defined(__aarch64__) || defined(__arm64ec__)
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+char _InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value);
+char _InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value);
+char _InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value);
+short _InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value);
+short _InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value);
+short _InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value);
+long _InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value);
+__int64 _InterlockedExchangeAdd64_acq(__int64 volatile *_Addend,
+                                      __int64 _Value);
+__int64 _InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value);
+__int64 _InterlockedExchangeAdd64_rel(__int64 volatile *_Addend,
+                                      __int64 _Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+short _InterlockedIncrement16_acq(short volatile *_Value);
+short _InterlockedIncrement16_nf(short volatile *_Value);
+short _InterlockedIncrement16_rel(short volatile *_Value);
+long _InterlockedIncrement_acq(long volatile *_Value);
+long _InterlockedIncrement_nf(long volatile *_Value);
+long _InterlockedIncrement_rel(long volatile *_Value);
+__int64 _InterlockedIncrement64_acq(__int64 volatile *_Value);
+__int64 _InterlockedIncrement64_nf(__int64 volatile *_Value);
+__int64 _InterlockedIncrement64_rel(__int64 volatile *_Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+short _InterlockedDecrement16_acq(short volatile *_Value);
+short _InterlockedDecrement16_nf(short volatile *_Value);
+short _InterlockedDecrement16_rel(short volatile *_Value);
+long _InterlockedDecrement_acq(long volatile *_Value);
+long _InterlockedDecrement_nf(long volatile *_Value);
+long _InterlockedDecrement_rel(long volatile *_Value);
+__int64 _InterlockedDecrement64_acq(__int64 volatile *_Value);
+__int64 _InterlockedDecrement64_nf(__int64 volatile *_Value);
+__int64 _InterlockedDecrement64_rel(__int64 volatile *_Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+char _InterlockedAnd8_acq(char volatile *_Value, char _Mask);
+char _InterlockedAnd8_nf(char volatile *_Value, char _Mask);
+char _InterlockedAnd8_rel(char volatile *_Value, char _Mask);
+short _InterlockedAnd16_acq(short volatile *_Value, short _Mask);
+short _InterlockedAnd16_nf(short volatile *_Value, short _Mask);
+short _InterlockedAnd16_rel(short volatile *_Value, short _Mask);
+long _InterlockedAnd_acq(long volatile *_Value, long _Mask);
+long _InterlockedAnd_nf(long volatile *_Value, long _Mask);
+long _InterlockedAnd_rel(long volatile *_Value, long _Mask);
+__int64 _InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask);
+
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+unsigned char _interlockedbittestandset_acq(long volatile *_BitBase,
+                                            long _BitPos);
+unsigned char _interlockedbittestandset_nf(long volatile *_BitBase,
+                                           long _BitPos);
+unsigned char _interlockedbittestandset_rel(long volatile *_BitBase,
+                                            long _BitPos);
+unsigned char _interlockedbittestandreset_acq(long volatile *_BitBase,
+                                              long _BitPos);
+unsigned char _interlockedbittestandreset_nf(long volatile *_BitBase,
+                                             long _BitPos);
+unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase,
+                                              long _BitPos);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+char _InterlockedOr8_acq(char volatile *_Value, char _Mask);
+char _InterlockedOr8_nf(char volatile *_Value, char _Mask);
+char _InterlockedOr8_rel(char volatile *_Value, char _Mask);
+short _InterlockedOr16_acq(short volatile *_Value, short _Mask);
+short _InterlockedOr16_nf(short volatile *_Value, short _Mask);
+short _InterlockedOr16_rel(short volatile *_Value, short _Mask);
+long _InterlockedOr_acq(long volatile *_Value, long _Mask);
+long _InterlockedOr_nf(long volatile *_Value, long _Mask);
+long _InterlockedOr_rel(long volatile *_Value, long _Mask);
+__int64 _InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+char _InterlockedXor8_acq(char volatile *_Value, char _Mask);
+char _InterlockedXor8_nf(char volatile *_Value, char _Mask);
+char _InterlockedXor8_rel(char volatile *_Value, char _Mask);
+short _InterlockedXor16_acq(short volatile *_Value, short _Mask);
+short _InterlockedXor16_nf(short volatile *_Value, short _Mask);
+short _InterlockedXor16_rel(short volatile *_Value, short _Mask);
+long _InterlockedXor_acq(long volatile *_Value, long _Mask);
+long _InterlockedXor_nf(long volatile *_Value, long _Mask);
+long _InterlockedXor_rel(long volatile *_Value, long _Mask);
+__int64 _InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+char _InterlockedExchange8_acq(char volatile *_Target, char _Value);
+char _InterlockedExchange8_nf(char volatile *_Target, char _Value);
+char _InterlockedExchange8_rel(char volatile *_Target, char _Value);
+short _InterlockedExchange16_acq(short volatile *_Target, short _Value);
+short _InterlockedExchange16_nf(short volatile *_Target, short _Value);
+short _InterlockedExchange16_rel(short volatile *_Target, short _Value);
+long _InterlockedExchange_acq(long volatile *_Target, long _Value);
+long _InterlockedExchange_nf(long volatile *_Target, long _Value);
+long _InterlockedExchange_rel(long volatile *_Target, long _Value);
+__int64 _InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value);
+__int64 _InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value);
+__int64 _InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+char _InterlockedCompareExchange8_acq(char volatile *_Destination,
+                                      char _Exchange, char _Comparand);
+char _InterlockedCompareExchange8_nf(char volatile *_Destination,
+                                     char _Exchange, char _Comparand);
+char _InterlockedCompareExchange8_rel(char volatile *_Destination,
+                                      char _Exchange, char _Comparand);
+short _InterlockedCompareExchange16_acq(short volatile *_Destination,
+                                        short _Exchange, short _Comparand);
+short _InterlockedCompareExchange16_nf(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+short _InterlockedCompareExchange16_rel(short volatile *_Destination,
+                                        short _Exchange, short _Comparand);
+long _InterlockedCompareExchange_acq(long volatile *_Destination,
+                                     long _Exchange, long _Comparand);
+long _InterlockedCompareExchange_nf(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+long _InterlockedCompareExchange_rel(long volatile *_Destination,
+                                     long _Exchange, long _Comparand);
+__int64 _InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
+                                          __int64 _Exchange,
+                                          __int64 _Comparand);
+__int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+__int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
+                                          __int64 _Exchange,
+                                          __int64 _Comparand);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __INTRIN0_H */
+#endif /* _MSC_VER */
--- a/lib/include/inttypes.h
+++ b/lib/include/inttypes.h
@ -13,6 +13,9 @@
 #if !defined(_AIX) || !defined(_STD_TYPES_T)
 #define __CLANG_INTTYPES_H
 #endif
+#if defined(__MVS__) && __has_include_next(<inttypes.h>)
+#include_next <inttypes.h>
+#else

 #if defined(_MSC_VER) && _MSC_VER < 1800
 #error MSVC does not have inttypes.h prior to Visual Studio 2013
@ -94,4 +97,5 @@
 #define SCNxFAST32 "x"
 #endif

+#endif /* __MVS__ */
 #endif /* __CLANG_INTTYPES_H */
--- a/lib/include/iso646.h
+++ b/lib/include/iso646.h
@ -9,6 +9,9 @@

 #ifndef __ISO646_H
 #define __ISO646_H
+#if defined(__MVS__) && __has_include_next(<iso646.h>)
+#include_next <iso646.h>
+#else

 #ifndef __cplusplus
 #define and    &&
@ -24,4 +27,5 @@
 #define xor_eq ^=
 #endif

+#endif /* __MVS__ */
 #endif /* __ISO646_H */
--- a/lib/include/keylockerintrin.h
+++ b/lib/include/keylockerintrin.h
@ -28,8 +28,7 @@
 #ifndef _KEYLOCKERINTRIN_H
 #define _KEYLOCKERINTRIN_H

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__KL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__)

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
@ -327,11 +326,9 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {

 #undef __DEFAULT_FN_ATTRS

-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
-          || defined(__KL__) */
+#endif /* !defined(__SCE__ || __has_feature(modules) || defined(__KL__) */

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__WIDEKL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
@ -524,7 +521,7 @@ _mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*

 #undef __DEFAULT_FN_ATTRS

-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
-          || defined(__WIDEKL__) */
+#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)   \
+        */

 #endif /* _KEYLOCKERINTRIN_H */
--- a/lib/include/limits.h
+++ b/lib/include/limits.h
@ -9,6 +9,10 @@
 #ifndef __CLANG_LIMITS_H
 #define __CLANG_LIMITS_H

+#if defined(__MVS__) && __has_include_next(<limits.h>)
+#include_next <limits.h>
+#else
+
 /* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
   Avert this #include_next madness. */
 #if defined __GNUC__ && !defined _GCC_LIMITS_H_
@ -122,4 +126,5 @@
 #define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
 #endif

+#endif /* __MVS__ */
 #endif /* __CLANG_LIMITS_H */
--- a/lib/include/llvm_libc_wrappers/assert.h
+++ b/lib/include/llvm_libc_wrappers/assert.h
@ -1,4 +1,4 @@
-//===-- Wrapper for C standard assert.h declarations on the GPU ------------===//
+//===-- Wrapper for C standard assert.h declarations on the GPU -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
--- a/lib/include/mm3dnow.h
+++ b/lib/include/mm3dnow.h
@ -7,151 +7,16 @@
 *===-----------------------------------------------------------------------===
 */

+// 3dNow intrinsics are no longer supported.
+
 #ifndef _MM3DNOW_H_INCLUDED
 #define _MM3DNOW_H_INCLUDED

+#ifndef _CLANG_DISABLE_CRT_DEPRECATION_WARNINGS
+#warning "The <mm3dnow.h> header is deprecated, and 3dNow! intrinsics are unsupported. For other intrinsics, include <x86intrin.h>, instead."
+#endif
+
 #include <mmintrin.h>
 #include <prfchwintrin.h>

-typedef float __v2sf __attribute__((__vector_size__(8)));
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64)))
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
-_m_femms(void) {
-  __builtin_ia32_femms();
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pavgusb(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pf2id(__m64 __m) {
-  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfacc(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfadd(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfcmpeq(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfcmpge(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfcmpgt(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfmax(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfmin(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfmul(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrcp(__m64 __m) {
-  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrcpit1(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrcpit2(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrsqrt(__m64 __m) {
-  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfsub(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfsubr(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pi2fd(__m64 __m) {
-  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pmulhrw(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/* Handle the 3dnowa instructions here. */
-#undef __DEFAULT_FN_ATTRS
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64)))
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pf2iw(__m64 __m) {
-  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfnacc(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfpnacc(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pi2fw(__m64 __m) {
-  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pswapdsf(__m64 __m) {
-  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pswapdsi(__m64 __m) {
-  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
 #endif
--- a/lib/include/mmintrin.h
+++ b/lib/include/mmintrin.h
@ -105,28 +105,23 @@ _mm_cvtm64_si64(__m64 __m)
    return (long long)__m;
 }

-/// Converts 16-bit signed integers from both 64-bit integer vector
-///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
-///    a 64-bit integer vector of [8 x i8] as the result. Positive values
-///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
-///    are saturated to 0x80.
+/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
+///    vector parameters of [4 x i16] into 8-bit signed integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result.
+///
+///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit signed integer with
-///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80. The converted
-///    [4 x i8] values are written to the lower 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit signed integer with
-///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80. The converted
-///    [4 x i8] values are written to the upper 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@ -135,28 +130,23 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
 }

-/// Converts 32-bit signed integers from both 64-bit integer vector
-///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
-///    a 64-bit integer vector of [4 x i16] as the result. Positive values
-///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
-///    0x8000 are saturated to 0x8000.
+/// Converts, with saturation, 32-bit signed integers from both 64-bit integer
+///    vector parameters of [2 x i32] into 16-bit signed integer values, and
+///    constructs a 64-bit integer vector of [4 x i16] as the result.
+///
+///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    values less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
-///    32-bit signed integer and is converted to a 16-bit signed integer with
-///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000. The converted
-///    [2 x i16] values are written to the lower 32 bits of the result.
+///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
-///    32-bit signed integer and is converted to a 16-bit signed integer with
-///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000. The converted
-///    [2 x i16] values are written to the upper 32 bits of the result.
+///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@ -165,28 +155,23 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
 }

-/// Converts 16-bit signed integers from both 64-bit integer vector
-///    parameters of [4 x i16] into 8-bit unsigned integer values, and
-///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
-///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
-///    to 0.
+/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
+///    vector parameters of [4 x i16] into 8-bit unsigned integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result.
+///
+///    Values greater than 0xFF are saturated to 0xFF. Values less than 0 are
+///    saturated to 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0 are saturated to 0. The converted [4 x i8] values are written to
-///    the lower 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0 are saturated to 0. The converted [4 x i8] values are written to
-///    the upper 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@ -400,11 +385,13 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
 }

-/// Adds each 8-bit signed integer element of the first 64-bit integer
-///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
-///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
-///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
-///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
+/// Adds, with saturation, each 8-bit signed integer element of the first
+///    64-bit integer vector of [8 x i8] to the corresponding 8-bit signed
+///    integer element of the second 64-bit integer vector of [8 x i8].
+///
+///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
+///    less than 0x80 are saturated to 0x80. The results are packed into a
+///    64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -422,12 +409,13 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// Adds each 16-bit signed integer element of the first 64-bit integer
-///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
-///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
-///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
-///    saturated to 0x8000. The results are packed into a 64-bit integer vector
-///    of [4 x i16].
+/// Adds, with saturation, each 16-bit signed integer element of the first
+///    64-bit integer vector of [4 x i16] to the corresponding 16-bit signed
+///    integer element of the second 64-bit integer vector of [4 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000. The results are packed into a
+///    64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -445,11 +433,12 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// Adds each 8-bit unsigned integer element of the first 64-bit integer
-///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
-///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
-///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
-///    [8 x i8].
+/// Adds, with saturation, each 8-bit unsigned integer element of the first
+///    64-bit integer vector of [8 x i8] to the corresponding 8-bit unsigned
+///    integer element of the second 64-bit integer vector of [8 x i8].
+///
+///    Sums greater than 0xFF are saturated to 0xFF. The results are packed
+///    into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -467,11 +456,12 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// Adds each 16-bit unsigned integer element of the first 64-bit integer
-///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
-///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
-///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
-///    integer vector of [4 x i16].
+/// Adds, with saturation, each 16-bit unsigned integer element of the first
+///    64-bit integer vector of [4 x i16] to the corresponding 16-bit unsigned
+///    integer element of the second 64-bit integer vector of [4 x i16].
+///
+///    Sums greater than 0xFFFF are saturated to 0xFFFF. The results are packed
+///    into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -552,12 +542,13 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
 }

-/// Subtracts each 8-bit signed integer element of the second 64-bit
-///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
-///    element of the first 64-bit integer vector of [8 x i8]. Positive results
-///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
-///    are saturated to 0x80. The results are packed into a 64-bit integer
-///    vector of [8 x i8].
+/// Subtracts, with saturation, each 8-bit signed integer element of the second
+///    64-bit integer vector of [8 x i8] from the corresponding 8-bit signed
+///    integer element of the first 64-bit integer vector of [8 x i8].
+///
+///    Positive results greater than 0x7F are saturated to 0x7F. Negative
+///    results less than 0x80 are saturated to 0x80. The results are packed
+///    into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -575,12 +566,13 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// Subtracts each 16-bit signed integer element of the second 64-bit
-///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
-///    element of the first 64-bit integer vector of [4 x i16]. Positive results
-///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
-///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
-///    integer vector of [4 x i16].
+/// Subtracts, with saturation, each 16-bit signed integer element of the
+///    second 64-bit integer vector of [4 x i16] from the corresponding 16-bit
+///    signed integer element of the first 64-bit integer vector of [4 x i16].
+///
+///    Positive results greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    results less than 0x8000 are saturated to 0x8000. The results are packed
+///    into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1149,7 +1141,7 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
 ///    [8 x i8] to determine if the element of the first vector is equal to the
 ///    corresponding element of the second vector.
 ///
-///    The comparison yields 0 for false, 0xFF for true.
+///    Each comparison returns 0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1171,7 +1163,7 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 ///    [4 x i16] to determine if the element of the first vector is equal to the
 ///    corresponding element of the second vector.
 ///
-///    The comparison yields 0 for false, 0xFFFF for true.
+///    Each comparison returns 0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1193,7 +1185,7 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 ///    [2 x i32] to determine if the element of the first vector is equal to the
 ///    corresponding element of the second vector.
 ///
-///    The comparison yields 0 for false, 0xFFFFFFFF for true.
+///    Each comparison returns 0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1215,7 +1207,7 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 ///    [8 x i8] to determine if the element of the first vector is greater than
 ///    the corresponding element of the second vector.
 ///
-///    The comparison yields 0 for false, 0xFF for true.
+///    Each comparison returns 0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1237,7 +1229,7 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 ///    [4 x i16] to determine if the element of the first vector is greater than
 ///    the corresponding element of the second vector.
 ///
-///    The comparison yields 0 for false, 0xFFFF for true.
+///    Each comparison returns 0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1259,7 +1251,7 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 ///    [2 x i32] to determine if the element of the first vector is greater than
 ///    the corresponding element of the second vector.
 ///
-///    The comparison yields 0 for false, 0xFFFFFFFF for true.
+///    Each comparison returns 0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
--- a/lib/include/module.modulemap
+++ b/lib/include/module.modulemap
@ -44,7 +44,6 @@ module _Builtin_intrinsics [system] [extern_c] {
    textual header "avxintrin.h"
    textual header "avx2intrin.h"
    textual header "avx512fintrin.h"
-    textual header "avx512erintrin.h"
    textual header "fmaintrin.h"

    header "x86intrin.h"
@ -203,6 +202,11 @@ module _Builtin_stdarg [system] {
    export *
  }

+  explicit module header_macro {
+    header "__stdarg_header_macro.h"
+    export *
+  }
+
  explicit module va_arg {
    header "__stdarg_va_arg.h"
    export *
@ -232,6 +236,10 @@ module _Builtin_stdbool [system] {
 module _Builtin_stddef [system] {
  textual header "stddef.h"

+  explicit module header_macro {
+    header "__stddef_header_macro.h"
+    export *
+  }
  // __stddef_max_align_t.h is always in this module, even if
  // -fbuiltin-headers-in-system-modules is passed.
  explicit module max_align_t {
@ -315,3 +323,8 @@ module opencl_c {
  header "opencl-c.h"
  header "opencl-c-base.h"
 }
+
+module ptrauth {
+  header "ptrauth.h"
+  export *
+}
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@ -46,6 +46,10 @@
 #define __opencl_c_ext_fp32_global_atomic_min_max 1
 #define __opencl_c_ext_fp32_local_atomic_min_max 1
 #define __opencl_c_ext_image_raw10_raw12 1
+#define cl_khr_kernel_clock 1
+#define __opencl_c_kernel_clock_scope_device 1
+#define __opencl_c_kernel_clock_scope_work_group 1
+#define __opencl_c_kernel_clock_scope_sub_group 1

 #endif // defined(__SPIR__) || defined(__SPIRV__)
 #endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
@ -17314,6 +17314,21 @@ half __ovld __conv sub_group_clustered_rotate(half, int, uint);
 #endif // cl_khr_fp16
 #endif // cl_khr_subgroup_rotate

+#if defined(cl_khr_kernel_clock)
+#if defined(__opencl_c_kernel_clock_scope_device)
+ulong __ovld clock_read_device();
+uint2 __ovld clock_read_hilo_device();
+#endif // __opencl_c_kernel_clock_scope_device
+#if defined(__opencl_c_kernel_clock_scope_work_group)
+ulong __ovld clock_read_work_group();
+uint2 __ovld clock_read_hilo_work_group();
+#endif // __opencl_c_kernel_clock_scope_work_group
+#if defined(__opencl_c_kernel_clock_scope_sub_group)
+ulong __ovld clock_read_sub_group();
+uint2 __ovld clock_read_hilo_sub_group();
+#endif // __opencl_c_kernel_clock_scope_sub_group
+#endif // cl_khr_kernel_clock
+
 #if defined(cl_intel_subgroups)
 // Intel-Specific Sub Group Functions
 float   __ovld __conv intel_sub_group_shuffle( float , uint );
--- a/lib/include/prfchwintrin.h
+++ b/lib/include/prfchwintrin.h
@ -8,16 +8,17 @@
 */

 #if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
-#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> instead."
 #endif

 #ifndef __PRFCHWINTRIN_H
 #define __PRFCHWINTRIN_H

 /// Loads a memory sequence containing the specified memory address into
-///    all data cache levels. The cache-coherency state is set to exclusive.
-///    Data can be read from and written to the cache line without additional
-///    delay.
+///    all data cache levels.
+///
+///    The cache-coherency state is set to exclusive. Data can be read from
+///    and written to the cache line without additional delay.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -32,10 +33,11 @@ _m_prefetch(void *__P)
 }

 /// Loads a memory sequence containing the specified memory address into
-///    the L1 data cache and sets the cache-coherency to modified. This
-///    provides a hint to the processor that the cache line will be modified.
-///    It is intended for use when the cache line will be written to shortly
-///    after the prefetch is performed.
+///    the L1 data cache and sets the cache-coherency state to modified.
+///
+///    This provides a hint to the processor that the cache line will be
+///    modified. It is intended for use when the cache line will be written to
+///    shortly after the prefetch is performed.
 ///
 ///    Note that the effect of this intrinsic is dependent on the processor
 ///    implementation.
--- a/lib/include/ptrauth.h
+++ b/lib/include/ptrauth.h
@ -0,0 +1,330 @@
+/*===---- ptrauth.h - Pointer authentication -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PTRAUTH_H
+#define __PTRAUTH_H
+
+typedef enum {
+  ptrauth_key_asia = 0,
+  ptrauth_key_asib = 1,
+  ptrauth_key_asda = 2,
+  ptrauth_key_asdb = 3,
+
+  /* A process-independent key which can be used to sign code pointers. */
+  ptrauth_key_process_independent_code = ptrauth_key_asia,
+
+  /* A process-specific key which can be used to sign code pointers. */
+  ptrauth_key_process_dependent_code = ptrauth_key_asib,
+
+  /* A process-independent key which can be used to sign data pointers. */
+  ptrauth_key_process_independent_data = ptrauth_key_asda,
+
+  /* A process-specific key which can be used to sign data pointers. */
+  ptrauth_key_process_dependent_data = ptrauth_key_asdb,
+
+  /* The key used to sign return addresses on the stack.
+     The extra data is based on the storage address of the return address.
+     On AArch64, that is always the storage address of the return address + 8
+     (or, in other words, the value of the stack pointer on function entry) */
+  ptrauth_key_return_address = ptrauth_key_process_dependent_code,
+
+  /* The key used to sign C function pointers.
+     The extra data is always 0. */
+  ptrauth_key_function_pointer = ptrauth_key_process_independent_code,
+
+  /* The key used to sign C++ v-table pointers.
+     The extra data is always 0. */
+  ptrauth_key_cxx_vtable_pointer = ptrauth_key_process_independent_data,
+
+  /* Other pointers signed under the ABI use private ABI rules. */
+
+} ptrauth_key;
+
+/* An integer type of the appropriate size for a discriminator argument. */
+typedef __UINTPTR_TYPE__ ptrauth_extra_data_t;
+
+/* An integer type of the appropriate size for a generic signature. */
+typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
+
+/* A signed pointer value embeds the original pointer together with
+   a signature that attests to the validity of that pointer.  Because
+   this signature must use only "spare" bits of the pointer, a
+   signature's validity is probabilistic in practice: it is unlikely
+   but still plausible that an invalidly-derived signature will
+   somehow equal the correct signature and therefore successfully
+   authenticate.  Nonetheless, this scheme provides a strong degree
+   of protection against certain kinds of attacks. */
+
+/* Authenticating a pointer that was not signed with the given key
+   and extra-data value will (likely) fail by trapping. */
+
+/* The null function pointer is always the all-zero bit pattern.
+   Signing an all-zero bit pattern will embed a (likely) non-zero
+   signature in the result, and so the result will not seem to be
+   a null function pointer.  Authenticating this value will yield
+   a null function pointer back.  However, authenticating an
+   all-zero bit pattern will probably fail, because the
+   authentication will expect a (likely) non-zero signature to
+   embedded in the value.
+
+   Because of this, if a pointer may validly be null, you should
+   check for null before attempting to authenticate it with one
+   of these intrinsics.  This is not necessary when using the
+   __ptrauth qualifier; the compiler will perform this check
+   automatically. */
+
+#if __has_feature(ptrauth_intrinsics)
+
+/* Strip the signature from a value without authenticating it.
+
+   If the value is a function pointer, the result will not be a
+   legal function pointer because of the missing signature, and
+   attempting to call it will result in an authentication failure.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The result will have the same type as the original value. */
+#define ptrauth_strip(__value, __key) __builtin_ptrauth_strip(__value, __key)
+
+/* Blend a constant discriminator into the given pointer-like value
+   to form a new discriminator.  Not all bits of the inputs are
+   guaranteed to contribute to the result.
+
+   On arm64e, the integer must fall within the range of a uint16_t;
+   other bits may be ignored.
+
+   For the purposes of ptrauth_sign_constant, the result of calling
+   this function is considered a constant expression if the arguments
+   are constant.  Some restrictions may be imposed on the pointer.
+
+   The first argument must be an expression of pointer type.
+   The second argument must be an expression of integer type.
+   The result will have type uintptr_t. */
+#define ptrauth_blend_discriminator(__pointer, __integer)                      \
+  __builtin_ptrauth_blend_discriminator(__pointer, __integer)
+
+/* Return a signed pointer for a constant address in a manner which guarantees
+   a non-attackable sequence.
+
+   The value must be a constant expression of pointer type which evaluates to
+   a non-null pointer.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be a constant expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This can be used in constant expressions.  */
+#define ptrauth_sign_constant(__value, __key, __data)                          \
+  __builtin_ptrauth_sign_constant(__value, __key, __data)
+
+/* Add a signature to the given pointer value using a specific key,
+   using the given extra data as a salt to the signing process.
+
+   This operation does not authenticate the original value and is
+   therefore potentially insecure if an attacker could possibly
+   control that value.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value. */
+#define ptrauth_sign_unauthenticated(__value, __key, __data)                   \
+  __builtin_ptrauth_sign_unauthenticated(__value, __key, __data)
+
+/* Authenticate a pointer using one scheme and resign it using another.
+
+   If the result is subsequently authenticated using the new scheme, that
+   authentication is guaranteed to fail if and only if the initial
+   authentication failed.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This operation is guaranteed to not leave the intermediate value
+   available for attack before it is re-signed.
+
+   Do not pass a null pointer to this function. A null pointer
+   will not successfully authenticate.
+
+   This operation traps if the authentication fails. */
+#define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key,     \
+                                __new_data)                                    \
+  __builtin_ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, \
+                                    __new_data)
+
+/* Authenticate a pointer using one scheme and resign it as a C
+   function pointer.
+
+   If the result is subsequently authenticated using the new scheme, that
+   authentication is guaranteed to fail if and only if the initial
+   authentication failed.
+
+   The value must be an expression of function pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This operation is guaranteed to not leave the intermediate value
+   available for attack before it is re-signed. Additionally, if this
+   expression is used syntactically as the function expression in a
+   call, only a single authentication will be performed. */
+#define ptrauth_auth_function(__value, __old_key, __old_data)                  \
+  ptrauth_auth_and_resign(__value, __old_key, __old_data,                      \
+                          ptrauth_key_function_pointer, 0)
+
+/* Authenticate a data pointer.
+
+   The value must be an expression of non-function pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This operation traps if the authentication fails. */
+#define ptrauth_auth_data(__value, __old_key, __old_data)                      \
+  __builtin_ptrauth_auth(__value, __old_key, __old_data)
+
+/* Compute a constant discriminator from the given string.
+
+   The argument must be a string literal of char character type.  The result
+   has type ptrauth_extra_data_t.
+
+   The result value is never zero and always within range for both the
+   __ptrauth qualifier and ptrauth_blend_discriminator.
+
+   This can be used in constant expressions.
+*/
+#define ptrauth_string_discriminator(__string)                                 \
+  __builtin_ptrauth_string_discriminator(__string)
+
+/* Compute a constant discriminator from the given type.
+
+   The result can be used as the second argument to
+   ptrauth_blend_discriminator or the third argument to the
+   __ptrauth qualifier.  It has type size_t.
+
+   If the type is a C++ member function pointer type, the result is
+   the discriminator used to signed member function pointers of that
+   type.  If the type is a function, function pointer, or function
+   reference type, the result is the discriminator used to sign
+   functions of that type.  It is ill-formed to use this macro with any
+   other type.
+
+   A call to this function is an integer constant expression. */
+#define ptrauth_type_discriminator(__type)                                     \
+  __builtin_ptrauth_type_discriminator(__type)
+
+/* Compute a signature for the given pair of pointer-sized values.
+   The order of the arguments is significant.
+
+   Like a pointer signature, the resulting signature depends on
+   private key data and therefore should not be reliably reproducible
+   by attackers.  That means that this can be used to validate the
+   integrity of arbitrary data by storing a signature for that data
+   alongside it, then checking that the signature is still valid later.
+   Data which exceeds two pointers in size can be signed by either
+   computing a tree of generic signatures or just signing an ordinary
+   cryptographic hash of the data.
+
+   The result has type ptrauth_generic_signature_t.  However, it may
+   not have as many bits of entropy as that type's width would suggest;
+   some implementations are known to compute a compressed signature as
+   if the arguments were a pointer and a discriminator.
+
+   The arguments must be either pointers or integers; if integers, they
+   will be coerce to uintptr_t. */
+#define ptrauth_sign_generic_data(__value, __data)                             \
+  __builtin_ptrauth_sign_generic_data(__value, __data)
+
+/* C++ vtable pointer signing class attribute */
+#define ptrauth_cxx_vtable_pointer(key, address_discrimination,                \
+                                   extra_discrimination...)                    \
+  [[clang::ptrauth_vtable_pointer(key, address_discrimination,                 \
+                                  extra_discrimination)]]
+
+#else
+
+#define ptrauth_strip(__value, __key)                                          \
+  ({                                                                           \
+    (void)__key;                                                               \
+    __value;                                                                   \
+  })
+
+#define ptrauth_blend_discriminator(__pointer, __integer)                      \
+  ({                                                                           \
+    (void)__pointer;                                                           \
+    (void)__integer;                                                           \
+    ((ptrauth_extra_data_t)0);                                                 \
+  })
+
+#define ptrauth_sign_constant(__value, __key, __data)                          \
+  ({                                                                           \
+    (void)__key;                                                               \
+    (void)__data;                                                              \
+    __value;                                                                   \
+  })
+
+#define ptrauth_sign_unauthenticated(__value, __key, __data)                   \
+  ({                                                                           \
+    (void)__key;                                                               \
+    (void)__data;                                                              \
+    __value;                                                                   \
+  })
+
+#define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key,     \
+                                __new_data)                                    \
+  ({                                                                           \
+    (void)__old_key;                                                           \
+    (void)__old_data;                                                          \
+    (void)__new_key;                                                           \
+    (void)__new_data;                                                          \
+    __value;                                                                   \
+  })
+
+#define ptrauth_auth_function(__value, __old_key, __old_data)                  \
+  ({                                                                           \
+    (void)__old_key;                                                           \
+    (void)__old_data;                                                          \
+    __value;                                                                   \
+  })
+
+#define ptrauth_auth_data(__value, __old_key, __old_data)                      \
+  ({                                                                           \
+    (void)__old_key;                                                           \
+    (void)__old_data;                                                          \
+    __value;                                                                   \
+  })
+
+#define ptrauth_string_discriminator(__string)                                 \
+  ({                                                                           \
+    (void)__string;                                                            \
+    ((ptrauth_extra_data_t)0);                                                 \
+  })
+
+#define ptrauth_type_discriminator(__type) ((ptrauth_extra_data_t)0)
+
+#define ptrauth_sign_generic_data(__value, __data)                             \
+  ({                                                                           \
+    (void)__value;                                                             \
+    (void)__data;                                                              \
+    ((ptrauth_generic_signature_t)0);                                          \
+  })
+
+
+#define ptrauth_cxx_vtable_pointer(key, address_discrimination,                \
+                                   extra_discrimination...)
+
+#endif /* __has_feature(ptrauth_intrinsics) */
+
+#endif /* __PTRAUTH_H */
--- a/lib/include/riscv_vector.h
+++ b/lib/include/riscv_vector.h
@ -14,10 +14,6 @@
 #include <stdint.h>
 #include <stddef.h>

-#ifndef __riscv_vector
-#error "Vector intrinsics require the vector extension."
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/lib/include/sifive_vector.h
+++ b/lib/include/sifive_vector.h
@ -13,4 +13,106 @@

 #pragma clang riscv intrinsic sifive_vector

+#define __riscv_sf_vc_x_se_u8mf4(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint8_t)rs1, 8, 6, vl)
+#define __riscv_sf_vc_x_se_u8mf2(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint8_t)rs1, 8, 7, vl)
+#define __riscv_sf_vc_x_se_u8m1(p27_26, p24_20, p11_7, rs1, vl)                \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint8_t)rs1, 8, 0, vl)
+#define __riscv_sf_vc_x_se_u8m2(p27_26, p24_20, p11_7, rs1, vl)                \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint8_t)rs1, 8, 1, vl)
+#define __riscv_sf_vc_x_se_u8m4(p27_26, p24_20, p11_7, rs1, vl)                \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint8_t)rs1, 8, 2, vl)
+#define __riscv_sf_vc_x_se_u8m8(p27_26, p24_20, p11_7, rs1, vl)                \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint8_t)rs1, 8, 3, vl)
+
+#define __riscv_sf_vc_x_se_u16mf2(p27_26, p24_20, p11_7, rs1, vl)              \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint16_t)rs1, 16, 7, vl)
+#define __riscv_sf_vc_x_se_u16m1(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint16_t)rs1, 16, 0, vl)
+#define __riscv_sf_vc_x_se_u16m2(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint16_t)rs1, 16, 1, vl)
+#define __riscv_sf_vc_x_se_u16m4(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint16_t)rs1, 16, 2, vl)
+#define __riscv_sf_vc_x_se_u16m8(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint16_t)rs1, 16, 3, vl)
+
+#define __riscv_sf_vc_x_se_u32m1(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint32_t)rs1, 32, 0, vl)
+#define __riscv_sf_vc_x_se_u32m2(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint32_t)rs1, 32, 1, vl)
+#define __riscv_sf_vc_x_se_u32m4(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint32_t)rs1, 32, 2, vl)
+#define __riscv_sf_vc_x_se_u32m8(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint32_t)rs1, 32, 3, vl)
+
+#define __riscv_sf_vc_i_se_u8mf4(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 8, 7, vl)
+#define __riscv_sf_vc_i_se_u8mf2(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 8, 6, vl)
+#define __riscv_sf_vc_i_se_u8m1(p27_26, p24_20, p11_7, simm5, vl)              \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 8, 0, vl)
+#define __riscv_sf_vc_i_se_u8m2(p27_26, p24_20, p11_7, simm5, vl)              \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 8, 1, vl)
+#define __riscv_sf_vc_i_se_u8m4(p27_26, p24_20, p11_7, simm5, vl)              \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 8, 2, vl)
+#define __riscv_sf_vc_i_se_u8m8(p27_26, p24_20, p11_7, simm5, vl)              \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 8, 3, vl)
+
+#define __riscv_sf_vc_i_se_u16mf2(p27_26, p24_20, p11_7, simm5, vl)            \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 16, 7, vl)
+#define __riscv_sf_vc_i_se_u16m1(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 16, 0, vl)
+#define __riscv_sf_vc_i_se_u16m2(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 16, 1, vl)
+#define __riscv_sf_vc_i_se_u16m4(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 16, 2, vl)
+#define __riscv_sf_vc_i_se_u16m8(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 16, 3, vl)
+
+#define __riscv_sf_vc_i_se_u32m1(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 32, 0, vl)
+#define __riscv_sf_vc_i_se_u32m2(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 32, 1, vl)
+#define __riscv_sf_vc_i_se_u32m4(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 32, 2, vl)
+#define __riscv_sf_vc_i_se_u32m8(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 32, 3, vl)
+
+#if __riscv_v_elen >= 64
+#define __riscv_sf_vc_x_se_u8mf8(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint8_t)rs1, 8, 5, vl)
+#define __riscv_sf_vc_x_se_u16mf4(p27_26, p24_20, p11_7, rs1, vl)              \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint16_t)rs1, 16, 6, vl)
+#define __riscv_sf_vc_x_se_u32mf2(p27_26, p24_20, p11_7, rs1, vl)              \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint32_t)rs1, 32, 7, vl)
+
+#define __riscv_sf_vc_i_se_u8mf8(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 8, 5, vl)
+#define __riscv_sf_vc_i_se_u16mf4(p27_26, p24_20, p11_7, simm5, vl)            \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 16, 6, vl)
+#define __riscv_sf_vc_i_se_u32mf2(p27_26, p24_20, p11_7, simm5, vl)            \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 32, 7, vl)
+
+#define __riscv_sf_vc_i_se_u64m1(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 64, 0, vl)
+#define __riscv_sf_vc_i_se_u64m2(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 64, 1, vl)
+#define __riscv_sf_vc_i_se_u64m4(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 64, 2, vl)
+#define __riscv_sf_vc_i_se_u64m8(p27_26, p24_20, p11_7, simm5, vl)             \
+  __riscv_sf_vc_i_se(p27_26, p24_20, p11_7, simm5, 64, 3, vl)
+
+#if __riscv_xlen >= 64
+#define __riscv_sf_vc_x_se_u64m1(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint64_t)rs1, 64, 0, vl)
+#define __riscv_sf_vc_x_se_u64m2(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint64_t)rs1, 64, 1, vl)
+#define __riscv_sf_vc_x_se_u64m4(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint64_t)rs1, 64, 2, vl)
+#define __riscv_sf_vc_x_se_u64m8(p27_26, p24_20, p11_7, rs1, vl)               \
+  __riscv_sf_vc_x_se(p27_26, p24_20, p11_7, (uint64_t)rs1, 64, 3, vl)
+#endif
+#endif
+
 #endif //_SIFIVE_VECTOR_H_
--- a/lib/include/smmintrin.h
+++ b/lib/include/smmintrin.h
@ -1188,6 +1188,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
 /// Compares each of the corresponding 64-bit values of the 128-bit
 ///    integer vectors for equality.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
@ -1431,8 +1433,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
 }

 /* SSE4 Pack with Unsigned Saturation.  */
-/// Converts 32-bit signed integers from both 128-bit integer vector
-///    operands into 16-bit unsigned integers, and returns the packed result.
+/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
+///    vector operands into 16-bit unsigned integers, and returns the packed
+///    result.
+///
 ///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
 ///    0x0000 are saturated to 0x0000.
 ///
@ -1441,17 +1445,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
 /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
 ///
 /// \param __V1
-///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
-///    signed integer and is converted to a 16-bit unsigned integer with
-///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
-///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
-///    are written to the lower 64 bits of the result.
+///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
+///    written to the lower 64 bits of the result.
 /// \param __V2
-///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
-///    signed integer and is converted to a 16-bit unsigned integer with
-///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
-///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
-///    are written to the higher 64 bits of the result.
+///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
+///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
                                                              __m128i __V2) {
@ -2305,6 +2303,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
 ///    integer vectors to determine if the values in the first operand are
 ///    greater than those in the second operand.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
--- a/lib/include/stdalign.h
+++ b/lib/include/stdalign.h
@ -10,6 +10,10 @@
 #ifndef __STDALIGN_H
 #define __STDALIGN_H

+#if defined(__MVS__) && __has_include_next(<stdalign.h>)
+#include_next <stdalign.h>
+#else
+
 #if defined(__cplusplus) ||                                                    \
    (defined(__STDC_VERSION__) && __STDC_VERSION__ < 202311L)
 #ifndef __cplusplus
@ -21,4 +25,5 @@
 #define __alignof_is_defined 1
 #endif /* __STDC_VERSION__ */

+#endif /* __MVS__ */
 #endif /* __STDALIGN_H */
--- a/lib/include/stdarg.h
+++ b/lib/include/stdarg.h
@ -14,29 +14,24 @@
 * need to use some of its interfaces. Otherwise this header provides all of
 * the expected interfaces.
 *
- * When clang modules are enabled, this header is a textual header. It ignores
- * its header guard so that multiple submodules can export its interfaces.
- * Take module SM with submodules A and B, whose headers both include stdarg.h
- * When SM.A builds, __STDARG_H will be defined. When SM.B builds, the
- * definition from SM.A will leak when building without local submodule
- * visibility. stdarg.h wouldn't include any of its implementation headers, and
- * SM.B wouldn't import any of the stdarg modules, and SM.B's `export *`
- * wouldn't export any stdarg interfaces as expected. However, since stdarg.h
- * ignores its header guard when building with modules, it all works as
- * expected.
- *
- * When clang modules are not enabled, the header guards can function in the
- * normal simple fashion.
+ * When clang modules are enabled, this header is a textual header to support
+ * the multiple include behavior. As such, it doesn't directly declare anything
+ * so that it doesn't add duplicate declarations to all of its includers'
+ * modules.
 */
-#if !defined(__STDARG_H) || __has_feature(modules) ||                          \
-    defined(__need___va_list) || defined(__need_va_list) ||                    \
-    defined(__need_va_arg) || defined(__need___va_copy) ||                     \
-    defined(__need_va_copy)
+#if defined(__MVS__) && __has_include_next(<stdarg.h>)
+#undef __need___va_list
+#undef __need_va_list
+#undef __need_va_arg
+#undef __need___va_copy
+#undef __need_va_copy
+#include <__stdarg_header_macro.h>
+#include_next <stdarg.h>

+#else
 #if !defined(__need___va_list) && !defined(__need_va_list) &&                  \
    !defined(__need_va_arg) && !defined(__need___va_copy) &&                   \
    !defined(__need_va_copy)
-#define __STDARG_H
 #define __need___va_list
 #define __need_va_list
 #define __need_va_arg
@ -49,6 +44,7 @@
    !defined(__STRICT_ANSI__)
 #define __need_va_copy
 #endif
+#include <__stdarg_header_macro.h>
 #endif

 #ifdef __need___va_list
@ -76,4 +72,4 @@
 #undef __need_va_copy
 #endif /* defined(__need_va_copy) */

-#endif
+#endif /* __MVS__ */
--- a/lib/include/stdatomic.h
+++ b/lib/include/stdatomic.h
@ -16,7 +16,7 @@
 * Exclude the MSVC path as well as the MSVC header as of the 14.31.30818
 * explicitly disallows `stdatomic.h` in the C mode via an `#error`.  Fallback
 * to the clang resource header until that is fully supported.  The
- * `stdatomic.h` header requires C++ 23 or newer.
+ * `stdatomic.h` header requires C++23 or newer.
 */
 #if __STDC_HOSTED__ &&                                                         \
    __has_include_next(<stdatomic.h>) &&                                       \
@ -35,6 +35,9 @@ extern "C" {

 #define ATOMIC_BOOL_LOCK_FREE       __CLANG_ATOMIC_BOOL_LOCK_FREE
 #define ATOMIC_CHAR_LOCK_FREE       __CLANG_ATOMIC_CHAR_LOCK_FREE
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
+#define ATOMIC_CHAR8_T_LOCK_FREE    __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
+#endif
 #define ATOMIC_CHAR16_T_LOCK_FREE   __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
 #define ATOMIC_CHAR32_T_LOCK_FREE   __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
 #define ATOMIC_WCHAR_T_LOCK_FREE    __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
@ -104,6 +107,9 @@ typedef _Atomic(long)               atomic_long;
 typedef _Atomic(unsigned long)      atomic_ulong;
 typedef _Atomic(long long)          atomic_llong;
 typedef _Atomic(unsigned long long) atomic_ullong;
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
+typedef _Atomic(unsigned char)      atomic_char8_t;
+#endif
 typedef _Atomic(uint_least16_t)     atomic_char16_t;
 typedef _Atomic(uint_least32_t)     atomic_char32_t;
 typedef _Atomic(wchar_t)            atomic_wchar_t;
@ -166,7 +172,11 @@ typedef _Atomic(uintmax_t)          atomic_uintmax_t;

 typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;

+#ifdef __cplusplus
+#define ATOMIC_FLAG_INIT {false}
+#else
 #define ATOMIC_FLAG_INIT { 0 }
+#endif

 /* These should be provided by the libc implementation. */
 #ifdef __cplusplus
--- a/lib/include/stdbool.h
+++ b/lib/include/stdbool.h
@ -12,6 +12,10 @@

 #define __bool_true_false_are_defined 1

+#if defined(__MVS__) && __has_include_next(<stdbool.h>)
+#include_next <stdbool.h>
+#else
+
 #if defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L
 /* FIXME: We should be issuing a deprecation warning here, but cannot yet due
 * to system headers which include this header file unconditionally.
@ -31,4 +35,5 @@
 #endif
 #endif

+#endif /* __MVS__ */
 #endif /* __STDBOOL_H */
--- a/lib/include/stddef.h
+++ b/lib/include/stddef.h
@ -14,34 +14,32 @@
 * need to use some of its interfaces. Otherwise this header provides all of
 * the expected interfaces.
 *
- * When clang modules are enabled, this header is a textual header. It ignores
- * its header guard so that multiple submodules can export its interfaces.
- * Take module SM with submodules A and B, whose headers both include stddef.h
- * When SM.A builds, __STDDEF_H will be defined. When SM.B builds, the
- * definition from SM.A will leak when building without local submodule
- * visibility. stddef.h wouldn't include any of its implementation headers, and
- * SM.B wouldn't import any of the stddef modules, and SM.B's `export *`
- * wouldn't export any stddef interfaces as expected. However, since stddef.h
- * ignores its header guard when building with modules, it all works as
- * expected.
- *
- * When clang modules are not enabled, the header guards can function in the
- * normal simple fashion.
+ * When clang modules are enabled, this header is a textual header to support
+ * the multiple include behavior. As such, it doesn't directly declare anything
+ * so that it doesn't add duplicate declarations to all of its includers'
+ * modules.
 */
-#if !defined(__STDDEF_H) || __has_feature(modules) ||                          \
-    (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1) ||        \
-    defined(__need_ptrdiff_t) || defined(__need_size_t) ||                     \
-    defined(__need_rsize_t) || defined(__need_wchar_t) ||                      \
-    defined(__need_NULL) || defined(__need_nullptr_t) ||                       \
-    defined(__need_unreachable) || defined(__need_max_align_t) ||              \
-    defined(__need_offsetof) || defined(__need_wint_t)
+#if defined(__MVS__) && __has_include_next(<stddef.h>)
+#undef __need_ptrdiff_t
+#undef __need_size_t
+#undef __need_rsize_t
+#undef __need_wchar_t
+#undef __need_NULL
+#undef __need_nullptr_t
+#undef __need_unreachable
+#undef __need_max_align_t
+#undef __need_offsetof
+#undef __need_wint_t
+#include <__stddef_header_macro.h>
+#include_next <stddef.h>
+
+#else

 #if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
    !defined(__need_rsize_t) && !defined(__need_wchar_t) &&                    \
    !defined(__need_NULL) && !defined(__need_nullptr_t) &&                     \
    !defined(__need_unreachable) && !defined(__need_max_align_t) &&            \
    !defined(__need_offsetof) && !defined(__need_wint_t)
-#define __STDDEF_H
 #define __need_ptrdiff_t
 #define __need_size_t
 /* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
@ -50,7 +48,24 @@
 #define __need_rsize_t
 #endif
 #define __need_wchar_t
+#if !defined(__STDDEF_H) || __has_feature(modules)
+/*
+ * __stddef_null.h is special when building without modules: if __need_NULL is
+ * set, then it will unconditionally redefine NULL. To avoid stepping on client
+ * definitions of NULL, __need_NULL should only be set the first time this
+ * header is included, that is when __STDDEF_H is not defined. However, when
+ * building with modules, this header is a textual header and needs to
+ * unconditionally include __stdef_null.h to support multiple submodules
+ * exporting _Builtin_stddef.null. Take module SM with submodules A and B, whose
+ * headers both include stddef.h When SM.A builds, __STDDEF_H will be defined.
+ * When SM.B builds, the definition from SM.A will leak when building without
+ * local submodule visibility. stddef.h wouldn't include __stddef_null.h, and
+ * SM.B wouldn't import _Builtin_stddef.null, and SM.B's `export *` wouldn't
+ * export NULL as expected. When building with modules, always include
+ * __stddef_null.h so that everything works as expected.
+ */
 #define __need_NULL
+#endif
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
    defined(__cplusplus)
 #define __need_nullptr_t
@ -66,6 +81,7 @@
 /* wint_t is provided by <wchar.h> and not <stddef.h>. It's here
 * for compatibility, but must be explicitly requested. Therefore
 * __need_wint_t is intentionally not defined here. */
+#include <__stddef_header_macro.h>
 #endif

 #if defined(__need_ptrdiff_t)
@ -120,4 +136,4 @@ __WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
 #undef __need_wint_t
 #endif /* __need_wint_t */

-#endif
+#endif /* __MVS__ */
--- a/lib/include/stdint.h
+++ b/lib/include/stdint.h
@ -14,6 +14,10 @@
 #define __CLANG_STDINT_H
 #endif

+#if defined(__MVS__) && __has_include_next(<stdint.h>)
+#include_next <stdint.h>
+#else
+
 /* If we're hosted, fall back to the system's stdint.h, which might have
 * additional definitions.
 */
@ -947,4 +951,5 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #endif

 #endif /* __STDC_HOSTED__ */
+#endif /* __MVS__ */
 #endif /* __CLANG_STDINT_H */
--- a/lib/include/stdnoreturn.h
+++ b/lib/include/stdnoreturn.h
@ -10,9 +10,15 @@
 #ifndef __STDNORETURN_H
 #define __STDNORETURN_H

+#if defined(__MVS__) && __has_include_next(<stdnoreturn.h>)
+#include_next <stdnoreturn.h>
+#else
+
 #define noreturn _Noreturn
 #define __noreturn_is_defined 1

+#endif /* __MVS__ */
+
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) &&               \
    !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
 /* The noreturn macro is deprecated in C23. We do not mark it as such because
--- a/lib/include/tmmintrin.h
+++ b/lib/include/tmmintrin.h
@ -271,10 +271,11 @@ _mm_hadd_pi32(__m64 __a, __m64 __b)
    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 }

-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
-///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+/// Horizontally adds, with saturation, the adjacent pairs of values contained
+///    in two packed 128-bit vectors of [8 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -296,10 +297,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }

-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
-///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+/// Horizontally adds, with saturation, the adjacent pairs of values contained
+///    in two packed 64-bit vectors of [4 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -413,10 +415,11 @@ _mm_hsub_pi32(__m64 __a, __m64 __b)
    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 }

-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
-///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
-///    saturated to 0x8000.
+/// Horizontally subtracts, with saturation, the adjacent pairs of values
+///    contained in two packed 128-bit vectors of [8 x i16].
+///
+///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative differences less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -438,10 +441,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }

-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
-///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
-///    saturated to 0x8000.
+/// Horizontally subtracts, with saturation, the adjacent pairs of values
+///    contained in two packed 64-bit vectors of [4 x i16].
+///
+///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative differences less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
--- a/lib/include/varargs.h
+++ b/lib/include/varargs.h
@ -8,5 +8,9 @@
 */
 #ifndef __VARARGS_H
 #define __VARARGS_H
-  #error "Please use <stdarg.h> instead of <varargs.h>"
+#if defined(__MVS__) && __has_include_next(<varargs.h>)
+#include_next <varargs.h>
+#else
+#error "Please use <stdarg.h> instead of <varargs.h>"
+#endif /* __MVS__ */
 #endif
--- a/lib/include/x86gprintrin.h
+++ b/lib/include/x86gprintrin.h
@ -10,38 +10,31 @@
 #ifndef __X86GPRINTRIN_H
 #define __X86GPRINTRIN_H

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__HRESET__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__HRESET__)
 #include <hresetintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__UINTR__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__UINTR__)
 #include <uintrintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__USERMSR__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__USERMSR__)
 #include <usermsrintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CRC32__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CRC32__)
 #include <crc32intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PRFCHI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHI__)
 #include <prfchiintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RAOINT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RAOINT__)
 #include <raointintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CMPCCXADD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CMPCCXADD__)
 #include <cmpccxaddintrin.h>
 #endif

--- a/lib/include/x86intrin.h
+++ b/lib/include/x86intrin.h
@ -14,53 +14,39 @@

 #include <immintrin.h>

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__3dNOW__)
-#include <mm3dnow.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PRFCHW__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHW__)
 #include <prfchwintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE4A__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE4A__)
 #include <ammintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FMA4__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA4__)
 #include <fma4intrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XOP__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XOP__)
 #include <xopintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__TBM__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__TBM__)
 #include <tbmintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__LWP__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__LWP__)
 #include <lwpintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MWAITX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MWAITX__)
 #include <mwaitxintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLZERO__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLZERO__)
 #include <clzerointrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDPRU__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPRU__)
 #include <rdpruintrin.h>
 #endif

--- a/lib/include/xmmintrin.h
+++ b/lib/include/xmmintrin.h
@ -316,6 +316,8 @@ _mm_rsqrt_ps(__m128 __a)
 ///    operands and returns the lesser value in the low-order bits of the
 ///    vector of [4 x float].
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
@ -338,6 +340,8 @@ _mm_min_ss(__m128 __a, __m128 __b)
 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
@ -358,6 +362,8 @@ _mm_min_ps(__m128 __a, __m128 __b)
 ///    operands and returns the greater value in the low-order bits of a 128-bit
 ///    vector of [4 x float].
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
@ -380,6 +386,8 @@ _mm_max_ss(__m128 __a, __m128 __b)
 /// Compares two 128-bit vectors of [4 x float] and returns the greater
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
@ -474,8 +482,11 @@ _mm_xor_ps(__m128 __a, __m128 __b)
 }

 /// Compares two 32-bit float values in the low-order bits of both
-///    operands for equality and returns the result of the comparison in the
+///    operands for equality.
+///
+///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector [4 x float].
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -498,6 +509,9 @@ _mm_cmpeq_ss(__m128 __a, __m128 __b)
 /// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] for equality.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
@ -515,8 +529,11 @@ _mm_cmpeq_ps(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is less than the
-///    corresponding value in the second operand and returns the result of the
-///    comparison in the low-order bits of a vector of [4 x float].
+///    corresponding value in the second operand.
+///
+///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -540,6 +557,9 @@ _mm_cmplt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are less than those in the second operand.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
@ -557,9 +577,11 @@ _mm_cmplt_ps(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is less than or
-///    equal to the corresponding value in the second operand and returns the
-///    result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    equal to the corresponding value in the second operand.
+///
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
+///    the low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -583,6 +605,9 @@ _mm_cmple_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are less than or equal to those in the second operand.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
@ -600,8 +625,11 @@ _mm_cmple_ps(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is greater than
-///    the corresponding value in the second operand and returns the result of
-///    the comparison in the low-order bits of a vector of [4 x float].
+///    the corresponding value in the second operand.
+///
+///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -627,6 +655,9 @@ _mm_cmpgt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are greater than those in the second operand.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
@ -644,9 +675,11 @@ _mm_cmpgt_ps(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is greater than
-///    or equal to the corresponding value in the second operand and returns
-///    the result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    or equal to the corresponding value in the second operand.
+///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -672,6 +705,9 @@ _mm_cmpge_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are greater than or equal to those in the second operand.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
@ -687,9 +723,12 @@ _mm_cmpge_ps(__m128 __a, __m128 __b)
  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
 }

-/// Compares two 32-bit float values in the low-order bits of both
-///    operands for inequality and returns the result of the comparison in the
+/// Compares two 32-bit float values in the low-order bits of both operands
+///    for inequality.
+///
+///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -713,6 +752,9 @@ _mm_cmpneq_ss(__m128 __a, __m128 __b)
 /// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] for inequality.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
@ -731,8 +773,11 @@ _mm_cmpneq_ps(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not less than
-///    the corresponding value in the second operand and returns the result of
-///    the comparison in the low-order bits of a vector of [4 x float].
+///    the corresponding value in the second operand.
+///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -757,6 +802,9 @@ _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not less than those in the second operand.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
@ -775,9 +823,11 @@ _mm_cmpnlt_ps(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not less than
-///    or equal to the corresponding value in the second operand and returns
-///    the result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    or equal to the corresponding value in the second operand.
+///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -802,6 +852,9 @@ _mm_cmpnle_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not less than or equal to those in the second operand.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
@ -820,9 +873,11 @@ _mm_cmpnle_ps(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not greater
-///    than the corresponding value in the second operand and returns the
-///    result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    than the corresponding value in the second operand.
+///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -849,6 +904,9 @@ _mm_cmpngt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not greater than those in the second operand.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
@ -867,9 +925,11 @@ _mm_cmpngt_ps(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not greater
-///    than or equal to the corresponding value in the second operand and
-///    returns the result of the comparison in the low-order bits of a vector
-///    of [4 x float].
+///    than or equal to the corresponding value in the second operand.
+///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -896,6 +956,9 @@ _mm_cmpnge_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not greater than or equal to those in the second operand.
 ///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
@ -914,9 +977,11 @@ _mm_cmpnge_ps(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is ordered with
-///    respect to the corresponding value in the second operand and returns the
-///    result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    respect to the corresponding value in the second operand.
+///
+///    A pair of floating-point values are ordered with respect to each
+///    other if neither value is a NaN. Each comparison returns 0x0 for false,
+///    0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -941,6 +1006,10 @@ _mm_cmpord_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are ordered with respect to those in the second operand.
 ///
+///    A pair of floating-point values are ordered with respect to each
+///    other if neither value is a NaN. Each comparison returns 0x0 for false,
+///    0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
@ -959,9 +1028,11 @@ _mm_cmpord_ps(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is unordered
-///    with respect to the corresponding value in the second operand and
-///    returns the result of the comparison in the low-order bits of a vector
-///    of [4 x float].
+///    with respect to the corresponding value in the second operand.
+///
+///    A pair of double-precision values are unordered with respect to each
+///    other if one or both values are NaN. Each comparison returns 0x0 for
+///    false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -986,6 +1057,10 @@ _mm_cmpunord_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are unordered with respect to those in the second operand.
 ///
+///    A pair of double-precision values are unordered with respect to each
+///    other if one or both values are NaN. Each comparison returns 0x0 for
+///    false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
@ -1003,9 +1078,10 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
 }

 /// Compares two 32-bit float values in the low-order bits of both
-///    operands for equality and returns the result of the comparison.
+///    operands for equality.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1018,8 +1094,7 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-///    two lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_ss(__m128 __a, __m128 __b)
 {
@ -1028,9 +1103,10 @@ _mm_comieq_ss(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is less than the second
-///    operand and returns the result of the comparison.
+///    operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1043,8 +1119,7 @@ _mm_comieq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_ss(__m128 __a, __m128 __b)
 {
@ -1053,9 +1128,10 @@ _mm_comilt_ss(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is less than or equal to the
-///    second operand and returns the result of the comparison.
+///    second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1067,8 +1143,7 @@ _mm_comilt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_ss(__m128 __a, __m128 __b)
 {
@ -1077,9 +1152,10 @@ _mm_comile_ss(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is greater than the second
-///    operand and returns the result of the comparison.
+///    operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1091,8 +1167,7 @@ _mm_comile_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-///     two lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_ss(__m128 __a, __m128 __b)
 {
@ -1101,9 +1176,10 @@ _mm_comigt_ss(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is greater than or equal to
-///    the second operand and returns the result of the comparison.
+///    the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1115,8 +1191,7 @@ _mm_comigt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_ss(__m128 __a, __m128 __b)
 {
@ -1125,9 +1200,10 @@ _mm_comige_ss(__m128 __a, __m128 __b)

 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is not equal to the second
-///    operand and returns the result of the comparison.
+///    operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 1 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1139,8 +1215,7 @@ _mm_comige_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-///     two lower 32-bit values is NaN, 1 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_ss(__m128 __a, __m128 __b)
 {
@ -1148,10 +1223,10 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
 }

 /// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine equality and returns
-///    the result of the comparison.
+///    the low-order bits of both operands to determine equality.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1163,8 +1238,7 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_ss(__m128 __a, __m128 __b)
 {
@ -1173,9 +1247,10 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)

 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    less than the second operand and returns the result of the comparison.
+///    less than the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1187,8 +1262,7 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_ss(__m128 __a, __m128 __b)
 {
@ -1197,10 +1271,10 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)

 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    less than or equal to the second operand and returns the result of the
-///    comparison.
+///    less than or equal to the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1212,8 +1286,7 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_ss(__m128 __a, __m128 __b)
 {
@ -1222,10 +1295,10 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)

 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    greater than the second operand and returns the result of the
-///    comparison.
+///    greater than the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1237,8 +1310,7 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_ss(__m128 __a, __m128 __b)
 {
@ -1247,10 +1319,10 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)

 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    greater than or equal to the second operand and returns the result of
-///    the comparison.
+///    greater than or equal to the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1262,8 +1334,7 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_ss(__m128 __a, __m128 __b)
 {
@ -1271,10 +1342,10 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
 }

 /// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine inequality and returns
-///    the result of the comparison.
+///    the low-order bits of both operands to determine inequality.
 ///
-///    If either of the two lower 32-bit values is NaN, 1 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1286,8 +1357,7 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower 32-bit values is NaN, 1 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_ss(__m128 __a, __m128 __b)
 {
@ -1297,6 +1367,10 @@ _mm_ucomineq_ss(__m128 __a, __m128 __b)
 /// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 32-bit integer.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
@ -1315,6 +1389,10 @@ _mm_cvtss_si32(__m128 __a)
 /// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 32-bit integer.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
@ -1335,6 +1413,10 @@ _mm_cvt_ss2si(__m128 __a)
 /// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 64-bit integer.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
@ -1355,6 +1437,10 @@ _mm_cvtss_si64(__m128 __a)
 /// Converts two low-order float values in a 128-bit vector of
 ///    [4 x float] into a 64-bit vector of [2 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
@ -1371,6 +1457,10 @@ _mm_cvtps_pi32(__m128 __a)
 /// Converts two low-order float values in a 128-bit vector of
 ///    [4 x float] into a 64-bit vector of [2 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
@ -1384,9 +1474,12 @@ _mm_cvt_ps2pi(__m128 __a)
  return _mm_cvtps_pi32(__a);
 }

-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 32-bit integer, truncating the result when it is
-///    inexact.
+/// Converts the lower (first) element of a vector of [4 x float] into a signed
+///    truncated (rounded toward zero) 32-bit integer.
+///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1403,9 +1496,12 @@ _mm_cvttss_si32(__m128 __a)
  return __builtin_ia32_cvttss2si((__v4sf)__a);
 }

-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 32-bit integer, truncating the result when it is
-///    inexact.
+/// Converts the lower (first) element of a vector of [4 x float] into a signed
+///    truncated (rounded toward zero) 32-bit integer.
+///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1423,9 +1519,12 @@ _mm_cvtt_ss2si(__m128 __a)
 }

 #ifdef __x86_64__
-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 64-bit integer, truncating the result when it is
-///    inexact.
+/// Converts the lower (first) element of a vector of [4 x float] into a signed
+///    truncated (rounded toward zero) 64-bit integer.
+///
+///    If the converted value does not fit in a 64-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1443,9 +1542,13 @@ _mm_cvttss_si64(__m128 __a)
 }
 #endif

-/// Converts two low-order float values in a 128-bit vector of
-///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
-///    when it is inexact.
+/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
+///    into two signed truncated (rounded toward zero) 32-bit integers,
+///    returned in a 64-bit vector of [2 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1461,9 +1564,13 @@ _mm_cvttps_pi32(__m128 __a)
  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
 }

-/// Converts two low-order float values in a 128-bit vector of [4 x
-///    float] into a 64-bit vector of [2 x i32], truncating the result when it
-///    is inexact.
+/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
+///    into two signed truncated (rounded toward zero) 64-bit integers,
+///    returned in a 64-bit vector of [2 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1803,7 +1910,7 @@ _mm_undefined_ps(void)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ss(float __w)
 {
-  return __extension__ (__m128){ __w, 0, 0, 0 };
+  return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
 }

 /// Constructs a 128-bit floating-point vector of [4 x float], with each
@ -2940,6 +3047,85 @@ _mm_movemask_ps(__m128 __a)
  return __builtin_ia32_movmskps((__v4sf)__a);
 }

+/* Compare */
+#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
+
+/// Compares each of the corresponding values of two 128-bit vectors of
+///    [4 x float], using the operation specified by the immediate integer
+///    operand.
+///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+#define _mm_cmp_ps(a, b, c)                                                    \
+  ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
+
+/// Compares each of the corresponding scalar values of two 128-bit
+///    vectors of [4 x float], using the operation specified by the immediate
+///    integer operand.
+///
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+#define _mm_cmp_ss(a, b, c)                                                    \
+  ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))

 #define _MM_ALIGN16 __attribute__((aligned(16)))

--- a/lib/include/yvals_core.h
+++ b/lib/include/yvals_core.h
@ -0,0 +1,25 @@
+//===----- yvals_core.h - Internal MSVC STL core header -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Only include this if we are aiming for MSVC compatibility.
+#ifndef _MSC_VER
+#include_next <yvals_core.h>
+#else
+
+#ifndef __clang_yvals_core_h
+#define __clang_yvals_core_h
+
+#include_next <yvals_core.h>
+
+#ifdef _STL_INTRIN_HEADER
+#undef _STL_INTRIN_HEADER
+#define _STL_INTRIN_HEADER <intrin0.h>
+#endif
+
+#endif
+#endif
--- a/lib/include/zos_wrappers/builtins.h
+++ b/lib/include/zos_wrappers/builtins.h
@ -0,0 +1,18 @@
+/*===---- builtins.h - z/Architecture Builtin Functions --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ZOS_WRAPPERS_BUILTINS_H
+#define __ZOS_WRAPPERS_BUILTINS_H
+#if defined(__MVS__)
+#include_next <builtins.h>
+#if defined(__VEC__)
+#include <vecintrin.h>
+#endif
+#endif /* defined(__MVS__) */
+#endif /* __ZOS_WRAPPERS_BUILTINS_H */
--- a/lib/libcxx/include/__algorithm/adjacent_find.h
+++ b/lib/libcxx/include/__algorithm/adjacent_find.h
@ -26,7 +26,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD

 template <class _Iter, class _Sent, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
 __adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) {
  if (__first == __last)
    return __first;
@ -40,13 +40,13 @@ __adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) {
 }

 template <class _ForwardIterator, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
 adjacent_find(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred) {
  return std::__adjacent_find(std::move(__first), std::move(__last), __pred);
 }

 template <class _ForwardIterator>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
 adjacent_find(_ForwardIterator __first, _ForwardIterator __last) {
  return std::adjacent_find(std::move(__first), std::move(__last), __equal_to());
 }
--- a/lib/libcxx/include/__algorithm/all_of.h
+++ b/lib/libcxx/include/__algorithm/all_of.h
@ -19,7 +19,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD

 template <class _InputIterator, class _Predicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 all_of(_InputIterator __first, _InputIterator __last, _Predicate __pred) {
  for (; __first != __last; ++__first)
    if (!__pred(*__first))
--- a/lib/libcxx/include/__algorithm/any_of.h
+++ b/lib/libcxx/include/__algorithm/any_of.h
@ -19,7 +19,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD

 template <class _InputIterator, class _Predicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 any_of(_InputIterator __first, _InputIterator __last, _Predicate __pred) {
  for (; __first != __last; ++__first)
    if (__pred(*__first))
--- a/lib/libcxx/include/__algorithm/binary_search.h
+++ b/lib/libcxx/include/__algorithm/binary_search.h
@ -22,14 +22,14 @@
 _LIBCPP_BEGIN_NAMESPACE_STD

 template <class _ForwardIterator, class _Tp, class _Compare>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) {
  __first = std::lower_bound<_ForwardIterator, _Tp, __comp_ref_type<_Compare> >(__first, __last, __value, __comp);
  return __first != __last && !__comp(__value, *__first);
 }

 template <class _ForwardIterator, class _Tp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
  return std::binary_search(__first, __last, __value, __less<>());
 }
--- a/lib/libcxx/include/__algorithm/clamp.h
+++ b/lib/libcxx/include/__algorithm/clamp.h
@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD

 #if _LIBCPP_STD_VER >= 17
 template <class _Tp, class _Compare>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI constexpr const _Tp&
+[[nodiscard]] inline _LIBCPP_HIDE_FROM_ABI constexpr const _Tp&
 clamp(_LIBCPP_LIFETIMEBOUND const _Tp& __v,
      _LIBCPP_LIFETIMEBOUND const _Tp& __lo,
      _LIBCPP_LIFETIMEBOUND const _Tp& __hi,
@ -31,7 +31,7 @@ clamp(_LIBCPP_LIFETIMEBOUND const _Tp& __v,
 }

 template <class _Tp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI constexpr const _Tp&
+[[nodiscard]] inline _LIBCPP_HIDE_FROM_ABI constexpr const _Tp&
 clamp(_LIBCPP_LIFETIMEBOUND const _Tp& __v,
      _LIBCPP_LIFETIMEBOUND const _Tp& __lo,
      _LIBCPP_LIFETIMEBOUND const _Tp& __hi) {
--- a/lib/libcxx/include/__algorithm/comp.h
+++ b/lib/libcxx/include/__algorithm/comp.h
@ -10,8 +10,7 @@
 #define _LIBCPP___ALGORITHM_COMP_H

 #include <__config>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/operation_traits.h>
+#include <__type_traits/desugars_to.h>

 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@ -27,7 +26,7 @@ struct __equal_to {
 };

 template <class _Tp, class _Up>
-struct __desugars_to<__equal_tag, __equal_to, _Tp, _Up> : true_type {};
+inline const bool __desugars_to_v<__equal_tag, __equal_to, _Tp, _Up> = true;

 // The definition is required because __less is part of the ABI, but it's empty
 // because all comparisons should be transparent.
@ -42,6 +41,9 @@ struct __less<void, void> {
  }
 };

+template <class _Tp>
+inline const bool __desugars_to_v<__less_tag, __less<>, _Tp, _Tp> = true;
+
 _LIBCPP_END_NAMESPACE_STD

 #endif // _LIBCPP___ALGORITHM_COMP_H
--- a/lib/libcxx/include/__algorithm/comp_ref_type.h
+++ b/lib/libcxx/include/__algorithm/comp_ref_type.h
@ -41,9 +41,9 @@ struct __debug_less {
  }

  template <class _LHS, class _RHS>
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 inline _LIBCPP_HIDE_FROM_ABI decltype((void)std::declval<_Compare&>()(
-      std::declval<_LHS&>(), std::declval<_RHS&>()))
-  __do_compare_assert(int, _LHS& __l, _RHS& __r) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 inline
+      _LIBCPP_HIDE_FROM_ABI decltype((void)std::declval<_Compare&>()(std::declval<_LHS&>(), std::declval<_RHS&>()))
+      __do_compare_assert(int, _LHS& __l, _RHS& __r) {
    _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(!__comp_(__l, __r), "Comparator does not induce a strict weak ordering");
    (void)__l;
    (void)__r;
--- a/lib/libcxx/include/__algorithm/copy.h
+++ b/lib/libcxx/include/__algorithm/copy.h
@ -32,7 +32,7 @@ template <class, class _InIter, class _Sent, class _OutIter>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter);

 template <class _AlgPolicy>
-struct __copy_loop {
+struct __copy_impl {
  template <class _InIter, class _Sent, class _OutIter>
  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
  operator()(_InIter __first, _Sent __last, _OutIter __result) const {
@ -94,9 +94,7 @@ struct __copy_loop {
      __local_first = _Traits::__begin(++__segment_iterator);
    }
  }
-};

-struct __copy_trivial {
  // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
  template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@ -108,7 +106,7 @@ struct __copy_trivial {
 template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
 pair<_InIter, _OutIter> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14
 __copy(_InIter __first, _Sent __last, _OutIter __result) {
-  return std::__dispatch_copy_or_move<_AlgPolicy, __copy_loop<_AlgPolicy>, __copy_trivial>(
+  return std::__copy_move_unwrap_iters<__copy_impl<_AlgPolicy> >(
      std::move(__first), std::move(__last), std::move(__result));
 }

--- a/lib/libcxx/include/__algorithm/copy_backward.h
+++ b/lib/libcxx/include/__algorithm/copy_backward.h
@ -15,7 +15,7 @@
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/common_type.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>

@ -33,7 +33,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter>
 __copy_backward(_InIter __first, _Sent __last, _OutIter __result);

 template <class _AlgPolicy>
-struct __copy_backward_loop {
+struct __copy_backward_impl {
  template <class _InIter, class _Sent, class _OutIter>
  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
  operator()(_InIter __first, _Sent __last, _OutIter __result) const {
@ -104,9 +104,7 @@ struct __copy_backward_loop {
      __local_last = _Traits::__end(__segment_iterator);
    }
  }
-};

-struct __copy_backward_trivial {
  // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
  template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@ -118,7 +116,7 @@ struct __copy_backward_trivial {
 template <class _AlgPolicy, class _BidirectionalIterator1, class _Sentinel, class _BidirectionalIterator2>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator1, _BidirectionalIterator2>
 __copy_backward(_BidirectionalIterator1 __first, _Sentinel __last, _BidirectionalIterator2 __result) {
-  return std::__dispatch_copy_or_move<_AlgPolicy, __copy_backward_loop<_AlgPolicy>, __copy_backward_trivial>(
+  return std::__copy_move_unwrap_iters<__copy_backward_impl<_AlgPolicy> >(
      std::move(__first), std::move(__last), std::move(__result));
 }

--- a/lib/libcxx/include/__algorithm/copy_move_common.h
+++ b/lib/libcxx/include/__algorithm/copy_move_common.h
@ -19,9 +19,8 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_always_bitcastable.h>
 #include <__type_traits/is_constant_evaluated.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_trivially_assignable.h>
-#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/is_volatile.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
@ -81,30 +80,17 @@ __copy_backward_trivial_impl(_In* __first, _In* __last, _Out* __result) {

 // Iterator unwrapping and dispatching to the correct overload.

-template <class _F1, class _F2>
-struct __overload : _F1, _F2 {
-  using _F1::operator();
-  using _F2::operator();
-};
-
-template <class _InIter, class _Sent, class _OutIter, class = void>
-struct __can_rewrap : false_type {};
-
-template <class _InIter, class _Sent, class _OutIter>
-struct __can_rewrap<_InIter,
-                    _Sent,
-                    _OutIter,
-                    // Note that sentinels are always copy-constructible.
-                    __enable_if_t< is_copy_constructible<_InIter>::value && is_copy_constructible<_OutIter>::value > >
-    : true_type {};
+template <class _InIter, class _OutIter>
+struct __can_rewrap
+    : integral_constant<bool, is_copy_constructible<_InIter>::value && is_copy_constructible<_OutIter>::value> {};

 template <class _Algorithm,
          class _InIter,
          class _Sent,
          class _OutIter,
-          __enable_if_t<__can_rewrap<_InIter, _Sent, _OutIter>::value, int> = 0>
+          __enable_if_t<__can_rewrap<_InIter, _OutIter>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter>
-__unwrap_and_dispatch(_InIter __first, _Sent __last, _OutIter __out_first) {
+__copy_move_unwrap_iters(_InIter __first, _Sent __last, _OutIter __out_first) {
  auto __range  = std::__unwrap_range(__first, std::move(__last));
  auto __result = _Algorithm()(std::move(__range.first), std::move(__range.second), std::__unwrap_iter(__out_first));
  return std::make_pair(std::__rewrap_range<_Sent>(std::move(__first), std::move(__result.first)),
@ -115,24 +101,12 @@ template <class _Algorithm,
          class _InIter,
          class _Sent,
          class _OutIter,
-          __enable_if_t<!__can_rewrap<_InIter, _Sent, _OutIter>::value, int> = 0>
+          __enable_if_t<!__can_rewrap<_InIter, _OutIter>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter>
-__unwrap_and_dispatch(_InIter __first, _Sent __last, _OutIter __out_first) {
+__copy_move_unwrap_iters(_InIter __first, _Sent __last, _OutIter __out_first) {
  return _Algorithm()(std::move(__first), std::move(__last), std::move(__out_first));
 }

-template <class _AlgPolicy,
-          class _NaiveAlgorithm,
-          class _OptimizedAlgorithm,
-          class _InIter,
-          class _Sent,
-          class _OutIter>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter>
-__dispatch_copy_or_move(_InIter __first, _Sent __last, _OutIter __out_first) {
-  using _Algorithm = __overload<_NaiveAlgorithm, _OptimizedAlgorithm>;
-  return std::__unwrap_and_dispatch<_Algorithm>(std::move(__first), std::move(__last), std::move(__out_first));
-}
-
 _LIBCPP_END_NAMESPACE_STD

 _LIBCPP_POP_MACROS
--- a/lib/libcxx/include/__algorithm/count.h
+++ b/lib/libcxx/include/__algorithm/count.h
@ -79,7 +79,7 @@ __count(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __l
 }

 template <class _InputIterator, class _Tp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __iter_diff_t<_InputIterator>
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __iter_diff_t<_InputIterator>
 count(_InputIterator __first, _InputIterator __last, const _Tp& __value) {
  __identity __proj;
  return std::__count<_ClassicAlgPolicy>(__first, __last, __value, __proj);
--- a/lib/libcxx/include/__algorithm/count_if.h
+++ b/lib/libcxx/include/__algorithm/count_if.h
@ -20,9 +20,9 @@
 _LIBCPP_BEGIN_NAMESPACE_STD

 template <class _InputIterator, class _Predicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
-    typename iterator_traits<_InputIterator>::difference_type
-    count_if(_InputIterator __first, _InputIterator __last, _Predicate __pred) {
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+typename iterator_traits<_InputIterator>::difference_type
+count_if(_InputIterator __first, _InputIterator __last, _Predicate __pred) {
  typename iterator_traits<_InputIterator>::difference_type __r(0);
  for (; __first != __last; ++__first)
    if (__pred(*__first))
--- a/lib/libcxx/include/__algorithm/equal.h
+++ b/lib/libcxx/include/__algorithm/equal.h
@ -18,12 +18,11 @@
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__string/constexpr_c_functions.h>
+#include <__type_traits/desugars_to.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_equality_comparable.h>
 #include <__type_traits/is_volatile.h>
-#include <__type_traits/operation_traits.h>
 #include <__utility/move.h>

 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@ -47,7 +46,7 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 boo
 template <class _Tp,
          class _Up,
          class _BinaryPredicate,
-          __enable_if_t<__desugars_to<__equal_tag, _BinaryPredicate, _Tp, _Up>::value && !is_volatile<_Tp>::value &&
+          __enable_if_t<__desugars_to_v<__equal_tag, _BinaryPredicate, _Tp, _Up> && !is_volatile<_Tp>::value &&
                            !is_volatile<_Up>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value,
                        int> = 0>
 _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
@ -56,33 +55,19 @@ __equal_iter_impl(_Tp* __first1, _Tp* __last1, _Up* __first2, _BinaryPredicate&)
 }

 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate __pred) {
  return std::__equal_iter_impl(
      std::__unwrap_iter(__first1), std::__unwrap_iter(__last1), std::__unwrap_iter(__first2), __pred);
 }

 template <class _InputIterator1, class _InputIterator2>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2) {
  return std::equal(__first1, __last1, __first2, __equal_to());
 }

 #if _LIBCPP_STD_VER >= 14
-template <class _BinaryPredicate, class _InputIterator1, class _InputIterator2>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
-__equal(_InputIterator1 __first1,
-        _InputIterator1 __last1,
-        _InputIterator2 __first2,
-        _InputIterator2 __last2,
-        _BinaryPredicate __pred,
-        input_iterator_tag,
-        input_iterator_tag) {
-  for (; __first1 != __last1 && __first2 != __last2; ++__first1, (void)++__first2)
-    if (!__pred(*__first1, *__first2))
-      return false;
-  return __first1 == __last1 && __first2 == __last2;
-}

 template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Pred, class _Proj1, class _Proj2>
 _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __equal_impl(
@ -101,7 +86,7 @@ template <class _Tp,
          class _Pred,
          class _Proj1,
          class _Proj2,
-          __enable_if_t<__desugars_to<__equal_tag, _Pred, _Tp, _Up>::value && __is_identity<_Proj1>::value &&
+          __enable_if_t<__desugars_to_v<__equal_tag, _Pred, _Tp, _Up> && __is_identity<_Proj1>::value &&
                            __is_identity<_Proj2>::value && !is_volatile<_Tp>::value && !is_volatile<_Up>::value &&
                            __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value,
                        int> = 0>
@ -110,17 +95,18 @@ __equal_impl(_Tp* __first1, _Tp* __last1, _Up* __first2, _Up*, _Pred&, _Proj1&,
  return std::__constexpr_memcmp_equal(__first1, __first2, __element_count(__last1 - __first1));
 }

-template <class _BinaryPredicate, class _RandomAccessIterator1, class _RandomAccessIterator2>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
-__equal(_RandomAccessIterator1 __first1,
-        _RandomAccessIterator1 __last1,
-        _RandomAccessIterator2 __first2,
-        _RandomAccessIterator2 __last2,
-        _BinaryPredicate __pred,
-        random_access_iterator_tag,
-        random_access_iterator_tag) {
-  if (std::distance(__first1, __last1) != std::distance(__first2, __last2))
-    return false;
+template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+equal(_InputIterator1 __first1,
+      _InputIterator1 __last1,
+      _InputIterator2 __first2,
+      _InputIterator2 __last2,
+      _BinaryPredicate __pred) {
+  if constexpr (__has_random_access_iterator_category<_InputIterator1>::value &&
+                __has_random_access_iterator_category<_InputIterator2>::value) {
+    if (std::distance(__first1, __last1) != std::distance(__first2, __last2))
+      return false;
+  }
  __identity __proj;
  return std::__equal_impl(
      std::__unwrap_iter(__first1),
@ -132,36 +118,13 @@ __equal(_RandomAccessIterator1 __first1,
      __proj);
 }

-template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
-equal(_InputIterator1 __first1,
-      _InputIterator1 __last1,
-      _InputIterator2 __first2,
-      _InputIterator2 __last2,
-      _BinaryPredicate __pred) {
-  return std::__equal<_BinaryPredicate&>(
-      __first1,
-      __last1,
-      __first2,
-      __last2,
-      __pred,
-      typename iterator_traits<_InputIterator1>::iterator_category(),
-      typename iterator_traits<_InputIterator2>::iterator_category());
+template <class _InputIterator1, class _InputIterator2>
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2) {
+  return std::equal(__first1, __last1, __first2, __last2, __equal_to());
 }

-template <class _InputIterator1, class _InputIterator2>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
-equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2) {
-  return std::__equal(
-      __first1,
-      __last1,
-      __first2,
-      __last2,
-      __equal_to(),
-      typename iterator_traits<_InputIterator1>::iterator_category(),
-      typename iterator_traits<_InputIterator2>::iterator_category());
-}
-#endif
+#endif // _LIBCPP_STD_VER >= 14

 _LIBCPP_END_NAMESPACE_STD

--- a/lib/libcxx/include/__algorithm/equal_range.h
+++ b/lib/libcxx/include/__algorithm/equal_range.h
@ -23,7 +23,7 @@
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
 #include <__type_traits/is_callable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>

@ -60,7 +60,7 @@ __equal_range(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp
 }

 template <class _ForwardIterator, class _Tp, class _Compare>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator, _ForwardIterator>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator, _ForwardIterator>
 equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) {
  static_assert(__is_callable<_Compare, decltype(*__first), const _Tp&>::value, "The comparator has to be callable");
  static_assert(is_copy_constructible<_ForwardIterator>::value, "Iterator has to be copy constructible");
@ -73,7 +73,7 @@ equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __valu
 }

 template <class _ForwardIterator, class _Tp>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator, _ForwardIterator>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator, _ForwardIterator>
 equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
  return std::equal_range(std::move(__first), std::move(__last), __value, __less<>());
 }
--- a/lib/libcxx/include/__algorithm/fill_n.h
+++ b/lib/libcxx/include/__algorithm/fill_n.h
@ -9,18 +9,74 @@
 #ifndef _LIBCPP___ALGORITHM_FILL_N_H
 #define _LIBCPP___ALGORITHM_FILL_N_H

+#include <__algorithm/min.h>
 #include <__config>
+#include <__fwd/bit_reference.h>
 #include <__iterator/iterator_traits.h>
+#include <__memory/pointer_traits.h>
 #include <__utility/convert_to_integral.h>

 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif

+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD

 // fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.

+template <class _OutputIterator, class _Size, class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value);
+
+template <bool _FillVal, class _Cp>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+__fill_n_bool(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
+  using _It            = __bit_iterator<_Cp, false>;
+  using __storage_type = typename _It::__storage_type;
+
+  const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0) {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = std::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    if (_FillVal)
+      *__first.__seg_ |= __m;
+    else
+      *__first.__seg_ &= ~__m;
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  __storage_type __nw = __n / __bits_per_word;
+  std::__fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
+  __n -= __nw * __bits_per_word;
+  // do last partial word
+  if (__n > 0) {
+    __first.__seg_ += __nw;
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    if (_FillVal)
+      *__first.__seg_ |= __m;
+    else
+      *__first.__seg_ &= ~__m;
+  }
+}
+
+template <class _Cp, class _Size>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
+__fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) {
+  if (__n > 0) {
+    if (__value)
+      std::__fill_n_bool<true>(__first, __n);
+    else
+      std::__fill_n_bool<false>(__first, __n);
+  }
+  return __first + __n;
+}
+
 template <class _OutputIterator, class _Size, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
 __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
@ -37,4 +93,6 @@ fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {

 _LIBCPP_END_NAMESPACE_STD

+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_FILL_N_H
--- a/lib/libcxx/include/__algorithm/find.h
+++ b/lib/libcxx/include/__algorithm/find.h
@ -43,7 +43,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // generic implementation
 template <class _Iter, class _Sent, class _Tp, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
-__find_impl(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
  for (; __first != __last; ++__first)
    if (std::__invoke(__proj, *__first) == __value)
      break;
@ -57,8 +57,7 @@ template <class _Tp,
          __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
                            sizeof(_Tp) == 1,
                        int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
-__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
  if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
    return __ret;
  return __last;
@ -71,8 +70,7 @@ template <class _Tp,
          __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
                            sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t),
                        int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
-__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
  if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
    return __ret;
  return __last;
@ -89,10 +87,10 @@ template <class _Tp,
                            is_signed<_Tp>::value == is_signed<_Up>::value,
                        int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
-__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) {
+__find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) {
  if (__value < numeric_limits<_Tp>::min() || __value > numeric_limits<_Tp>::max())
    return __last;
-  return std::__find_impl(__first, __last, _Tp(__value), __proj);
+  return std::__find(__first, __last, _Tp(__value), __proj);
 }

 // __bit_iterator implementation
@ -134,7 +132,7 @@ __find_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)

 template <class _Cp, bool _IsConst, class _Tp, class _Proj, __enable_if_t<__is_identity<_Proj>::value, int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, _IsConst>
-__find_impl(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value, _Proj&) {
+__find(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value, _Proj&) {
  if (static_cast<bool>(__value))
    return std::__find_bool<true>(__first, static_cast<typename _Cp::size_type>(__last - __first));
  return std::__find_bool<false>(__first, static_cast<typename _Cp::size_type>(__last - __first));
@ -150,7 +148,7 @@ template <class _SegmentedIterator,
          class _Proj,
          __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
-__find_impl(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value, _Proj& __proj) {
+__find(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value, _Proj& __proj) {
  return std::__find_segment_if(std::move(__first), std::move(__last), __find_segment<_Tp>(__value), __proj);
 }

@ -163,17 +161,17 @@ struct __find_segment {
  template <class _InputIterator, class _Proj>
  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _InputIterator
  operator()(_InputIterator __first, _InputIterator __last, _Proj& __proj) const {
-    return std::__find_impl(__first, __last, __value_, __proj);
+    return std::__find(__first, __last, __value_, __proj);
  }
 };

 // public API
 template <class _InputIterator, class _Tp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
 find(_InputIterator __first, _InputIterator __last, const _Tp& __value) {
  __identity __proj;
  return std::__rewrap_iter(
-      __first, std::__find_impl(std::__unwrap_iter(__first), std::__unwrap_iter(__last), __value, __proj));
+      __first, std::__find(std::__unwrap_iter(__first), std::__unwrap_iter(__last), __value, __proj));
 }

 _LIBCPP_END_NAMESPACE_STD
--- a/lib/libcxx/include/__algorithm/find_end.h
+++ b/lib/libcxx/include/__algorithm/find_end.h
@ -205,7 +205,7 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Fo
 }

 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 find_end(
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 find_end(
    _ForwardIterator1 __first1,
    _ForwardIterator1 __last1,
    _ForwardIterator2 __first2,
@ -215,7 +215,7 @@ _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
 }

 template <class _ForwardIterator1, class _ForwardIterator2>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1
 find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2) {
  return std::find_end(__first1, __last1, __first2, __last2, __equal_to());
 }
--- a/lib/libcxx/include/__algorithm/find_first_of.h
+++ b/lib/libcxx/include/__algorithm/find_first_of.h
@ -35,7 +35,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator1 __find_fir
 }

 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 find_first_of(
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 find_first_of(
    _ForwardIterator1 __first1,
    _ForwardIterator1 __last1,
    _ForwardIterator2 __first2,
@ -45,7 +45,7 @@ _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
 }

 template <class _ForwardIterator1, class _ForwardIterator2>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 find_first_of(
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 find_first_of(
    _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2) {
  return std::__find_first_of_ce(__first1, __last1, __first2, __last2, __equal_to());
 }
--- a/lib/libcxx/include/__algorithm/find_if.h
+++ b/lib/libcxx/include/__algorithm/find_if.h
@ -19,7 +19,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD

 template <class _InputIterator, class _Predicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
 find_if(_InputIterator __first, _InputIterator __last, _Predicate __pred) {
  for (; __first != __last; ++__first)
    if (__pred(*__first))
--- a/lib/libcxx/include/__algorithm/find_if_not.h
+++ b/lib/libcxx/include/__algorithm/find_if_not.h
@ -19,7 +19,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD

 template <class _InputIterator, class _Predicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
 find_if_not(_InputIterator __first, _InputIterator __last, _Predicate __pred) {
  for (; __first != __last; ++__first)
    if (!__pred(*__first))
--- a/lib/libcxx/include/__algorithm/fold.h
+++ b/lib/libcxx/include/__algorithm/fold.h
@ -78,8 +78,7 @@ concept __indirectly_binary_left_foldable =

 struct __fold_left_with_iter {
  template <input_iterator _Ip, sentinel_for<_Ip> _Sp, class _Tp, __indirectly_binary_left_foldable<_Tp, _Ip> _Fp>
-  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static constexpr auto
-  operator()(_Ip __first, _Sp __last, _Tp __init, _Fp __f) {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static constexpr auto operator()(_Ip __first, _Sp __last, _Tp __init, _Fp __f) {
    using _Up = decay_t<invoke_result_t<_Fp&, _Tp, iter_reference_t<_Ip>>>;

    if (__first == __last) {
@ -95,7 +94,7 @@ struct __fold_left_with_iter {
  }

  template <input_range _Rp, class _Tp, __indirectly_binary_left_foldable<_Tp, iterator_t<_Rp>> _Fp>
-  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static constexpr auto operator()(_Rp&& __r, _Tp __init, _Fp __f) {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static constexpr auto operator()(_Rp&& __r, _Tp __init, _Fp __f) {
    auto __result = operator()(ranges::begin(__r), ranges::end(__r), std::move(__init), std::ref(__f));

    using _Up = decay_t<invoke_result_t<_Fp&, _Tp, range_reference_t<_Rp>>>;
@ -107,13 +106,12 @@ inline constexpr auto fold_left_with_iter = __fold_left_with_iter();

 struct __fold_left {
  template <input_iterator _Ip, sentinel_for<_Ip> _Sp, class _Tp, __indirectly_binary_left_foldable<_Tp, _Ip> _Fp>
-  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static constexpr auto
-  operator()(_Ip __first, _Sp __last, _Tp __init, _Fp __f) {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static constexpr auto operator()(_Ip __first, _Sp __last, _Tp __init, _Fp __f) {
    return fold_left_with_iter(std::move(__first), std::move(__last), std::move(__init), std::ref(__f)).value;
  }

  template <input_range _Rp, class _Tp, __indirectly_binary_left_foldable<_Tp, iterator_t<_Rp>> _Fp>
-  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static constexpr auto operator()(_Rp&& __r, _Tp __init, _Fp __f) {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static constexpr auto operator()(_Rp&& __r, _Tp __init, _Fp __f) {
    return fold_left_with_iter(ranges::begin(__r), ranges::end(__r), std::move(__init), std::ref(__f)).value;
  }
 };
--- a/lib/libcxx/include/__algorithm/includes.h
+++ b/lib/libcxx/include/__algorithm/includes.h
@ -47,7 +47,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __includes(
 }

 template <class _InputIterator1, class _InputIterator2, class _Compare>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 includes(_InputIterator1 __first1,
         _InputIterator1 __last1,
         _InputIterator2 __first2,
@ -67,7 +67,7 @@ includes(_InputIterator1 __first1,
 }

 template <class _InputIterator1, class _InputIterator2>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 includes(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2) {
  return std::includes(std::move(__first1), std::move(__last1), std::move(__first2), std::move(__last2), __less<>());
 }
--- a/lib/libcxx/include/__algorithm/inplace_merge.h
+++ b/lib/libcxx/include/__algorithm/inplace_merge.h
@ -114,8 +114,8 @@ _LIBCPP_HIDE_FROM_ABI void __buffered_inplace_merge(
    for (_BidirectionalIterator __i = __middle; __i != __last;
         __d.template __incr<value_type>(), (void)++__i, (void)++__p)
      ::new ((void*)__p) value_type(_IterOps<_AlgPolicy>::__iter_move(__i));
-    typedef __unconstrained_reverse_iterator<_BidirectionalIterator> _RBi;
-    typedef __unconstrained_reverse_iterator<value_type*> _Rv;
+    typedef reverse_iterator<_BidirectionalIterator> _RBi;
+    typedef reverse_iterator<value_type*> _Rv;
    typedef __invert<_Compare> _Inverted;
    std::__half_inplace_merge<_AlgPolicy>(
        _Rv(__p), _Rv(__buff), _RBi(__middle), _RBi(__first), _RBi(__last), _Inverted(__comp));
--- a/lib/libcxx/include/__algorithm/is_heap.h
+++ b/lib/libcxx/include/__algorithm/is_heap.h
@ -22,13 +22,13 @@
 _LIBCPP_BEGIN_NAMESPACE_STD

 template <class _RandomAccessIterator, class _Compare>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 is_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
  return std::__is_heap_until(__first, __last, static_cast<__comp_ref_type<_Compare> >(__comp)) == __last;
 }

 template <class _RandomAccessIterator>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 is_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) {
  return std::is_heap(__first, __last, __less<>());
 }
--- a/lib/libcxx/include/__algorithm/is_heap_until.h
+++ b/lib/libcxx/include/__algorithm/is_heap_until.h
@ -46,13 +46,13 @@ __is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last, _Co
 }

 template <class _RandomAccessIterator, class _Compare>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator
 is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
  return std::__is_heap_until(__first, __last, static_cast<__comp_ref_type<_Compare> >(__comp));
 }

 template <class _RandomAccessIterator>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator
 is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last) {
  return std::__is_heap_until(__first, __last, __less<>());
 }
--- a/lib/libcxx/include/__algorithm/is_partitioned.h
+++ b/lib/libcxx/include/__algorithm/is_partitioned.h
@ -18,7 +18,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD

 template <class _InputIterator, class _Predicate>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 is_partitioned(_InputIterator __first, _InputIterator __last, _Predicate __pred) {
  for (; __first != __last; ++__first)
    if (!__pred(*__first))
--- a/lib/libcxx/include/__algorithm/is_permutation.h
+++ b/lib/libcxx/include/__algorithm/is_permutation.h
@ -113,7 +113,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation_impl(

 // 2+1 iterators, predicate. Not used by range algorithms.
 template <class _AlgPolicy, class _ForwardIterator1, class _Sentinel1, class _ForwardIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation(
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation(
    _ForwardIterator1 __first1, _Sentinel1 __last1, _ForwardIterator2 __first2, _BinaryPredicate&& __pred) {
  // Shorten sequences as much as possible by lopping of any equal prefix.
  for (; __first1 != __last1; ++__first1, (void)++__first2) {
@ -247,7 +247,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation(

 // 2+1 iterators, predicate
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool is_permutation(
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool is_permutation(
    _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _BinaryPredicate __pred) {
  static_assert(__is_callable<_BinaryPredicate, decltype(*__first1), decltype(*__first2)>::value,
                "The predicate has to be callable");
@ -257,7 +257,7 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool i

 // 2+1 iterators
 template <class _ForwardIterator1, class _ForwardIterator2>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2) {
  return std::is_permutation(__first1, __last1, __first2, __equal_to());
 }
@ -266,7 +266,7 @@ is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIt

 // 2+2 iterators
 template <class _ForwardIterator1, class _ForwardIterator2>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool is_permutation(
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool is_permutation(
    _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2) {
  return std::__is_permutation<_ClassicAlgPolicy>(
      std::move(__first1),
@ -280,7 +280,7 @@ _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20

 // 2+2 iterators, predicate
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool is_permutation(
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool is_permutation(
    _ForwardIterator1 __first1,
    _ForwardIterator1 __last1,
    _ForwardIterator2 __first2,
--- a/lib/libcxx/include/__algorithm/is_sorted.h
+++ b/lib/libcxx/include/__algorithm/is_sorted.h
@ -22,13 +22,13 @@
 _LIBCPP_BEGIN_NAMESPACE_STD

 template <class _ForwardIterator, class _Compare>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 is_sorted(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp) {
  return std::__is_sorted_until<__comp_ref_type<_Compare> >(__first, __last, __comp) == __last;
 }

 template <class _ForwardIterator>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 is_sorted(_ForwardIterator __first, _ForwardIterator __last) {
  return std::is_sorted(__first, __last, __less<>());
 }
--- a/Show More
+++ b/Show More