ported from UXP:

- Issue #1769 - Part 1: Add vendored libjxl and highway sources. (7983f5d8) - Issue #1769 - Part 1 Follow-up: Tidy up moz.build for highway and libjxl (00a5d640) - Issue #1769 - Part 1 Follow-up: Use standard [[deprecated]] for JXL_DEPRECATED. (40f27cd3) - Issue #1769 - Follow-up: Fix typo in MOZ_ARG_ENABLE_BOOL (4d78f53d) - Issue #1769: Update symbols for libxul linkage (1ef390db) - PR #2050 follow-up: add symbols to build shared on Windows. (514ed142) - Issue #2061 Follow-up: Export jxl/version.h. (b1811451) - Issue #2061 Follow-up: Fix moz.build to compile on all platforms. (743d1f66) - Issue #2061 - Follow-up: Silence compiler warnings for libjxl (84dc161d) - Issue #2061 - Follow-up: Silence compiler warnings for libjxl (MSVC) (80c20628)
2026-05-26 05:02:25 +00:00 · 2023-06-26 11:40:31 +08:00
parent f7c8a895e8
commit 3c3852c268
955 changed files with 231619 additions and 0 deletions
@@ -159,6 +159,7 @@ def old_configure_options(*options):
    '--enable-alsa',
    '--enable-android-omx',
    '--enable-av1',
+    '--enable-jxl',
    '--enable-b2g-bt',
    '--enable-b2g-camera',
    '--enable-b2g-ril',
@@ -53,6 +53,9 @@ if CONFIG['MOZ_WEBSPEECH_POCKETSPHINX']:
 if CONFIG['MOZ_FFVPX']:
    external_dirs += ['media/ffvpx']

+if CONFIG["MOZ_JXL"]:
+    external_dirs += ["media/libjxl", "media/highway"]
+
 external_dirs += [
    'media/kiss_fft',
    'media/libcubeb',
@@ -80,6 +80,24 @@ aom_codec_peek_stream_info
 aom_img_alloc
 aom_img_free
 #endif
+#ifdef MOZ_JXL
+JxlDecoderCreate
+JxlDecoderDestroy
+JxlDecoderSetParallelRunner
+JxlDecoderSubscribeEvents
+JxlDecoderProcessInput
+JxlDecoderSetInput
+JxlDecoderReleaseInput
+JxlDecoderGetBasicInfo
+JxlDecoderImageOutBufferSize
+JxlDecoderSetImageOutBuffer
+JxlDecoderGetFrameHeader
+JxlDecoderFlushImage
+JxlThreadParallelRunner
+JxlThreadParallelRunnerCreate
+JxlThreadParallelRunnerDestroy
+JxlThreadParallelRunnerDefaultNumWorkerThreads
+#endif
 #ifdef MOZ_VORBIS
 ogg_page_bos
 ogg_page_granulepos
@@ -0,0 +1,12 @@
+This directory contains build files for the Highway C++
+SIMD library.
+
+Any patches or additional configuration to be applied to the
+upstream source should be kept here in the media/highway
+directory.
+
+The upstream highway git repository is:
+
+    https://github.com/google/highway
+
+The version used was tagged 1.0.2.
@@ -0,0 +1,48 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+LOCAL_INCLUDES += [
+    "/media/highway/src/",
+]
+
+SOURCES += [
+    "/media/highway/src/hwy/aligned_allocator.cc",
+    "/media/highway/src/hwy/contrib/image/image.cc",
+    "/media/highway/src/hwy/per_target.cc",
+    "/media/highway/src/hwy/targets.cc",
+]
+
+EXPORTS.hwy += [
+    "/media/highway/src/hwy/aligned_allocator.h",
+    "/media/highway/src/hwy/base.h",
+    "/media/highway/src/hwy/cache_control.h",
+    "/media/highway/src/hwy/detect_compiler_arch.h",
+    "/media/highway/src/hwy/detect_targets.h",
+    "/media/highway/src/hwy/foreach_target.h",
+    "/media/highway/src/hwy/highway.h",
+    "/media/highway/src/hwy/highway_export.h",
+    "/media/highway/src/hwy/targets.h",
+]
+
+EXPORTS.hwy.ops += [
+    "/media/highway/src/hwy/ops/arm_neon-inl.h",
+    "/media/highway/src/hwy/ops/arm_sve-inl.h",
+    "/media/highway/src/hwy/ops/emu128-inl.h",
+    "/media/highway/src/hwy/ops/generic_ops-inl.h",
+    "/media/highway/src/hwy/ops/rvv-inl.h",
+    "/media/highway/src/hwy/ops/scalar-inl.h",
+    "/media/highway/src/hwy/ops/set_macros-inl.h",
+    "/media/highway/src/hwy/ops/shared-inl.h",
+    "/media/highway/src/hwy/ops/wasm_128-inl.h",
+    "/media/highway/src/hwy/ops/x86_128-inl.h",
+    "/media/highway/src/hwy/ops/x86_256-inl.h",
+    "/media/highway/src/hwy/ops/x86_512-inl.h",
+]
+
+FINAL_LIBRARY = "gkmedias"
+
+# We allow warnings for third-party code that can be updated from upstream.
+ALLOW_COMPILER_WARNINGS = True
@@ -0,0 +1,413 @@
+load("@bazel_skylib//lib:selects.bzl", "selects")
+
+load("@rules_cc//cc:defs.bzl", "cc_test")
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+# Detect compiler:
+config_setting(
+    name = "compiler_clang",
+    flag_values = {"@bazel_tools//tools/cpp:compiler": "clang"},
+)
+
+config_setting(
+    name = "compiler_clangcl",
+    flag_values = {"@bazel_tools//tools/cpp:compiler": "lexan"},
+)
+
+config_setting(
+    name = "compiler_msvc_actual",
+    flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"},
+)
+
+# The above is insufficient for Bazel on Windows, which does not seem to
+# detect/set a compiler flag. This workaround prevents compile errors due to
+# passing clang-only warning flags to MSVC.
+config_setting(
+    name = "compiler_msvc_cpu",
+    values = {
+        "cpu": "x64_windows",
+    },
+)
+
+selects.config_setting_group(
+    name = "compiler_msvc",
+    match_any = [
+        ":compiler_msvc_actual",
+        ":compiler_msvc_cpu",
+    ],
+)
+
+config_setting(
+    name = "compiler_emscripten",
+    values = {"cpu": "wasm32"},
+)
+
+# See https://github.com/bazelbuild/bazel/issues/12707
+config_setting(
+    name = "compiler_gcc_bug",
+    flag_values = {
+        "@bazel_tools//tools/cpp:compiler": "compiler",
+    },
+)
+
+config_setting(
+    name = "compiler_gcc_actual",
+    flag_values = {
+        "@bazel_tools//tools/cpp:compiler": "gcc",
+    },
+)
+
+selects.config_setting_group(
+    name = "compiler_gcc",
+    match_any = [
+        ":compiler_gcc_bug",
+        ":compiler_gcc_actual",
+    ],
+)
+
+# Additional warnings for Clang OR GCC (skip for MSVC)
+CLANG_GCC_COPTS = [
+    "-Wunused-parameter",
+    "-Wunused-variable",
+    "-Wextra-semi",
+    "-Wunreachable-code",
+]
+
+# Warnings supported by Clang and Clang-cl
+CLANG_OR_CLANGCL_OPTS = CLANG_GCC_COPTS + [
+    "-Wfloat-overflow-conversion",
+    "-Wfloat-zero-conversion",
+    "-Wfor-loop-analysis",
+    "-Wgnu-redeclared-enum",
+    "-Winfinite-recursion",
+    "-Wliteral-conversion",
+    "-Wno-c++98-compat",
+    "-Wno-unused-command-line-argument",
+    "-Wprivate-header",
+    "-Wself-assign",
+    "-Wstring-conversion",
+    "-Wtautological-overlap-compare",
+    "-Wthread-safety-analysis",
+    "-Wundefined-func-template",
+    "-Wunused-comparison",
+]
+
+# Warnings only supported by Clang, but not Clang-cl
+CLANG_ONLY_COPTS = CLANG_OR_CLANGCL_OPTS + [
+    # Do not treat the third_party headers as system headers when building
+    # highway - the errors are pertinent.
+    "--no-system-header-prefix=third_party/highway",
+]
+
+COPTS = select({
+    ":compiler_msvc": [],
+    ":compiler_gcc": CLANG_GCC_COPTS,
+    ":compiler_clangcl": CLANG_OR_CLANGCL_OPTS,
+    # Default to clang because compiler detection only works in Bazel
+    "//conditions:default": CLANG_ONLY_COPTS,
+}) + select({
+    "@platforms//cpu:riscv64": [
+        "-march=rv64gcv1p0",
+        "-menable-experimental-extensions",
+    ],
+    "//conditions:default": [
+    ],
+})
+
+DEFINES = select({
+    ":compiler_msvc": ["HWY_SHARED_DEFINE"],
+    ":compiler_clangcl": ["HWY_SHARED_DEFINE"],
+    "//conditions:default": [],
+})
+
+# Unused on Bazel builds, where this is not defined/known; Copybara replaces
+# usages with an empty list.
+COMPAT = [
+    "//buildenv/target:non_prod",  # includes mobile/vendor.
+]
+
+# WARNING: changing flags such as HWY_DISABLED_TARGETS may break users without
+# failing integration tests, if the machine running tests does not support the
+# newly enabled instruction set, or the failure is only caught by sanitizers
+# which do not run in CI.
+
+cc_library(
+    name = "hwy",
+    srcs = [
+        "hwy/aligned_allocator.cc",
+        "hwy/per_target.cc",
+        "hwy/print.cc",
+        "hwy/targets.cc",
+    ],
+    # Normal headers with include guards
+    hdrs = [
+        "hwy/aligned_allocator.h",
+        "hwy/base.h",
+        "hwy/cache_control.h",
+        "hwy/detect_compiler_arch.h",  # private
+        "hwy/print.h",
+    ],
+    compatible_with = [],
+    copts = COPTS,
+    defines = DEFINES,
+    local_defines = ["hwy_EXPORTS"],
+    textual_hdrs = [
+        # These are textual because config macros influence them:
+        "hwy/detect_targets.h",  # private
+        "hwy/targets.h",
+        # This .cc file #includes itself through foreach_target.h
+        "hwy/per_target.cc",
+        # End of list
+        "hwy/highway.h",  # public
+        "hwy/foreach_target.h",  # public
+        "hwy/per_target.h",  # public
+        "hwy/print-inl.h",  # public
+        "hwy/highway_export.h",  # public
+        "hwy/ops/arm_neon-inl.h",
+        "hwy/ops/arm_sve-inl.h",
+        "hwy/ops/emu128-inl.h",
+        "hwy/ops/generic_ops-inl.h",
+        "hwy/ops/scalar-inl.h",
+        "hwy/ops/set_macros-inl.h",
+        "hwy/ops/shared-inl.h",
+        "hwy/ops/x86_128-inl.h",
+        "hwy/ops/x86_256-inl.h",
+        "hwy/ops/x86_512-inl.h",
+        # Select avoids recompiling native arch if only non-native changed
+    ] + select({
+        ":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"],
+        "//conditions:default": [],
+    }) + select({
+        "@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "algo",
+    compatible_with = [],
+    copts = COPTS,
+    textual_hdrs = [
+        "hwy/contrib/algo/copy-inl.h",
+        "hwy/contrib/algo/find-inl.h",
+        "hwy/contrib/algo/transform-inl.h",
+    ],
+    deps = [
+        ":hwy",
+    ],
+)
+
+cc_library(
+    name = "dot",
+    compatible_with = [],
+    copts = COPTS,
+    textual_hdrs = [
+        "hwy/contrib/dot/dot-inl.h",
+    ],
+    deps = [
+        ":hwy",
+    ],
+)
+
+cc_library(
+    name = "image",
+    srcs = [
+        "hwy/contrib/image/image.cc",
+    ],
+    hdrs = [
+        "hwy/contrib/image/image.h",
+    ],
+    compatible_with = [],
+    copts = COPTS,
+    local_defines = ["hwy_contrib_EXPORTS"],
+    deps = [
+        ":hwy",
+    ],
+)
+
+cc_library(
+    name = "math",
+    compatible_with = [],
+    copts = COPTS,
+    textual_hdrs = [
+        "hwy/contrib/math/math-inl.h",
+    ],
+    deps = [
+        ":hwy",
+    ],
+)
+
+# Everything required for tests that use Highway.
+cc_library(
+    name = "hwy_test_util",
+    srcs = ["hwy/tests/test_util.cc"],
+    hdrs = ["hwy/tests/test_util.h"],
+    compatible_with = [],
+    copts = COPTS,
+    local_defines = ["hwy_test_EXPORTS"],
+    textual_hdrs = [
+        "hwy/tests/test_util-inl.h",
+        "hwy/tests/hwy_gtest.h",
+    ],
+    # Must not depend on a gtest variant, which can conflict with the
+    # GUNIT_INTERNAL_BUILD_MODE defined by the test.
+    deps = [
+        ":hwy",
+    ],
+)
+
+cc_library(
+    name = "nanobenchmark",
+    srcs = ["hwy/nanobenchmark.cc"],
+    hdrs = ["hwy/nanobenchmark.h"],
+    compatible_with = [],
+    copts = COPTS,
+    local_defines = ["hwy_EXPORTS"],
+    deps = [":hwy"],
+)
+
+cc_binary(
+    name = "benchmark",
+    srcs = ["hwy/examples/benchmark.cc"],
+    copts = COPTS,
+    deps = [
+        ":hwy",
+        ":nanobenchmark",
+    ],
+)
+
+cc_library(
+    name = "skeleton",
+    srcs = ["hwy/examples/skeleton.cc"],
+    hdrs = ["hwy/examples/skeleton.h"],
+    copts = COPTS,
+    local_defines = ["hwy_EXPORTS"],
+    textual_hdrs = ["hwy/examples/skeleton-inl.h"],
+    deps = [
+        ":hwy",
+    ],
+)
+
+cc_binary(
+    name = "list_targets",
+    srcs = ["hwy/tests/list_targets.cc"],
+    deps = [":hwy"],
+)
+
+# path, name
+HWY_TESTS = [
+    ("hwy/contrib/algo/", "copy_test"),
+    ("hwy/contrib/algo/", "find_test"),
+    ("hwy/contrib/algo/", "transform_test"),
+    ("hwy/contrib/dot/", "dot_test"),
+    ("hwy/contrib/image/", "image_test"),
+    ("hwy/contrib/math/", "math_test"),
+    # contrib/sort has its own BUILD, we add it to GUITAR_TESTS.
+    ("hwy/examples/", "skeleton_test"),
+    ("hwy/", "nanobenchmark_test"),
+    ("hwy/", "aligned_allocator_test"),
+    ("hwy/", "base_test"),
+    ("hwy/", "highway_test"),
+    ("hwy/", "targets_test"),
+    ("hwy/tests/", "arithmetic_test"),
+    ("hwy/tests/", "blockwise_test"),
+    ("hwy/tests/", "blockwise_shift_test"),
+    ("hwy/tests/", "combine_test"),
+    ("hwy/tests/", "compare_test"),
+    ("hwy/tests/", "compress_test"),
+    ("hwy/tests/", "convert_test"),
+    ("hwy/tests/", "crypto_test"),
+    ("hwy/tests/", "demote_test"),
+    ("hwy/tests/", "float_test"),
+    ("hwy/tests/", "if_test"),
+    ("hwy/tests/", "interleaved_test"),
+    ("hwy/tests/", "logical_test"),
+    ("hwy/tests/", "mask_test"),
+    ("hwy/tests/", "mask_mem_test"),
+    ("hwy/tests/", "memory_test"),
+    ("hwy/tests/", "mul_test"),
+    ("hwy/tests/", "reduction_test"),
+    ("hwy/tests/", "reverse_test"),
+    ("hwy/tests/", "shift_test"),
+    ("hwy/tests/", "swizzle_test"),
+    ("hwy/tests/", "test_util_test"),
+]
+
+HWY_TEST_COPTS = select({
+    ":compiler_msvc": [],
+    "//conditions:default": [
+        # gTest triggers this warning (which is enabled by the
+        # extra-semi in COPTS), so we need to disable it here,
+        # but it's still enabled for :hwy.
+        "-Wno-c++98-compat-extra-semi",
+    ],
+})
+
+HWY_TEST_DEPS = [
+    ":algo",
+    ":dot",
+    ":hwy",
+    ":hwy_test_util",
+    ":image",
+    ":math",
+    ":nanobenchmark",
+    ":skeleton",
+    "//hwy/contrib/sort:vqsort",
+    "@com_google_googletest//:gtest_main",
+]
+
+[
+    [
+        cc_test(
+            name = test,
+            size = "medium",
+            timeout = "long",  # default moderate is not enough for math_test
+            srcs = [
+                subdir + test + ".cc",
+            ],
+            copts = COPTS + HWY_TEST_COPTS,
+            features = select({
+                "@platforms//cpu:riscv64": ["fully_static_link"],
+                "//conditions:default": [],
+            }),
+            linkopts = select({
+                ":compiler_emscripten": [
+                    "-s ASSERTIONS=2",
+                    "-s ENVIRONMENT=node,shell,web",
+                    "-s ERROR_ON_UNDEFINED_SYMBOLS=1",
+                    "-s DEMANGLE_SUPPORT=1",
+                    "-s EXIT_RUNTIME=1",
+                    "-s ALLOW_MEMORY_GROWTH=1",
+                    "--pre-js $(location :preamble.js.lds)",
+                ],
+                "//conditions:default": [],
+            }),
+            linkstatic = select({
+                "@platforms//cpu:riscv64": True,
+                "//conditions:default": False,
+            }),
+            local_defines = ["HWY_IS_TEST"],
+            # for test_suite.
+            tags = ["hwy_ops_test"],
+            deps = HWY_TEST_DEPS + select({
+                ":compiler_emscripten": [":preamble.js.lds"],
+                "//conditions:default": [],
+            }),
+        ),
+    ]
+    for subdir, test in HWY_TESTS
+]
+
+# For manually building the tests we define here (:all does not work in --config=msvc)
+test_suite(
+    name = "hwy_ops_tests",
+    tags = ["hwy_ops_test"],
+)
+
+# Placeholder for integration test, do not remove
@@ -0,0 +1,580 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.10)
+
+# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
+if(POLICY CMP0083)
+  cmake_policy(SET CMP0083 NEW)
+endif()
+
+# Workaround for 3.19 raising error 'IMPORTED_LOCATION not set for imported
+# target "GTest::gtest_main"'.
+if(POLICY CMP0111)
+  cmake_policy(SET CMP0111 OLD)
+endif()
+
+project(hwy VERSION 1.0.2)  # Keep in sync with highway.h version
+
+# Directly define the ABI version from the cmake project() version values:
+set(LIBRARY_VERSION "${hwy_VERSION}")
+set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})
+
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+# Search for Atomics implementation:
+find_package(Atomics REQUIRED)
+
+# Enabled PIE binaries by default if supported.
+include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
+if(CHECK_PIE_SUPPORTED)
+  check_pie_supported(LANGUAGES CXX)
+  if(CMAKE_CXX_LINK_PIE_SUPPORTED)
+    set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+  endif()
+endif()
+
+include(GNUInstallDirs)
+
+if (NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE RelWithDebInfo)
+endif()
+
+set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4)?")
+
+# Unconditionally adding -Werror risks breaking the build when new warnings
+# arise due to compiler/platform changes. Enable this in CI/tests.
+set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
+
+set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
+set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
+set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
+set(HWY_ENABLE_TESTS ON CACHE BOOL "Enable HWY tests")
+
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles(
+   "int main() {
+      #if !defined(__EMSCRIPTEN__)
+      static_assert(false, \"__EMSCRIPTEN__ is not defined\");
+      #endif
+      return 0;
+    }"
+  HWY_EMSCRIPTEN
+)
+
+check_cxx_source_compiles(
+   "int main() {
+      #if !defined(__riscv)
+      static_assert(false, \"__riscv is not defined\");
+      #endif
+      return 0;
+    }"
+  HWY_RISCV
+)
+
+if (HWY_ENABLE_CONTRIB)
+# Glob all the traits so we don't need to modify this file when adding
+# additional special cases.
+file(GLOB HWY_CONTRIB_SOURCES "hwy/contrib/sort/vqsort_*.cc")
+list(APPEND HWY_CONTRIB_SOURCES
+    hwy/contrib/dot/dot-inl.h
+    hwy/contrib/image/image.cc
+    hwy/contrib/image/image.h
+    hwy/contrib/math/math-inl.h
+    hwy/contrib/sort/shared-inl.h
+    hwy/contrib/sort/sorting_networks-inl.h
+    hwy/contrib/sort/traits-inl.h
+    hwy/contrib/sort/traits128-inl.h
+    hwy/contrib/sort/vqsort-inl.h
+    hwy/contrib/sort/vqsort.cc
+    hwy/contrib/sort/vqsort.h
+    hwy/contrib/algo/copy-inl.h
+    hwy/contrib/algo/find-inl.h
+    hwy/contrib/algo/transform-inl.h
+)
+endif()  # HWY_ENABLE_CONTRIB
+
+set(HWY_SOURCES
+    hwy/aligned_allocator.cc
+    hwy/aligned_allocator.h
+    hwy/base.h
+    hwy/cache_control.h
+    hwy/detect_compiler_arch.h  # private
+    hwy/detect_targets.h  # private
+    hwy/foreach_target.h
+    hwy/highway.h
+    hwy/highway_export.h
+    hwy/nanobenchmark.cc
+    hwy/nanobenchmark.h
+    hwy/ops/arm_neon-inl.h
+    hwy/ops/arm_sve-inl.h
+    hwy/ops/emu128-inl.h
+    hwy/ops/generic_ops-inl.h
+    hwy/ops/rvv-inl.h
+    hwy/ops/scalar-inl.h
+    hwy/ops/set_macros-inl.h
+    hwy/ops/shared-inl.h
+    hwy/ops/wasm_128-inl.h
+    hwy/ops/x86_128-inl.h
+    hwy/ops/x86_256-inl.h
+    hwy/ops/x86_512-inl.h
+    hwy/per_target.cc
+    hwy/per_target.h
+    hwy/print-inl.h
+    hwy/print.cc
+    hwy/print.h
+    hwy/targets.cc
+    hwy/targets.h
+)
+
+set(HWY_TEST_SOURCES
+    hwy/tests/hwy_gtest.h
+    hwy/tests/test_util-inl.h
+    hwy/tests/test_util.cc
+    hwy/tests/test_util.h
+)
+
+if (MSVC)
+  set(HWY_FLAGS
+    # fix build error C1128 in blockwise*_test & arithmetic_test
+    /bigobj
+  )
+else()
+  set(HWY_FLAGS
+    # Avoid changing binaries based on the current time and date.
+    -Wno-builtin-macro-redefined
+    -D__DATE__="redacted"
+    -D__TIMESTAMP__="redacted"
+    -D__TIME__="redacted"
+
+    # Optimizations
+    -fmerge-all-constants
+
+    # Warnings
+    -Wall
+    -Wextra
+    # These are not included in Wall nor Wextra:
+    -Wconversion
+    -Wsign-conversion
+    -Wvla
+    -Wnon-virtual-dtor
+  )
+
+  if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+    list(APPEND HWY_FLAGS
+      -Wfloat-overflow-conversion
+      -Wfloat-zero-conversion
+      -Wfor-loop-analysis
+      -Wgnu-redeclared-enum
+      -Winfinite-recursion
+      -Wself-assign
+      -Wstring-conversion
+      -Wtautological-overlap-compare
+      -Wthread-safety-analysis
+      -Wundefined-func-template
+
+      -fno-cxx-exceptions
+      -fno-slp-vectorize
+      -fno-vectorize
+
+      # Use color in messages
+      -fdiagnostics-show-option -fcolor-diagnostics
+    )
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 6.0)
+      list(APPEND HWY_FLAGS -Wc++2a-extensions)
+    endif()
+  endif()
+
+  if (WIN32)
+    if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+      list(APPEND HWY_FLAGS
+        -Wno-global-constructors
+        -Wno-language-extension-token
+        -Wno-used-but-marked-unused
+        -Wno-shadow-field-in-constructor
+        -Wno-unused-member-function
+        -Wno-unused-template
+        -Wno-c++98-compat-pedantic
+        -Wno-used-but-marked-unused
+        -Wno-zero-as-null-pointer-constant
+      )
+    endif()
+
+    list(APPEND HWY_FLAGS
+      -Wno-cast-align
+      -Wno-double-promotion
+      -Wno-float-equal
+      -Wno-format-nonliteral
+      -Wno-shadow
+      -Wno-sign-conversion
+    )
+  else()
+    list(APPEND HWY_FLAGS
+      -fmath-errno
+      -fno-exceptions
+    )
+  endif()  # WIN32
+
+  if (HWY_CMAKE_ARM7)
+    list(APPEND HWY_FLAGS
+      -march=armv7-a
+      -mfpu=neon-vfpv4
+      -mfloat-abi=hard  # must match the toolchain specified as CXX=
+      -mfp16-format=ieee  # required for vcvt_f32_f16
+    )
+  endif()  # HWY_CMAKE_ARM7
+
+  if(HWY_RISCV)
+    if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+      # Not yet supported by GCC. When runtime dispatch is supported and
+      # implemented, we will remove v from the required flags. Until then, using
+      # clang for RISC-V will require the CPU to support the V extension (1.0).
+      list(APPEND HWY_FLAGS -march=rv64gcv1p0)
+      list(APPEND HWY_FLAGS -menable-experimental-extensions)
+    endif()
+  endif()
+
+  if (HWY_WARNINGS_ARE_ERRORS)
+    list(APPEND HWY_FLAGS -Werror)
+  endif()
+
+  # Prevent "wasm-ld: error: --shared-memory is disallowed by targets.cc.o
+  # because it was not compiled with 'atomics' or 'bulk-memory' features."
+  if (HWY_EMSCRIPTEN)
+    list(APPEND HWY_FLAGS -matomics)
+  endif()
+
+endif()  # !MSVC
+
+# By default prefer STATIC build (legacy behavior)
+option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
+option(HWY_FORCE_STATIC_LIBS "Ignore BUILD_SHARED_LIBS" OFF)
+# only expose shared/static options to advanced users:
+mark_as_advanced(BUILD_SHARED_LIBS)
+mark_as_advanced(HWY_FORCE_STATIC_LIBS)
+# Define visibility settings globally:
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
+
+# Copy-cat "add_library" logic + add override.
+set(HWY_LIBRARY_TYPE "SHARED")
+if (NOT BUILD_SHARED_LIBS OR HWY_FORCE_STATIC_LIBS)
+  set(HWY_LIBRARY_TYPE "STATIC")
+endif()
+
+# This preprocessor define will drive the build, also used in the *.pc files:
+if("${HWY_LIBRARY_TYPE}" STREQUAL "SHARED")
+  set(DLLEXPORT_TO_DEFINE "HWY_SHARED_DEFINE")
+else()
+  set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE")
+endif()
+
+add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES})
+target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}")
+target_compile_options(hwy PRIVATE ${HWY_FLAGS})
+set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
+target_include_directories(hwy PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+target_compile_features(hwy PUBLIC cxx_std_11)
+set_target_properties(hwy PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
+# For GCC __atomic_store_8, see #887
+target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES})
+if(UNIX AND NOT APPLE)
+  # not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
+  set_property(TARGET hwy APPEND_STRING PROPERTY
+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
+endif()
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
+  # uname -p is broken on this system.  Try uname -m
+  EXECUTE_PROCESS( COMMAND uname -m
+		   OUTPUT_STRIP_TRAILING_WHITESPACE
+		   ERROR_QUIET
+		   OUTPUT_VARIABLE HWY_ARCH)
+else (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
+  set(HWY_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
+message(STATUS "Architecture: " ${HWY_ARCH})
+if (HWY_ARCH MATCHES "mips")
+  target_link_options(hwy PUBLIC "LINKER:-z,noexecstack")
+endif (HWY_ARCH MATCHES "mips")
+
+
+if (HWY_ENABLE_CONTRIB)
+add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
+target_link_libraries(hwy_contrib hwy)
+target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
+set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
+target_include_directories(hwy_contrib PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+target_compile_features(hwy_contrib PUBLIC cxx_std_11)
+set_target_properties(hwy_contrib PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
+if(UNIX AND NOT APPLE)
+  set_property(TARGET hwy_contrib APPEND_STRING PROPERTY
+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
+endif()
+endif()  # HWY_ENABLE_CONTRIB
+
+add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
+target_link_libraries(hwy_test hwy)
+target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
+set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
+target_include_directories(hwy_test PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+target_compile_features(hwy_test PUBLIC cxx_std_11)
+set_target_properties(hwy_test PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
+if(UNIX AND NOT APPLE)
+  set_property(TARGET hwy_test APPEND_STRING PROPERTY
+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
+endif()
+
+# -------------------------------------------------------- hwy_list_targets
+# Generate a tool to print the compiled-in targets as defined by the current
+# flags. This tool will print to stderr at build time, after building hwy.
+add_executable(hwy_list_targets hwy/tests/list_targets.cc)
+target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
+target_link_libraries(hwy_list_targets hwy)
+target_include_directories(hwy_list_targets PRIVATE
+  $<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
+# TARGET_FILE always returns the path to executable
+# Naked target also not always could be run (due to the lack of '.\' prefix)
+# Thus effective command to run should contain the full path
+# and emulator prefix (if any).
+if (NOT CMAKE_CROSSCOMPILING OR CMAKE_CROSSCOMPILING_EMULATOR)
+add_custom_command(TARGET hwy_list_targets POST_BUILD
+    COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
+endif()
+
+# --------------------------------------------------------
+# Allow skipping the following sections for projects that do not need them:
+# tests, examples, benchmarks and installation.
+
+# -------------------------------------------------------- install library
+if (HWY_ENABLE_INSTALL)
+
+install(TARGETS hwy
+  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
+# Install all the headers keeping the relative path to the current directory
+# when installing them.
+foreach (source ${HWY_SOURCES})
+  if ("${source}" MATCHES "\.h$")
+    get_filename_component(dirname "${source}" DIRECTORY)
+    install(FILES "${source}"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
+  endif()
+endforeach()
+
+if (HWY_ENABLE_CONTRIB)
+install(TARGETS hwy_contrib
+  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
+# Install all the headers keeping the relative path to the current directory
+# when installing them.
+foreach (source ${HWY_CONTRIB_SOURCES})
+  if ("${source}" MATCHES "\.h$")
+    get_filename_component(dirname "${source}" DIRECTORY)
+    install(FILES "${source}"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
+  endif()
+endforeach()
+endif()  # HWY_ENABLE_CONTRIB
+
+install(TARGETS hwy_test
+  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
+# Install all the headers keeping the relative path to the current directory
+# when installing them.
+foreach (source ${HWY_TEST_SOURCES})
+  if ("${source}" MATCHES "\.h$")
+    get_filename_component(dirname "${source}" DIRECTORY)
+    install(FILES "${source}"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
+  endif()
+endforeach()
+
+# Add a pkg-config file for libhwy and the contrib/test libraries.
+set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
+set(HWY_PC_FILES libhwy.pc libhwy-test.pc)
+if (HWY_ENABLE_CONTRIB)
+list(APPEND HWY_PC_FILES libhwy-contrib.pc)
+endif()  # HWY_ENABLE_CONTRIB
+foreach (pc ${HWY_PC_FILES})
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+endforeach()
+
+endif()  # HWY_ENABLE_INSTALL
+# -------------------------------------------------------- Examples
+if (HWY_ENABLE_EXAMPLES)
+
+# Avoids mismatch between GTest's static CRT and our dynamic.
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+# Programming exercise with integrated benchmark
+add_executable(hwy_benchmark hwy/examples/benchmark.cc)
+target_sources(hwy_benchmark PRIVATE
+    hwy/nanobenchmark.h)
+# Try adding one of -DHWY_COMPILE_ONLY_SCALAR, -DHWY_COMPILE_ONLY_EMU128 or
+# -DHWY_COMPILE_ONLY_STATIC to observe the difference in targets printed.
+target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
+target_link_libraries(hwy_benchmark hwy)
+set_target_properties(hwy_benchmark
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
+
+endif()  # HWY_ENABLE_EXAMPLES
+# -------------------------------------------------------- Tests
+
+include(CTest)
+
+if(BUILD_TESTING AND HWY_ENABLE_TESTS)
+enable_testing()
+include(GoogleTest)
+
+set(HWY_SYSTEM_GTEST OFF CACHE BOOL "Use pre-installed googletest?")
+if(HWY_SYSTEM_GTEST)
+find_package(GTest REQUIRED)
+else()
+# Download and unpack googletest at configure time
+configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+execute_process(COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
+                 ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
+                 EXCLUDE_FROM_ALL)
+endif()  # HWY_SYSTEM_GTEST
+
+set(HWY_TEST_FILES
+  hwy/contrib/algo/copy_test.cc
+  hwy/contrib/algo/find_test.cc
+  hwy/contrib/algo/transform_test.cc
+  hwy/aligned_allocator_test.cc
+  hwy/base_test.cc
+  hwy/highway_test.cc
+  hwy/nanobenchmark_test.cc
+  hwy/targets_test.cc
+  hwy/examples/skeleton_test.cc
+  hwy/tests/arithmetic_test.cc
+  hwy/tests/blockwise_test.cc
+  hwy/tests/blockwise_shift_test.cc
+  hwy/tests/combine_test.cc
+  hwy/tests/compare_test.cc
+  hwy/tests/compress_test.cc
+  hwy/tests/convert_test.cc
+  hwy/tests/crypto_test.cc
+  hwy/tests/demote_test.cc
+  hwy/tests/float_test.cc
+  hwy/tests/if_test.cc
+  hwy/tests/interleaved_test.cc
+  hwy/tests/logical_test.cc
+  hwy/tests/mask_test.cc
+  hwy/tests/mask_mem_test.cc
+  hwy/tests/memory_test.cc
+  hwy/tests/mul_test.cc
+  hwy/tests/reduction_test.cc
+  hwy/tests/reverse_test.cc
+  hwy/tests/shift_test.cc
+  hwy/tests/swizzle_test.cc
+  hwy/tests/test_util_test.cc
+)
+
+set(HWY_TEST_LIBS hwy hwy_test)
+
+if (HWY_ENABLE_CONTRIB)
+list(APPEND HWY_TEST_LIBS hwy_contrib)
+
+list(APPEND HWY_TEST_FILES
+  hwy/contrib/dot/dot_test.cc
+  hwy/contrib/image/image_test.cc
+  # Disabled due to SIGILL in clang7 debug build during gtest discovery phase,
+  # not reproducible locally. Still tested via bazel build.
+  # hwy/contrib/math/math_test.cc
+  hwy/contrib/sort/sort_test.cc
+)
+endif()  # HWY_ENABLE_CONTRIB
+
+if(HWY_SYSTEM_GTEST)
+  if (CMAKE_VERSION VERSION_LESS 3.20)
+    set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
+  else()
+    set(HWY_GTEST_LIBS GTest::gtest GTest::gtest_main)
+  endif()
+else()
+  set(HWY_GTEST_LIBS gtest gtest_main)
+endif()
+
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
+foreach (TESTFILE IN LISTS HWY_TEST_FILES)
+  # The TESTNAME is the name without the extension or directory.
+  get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
+  add_executable(${TESTNAME} ${TESTFILE})
+  target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
+  # Test all targets, not just the best/baseline. This changes the default
+  # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can
+  # cause compile errors because only one may be set, and other CMakeLists.txt
+  # that include us may set them.
+  target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
+
+  target_link_libraries(${TESTNAME} ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
+  # Output test targets in the test directory.
+  set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")
+
+  if (HWY_EMSCRIPTEN)
+    set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1")
+  endif()
+
+  if(${CMAKE_VERSION} VERSION_LESS "3.10.3")
+    gtest_discover_tests(${TESTNAME} TIMEOUT 60)
+  else ()
+    gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 60)
+  endif ()
+endforeach ()
+
+# The skeleton test uses the skeleton library code.
+target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
+
+endif()  # BUILD_TESTING
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 2.8.12)
+
+project(googletest-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(googletest
+  GIT_REPOSITORY    https://github.com/google/googletest.git
+  GIT_TAG           43efa0a4efd40c78b9210d15373112081899a97c
+  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
+  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
@@ -0,0 +1,33 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+## Testing
+
+This repository is used by JPEG XL, so major API changes will require
+coordination. Please get in touch with us beforehand, e.g. by raising an issue.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
@@ -0,0 +1,322 @@
+# Efficient and performance-portable vector software
+
+[//]: # (placeholder, do not remove)
+
+Highway is a C++ library that provides portable SIMD/vector intrinsics.
+
+## Why
+
+We are passionate about high-performance software. We see major untapped
+potential in CPUs (servers, mobile, desktops). Highway is for engineers who want
+to reliably and economically push the boundaries of what is possible in
+software.
+
+## How
+
+CPUs provide SIMD/vector instructions that apply the same operation to multiple
+data items. This can reduce energy usage e.g. *fivefold* because fewer
+instructions are executed. We also often see *5-10x* speedups.
+
+Highway makes SIMD/vector programming practical and workable according to these
+guiding principles:
+
+**Does what you expect**: Highway is a C++ library with carefully-chosen
+functions that map well to CPU instructions without extensive compiler
+transformations. The resulting code is more predictable and robust to code
+changes/compiler updates than autovectorization.
+
+**Works on widely-used platforms**: Highway supports four architectures; the
+same application code can target eight instruction sets, including those with
+'scalable' vectors (size unknown at compile time). Highway only requires C++11
+and supports four families of compilers. If you would like to use Highway on
+other platforms, please raise an issue.
+
+**Flexible to deploy**: Applications using Highway can run on heterogeneous
+clouds or client devices, choosing the best available instruction set at
+runtime. Alternatively, developers may choose to target a single instruction set
+without any runtime overhead. In both cases, the application code is the same
+except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one
+line of code.
+
+**Suitable for a variety of domains**: Highway provides an extensive set of
+operations, used for image processing (floating-point), compression, video
+analysis, linear algebra, cryptography, sorting and random generation. We
+recognise that new use-cases may require additional ops and are happy to add
+them where it makes sense (e.g. no performance cliffs on some architectures). If
+you would like to discuss, please file an issue.
+
+**Rewards data-parallel design**: Highway provides tools such as Gather,
+MaskedLoad, and FixedTag to enable speedups for legacy data structures. However,
+the biggest gains are unlocked by designing algorithms and data structures for
+scalable vectors. Helpful techniques include batching, structure-of-array
+layouts, and aligned/padded allocations.
+
+## Examples
+
+Online demos using Compiler Explorer:
+
+-   [multiple targets with dynamic dispatch](https://gcc.godbolt.org/z/zP7MYe9Yf)
+    (recommended)
+-   [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)
+
+Projects using Highway: (to add yours, feel free to raise an issue or contact us
+via the below email)
+
+*   [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp)
+*   [JPEG XL image codec](https://github.com/libjxl/libjxl)
+*   [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok)
+*   [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort) ([paper](https://arxiv.org/abs/2205.05982))
+
+## Current status
+
+### Targets
+
+Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
+requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
+WASM SIMD, RISC-V V.
+
+SVE was initially tested using farm_sve (see acknowledgments).
+
+### Versioning
+
+Highway releases aim to follow the semver.org system (MAJOR.MINOR.PATCH),
+incrementing MINOR after backward-compatible additions and PATCH after
+backward-compatible fixes. We recommend using releases (rather than the Git tip)
+because they are tested more extensively, see below.
+
+The current version 1.0 signals an increased focus on backwards compatibility.
+Applications using documented functionality will remain compatible with future
+updates that have the same major version number.
+
+### Testing
+
+Continuous integration tests build with a recent version of Clang (running on
+native x86, or QEMU for RVV and ARM) and MSVC 2019 (v19.28, running on native
+x86).
+
+Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via GCC
+cross-compile. See the [testing process](g3doc/release_testing_process.md) for
+details.
+
+### Related modules
+
+The `contrib` directory contains SIMD-related utilities: an image class with
+aligned rows, a math library (16 functions already implemented, mostly
+trigonometry), and functions for computing dot products and sorting.
+
+## Installation
+
+This project uses CMake to generate and build. In a Debian-based system you can
+install it via:
+
+```bash
+sudo apt install cmake
+```
+
+Highway's unit tests use [googletest](https://github.com/google/googletest).
+By default, Highway's CMake downloads this dependency at configuration time.
+You can disable this by setting the `HWY_SYSTEM_GTEST` CMake variable to ON and
+installing gtest separately:
+
+```bash
+sudo apt install libgtest-dev
+```
+
+To build Highway as a shared or static library (depending on BUILD_SHARED_LIBS),
+the standard CMake workflow can be used:
+
+```bash
+mkdir -p build && cd build
+cmake ..
+make -j && make test
+```
+
+Or you can run `run_tests.sh` (`run_tests.bat` on Windows).
+
+Bazel is also supported for building, but it is not as widely used/tested.
+
+## Quick start
+
+You can use the `benchmark` inside examples/ as a starting point.
+
+A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
+and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
+indicates the number of instructions per operation.
+
+The [FAQ](g3doc/faq.md) answers questions about portability, API design and
+where to find more information.
+
+We recommend using full SIMD vectors whenever possible for maximum performance
+portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
+`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
+alternatives for use-cases requiring an upper bound on the lanes:
+
+-   For up to `N` lanes, specify `CappedTag<T, N>` or the equivalent
+    `HWY_CAPPED(T, N)`. The actual number of lanes will be `N` rounded down to
+    the nearest power of two, such as 4 if `N` is 5, or 8 if `N` is 8. This is
+    useful for data structures such as a narrow matrix. A loop is still required
+    because vectors may actually have fewer than `N` lanes.
+
+-   For exactly a power of two `N` lanes, specify `FixedTag<T, N>`. The largest
+    supported `N` depends on the target, but is guaranteed to be at least
+    `16/sizeof(T)`.
+
+Due to ADL restrictions, user code calling Highway ops must either:
+*   Reside inside `namespace hwy { namespace HWY_NAMESPACE {`; or
+*   prefix each op with an alias such as `namespace hn = hwy::HWY_NAMESPACE;
+    hn::Add()`; or
+*   add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`.
+
+Additionally, each function that calls Highway ops (such as `Load`) must either
+be prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
+`HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before
+their opening brace.
+
+The entry points into code using Highway differ slightly depending on whether
+they use static or dynamic dispatch.
+
+*   For static dispatch, `HWY_TARGET` will be the best available target among
+    `HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
+    [quick-reference](g3doc/quick_reference.md)). Functions inside
+    `HWY_NAMESPACE` can be called using `HWY_STATIC_DISPATCH(func)(args)` within
+    the same module they are defined in. You can call the function from other
+    modules by wrapping it in a regular function and declaring the regular
+    function in a header.
+
+*   For dynamic dispatch, a table of function pointers is generated via the
+    `HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
+    call the best function pointer for the current CPU's supported targets. A
+    module is automatically compiled for each target in `HWY_TARGETS` (see
+    [quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
+    defined and `foreach_target.h` is included.
+
+When using dynamic dispatch, `foreach_target.h` is included from translation
+units (.cc files), not headers. Headers containing vector code shared between
+several translation units require a special include guard, for example the
+following taken from `examples/skeleton-inl.h`:
+
+```
+#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#else
+#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#endif
+
+#include "hwy/highway.h"
+// Your vector code
+#endif
+```
+
+By convention, we name such headers `-inl.h` because their contents (often
+function templates) are usually inlined.
+
+## Compiler flags
+
+Applications should be compiled with optimizations enabled - without inlining,
+SIMD code may slow down by factors of 10 to 100. For clang and GCC, `-O2` is
+generally sufficient.
+
+For MSVC, we recommend compiling with `/Gv` to allow non-inlined functions to
+pass vector arguments in registers. If intending to use the AVX2 target together
+with half-width vectors (e.g. for `PromoteTo`), it is also important to compile
+with `/arch:AVX2`. This seems to be the only way to generate VEX-encoded SSE4
+instructions on MSVC. Otherwise, mixing VEX-encoded AVX2 instructions and
+non-VEX SSE4 may cause severe performance degradation. Unfortunately, the
+resulting binary will then require AVX2. Note that no such flag is needed for
+clang and GCC because they support target-specific attributes, which we use to
+ensure proper VEX code generation for AVX2 targets.
+
+## Strip-mining loops
+
+To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
+loop with number of iterations matching the preferred vector width.
+
+In this section, let `T` denote the element type, `d = ScalableTag<T>`, `count`
+the number of elements to process, and `N = Lanes(d)` the number of lanes in a
+full vector. Assume the loop body is given as a function `template<bool partial,
+class D> void LoopBody(D d, size_t index, size_t max_n)`.
+
+Highway offers several ways to express loops where `N` need not divide `count`:
+
+*   Ensure all inputs/outputs are padded. Then the loop is simply
+
+    ```
+    for (size_t i = 0; i < count; i += N) LoopBody<false>(d, i, 0);
+    ```
+    Here, the template parameter and second function argument are not needed.
+
+    This is the preferred option, unless `N` is in the thousands and vector
+    operations are pipelined with long latencies. This was the case for
+    supercomputers in the 90s, but nowadays ALUs are cheap and we see most
+    implementations split vectors into 1, 2 or 4 parts, so there is little cost
+    to processing entire vectors even if we do not need all their lanes. Indeed
+    this avoids the (potentially large) cost of predication or partial
+    loads/stores on older targets, and does not duplicate code.
+
+*   Use the `Transform*` functions in hwy/contrib/algo/transform-inl.h. This
+    takes care of the loop and remainder handling and you simply define a
+    generic lambda function (C++14) or functor which receives the current vector
+    from the input/output array, plus optionally vectors from up to two extra
+    input arrays, and returns the value to write to the input/output array.
+
+    Here is an example implementing the BLAS function SAXPY (`alpha * x + y`):
+
+    ```
+    Transform1(d, x, n, y, [](auto d, const auto v, const auto v1) HWY_ATTR {
+      return MulAdd(Set(d, alpha), v, v1);
+    });
+    ```
+
+*   Process whole vectors as above, followed by a scalar loop:
+
+    ```
+    size_t i = 0;
+    for (; i + N <= count; i += N) LoopBody<false>(d, i, 0);
+    for (; i < count; ++i) LoopBody<false>(CappedTag<T, 1>(), i, 0);
+    ```
+    The template parameter and second function arguments are again not needed.
+
+    This avoids duplicating code, and is reasonable if `count` is large.
+    If `count` is small, the second loop may be slower than the next option.
+
+*   Process whole vectors as above, followed by a single call to a modified
+    `LoopBody` with masking:
+
+    ```
+    size_t i = 0;
+    for (; i + N <= count; i += N) {
+      LoopBody<false>(d, i, 0);
+    }
+    if (i < count) {
+      LoopBody<true>(d, i, count - i);
+    }
+    ```
+    Now the template parameter and third function argument can be used inside
+    `LoopBody` to non-atomically 'blend' the first `num_remaining` lanes of `v`
+    with the previous contents of memory at subsequent locations:
+    `BlendedStore(v, FirstN(d, num_remaining), d, pointer);`. Similarly,
+    `MaskedLoad(FirstN(d, num_remaining), d, pointer)` loads the first
+    `num_remaining` elements and returns zero in other lanes.
+
+    This is a good default when it is infeasible to ensure vectors are padded,
+    but is only safe `#if !HWY_MEM_OPS_MIGHT_FAULT`!
+    In contrast to the scalar loop, only a single final iteration is needed.
+    The increased code size from two loop bodies is expected to be worthwhile
+    because it avoids the cost of masking in all but the final iteration.
+
+## Additional resources
+
+*   [Highway introduction (slides)](g3doc/highway_intro.pdf)
+*   [Overview of instructions per operation on different architectures](g3doc/instruction_matrix.pdf)
+*   [Design philosophy and comparison](g3doc/design_philosophy.md)
+*   [Implementation details](g3doc/impl_details.md)
+
+## Acknowledgments
+
+We have used [farm-sve](https://gitlab.inria.fr/bramas/farm-sve) by Berenger
+Bramas; it has proved useful for checking the SVE port on an x86 development
+machine.
+
+This is not an officially supported Google product.
+Contact: janwas@google.com
@@ -0,0 +1,24 @@
+workspace(name = "highway")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+  name = "com_google_googletest",
+  urls = ["https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip"],
+  sha256 = "5cf189eb6847b4f8fc603a3ffff3b0771c08eec7dd4bd961bfd45477dd13eb73",
+  strip_prefix = "googletest-609281088cfefc76f9d0ce82e1ff6c30cc3591e5",
+)
+
+# See https://google.github.io/googletest/quickstart-bazel.html
+http_archive(
+  name = "rules_cc",
+  urls = ["https://github.com/bazelbuild/rules_cc/archive/40548a2974f1aea06215272d9c2b47a14a24e556.zip"],
+  sha256 = "56ac9633c13d74cb71e0546f103ce1c58810e4a76aa8325da593ca4277908d72",
+  strip_prefix = "rules_cc-40548a2974f1aea06215272d9c2b47a14a24e556",
+)
+
+# Need recent version for config_setting_group
+http_archive(
+    name = "bazel_skylib",
+    urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel_skylib-0.9.0.tar.gz"],
+)
@@ -0,0 +1,157 @@
+highway (1.0.2-1) UNRELEASED; urgency=medium
+
+* Add ExclusiveNeither, FindKnownFirstTrue, Ne128
+* Add 16-bit SumOfLanes/ReorderWidenMulAccumulate/ReorderDemote2To
+* Faster sort for low-entropy input, improved pivot selection
+* Add GN build system, Highway FAQ, k32v32 type to vqsort
+* CMake: Support find_package(GTest), add rvv-inl.h, add HWY_ENABLE_TESTS
+* Fix MIPS and C++20 build, Apple LLVM 10.3 detection, EMU128 AllTrue on RVV
+* Fix missing exec_prefix, RVV build, warnings, libatomic linking
+* Work around GCC 10.4 issue, disabled RDCYCLE, arm7 with vfpv3
+* Documentation/example improvements
+* Support static dispatch to SVE2_128 and SVE_256
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 27 Oct 2022 17:00:00 +0200
+
+highway (1.0.1-1) UNRELEASED; urgency=medium
+
+* Add Eq128, i64 Mul, unsigned->float ConvertTo
+* Faster sort for few unique keys, more robust pivot selection
+* Fix: floating-point generator for sort tests, Min/MaxOfLanes for i16
+* Fix: avoid always_inline in debug, link atomic
+* GCC warnings: string.h, maybe-uninitialized, ignored-attributes
+* GCC warnings: preprocessor int overflow, spurious use-after-free/overflow
+* Doc: <=HWY_AVX3, Full32/64/128, how to use generic-inl
+
+ -- Jan Wassenberg <janwas@google.com>  Tue, 23 Aug 2022 10:00:00 +0200
+
+highway (1.0.0-1) UNRELEASED; urgency=medium
+
+* ABI change: 64-bit target values, more room for expansion
+* Add CompressBlocksNot, CompressNot, Lt128Upper, Min/Max128Upper, TruncateTo
+* Add HWY_SVE2_128 target
+* Sort speedups especially for 128-bit
+* Documentation clarifications
+* Faster NEON CountTrue/FindFirstTrue/AllFalse/AllTrue
+* Improved SVE codegen
+* Fix u16x8 ConcatEven/Odd, SSSE3 i64 Lt
+* MSVC 2017 workarounds
+* Support for runtime dispatch on Arm/GCC/Linux
+
+ -- Jan Wassenberg <janwas@google.com>  Wed, 27 Jul 2022 10:00:00 +0200
+
+highway (0.17.0-1) UNRELEASED; urgency=medium
+
+* Add ExtractLane, InsertLane, IsInf, IsFinite, IsNaN
+* Add StoreInterleaved2, LoadInterleaved2/3/4, BlendedStore, SafeFillN
+* Add MulFixedPoint15, Or3
+* Add Copy[If], Find[If], Generate, Replace[If] algos
+* Add HWY_EMU128 target (replaces HWY_SCALAR)
+* HWY_RVV is feature-complete
+* Add HWY_ENABLE_CONTRIB build flag, HWY_NATIVE_FMA, HWY_WANT_SSSE3/SSE4 macros
+* Extend ConcatOdd/Even and StoreInterleaved* to all types
+* Allow CappedTag<T, nonPowerOfTwo>
+* Sort speedups: 2x for AVX2, 1.09x for AVX3; avoid x86 malloc
+* Expand documentation
+* Fix RDTSCP crash in nanobenchmark
+* Fix XCR0 check (was ignoring AVX3 on ICL)
+* Support Arm/RISC-V timers
+
+ -- Jan Wassenberg <janwas@google.com>  Fri, 20 May 2022 10:00:00 +0200
+
+highway (0.16.0-1) UNRELEASED; urgency=medium
+
+  * Add contrib/sort (vectorized quicksort)
+  * Add IfNegativeThenElse, IfVecThenElse
+  * Add Reverse2,4,8, ReverseBlocks, DupEven/Odd, AESLastRound
+  * Add OrAnd, Min128, Max128, Lt128, SumsOf8
+  * Support capped/partial vectors on RVV/SVE, int64 in WASM
+  * Support SVE2, shared library build
+  * Remove deprecated overloads without the required d arg (UpperHalf etc.)
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 03 Feb 2022 11:00:00 +0100
+
+highway (0.15.0-1) UNRELEASED; urgency=medium
+
+  * New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec
+  * New ops: OddEvenBlocks, SwapAdjacentBlocks, Reverse, RotateRight
+  * Add bf16, unsigned comparisons, more lane types for Reverse/TableLookupLanes
+  * Contrib: add sort(ing network) and dot(product)
+  * Targets: update RVV for LLVM, add experimental WASM2
+  * Separate library hwy_test for test utils
+  * Add non-macro Simd<> aliases
+  * Fixes: const V& for GCC, AVX3 BZHI, POPCNT with AVX on MSVC, avoid %zu
+
+ -- Jan Wassenberg <janwas@google.com>  Wed, 10 Nov 2021 10:00:00 +0100
+
+highway (0.14.2-1) UNRELEASED; urgency=medium
+
+  * Add MaskedLoad
+  * Fix non-glibc PPC, Windows GCC, MSVC 19.14
+  * Opt-in for -Werror; separate design_philosophy.md
+
+ -- Jan Wassenberg <janwas@google.com>  Tue, 24 Aug 2021 15:00:00 +0200
+
+highway (0.14.1-1) UNRELEASED; urgency=medium
+
+  * Add LoadMaskBits, CompressBits[Store]
+  * Fix CPU feature check (AES/F16C) and warnings
+  * Improved DASSERT - disabled in optimized builds
+
+ -- Jan Wassenberg <janwas@google.com>  Tue, 17 Aug 2021 14:00:00 +0200
+
+highway (0.14.0-1) UNRELEASED; urgency=medium
+
+  * Add SVE, S-SSE3, AVX3_DL targets
+  * Support partial vectors in all ops
+  * Add PopulationCount, FindFirstTrue, Ne, TableLookupBytesOr0
+  * Add AESRound, CLMul, MulOdd, HWY_CAP_FLOAT16
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 29 Jul 2021 15:00:00 +0200
+
+highway (0.12.2-1) UNRELEASED; urgency=medium
+
+  * fix scalar-only test and Windows macro conflict with Load/StoreFence
+  * replace deprecated wasm intrinsics
+
+ -- Jan Wassenberg <janwas@google.com>  Mon, 31 May 2021 16:00:00 +0200
+
+highway (0.12.1-1) UNRELEASED; urgency=medium
+
+  * doc updates, ARM GCC support, fix s390/ppc, complete partial vectors
+  * fix warnings, faster ARM div/sqrt, separate hwy_contrib library
+  * add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC
+
+ -- Jan Wassenberg <janwas@google.com>  Wed, 19 May 2021 15:00:00 +0200
+
+highway (0.12.0-1) UNRELEASED; urgency=medium
+
+  * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
+  * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
+  * Proper IEEE rounding, reduce libstdc++ usage, inlined math
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 15 Apr 2021 20:00:00 +0200
+
+highway (0.11.1-1) UNRELEASED; urgency=medium
+
+  * Fix clang7 asan error, finish f16 conversions and add test
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 25 Feb 2021 16:00:00 +0200
+
+highway (0.11.0-1) UNRELEASED; urgency=medium
+
+  * Add RVV+mask logical ops, allow Shl/ShiftLeftSame on all targets, more math
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 18 Feb 2021 20:00:00 +0200
+
+highway (0.7.0-1) UNRELEASED; urgency=medium
+
+  * Added API stability notice, Compress[Store], contrib/, SignBit, CopySign
+
+ -- Jan Wassenberg <janwas@google.com>  Tue, 5 Jan 2021 17:00:00 +0200
+
+highway (0.1-1) UNRELEASED; urgency=medium
+
+  * Initial debian package.
+
+ -- Alex Deymo <deymo@google.com>  Mon, 19 Oct 2020 16:48:07 +0200
@@ -0,0 +1 @@
+10
@@ -0,0 +1,23 @@
+Source: highway
+Maintainer: JPEG XL Maintainers <jpegxl@google.com>
+Section: misc
+Priority: optional
+Standards-Version: 3.9.8
+Build-Depends: cmake,
+               debhelper (>= 9),
+               libgtest-dev
+Homepage: https://github.com/google/highway
+
+Package: libhwy-dev
+Architecture: any
+Section: libdevel
+Depends: ${misc:Depends}
+Description: Efficient and performance-portable SIMD wrapper (developer files)
+ This library provides type-safe and source-code portable wrappers over
+ existing platform-specific intrinsics. Its design aims for simplicity,
+ reliable efficiency across platforms, and immediate usability with current
+ compilers.
+ .
+ This package installs the development files. There's no runtime library
+ since most of Highway is implemented in headers and only a very small
+ static library is needed.
@@ -0,0 +1,20 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: highway
+
+Files: *
+Copyright: 2020 Google LLC
+License: Apache-2.0
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ .
+      http://www.apache.org/licenses/LICENSE-2.0
+ .
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ .
+ On Debian systems, the complete text of the Apache License, Version 2
+ can be found in "/usr/share/common-licenses/Apache-2.0".
@@ -0,0 +1,6 @@
+#!/usr/bin/make -f
+%:
+	dh $@ --buildsystem=cmake
+
+override_dh_auto_configure:
+	dh_auto_configure -- -DHWY_SYSTEM_GTEST=ON
@@ -0,0 +1 @@
+3.0 (quilt)
@@ -0,0 +1,152 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/aligned_allocator.h"
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>  // malloc
+
+#include <atomic>
+#include <limits>
+
+#include "hwy/base.h"
+
+namespace hwy {
+namespace {
+
+#if HWY_ARCH_RVV && defined(__riscv_vector)
+// Not actually an upper bound on the size, but this value prevents crossing a
+// 4K boundary (relevant on Andes).
+constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, 4096);
+#else
+constexpr size_t kAlignment = HWY_ALIGNMENT;
+#endif
+
+#if HWY_ARCH_X86
+// On x86, aliasing can only occur at multiples of 2K, but that's too wasteful
+// if this is used for single-vector allocations. 256 is more reasonable.
+constexpr size_t kAlias = kAlignment * 4;
+#else
+constexpr size_t kAlias = kAlignment;
+#endif
+
+#pragma pack(push, 1)
+struct AllocationHeader {
+  void* allocated;
+  size_t payload_size;
+};
+#pragma pack(pop)
+
+// Returns a 'random' (cyclical) offset for AllocateAlignedBytes.
+size_t NextAlignedOffset() {
+  static std::atomic<uint32_t> next{0};
+  constexpr uint32_t kGroups = kAlias / kAlignment;
+  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
+  const size_t offset = kAlignment * group;
+  HWY_DASSERT((offset % kAlignment == 0) && offset <= kAlias);
+  return offset;
+}
+
+}  // namespace
+
+HWY_DLLEXPORT void* AllocateAlignedBytes(const size_t payload_size,
+                                         AllocPtr alloc_ptr, void* opaque_ptr) {
+  HWY_ASSERT(payload_size != 0);  // likely a bug in caller
+  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
+    HWY_DASSERT(false && "payload_size too large");
+    return nullptr;
+  }
+
+  size_t offset = NextAlignedOffset();
+
+  // What: | misalign | unused | AllocationHeader |payload
+  // Size: |<= kAlias | offset                    |payload_size
+  //       ^allocated.^aligned.^header............^payload
+  // The header must immediately precede payload, which must remain aligned.
+  // To avoid wasting space, the header resides at the end of `unused`,
+  // which therefore cannot be empty (offset == 0).
+  if (offset == 0) {
+    offset = kAlignment;  // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
+    static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
+  }
+
+  const size_t allocated_size = kAlias + offset + payload_size;
+  void* allocated;
+  if (alloc_ptr == nullptr) {
+    allocated = malloc(allocated_size);
+  } else {
+    allocated = (*alloc_ptr)(opaque_ptr, allocated_size);
+  }
+  if (allocated == nullptr) return nullptr;
+  // Always round up even if already aligned - we already asked for kAlias
+  // extra bytes and there's no way to give them back.
+  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
+  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
+  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
+  aligned &= ~(kAlias - 1);
+
+  const uintptr_t payload = aligned + offset;  // still aligned
+
+  // Stash `allocated` and payload_size inside header for FreeAlignedBytes().
+  // The allocated_size can be reconstructed from the payload_size.
+  AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1;
+  header->allocated = allocated;
+  header->payload_size = payload_size;
+
+  return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kAlignment);
+}
+
+HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
+                                    FreePtr free_ptr, void* opaque_ptr) {
+  if (aligned_pointer == nullptr) return;
+
+  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
+  HWY_DASSERT(payload % kAlignment == 0);
+  const AllocationHeader* header =
+      reinterpret_cast<const AllocationHeader*>(payload) - 1;
+
+  if (free_ptr == nullptr) {
+    free(header->allocated);
+  } else {
+    (*free_ptr)(opaque_ptr, header->allocated);
+  }
+}
+
+// static
+HWY_DLLEXPORT void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer,
+                                                      FreePtr free_ptr,
+                                                      void* opaque_ptr,
+                                                      ArrayDeleter deleter) {
+  if (aligned_pointer == nullptr) return;
+
+  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
+  HWY_DASSERT(payload % kAlignment == 0);
+  const AllocationHeader* header =
+      reinterpret_cast<const AllocationHeader*>(payload) - 1;
+
+  if (deleter) {
+    (*deleter)(aligned_pointer, header->payload_size);
+  }
+
+  if (free_ptr == nullptr) {
+    free(header->allocated);
+  } else {
+    (*free_ptr)(opaque_ptr, header->allocated);
+  }
+}
+
+}  // namespace hwy
@@ -0,0 +1,212 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
+#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
+
+// Memory allocator with support for alignment and offsets.
+
+#include <stddef.h>
+
+#include <memory>
+
+#include "hwy/highway_export.h"
+
+namespace hwy {
+
+// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
+// requires a literal. This matches typical L1 cache line sizes, which prevents
+// false sharing.
+#define HWY_ALIGNMENT 64
+
+// Pointers to functions equivalent to malloc/free with an opaque void* passed
+// to them.
+using AllocPtr = void* (*)(void* opaque, size_t bytes);
+using FreePtr = void (*)(void* opaque, void* memory);
+
+// Returns null or a pointer to at least `payload_size` (which can be zero)
+// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
+// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
+// memory or malloc() if it is null.
+HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
+                                         AllocPtr alloc_ptr, void* opaque_ptr);
+
+// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
+// must have been returned from a previous call to `AllocateAlignedBytes`.
+// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
+// `free_ptr` function is null, uses the default free().
+HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
+                                    FreePtr free_ptr, void* opaque_ptr);
+
+// Class that deletes the aligned pointer passed to operator() calling the
+// destructor before freeing the pointer. This is equivalent to the
+// std::default_delete but for aligned objects. For a similar deleter equivalent
+// to free() for aligned memory see AlignedFreer().
+class AlignedDeleter {
+ public:
+  AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
+  AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
+      : free_(free_ptr), opaque_ptr_(opaque_ptr) {}
+
+  template <typename T>
+  void operator()(T* aligned_pointer) const {
+    return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
+                              TypedArrayDeleter<T>);
+  }
+
+ private:
+  template <typename T>
+  static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
+    size_t elems = size_in_bytes / sizeof(T);
+    for (size_t i = 0; i < elems; i++) {
+      // Explicitly call the destructor on each element.
+      (static_cast<T*>(ptr) + i)->~T();
+    }
+  }
+
+  // Function prototype that calls the destructor for each element in a typed
+  // array. TypeArrayDeleter<T> would match this prototype.
+  using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
+
+  HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
+                                               FreePtr free_ptr,
+                                               void* opaque_ptr,
+                                               ArrayDeleter deleter);
+
+  FreePtr free_;
+  void* opaque_ptr_;
+};
+
+// Unique pointer to T with custom aligned deleter. This can be a single
+// element U or an array of element if T is a U[]. The custom aligned deleter
+// will call the destructor on U or each element of a U[] in the array case.
+template <typename T>
+using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
+
+// Aligned memory equivalent of make_unique<T> using the custom allocators
+// alloc/free with the passed `opaque` pointer. This function calls the
+// constructor with the passed Args... and calls the destructor of the object
+// when the AlignedUniquePtr is destroyed.
+template <typename T, typename... Args>
+AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
+                                               void* opaque, Args&&... args) {
+  T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
+  return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
+                             AlignedDeleter(free, opaque));
+}
+
+// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
+// functions.
+template <typename T, typename... Args>
+AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
+  T* ptr = static_cast<T*>(AllocateAlignedBytes(
+      sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
+  return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
+                             AlignedDeleter());
+}
+
+// Helpers for array allocators (avoids overflow)
+namespace detail {
+
+// Returns x such that 1u << x == n (if n is a power of two).
+static inline constexpr size_t ShiftCount(size_t n) {
+  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
+}
+
+template <typename T>
+T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
+  constexpr size_t size = sizeof(T);
+
+  constexpr bool is_pow2 = (size & (size - 1)) == 0;
+  constexpr size_t bits = ShiftCount(size);
+  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
+
+  const size_t bytes = is_pow2 ? items << bits : items * size;
+  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
+  if (check != items) {
+    return nullptr;  // overflowed
+  }
+  return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
+}
+
+}  // namespace detail
+
+// Aligned memory equivalent of make_unique<T[]> for array types using the
+// custom allocators alloc/free. This function calls the constructor with the
+// passed Args... on every created item. The destructor of each element will be
+// called when the AlignedUniquePtr is destroyed.
+template <typename T, typename... Args>
+AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
+    size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
+  T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
+  if (ptr != nullptr) {
+    for (size_t i = 0; i < items; i++) {
+      new (ptr + i) T(std::forward<Args>(args)...);
+    }
+  }
+  return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
+}
+
+template <typename T, typename... Args>
+AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
+  return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
+      items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
+}
+
+// Custom deleter for std::unique_ptr equivalent to using free() as a deleter
+// but for aligned memory.
+class AlignedFreer {
+ public:
+  // Pass address of this to ctor to skip deleting externally-owned memory.
+  static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}
+
+  AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
+  AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
+      : free_(free_ptr), opaque_ptr_(opaque_ptr) {}
+
+  template <typename T>
+  void operator()(T* aligned_pointer) const {
+    // TODO(deymo): assert that we are using a POD type T.
+    FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
+  }
+
+ private:
+  FreePtr free_;
+  void* opaque_ptr_;
+};
+
+// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
+// data use AlignedUniquePtr.
+template <typename T>
+using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
+
+// Allocate an aligned and uninitialized array of POD values as a unique_ptr.
+// Upon destruction of the unique_ptr the aligned array will be freed.
+template <typename T>
+AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
+                                          FreePtr free, void* opaque) {
+  return AlignedFreeUniquePtr<T[]>(
+      detail::AllocateAlignedItems<T>(items, alloc, opaque),
+      AlignedFreer(free, opaque));
+}
+
+// Same as previous AllocateAligned(), using default allocate/free functions.
+template <typename T>
+AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
+  return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
+}
+
+}  // namespace hwy
+#endif  // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
@@ -0,0 +1,278 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/aligned_allocator.h"
+
+#include <stddef.h>
+
+#include <array>
+#include <new>
+#include <random>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+namespace {
+
+// Sample object that keeps track on an external counter of how many times was
+// the explicit constructor and destructor called.
+template <size_t N>
+class SampleObject {
+ public:
+  SampleObject() { data_[0] = 'a'; }
+  explicit SampleObject(int* counter) : counter_(counter) {
+    if (counter) (*counter)++;
+    data_[0] = 'b';
+  }
+
+  ~SampleObject() {
+    if (counter_) (*counter_)--;
+  }
+
+  static_assert(N > sizeof(int*), "SampleObject size too small.");
+  int* counter_ = nullptr;
+  char data_[N - sizeof(int*)];
+};
+
+class FakeAllocator {
+ public:
+  // static AllocPtr and FreePtr member to be used with the alligned
+  // allocator. These functions calls the private non-static members.
+  static void* StaticAlloc(void* opaque, size_t bytes) {
+    return reinterpret_cast<FakeAllocator*>(opaque)->Alloc(bytes);
+  }
+  static void StaticFree(void* opaque, void* memory) {
+    return reinterpret_cast<FakeAllocator*>(opaque)->Free(memory);
+  }
+
+  // Returns the number of pending allocations to be freed.
+  size_t PendingAllocs() { return allocs_.size(); }
+
+ private:
+  void* Alloc(size_t bytes) {
+    void* ret = malloc(bytes);
+    allocs_.insert(ret);
+    return ret;
+  }
+  void Free(void* memory) {
+    if (!memory) return;
+    EXPECT_NE(allocs_.end(), allocs_.find(memory));
+    allocs_.erase(memory);
+    free(memory);
+  }
+
+  std::set<void*> allocs_;
+};
+
+}  // namespace
+
+namespace hwy {
+
+class AlignedAllocatorTest : public testing::Test {};
+
+TEST(AlignedAllocatorTest, FreeNullptr) {
+  // Calling free with a nullptr is always ok.
+  FreeAlignedBytes(/*aligned_pointer=*/nullptr, /*free_ptr=*/nullptr,
+                   /*opaque_ptr=*/nullptr);
+}
+
+TEST(AlignedAllocatorTest, Log2) {
+  EXPECT_EQ(0u, detail::ShiftCount(1));
+  EXPECT_EQ(1u, detail::ShiftCount(2));
+  EXPECT_EQ(3u, detail::ShiftCount(8));
+}
+
+// Allocator returns null when it detects overflow of items * sizeof(T).
+TEST(AlignedAllocatorTest, Overflow) {
+  constexpr size_t max = ~size_t(0);
+  constexpr size_t msb = (max >> 1) + 1;
+  using Size5 = std::array<uint8_t, 5>;
+  using Size10 = std::array<uint8_t, 10>;
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<uint32_t>(max / 2, nullptr, nullptr));
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<uint32_t>(max / 3, nullptr, nullptr));
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<Size5>(max / 4, nullptr, nullptr));
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<uint16_t>(msb, nullptr, nullptr));
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<double>(msb + 1, nullptr, nullptr));
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<Size10>(msb / 4, nullptr, nullptr));
+}
+
+TEST(AlignedAllocatorTest, AllocDefaultPointers) {
+  const size_t kSize = 7777;
+  void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
+                                   /*opaque_ptr=*/nullptr);
+  ASSERT_NE(nullptr, ptr);
+  // Make sure the pointer is actually aligned.
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
+  char* p = static_cast<char*>(ptr);
+  size_t ret = 0;
+  for (size_t i = 0; i < kSize; i++) {
+    // Performs a computation using p[] to prevent it being optimized away.
+    p[i] = static_cast<char>(i & 0x7F);
+    if (i) ret += static_cast<size_t>(p[i] * p[i - 1]);
+  }
+  EXPECT_NE(0U, ret);
+  FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
+}
+
+TEST(AlignedAllocatorTest, EmptyAlignedUniquePtr) {
+  AlignedUniquePtr<SampleObject<32>> ptr(nullptr, AlignedDeleter());
+  AlignedUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedDeleter());
+}
+
+TEST(AlignedAllocatorTest, EmptyAlignedFreeUniquePtr) {
+  AlignedFreeUniquePtr<SampleObject<32>> ptr(nullptr, AlignedFreer());
+  AlignedFreeUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedFreer());
+}
+
+TEST(AlignedAllocatorTest, CustomAlloc) {
+  FakeAllocator fake_alloc;
+
+  const size_t kSize = 7777;
+  void* ptr =
+      AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
+  ASSERT_NE(nullptr, ptr);
+  // We should have only requested one alloc from the allocator.
+  EXPECT_EQ(1U, fake_alloc.PendingAllocs());
+  // Make sure the pointer is actually aligned.
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
+  FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
+  EXPECT_EQ(0U, fake_alloc.PendingAllocs());
+}
+
+TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
+  {
+    auto ptr = MakeUniqueAligned<SampleObject<24>>();
+    // Default constructor sets the data_[0] to 'a'.
+    EXPECT_EQ('a', ptr->data_[0]);
+    EXPECT_EQ(nullptr, ptr->counter_);
+  }
+}
+
+TEST(AlignedAllocatorTest, MakeUniqueAligned) {
+  int counter = 0;
+  {
+    // Creates the object, initializes it with the explicit constructor and
+    // returns an unique_ptr to it.
+    auto ptr = MakeUniqueAligned<SampleObject<24>>(&counter);
+    EXPECT_EQ(1, counter);
+    // Custom constructor sets the data_[0] to 'b'.
+    EXPECT_EQ('b', ptr->data_[0]);
+  }
+  EXPECT_EQ(0, counter);
+}
+
+TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) {
+  int counter = 0;
+  {
+    // Creates the array of objects and initializes them with the explicit
+    // constructor.
+    auto arr = MakeUniqueAlignedArray<SampleObject<24>>(7, &counter);
+    EXPECT_EQ(7, counter);
+    for (size_t i = 0; i < 7; i++) {
+      // Custom constructor sets the data_[0] to 'b'.
+      EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
+    }
+  }
+  EXPECT_EQ(0, counter);
+}
+
+TEST(AlignedAllocatorTest, AllocSingleInt) {
+  auto ptr = AllocateAligned<uint32_t>(1);
+  ASSERT_NE(nullptr, ptr.get());
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
+  // Force delete of the unique_ptr now to check that it doesn't crash.
+  ptr.reset(nullptr);
+  EXPECT_EQ(nullptr, ptr.get());
+}
+
+TEST(AlignedAllocatorTest, AllocMultipleInt) {
+  const size_t kSize = 7777;
+  auto ptr = AllocateAligned<uint32_t>(kSize);
+  ASSERT_NE(nullptr, ptr.get());
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
+  // ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
+  // underlying type chosen by AllocateAligned() for the std::unique_ptr.
+  EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
+
+  size_t ret = 0;
+  for (size_t i = 0; i < kSize; i++) {
+    // Performs a computation using ptr[] to prevent it being optimized away.
+    ptr[i] = static_cast<uint32_t>(i);
+    if (i) ret += ptr[i] * ptr[i - 1];
+  }
+  EXPECT_NE(0U, ret);
+}
+
+TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
+  int counter = 0;
+  {
+    // This doesn't call the constructor.
+    auto obj = AllocateAligned<SampleObject<24>>(1);
+    obj[0].counter_ = &counter;
+  }
+  // Destroying the unique_ptr shouldn't have called the destructor of the
+  // SampleObject<24>.
+  EXPECT_EQ(0, counter);
+}
+
+TEST(AlignedAllocatorTest, MakeUniqueAlignedArrayWithCustomAlloc) {
+  FakeAllocator fake_alloc;
+  int counter = 0;
+  {
+    // Creates the array of objects and initializes them with the explicit
+    // constructor.
+    auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
+        7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
+        &counter);
+    ASSERT_NE(nullptr, arr.get());
+    // An array should still only call a single allocation.
+    EXPECT_EQ(1u, fake_alloc.PendingAllocs());
+    EXPECT_EQ(7, counter);
+    for (size_t i = 0; i < 7; i++) {
+      // Custom constructor sets the data_[0] to 'b'.
+      EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
+    }
+  }
+  EXPECT_EQ(0, counter);
+  EXPECT_EQ(0u, fake_alloc.PendingAllocs());
+}
+
+TEST(AlignedAllocatorTest, DefaultInit) {
+  // The test is whether this compiles. Default-init is useful for output params
+  // and per-thread storage.
+  std::vector<AlignedUniquePtr<int[]>> ptrs;
+  std::vector<AlignedFreeUniquePtr<double[]>> free_ptrs;
+  ptrs.resize(128);
+  free_ptrs.resize(128);
+  // The following is to prevent elision of the pointers.
+  std::mt19937 rng(129);  // Emscripten lacks random_device.
+  std::uniform_int_distribution<size_t> dist(0, 127);
+  ptrs[dist(rng)] = MakeUniqueAlignedArray<int>(123);
+  free_ptrs[dist(rng)] = AllocateAligned<double>(456);
+  // "Use" pointer without resorting to printf. 0 == 0. Can't shift by 64.
+  const auto addr1 = reinterpret_cast<uintptr_t>(ptrs[dist(rng)].get());
+  const auto addr2 = reinterpret_cast<uintptr_t>(free_ptrs[dist(rng)].get());
+  constexpr size_t kBits = sizeof(uintptr_t) * 8;
+  EXPECT_EQ((addr1 >> (kBits - 1)) >> (kBits - 1),
+            (addr2 >> (kBits - 1)) >> (kBits - 1));
+}
+
+}  // namespace hwy
@@ -0,0 +1,946 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_BASE_H_
+#define HIGHWAY_HWY_BASE_H_
+
+// For SIMD module implementations and their callers, target-independent.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hwy/detect_compiler_arch.h"
+#include "hwy/highway_export.h"
+
+#if HWY_COMPILER_MSVC
+#include <string.h>  // memcpy
+#endif
+#if HWY_ARCH_X86
+#include <atomic>
+#endif
+
+//------------------------------------------------------------------------------
+// Compiler-specific definitions
+
+#define HWY_STR_IMPL(macro) #macro
+#define HWY_STR(macro) HWY_STR_IMPL(macro)
+
+#if HWY_COMPILER_MSVC
+
+#include <intrin.h>
+
+#define HWY_RESTRICT __restrict
+#define HWY_INLINE __forceinline
+#define HWY_NOINLINE __declspec(noinline)
+#define HWY_FLATTEN
+#define HWY_NORETURN __declspec(noreturn)
+#define HWY_LIKELY(expr) (expr)
+#define HWY_UNLIKELY(expr) (expr)
+#define HWY_PRAGMA(tokens) __pragma(tokens)
+#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
+#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
+#define HWY_MAYBE_UNUSED
+#define HWY_HAS_ASSUME_ALIGNED 0
+#if (_MSC_VER >= 1700)
+#define HWY_MUST_USE_RESULT _Check_return_
+#else
+#define HWY_MUST_USE_RESULT
+#endif
+
+#else
+
+#define HWY_RESTRICT __restrict__
+// force inlining without optimization enabled creates very inefficient code
+// that can cause compiler timeout
+#ifdef __OPTIMIZE__
+#define HWY_INLINE inline __attribute__((always_inline))
+#else
+#define HWY_INLINE inline
+#endif
+#define HWY_NOINLINE __attribute__((noinline))
+#define HWY_FLATTEN __attribute__((flatten))
+#define HWY_NORETURN __attribute__((noreturn))
+#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#define HWY_PRAGMA(tokens) _Pragma(#tokens)
+#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
+#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
+// Encountered "attribute list cannot appear here" when using the C++17
+// [[maybe_unused]], so only use the old style attribute for now.
+#define HWY_MAYBE_UNUSED __attribute__((unused))
+#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
+
+#endif  // !HWY_COMPILER_MSVC
+
+//------------------------------------------------------------------------------
+// Builtin/attributes
+
+// Enables error-checking of format strings.
+#if HWY_HAS_ATTRIBUTE(__format__)
+#define HWY_FORMAT(idx_fmt, idx_arg) \
+  __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
+#else
+#define HWY_FORMAT(idx_fmt, idx_arg)
+#endif
+
+// Returns a void* pointer which the compiler then assumes is N-byte aligned.
+// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
+//
+// The assignment semantics are required by GCC/Clang. ICC provides an in-place
+// __assume_aligned, whereas MSVC's __assume appears unsuitable.
+#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
+#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
+#else
+#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
+#endif
+
+// Clang and GCC require attributes on each function into which SIMD intrinsics
+// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
+// automatic annotation via pragmas.
+#if HWY_COMPILER_CLANG
+#define HWY_PUSH_ATTRIBUTES(targets_str)                                \
+  HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
+                                  apply_to = function))
+#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
+#elif HWY_COMPILER_GCC
+#define HWY_PUSH_ATTRIBUTES(targets_str) \
+  HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
+#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
+#else
+#define HWY_PUSH_ATTRIBUTES(targets_str)
+#define HWY_POP_ATTRIBUTES
+#endif
+
+//------------------------------------------------------------------------------
+// Macros
+
+#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
+
+#define HWY_CONCAT_IMPL(a, b) a##b
+#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
+
+#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
+#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#if HWY_COMPILER_GCC_ACTUAL
+// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
+#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
+#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
+#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
+#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
+#define HWY_DEFAULT_UNROLL HWY_UNROLL()
+#else
+#define HWY_UNROLL(factor)
+#define HWY_DEFAULT_UNROLL
+#endif
+
+
+// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
+// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
+// does, without generating code.
+#if HWY_ARCH_X86
+#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
+#else
+// TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
+#define HWY_FENCE
+#endif
+
+// 4 instances of a given literal value, useful as input to LoadDup128.
+#define HWY_REP4(literal) literal, literal, literal, literal
+
+#define HWY_ABORT(format, ...) \
+  ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
+
+// Always enabled.
+#define HWY_ASSERT(condition)             \
+  do {                                    \
+    if (!(condition)) {                   \
+      HWY_ABORT("Assert %s", #condition); \
+    }                                     \
+  } while (0)
+
+#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
+#define HWY_IS_MSAN 1
+#else
+#define HWY_IS_MSAN 0
+#endif
+
+#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
+#define HWY_IS_ASAN 1
+#else
+#define HWY_IS_ASAN 0
+#endif
+
+#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
+#define HWY_IS_TSAN 1
+#else
+#define HWY_IS_TSAN 0
+#endif
+
+// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
+// You can disable MSAN by adding this attribute to the function that fails.
+#if HWY_IS_MSAN
+#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
+#else
+#define HWY_ATTR_NO_MSAN
+#endif
+
+// For enabling HWY_DASSERT and shortening tests in slower debug builds
+#if !defined(HWY_IS_DEBUG_BUILD)
+// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
+// MSVC defines NDEBUG (if not, could instead check _DEBUG).
+#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
+    HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
+#define HWY_IS_DEBUG_BUILD 1
+#else
+#define HWY_IS_DEBUG_BUILD 0
+#endif
+#endif  // HWY_IS_DEBUG_BUILD
+
+#if HWY_IS_DEBUG_BUILD
+#define HWY_DASSERT(condition) HWY_ASSERT(condition)
+#else
+#define HWY_DASSERT(condition) \
+  do {                         \
+  } while (0)
+#endif
+
+namespace hwy {
+
+//------------------------------------------------------------------------------
+// kMaxVectorSize (undocumented, pending removal)
+
+#if HWY_ARCH_X86
+static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64;  // AVX-512
+#elif HWY_ARCH_RVV && defined(__riscv_vector)
+// Not actually an upper bound on the size.
+static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
+#else
+static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
+#endif
+
+//------------------------------------------------------------------------------
+// Alignment
+
+// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
+// should be allocated dynamically via aligned_allocator.h because Lanes() may
+// exceed the stack size.
+#if HWY_ARCH_X86
+#define HWY_ALIGN_MAX alignas(64)
+#elif HWY_ARCH_RVV && defined(__riscv_vector)
+#define HWY_ALIGN_MAX alignas(8)  // only elements need be aligned
+#else
+#define HWY_ALIGN_MAX alignas(16)
+#endif
+
+//------------------------------------------------------------------------------
+// Lane types
+
+// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
+// by concatenating base type and bits.
+
+#pragma pack(push, 1)
+
+// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
+// always supported on aarch64, for v7 only if -mfp16-format is given.
+#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
+using float16_t = __fp16;
+// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
+// Required for Clang RVV if the float16 extension is used.
+#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
+using float16_t = _Float16;
+// Otherwise emulate
+#else
+struct float16_t {
+  uint16_t bits;
+};
+#endif
+
+struct bfloat16_t {
+  uint16_t bits;
+};
+
+#pragma pack(pop)
+
+using float32_t = float;
+using float64_t = double;
+
+#pragma pack(push, 1)
+
+// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
+// https://reviews.llvm.org/D86310
+struct alignas(16) uint128_t {
+  uint64_t lo;  // little-endian layout
+  uint64_t hi;
+};
+
+// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
+// field is to be compared (Lt128Upper instead of Lt128).
+struct alignas(16) K64V64 {
+  uint64_t value;  // little-endian layout
+  uint64_t key;
+};
+
+// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
+// than when considering both to be a 64-bit key.
+struct alignas(8) K32V32 {
+  uint32_t value;  // little-endian layout
+  uint32_t key;
+};
+
+#pragma pack(pop)
+
+static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
+                                              const uint128_t& b) {
+  return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
+                                              const uint128_t& b) {
+  return b < a;
+}
+static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
+                                               const uint128_t& b) {
+  return a.lo == b.lo && a.hi == b.hi;
+}
+
+static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
+                                              const K64V64& b) {
+  return a.key < b.key;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
+                                              const K64V64& b) {
+  return b < a;
+}
+
+static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
+                                              const K32V32& b) {
+  return a.key < b.key;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
+                                              const K32V32& b) {
+  return b < a;
+}
+
+//------------------------------------------------------------------------------
+// Controlling overload resolution (SFINAE)
+
+template <bool Condition>
+struct EnableIfT {};
+template <>
+struct EnableIfT<true> {
+  using type = void;
+};
+
+template <bool Condition>
+using EnableIf = typename EnableIfT<Condition>::type;
+
+template <typename T, typename U>
+struct IsSameT {
+  enum { value = 0 };
+};
+
+template <typename T>
+struct IsSameT<T, T> {
+  enum { value = 1 };
+};
+
+template <typename T, typename U>
+HWY_API constexpr bool IsSame() {
+  return IsSameT<T, U>::value;
+}
+
+// Insert into template/function arguments to enable this overload only for
+// vectors of AT MOST this many bits.
+//
+// Note that enabling for exactly 128 bits is unnecessary because a function can
+// simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
+// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
+#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
+#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
+#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
+#define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
+#define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
+#define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
+#define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
+
+#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
+#define HWY_IF_SIGNED(T) \
+  hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
+#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
+#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
+
+#define HWY_IF_LANE_SIZE(T, bytes) \
+  hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
+#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
+  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
+#define HWY_IF_LANE_SIZE_LT(T, bytes) \
+  hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
+
+#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
+  hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
+
+// Empty struct used as a size tag type.
+template <size_t N>
+struct SizeTag {};
+
+template <class T>
+struct RemoveConstT {
+  using type = T;
+};
+template <class T>
+struct RemoveConstT<const T> {
+  using type = T;
+};
+
+template <class T>
+using RemoveConst = typename RemoveConstT<T>::type;
+
+//------------------------------------------------------------------------------
+// Type relations
+
+namespace detail {
+
+template <typename T>
+struct Relations;
+template <>
+struct Relations<uint8_t> {
+  using Unsigned = uint8_t;
+  using Signed = int8_t;
+  using Wide = uint16_t;
+  enum { is_signed = 0, is_float = 0 };
+};
+template <>
+struct Relations<int8_t> {
+  using Unsigned = uint8_t;
+  using Signed = int8_t;
+  using Wide = int16_t;
+  enum { is_signed = 1, is_float = 0 };
+};
+template <>
+struct Relations<uint16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Wide = uint32_t;
+  using Narrow = uint8_t;
+  enum { is_signed = 0, is_float = 0 };
+};
+template <>
+struct Relations<int16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Wide = int32_t;
+  using Narrow = int8_t;
+  enum { is_signed = 1, is_float = 0 };
+};
+template <>
+struct Relations<uint32_t> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+  using Wide = uint64_t;
+  using Narrow = uint16_t;
+  enum { is_signed = 0, is_float = 0 };
+};
+template <>
+struct Relations<int32_t> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+  using Wide = int64_t;
+  using Narrow = int16_t;
+  enum { is_signed = 1, is_float = 0 };
+};
+template <>
+struct Relations<uint64_t> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+  using Wide = uint128_t;
+  using Narrow = uint32_t;
+  enum { is_signed = 0, is_float = 0 };
+};
+template <>
+struct Relations<int64_t> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+  using Narrow = int32_t;
+  enum { is_signed = 1, is_float = 0 };
+};
+template <>
+struct Relations<uint128_t> {
+  using Unsigned = uint128_t;
+  using Narrow = uint64_t;
+  enum { is_signed = 0, is_float = 0 };
+};
+template <>
+struct Relations<float16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Float = float16_t;
+  using Wide = float;
+  enum { is_signed = 1, is_float = 1 };
+};
+template <>
+struct Relations<bfloat16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Wide = float;
+  enum { is_signed = 1, is_float = 1 };
+};
+template <>
+struct Relations<float> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+  using Wide = double;
+  using Narrow = float16_t;
+  enum { is_signed = 1, is_float = 1 };
+};
+template <>
+struct Relations<double> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+  using Narrow = float;
+  enum { is_signed = 1, is_float = 1 };
+};
+
+template <size_t N>
+struct TypeFromSize;
+template <>
+struct TypeFromSize<1> {
+  using Unsigned = uint8_t;
+  using Signed = int8_t;
+};
+template <>
+struct TypeFromSize<2> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+};
+template <>
+struct TypeFromSize<4> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+};
+template <>
+struct TypeFromSize<8> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+};
+template <>
+struct TypeFromSize<16> {
+  using Unsigned = uint128_t;
+};
+
+}  // namespace detail
+
+// Aliases for types of a different category, but the same size.
+template <typename T>
+using MakeUnsigned = typename detail::Relations<T>::Unsigned;
+template <typename T>
+using MakeSigned = typename detail::Relations<T>::Signed;
+template <typename T>
+using MakeFloat = typename detail::Relations<T>::Float;
+
+// Aliases for types of the same category, but different size.
+template <typename T>
+using MakeWide = typename detail::Relations<T>::Wide;
+template <typename T>
+using MakeNarrow = typename detail::Relations<T>::Narrow;
+
+// Obtain type from its size [bytes].
+template <size_t N>
+using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
+template <size_t N>
+using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
+template <size_t N>
+using FloatFromSize = typename detail::TypeFromSize<N>::Float;
+
+// Avoid confusion with SizeTag where the parameter is a lane size.
+using UnsignedTag = SizeTag<0>;
+using SignedTag = SizeTag<0x100>;  // integer
+using FloatTag = SizeTag<0x200>;
+
+template <typename T, class R = detail::Relations<T>>
+constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
+  return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>();
+}
+
+// For when we only want to distinguish FloatTag from everything else.
+using NonFloatTag = SizeTag<0x400>;
+
+template <typename T, class R = detail::Relations<T>>
+constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
+  return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
+}
+
+//------------------------------------------------------------------------------
+// Type traits
+
+template <typename T>
+HWY_API constexpr bool IsFloat() {
+  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
+  // from a float, not compared.
+  return IsSame<T, float>() || IsSame<T, double>();
+}
+
+template <typename T>
+HWY_API constexpr bool IsSigned() {
+  return T(0) > T(-1);
+}
+template <>
+constexpr bool IsSigned<float16_t>() {
+  return true;
+}
+template <>
+constexpr bool IsSigned<bfloat16_t>() {
+  return true;
+}
+
+// Largest/smallest representable integer values.
+template <typename T>
+HWY_API constexpr T LimitsMax() {
+  static_assert(!IsFloat<T>(), "Only for integer types");
+  using TU = MakeUnsigned<T>;
+  return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
+                                      : static_cast<TU>(~0ull));
+}
+template <typename T>
+HWY_API constexpr T LimitsMin() {
+  static_assert(!IsFloat<T>(), "Only for integer types");
+  return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
+}
+
+// Largest/smallest representable value (integer or float). This naming avoids
+// confusion with numeric_limits<float>::min() (the smallest positive value).
+template <typename T>
+HWY_API constexpr T LowestValue() {
+  return LimitsMin<T>();
+}
+template <>
+constexpr float LowestValue<float>() {
+  return -3.402823466e+38F;
+}
+template <>
+constexpr double LowestValue<double>() {
+  return -1.7976931348623158e+308;
+}
+
+template <typename T>
+HWY_API constexpr T HighestValue() {
+  return LimitsMax<T>();
+}
+template <>
+constexpr float HighestValue<float>() {
+  return 3.402823466e+38F;
+}
+template <>
+constexpr double HighestValue<double>() {
+  return 1.7976931348623158e+308;
+}
+
+// Difference between 1.0 and the next representable value.
+template <typename T>
+HWY_API constexpr T Epsilon() {
+  return 1;
+}
+template <>
+constexpr float Epsilon<float>() {
+  return 1.192092896e-7f;
+}
+template <>
+constexpr double Epsilon<double>() {
+  return 2.2204460492503131e-16;
+}
+
+// Returns width in bits of the mantissa field in IEEE binary32/64.
+template <typename T>
+constexpr int MantissaBits() {
+  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
+  return 0;
+}
+template <>
+constexpr int MantissaBits<float>() {
+  return 23;
+}
+template <>
+constexpr int MantissaBits<double>() {
+  return 52;
+}
+
+// Returns the (left-shifted by one bit) IEEE binary32/64 representation with
+// the largest possible (biased) exponent field. Used by IsInf.
+template <typename T>
+constexpr MakeSigned<T> MaxExponentTimes2() {
+  return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
+}
+
+// Returns bitmask of the sign bit in IEEE binary32/64.
+template <typename T>
+constexpr MakeUnsigned<T> SignMask() {
+  return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
+}
+
+// Returns bitmask of the exponent field in IEEE binary32/64.
+template <typename T>
+constexpr MakeUnsigned<T> ExponentMask() {
+  return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
+}
+
+// Returns bitmask of the mantissa field in IEEE binary32/64.
+template <typename T>
+constexpr MakeUnsigned<T> MantissaMask() {
+  return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
+}
+
+// Returns 1 << mantissa_bits as a floating-point number. All integers whose
+// absolute value are less than this can be represented exactly.
+template <typename T>
+constexpr T MantissaEnd() {
+  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
+  return 0;
+}
+template <>
+constexpr float MantissaEnd<float>() {
+  return 8388608.0f;  // 1 << 23
+}
+template <>
+constexpr double MantissaEnd<double>() {
+  // floating point literal with p52 requires C++17.
+  return 4503599627370496.0;  // 1 << 52
+}
+
+// Returns width in bits of the exponent field in IEEE binary32/64.
+template <typename T>
+constexpr int ExponentBits() {
+  // Exponent := remaining bits after deducting sign and mantissa.
+  return 8 * sizeof(T) - 1 - MantissaBits<T>();
+}
+
+// Returns largest value of the biased exponent field in IEEE binary32/64,
+// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
+// This is expressed as a signed integer for more efficient comparison.
+template <typename T>
+constexpr MakeSigned<T> MaxExponentField() {
+  return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
+}
+
+//------------------------------------------------------------------------------
+// Helper functions
+
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+// Works for any `align`; if a power of two, compiler emits ADD+AND.
+constexpr inline size_t RoundUpTo(size_t what, size_t align) {
+  return DivCeil(what, align) * align;
+}
+
+// Undefined results for x == 0.
+HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
+#if HWY_COMPILER_MSVC
+  unsigned long index;  // NOLINT
+  _BitScanForward(&index, x);
+  return index;
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_ctz(x));
+#endif  // HWY_COMPILER_MSVC
+}
+
+HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
+#if HWY_COMPILER_MSVC
+#if HWY_ARCH_X86_64
+  unsigned long index;  // NOLINT
+  _BitScanForward64(&index, x);
+  return index;
+#else   // HWY_ARCH_X86_64
+  // _BitScanForward64 not available
+  uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+  unsigned long index;  // NOLINT
+  if (lsb == 0) {
+    uint32_t msb = static_cast<uint32_t>(x >> 32u);
+    _BitScanForward(&index, msb);
+    return 32 + index;
+  } else {
+    _BitScanForward(&index, lsb);
+    return index;
+  }
+#endif  // HWY_ARCH_X86_64
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_ctzll(x));
+#endif  // HWY_COMPILER_MSVC
+}
+
+// Undefined results for x == 0.
+HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
+#if HWY_COMPILER_MSVC
+  unsigned long index;  // NOLINT
+  _BitScanReverse(&index, x);
+  return 31 - index;
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_clz(x));
+#endif  // HWY_COMPILER_MSVC
+}
+
+HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
+#if HWY_COMPILER_MSVC
+#if HWY_ARCH_X86_64
+  unsigned long index;  // NOLINT
+  _BitScanReverse64(&index, x);
+  return 63 - index;
+#else   // HWY_ARCH_X86_64
+  // _BitScanReverse64 not available
+  const uint32_t msb = static_cast<uint32_t>(x >> 32u);
+  unsigned long index;  // NOLINT
+  if (msb == 0) {
+    const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+    _BitScanReverse(&index, lsb);
+    return 63 - index;
+  } else {
+    _BitScanReverse(&index, msb);
+    return 31 - index;
+  }
+#endif  // HWY_ARCH_X86_64
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_clzll(x));
+#endif  // HWY_COMPILER_MSVC
+}
+
+HWY_API size_t PopCount(uint64_t x) {
+#if HWY_COMPILER_GCC  // includes clang
+  return static_cast<size_t>(__builtin_popcountll(x));
+  // This instruction has a separate feature flag, but is often called from
+  // non-SIMD code, so we don't want to require dynamic dispatch. It was first
+  // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
+  // for AVX, so check for that.
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
+  return _mm_popcnt_u64(x);
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
+  return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
+         _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
+#else
+  x -= ((x >> 1) & 0x5555555555555555ULL);
+  x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
+  x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
+  x += (x >> 8);
+  x += (x >> 16);
+  x += (x >> 32);
+  return static_cast<size_t>(x & 0x7Fu);
+#endif
+}
+
+// Skip HWY_API due to GCC "function not considered for inlining". Previously
+// such errors were caused by underlying type mismatches, but it's not clear
+// what is still mismatched despite all the casts.
+template <typename TI>
+/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
+  return x == TI{1}
+             ? 0
+             : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
+}
+
+template <typename TI>
+/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
+  return x == TI{1}
+             ? 0
+             : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
+}
+
+#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
+#pragma intrinsic(_umul128)
+#endif
+
+// 64 x 64 = 128 bit multiplication
+HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
+#if defined(__SIZEOF_INT128__)
+  __uint128_t product = (__uint128_t)a * (__uint128_t)b;
+  *upper = (uint64_t)(product >> 64);
+  return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
+  return _umul128(a, b, upper);
+#else
+  constexpr uint64_t kLo32 = 0xFFFFFFFFU;
+  const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
+  const uint64_t hi_lo = (a >> 32) * (b & kLo32);
+  const uint64_t lo_hi = (a & kLo32) * (b >> 32);
+  const uint64_t hi_hi = (a >> 32) * (b >> 32);
+  const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
+  *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
+  return (t << 32) | (lo_lo & kLo32);
+#endif
+}
+
+#if HWY_COMPILER_MSVC
+#pragma intrinsic(memcpy)
+#pragma intrinsic(memset)
+#endif
+
+// The source/destination must not overlap/alias.
+template <size_t kBytes, typename From, typename To>
+HWY_API void CopyBytes(const From* from, To* to) {
+#if HWY_COMPILER_MSVC
+  memcpy(to, from, kBytes);
+#else
+  __builtin_memcpy(
+      static_cast<void*>(to), static_cast<const void*>(from), kBytes);
+#endif
+}
+
+// Same as CopyBytes, but for same-sized objects; avoids a size argument.
+template <typename From, typename To>
+HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
+  static_assert(sizeof(From) == sizeof(To), "");
+  CopyBytes<sizeof(From)>(from, to);
+}
+
+template <size_t kBytes, typename To>
+HWY_API void ZeroBytes(To* to) {
+#if HWY_COMPILER_MSVC
+  memset(to, 0, kBytes);
+#else
+  __builtin_memset(to, 0, kBytes);
+#endif
+}
+
+HWY_API float F32FromBF16(bfloat16_t bf) {
+  uint32_t bits = bf.bits;
+  bits <<= 16;
+  float f;
+  CopySameSize(&bits, &f);
+  return f;
+}
+
+HWY_API bfloat16_t BF16FromF32(float f) {
+  uint32_t bits;
+  CopySameSize(&f, &bits);
+  bfloat16_t bf;
+  bf.bits = static_cast<uint16_t>(bits >> 16);
+  return bf;
+}
+
+HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
+    Abort(const char* file, int line, const char* format, ...);
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_BASE_H_
@@ -0,0 +1,178 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <limits>
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "base_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+HWY_NOINLINE void TestAllLimits() {
+  HWY_ASSERT_EQ(uint8_t{0}, LimitsMin<uint8_t>());
+  HWY_ASSERT_EQ(uint16_t{0}, LimitsMin<uint16_t>());
+  HWY_ASSERT_EQ(uint32_t{0}, LimitsMin<uint32_t>());
+  HWY_ASSERT_EQ(uint64_t{0}, LimitsMin<uint64_t>());
+
+  HWY_ASSERT_EQ(int8_t{-128}, LimitsMin<int8_t>());
+  HWY_ASSERT_EQ(int16_t{-32768}, LimitsMin<int16_t>());
+  HWY_ASSERT_EQ(static_cast<int32_t>(0x80000000u), LimitsMin<int32_t>());
+  HWY_ASSERT_EQ(static_cast<int64_t>(0x8000000000000000ull),
+                LimitsMin<int64_t>());
+
+  HWY_ASSERT_EQ(uint8_t{0xFF}, LimitsMax<uint8_t>());
+  HWY_ASSERT_EQ(uint16_t{0xFFFF}, LimitsMax<uint16_t>());
+  HWY_ASSERT_EQ(uint32_t{0xFFFFFFFFu}, LimitsMax<uint32_t>());
+  HWY_ASSERT_EQ(uint64_t{0xFFFFFFFFFFFFFFFFull}, LimitsMax<uint64_t>());
+
+  HWY_ASSERT_EQ(int8_t{0x7F}, LimitsMax<int8_t>());
+  HWY_ASSERT_EQ(int16_t{0x7FFF}, LimitsMax<int16_t>());
+  HWY_ASSERT_EQ(int32_t{0x7FFFFFFFu}, LimitsMax<int32_t>());
+  HWY_ASSERT_EQ(int64_t{0x7FFFFFFFFFFFFFFFull}, LimitsMax<int64_t>());
+}
+
+struct TestLowestHighest {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*unused*/) const {
+    HWY_ASSERT_EQ(std::numeric_limits<T>::lowest(), LowestValue<T>());
+    HWY_ASSERT_EQ(std::numeric_limits<T>::max(), HighestValue<T>());
+  }
+};
+
+HWY_NOINLINE void TestAllLowestHighest() { ForAllTypes(TestLowestHighest()); }
+struct TestIsUnsigned {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*unused*/) const {
+    static_assert(!IsFloat<T>(), "Expected !IsFloat");
+    static_assert(!IsSigned<T>(), "Expected !IsSigned");
+  }
+};
+
+struct TestIsSigned {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*unused*/) const {
+    static_assert(!IsFloat<T>(), "Expected !IsFloat");
+    static_assert(IsSigned<T>(), "Expected IsSigned");
+  }
+};
+
+struct TestIsFloat {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*unused*/) const {
+    static_assert(IsFloat<T>(), "Expected IsFloat");
+    static_assert(IsSigned<T>(), "Floats are also considered signed");
+  }
+};
+
+HWY_NOINLINE void TestAllType() {
+  ForUnsignedTypes(TestIsUnsigned());
+  ForSignedTypes(TestIsSigned());
+  ForFloatTypes(TestIsFloat());
+
+  static_assert(sizeof(MakeUnsigned<hwy::uint128_t>) == 16, "");
+  static_assert(sizeof(MakeWide<uint64_t>) == 16, "Expected uint128_t");
+  static_assert(sizeof(MakeNarrow<hwy::uint128_t>) == 8, "Expected uint64_t");
+}
+
+struct TestIsSame {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*unused*/) const {
+    static_assert(IsSame<T, T>(), "T == T");
+    static_assert(!IsSame<MakeSigned<T>, MakeUnsigned<T>>(), "S != U");
+    static_assert(!IsSame<MakeUnsigned<T>, MakeSigned<T>>(), "U != S");
+  }
+};
+
+HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); }
+
+HWY_NOINLINE void TestAllBitScan() {
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(2u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(3u));
+  HWY_ASSERT_EQ(size_t{31}, Num0BitsAboveMS1Bit_Nonzero32(1u));
+
+  HWY_ASSERT_EQ(size_t{0},
+                Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull));
+  HWY_ASSERT_EQ(size_t{0},
+                Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{1},
+                Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull));
+  HWY_ASSERT_EQ(size_t{1},
+                Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull));
+  HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(2ull));
+  HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(3ull));
+  HWY_ASSERT_EQ(size_t{63}, Num0BitsAboveMS1Bit_Nonzero64(1ull));
+
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero32(1u));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero32(2u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
+  HWY_ASSERT_EQ(size_t{31}, Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
+
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero64(1ull));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero64(2ull));
+  HWY_ASSERT_EQ(size_t{62},
+                Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull));
+  HWY_ASSERT_EQ(size_t{63},
+                Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull));
+}
+
+HWY_NOINLINE void TestAllPopCount() {
+  HWY_ASSERT_EQ(size_t{0}, PopCount(0u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(1u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(2u));
+  HWY_ASSERT_EQ(size_t{2}, PopCount(3u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000u));
+  HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFu));
+  HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFu));
+
+  HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000ull));
+  HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFull));
+  HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{33}, PopCount(0x10FFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{63}, PopCount(0xFFFEFFFFFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(BaseTest);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllIsSame);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
+}  // namespace hwy
+
+#endif
@@ -0,0 +1,110 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
+#define HIGHWAY_HWY_CACHE_CONTROL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hwy/base.h"
+
+// Requires SSE2; fails to compile on 32-bit Clang 7 (see
+// https://github.com/gperftools/gperftools/issues/946).
+#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
+#undef HWY_DISABLE_CACHE_CONTROL
+#define HWY_DISABLE_CACHE_CONTROL
+#endif
+
+// intrin.h is sufficient on MSVC and already included by base.h.
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
+#include <emmintrin.h>  // SSE2
+#endif
+
+// Windows.h #defines these, which causes infinite recursion. Temporarily
+// undefine them in this header; these functions are anyway deprecated.
+// TODO(janwas): remove when these functions are removed.
+#pragma push_macro("LoadFence")
+#undef LoadFence
+
+namespace hwy {
+
+// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
+#define HWY_STREAM_MULTIPLE 16
+
+// The following functions may also require an attribute.
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
+#define HWY_ATTR_CACHE __attribute__((target("sse2")))
+#else
+#define HWY_ATTR_CACHE
+#endif
+
+// Delays subsequent loads until prior loads are visible. Beware of potentially
+// differing behavior across architectures and vendors: on Intel but not
+// AMD CPUs, also serves as a full fence (waits for all prior instructions to
+// complete).
+HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
+  _mm_lfence();
+#endif
+}
+
+// Ensures values written by previous `Stream` calls are visible on the current
+// core. This is NOT sufficient for synchronizing across cores; when `Stream`
+// outputs are to be consumed by other core(s), the producer must publish
+// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
+HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
+  _mm_sfence();
+#endif
+}
+
+// Optionally begins loading the cache line containing "p" to reduce latency of
+// subsequent actual loads.
+template <typename T>
+HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
+  _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
+#elif HWY_COMPILER_GCC  // includes clang
+  // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
+  // desirable, so use the default 3 (keep in caches).
+  __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
+#else
+  (void)p;
+#endif
+}
+
+// Invalidates and flushes the cache line containing "p", if possible.
+HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
+  _mm_clflush(p);
+#else
+  (void)p;
+#endif
+}
+
+// When called inside a spin-loop, may reduce power consumption.
+HWY_INLINE HWY_ATTR_CACHE void Pause() {
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
+  _mm_pause();
+#endif
+}
+
+}  // namespace hwy
+
+// TODO(janwas): remove when these functions are removed. (See above.)
+#pragma pop_macro("LoadFence")
+
+#endif  // HIGHWAY_HWY_CACHE_CONTROL_H_
@@ -0,0 +1,136 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// These functions avoid having to write a loop plus remainder handling in the
+// (unfortunately still common) case where arrays are not aligned/padded. If the
+// inputs are known to be aligned/padded, it is more efficient to write a single
+// loop using Load(). We do not provide a CopyAlignedPadded because it
+// would be more verbose than such a loop.
+
+// Fills `to`[0, `count`) with `value`.
+template <class D, typename T = TFromD<D>>
+void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) {
+  const size_t N = Lanes(d);
+  const Vec<D> v = Set(d, value);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    StoreU(v, d, to + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  SafeFillN(remaining, value, d, to + idx);
+}
+
+// Copies `from`[0, `count`) to `to`, which must not overlap `from`.
+template <class D, typename T = TFromD<D>>
+void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) {
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    const Vec<D> v = LoadU(d, from + idx);
+    StoreU(v, d, to + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  SafeCopyN(remaining, d, from + idx, to + idx);
+}
+
+// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the
+// corresponding mask element of `func(d, v)` is true. Returns the STL-style end
+// of the newly written elements in `to`.
+//
+// `func` is either a functor with a templated operator()(d, v) returning a
+// mask, or a generic lambda if using C++14. Due to apparent limitations of
+// Clang on Windows, it is currently necessary to add HWY_ATTR before the
+// opening { of the lambda to avoid errors about "function .. requires target".
+//
+// NOTE: this is only supported for 16-, 32- or 64-bit types.
+// NOTE: Func may be called a second time for elements it has already seen, but
+// these elements will not be written to `to` again.
+template <class D, class Func, typename T = TFromD<D>>
+T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to,
+          const Func& func) {
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    const Vec<D> v = LoadU(d, from + idx);
+    to += CompressBlendedStore(v, func(d, v), d, to);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return to;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    // Workaround for -Waggressive-loop-optimizations on GCC 8
+    // (iteration 2305843009213693951 invokes undefined behavior for T=i64)
+    const uintptr_t addr = reinterpret_cast<uintptr_t>(from);
+    const T* HWY_RESTRICT from_idx =
+        reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T)));
+    const V1 v = LoadU(d1, from_idx);
+    // Avoid storing to `to` unless we know it should be kept - otherwise, we
+    // might overrun the end if it was allocated for the exact count.
+    if (CountTrue(d1, func(d1, v)) == 0) continue;
+    StoreU(v, d1, to);
+    to += 1;
+  }
+#else
+  // Start index of the last unaligned whole vector, ending at the array end.
+  const size_t last = count - N;
+  // Number of elements before `from` or already written.
+  const size_t invalid = idx - last;
+  HWY_DASSERT(0 != invalid && invalid < N);
+  const Mask<D> mask = Not(FirstN(d, invalid));
+  const Vec<D> v = MaskedLoad(mask, d, from + last);
+  to += CompressBlendedStore(v, And(mask, func(d, v)), d, to);
+#endif
+  return to;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
@@ -0,0 +1,199 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/aligned_allocator.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/algo/copy-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// If your project requires C++14 or later, you can ignore this and pass lambdas
+// directly to Transform, without requiring an lvalue as we do here for C++11.
+#if __cplusplus < 201402L
+#define HWY_GENERIC_LAMBDA 0
+#else
+#define HWY_GENERIC_LAMBDA 1
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Returns random integer in [0, 128), which fits in any lane type.
+template <typename T>
+T Random7Bit(RandomState& rng) {
+  return static_cast<T>(Random32(&rng) & 127);
+}
+
+// In C++14, we can instead define these as generic lambdas next to where they
+// are invoked.
+#if !HWY_GENERIC_LAMBDA
+
+struct IsOdd {
+  template <class D, class V>
+  Mask<D> operator()(D d, V v) const {
+    return TestBit(v, Set(d, TFromD<D>{1}));
+  }
+};
+
+#endif  // !HWY_GENERIC_LAMBDA
+
+// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from
+// ForFloatTypes.
+template <class Test>
+struct ForeachCountAndMisalign {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) const {
+    RandomState rng;
+    const size_t N = Lanes(d);
+    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+
+    for (size_t count = 0; count < 2 * N; ++count) {
+      for (size_t ma : misalignments) {
+        for (size_t mb : misalignments) {
+          Test()(d, count, ma, mb, rng);
+        }
+      }
+    }
+  }
+};
+
+struct TestFill {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    using T = TFromD<D>;
+    // HWY_MAX prevents error when misalign == count == 0.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    T* expected = pa.get() + misalign_a;
+    const T value = Random7Bit<T>(rng);
+    for (size_t i = 0; i < count; ++i) {
+      expected[i] = value;
+    }
+    AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + count + 1);
+    T* actual = pb.get() + misalign_b;
+
+    actual[count] = T{0};  // sentinel
+    Fill(d, value, count, actual);
+    HWY_ASSERT_EQ(T{0}, actual[count]);  // did not write past end
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name,
+                                  __FILE__, __LINE__);
+  }
+};
+
+void TestAllFill() {
+  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFill>>());
+}
+
+struct TestCopy {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    using T = TFromD<D>;
+    // Prevents error if size to allocate is zero.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    T* a = pa.get() + misalign_a;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random7Bit<T>(rng);
+    }
+    AlignedFreeUniquePtr<T[]> pb =
+        AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
+    T* b = pb.get() + misalign_b;
+
+    Copy(d, a, count, b);
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__,
+                                  __LINE__);
+  }
+};
+
+void TestAllCopy() {
+  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestCopy>>());
+}
+
+struct TestCopyIf {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    using T = TFromD<D>;
+    // Prevents error if size to allocate is zero.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    T* a = pa.get() + misalign_a;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random7Bit<T>(rng);
+    }
+    const size_t padding = Lanes(ScalableTag<T>());
+    AlignedFreeUniquePtr<T[]> pb =
+        AllocateAligned<T>(HWY_MAX(1, misalign_b + count + padding));
+    T* b = pb.get() + misalign_b;
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+    size_t num_odd = 0;
+    for (size_t i = 0; i < count; ++i) {
+      if (a[i] & 1) {
+        expected[num_odd++] = a[i];
+      }
+    }
+
+#if HWY_GENERIC_LAMBDA
+    const auto is_odd = [](const auto d, const auto v) HWY_ATTR {
+      return TestBit(v, Set(d, TFromD<decltype(d)>{1}));
+    };
+#else
+    const IsOdd is_odd;
+#endif
+    T* end = CopyIf(d, a, count, b, is_odd);
+    const size_t num_written = static_cast<size_t>(end - b);
+    HWY_ASSERT_EQ(num_odd, num_written);
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name,
+                                  __FILE__, __LINE__);
+  }
+};
+
+void TestAllCopyIf() {
+  ForUI163264(ForPartialVectors<ForeachCountAndMisalign<TestCopyIf>>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(CopyTest);
+HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill);
+HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy);
+HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf);
+}  // namespace hwy
+
+#endif
@@ -0,0 +1,109 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Returns index of the first element equal to `value` in `in[0, count)`, or
+// `count` if not found.
+template <class D, typename T = TFromD<D>>
+size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) {
+  const size_t N = Lanes(d);
+  const Vec<D> broadcasted = Set(d, value);
+
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i)));
+    if (pos >= 0) return i + static_cast<size_t>(pos);
+  }
+
+  if (i != count) {
+#if HWY_MEM_OPS_MIGHT_FAULT
+    // Scan single elements.
+    const CappedTag<T, 1> d1;
+    using V1 = Vec<decltype(d1)>;
+    const V1 broadcasted1 = Set(d1, GetLane(broadcasted));
+    for (; i < count; ++i) {
+      if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) {
+        return i;
+      }
+    }
+#else
+    const size_t remaining = count - i;
+    HWY_DASSERT(0 != remaining && remaining < N);
+    const Mask<D> mask = FirstN(d, remaining);
+    const Vec<D> v = MaskedLoad(mask, d, in + i);
+    // Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
+    const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask));
+    if (pos >= 0) return i + static_cast<size_t>(pos);
+#endif  // HWY_MEM_OPS_MIGHT_FAULT
+  }
+
+  return count;  // not found
+}
+
+// Returns index of the first element in `in[0, count)` for which `func(d, vec)`
+// returns true, otherwise `count`.
+template <class D, class Func, typename T = TFromD<D>>
+size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) {
+  const size_t N = Lanes(d);
+
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i)));
+    if (pos >= 0) return i + static_cast<size_t>(pos);
+  }
+
+  if (i != count) {
+#if HWY_MEM_OPS_MIGHT_FAULT
+    // Scan single elements.
+    const CappedTag<T, 1> d1;
+    for (; i < count; ++i) {
+      if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) {
+        return i;
+      }
+    }
+#else
+    const size_t remaining = count - i;
+    HWY_DASSERT(0 != remaining && remaining < N);
+    const Mask<D> mask = FirstN(d, remaining);
+    const Vec<D> v = MaskedLoad(mask, d, in + i);
+    // Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
+    const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask));
+    if (pos >= 0) return i + static_cast<size_t>(pos);
+#endif  // HWY_MEM_OPS_MIGHT_FAULT
+  }
+
+  return count;  // not found
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
@@ -0,0 +1,219 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <vector>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/print.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/algo/find-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// If your project requires C++14 or later, you can ignore this and pass lambdas
+// directly to FindIf, without requiring an lvalue as we do here for C++11.
+#if __cplusplus < 201402L
+#define HWY_GENERIC_LAMBDA 0
+#else
+#define HWY_GENERIC_LAMBDA 1
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Returns random number in [-8, 8) - we use knowledge of the range to Find()
+// values we know are not present.
+template <typename T>
+T Random(RandomState& rng) {
+  const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
+  const double val = (bits - 512) / 64.0;
+  // Clamp negative to zero for unsigned types.
+  return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
+}
+
+// In C++14, we can instead define these as generic lambdas next to where they
+// are invoked.
+#if !HWY_GENERIC_LAMBDA
+
+class GreaterThan {
+ public:
+  GreaterThan(int val) : val_(val) {}
+  template <class D, class V>
+  Mask<D> operator()(D d, V v) const {
+    return Gt(v, Set(d, static_cast<TFromD<D>>(val_)));
+  }
+
+ private:
+  int val_;
+};
+
+#endif  // !HWY_GENERIC_LAMBDA
+
+// Invokes Test (e.g. TestFind) with all arg combinations.
+template <class Test>
+struct ForeachCountAndMisalign {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) const {
+    RandomState rng;
+    const size_t N = Lanes(d);
+    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+
+    // Find() checks 8 vectors at a time, so we want to cover a fairly large
+    // range without oversampling (checking every possible count).
+    std::vector<size_t> counts(AdjustedReps(512));
+    for (size_t& count : counts) {
+      count = static_cast<size_t>(rng()) % (16 * N + 1);
+    }
+    counts[0] = 0;  // ensure we test count=0.
+
+    for (size_t count : counts) {
+      for (size_t m : misalignments) {
+        Test()(d, count, m, rng);
+      }
+    }
+  }
+};
+
+struct TestFind {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
+    using T = TFromD<D>;
+    // Must allocate at least one even if count is zero.
+    AlignedFreeUniquePtr<T[]> storage =
+        AllocateAligned<T>(HWY_MAX(1, misalign + count));
+    T* in = storage.get() + misalign;
+    for (size_t i = 0; i < count; ++i) {
+      in[i] = Random<T>(rng);
+    }
+
+    // For each position, search for that element (which we know is there)
+    for (size_t pos = 0; pos < count; ++pos) {
+      const size_t actual = Find(d, in[pos], in, count);
+
+      // We may have found an earlier occurrence of the same value; ensure the
+      // value is the same, and that it is the first.
+      if (!IsEqual(in[pos], in[actual])) {
+        fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n",
+                hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
+                static_cast<double>(in[actual]), static_cast<int>(actual),
+                static_cast<double>(in[pos]));
+        HWY_ASSERT(false);
+      }
+      for (size_t i = 0; i < actual; ++i) {
+        if (IsEqual(in[i], in[pos])) {
+          fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n",
+                  hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
+                  static_cast<double>(in[i]), static_cast<int>(i),
+                  static_cast<int>(actual));
+          HWY_ASSERT(false);
+        }
+      }
+    }
+
+    // Also search for values we know not to be present (out of range)
+    HWY_ASSERT_EQ(count, Find(d, T{9}, in, count));
+    HWY_ASSERT_EQ(count, Find(d, static_cast<T>(-9), in, count));
+  }
+};
+
+void TestAllFind() {
+  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFind>>());
+}
+
+struct TestFindIf {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
+    using T = TFromD<D>;
+    using TI = MakeSigned<T>;
+    // Must allocate at least one even if count is zero.
+    AlignedFreeUniquePtr<T[]> storage =
+        AllocateAligned<T>(HWY_MAX(1, misalign + count));
+    T* in = storage.get() + misalign;
+    for (size_t i = 0; i < count; ++i) {
+      in[i] = Random<T>(rng);
+      HWY_ASSERT(in[i] < 8);
+      HWY_ASSERT(!hwy::IsSigned<T>() || static_cast<TI>(in[i]) >= -8);
+    }
+
+    bool found_any = false;
+    bool not_found_any = false;
+
+    // unsigned T would be promoted to signed and compare greater than any
+    // negative val, whereas Set() would just cast to an unsigned value and the
+    // comparison remains unsigned, so avoid negative numbers there.
+    const int min_val = IsSigned<T>() ? -9 : 0;
+    // Includes out-of-range value 9 to test the not-found path.
+    for (int val = min_val; val <= 9; ++val) {
+#if HWY_GENERIC_LAMBDA
+      const auto greater = [val](const auto d, const auto v) HWY_ATTR {
+        return Gt(v, Set(d, static_cast<T>(val)));
+      };
+#else
+      const GreaterThan greater(val);
+#endif
+      const size_t actual = FindIf(d, in, count, greater);
+      found_any |= actual < count;
+      not_found_any |= actual == count;
+
+      const auto pos = std::find_if(
+          in, in + count, [val](T x) { return x > static_cast<T>(val); });
+      // Convert returned iterator to index.
+      const size_t expected = static_cast<size_t>(pos - in);
+      if (expected != actual) {
+        fprintf(stderr, "%s count %d val %d, expected %d actual %d\n",
+                hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
+                val, static_cast<int>(expected), static_cast<int>(actual));
+        hwy::detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "in", in, count,
+                                0, count);
+        HWY_ASSERT(false);
+      }
+    }
+
+    // We will always not-find something due to val=9.
+    HWY_ASSERT(not_found_any);
+    // We'll find something unless the input is empty or {0} - because 0 > i
+    // is false for all i=[0,9].
+    if (count != 0 && in[0] != 0) {
+      HWY_ASSERT(found_any);
+    }
+  }
+};
+
+void TestAllFindIf() {
+  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFindIf>>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(FindTest);
+HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind);
+HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf);
+}  // namespace hwy
+
+#endif
@@ -0,0 +1,262 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// These functions avoid having to write a loop plus remainder handling in the
+// (unfortunately still common) case where arrays are not aligned/padded. If the
+// inputs are known to be aligned/padded, it is more efficient to write a single
+// loop using Load(). We do not provide a TransformAlignedPadded because it
+// would be more verbose than such a loop.
+//
+// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a
+// generic lambda if using C++14. Due to apparent limitations of Clang on
+// Windows, it is currently necessary to add HWY_ATTR before the opening { of
+// the lambda to avoid errors about "always_inline function .. requires target".
+//
+// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise,
+// we used `MaskedLoad` and `BlendedStore` to read/write the final partial
+// vector.
+
+// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`,
+// where `index_vec` is `Vec<RebindToUnsigned<D>>`. On the first call to `func`,
+// the value of its lane i is i, and increases by `Lanes(d)` after every call.
+// Note that some of these indices may be `>= count`, but the elements that
+// `func` returns in those lanes will not be written to `out`.
+template <class D, class Func, typename T = TFromD<D>>
+void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) {
+  const RebindToUnsigned<D> du;
+  using TU = TFromD<decltype(du)>;
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  Vec<decltype(du)> vidx = Iota(du, 0);
+  for (; idx + N <= count; idx += N) {
+    StoreU(func(d, vidx), d, out + idx);
+    vidx = Add(vidx, Set(du, static_cast<TU>(N)));
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  const RebindToUnsigned<decltype(d1)> du1;
+  for (; idx < count; ++idx) {
+    StoreU(func(d1, Set(du1, static_cast<TU>(idx))), d1, out + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  BlendedStore(func(d, vidx), mask, d, out + idx);
+#endif
+}
+
+// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying
+// array elements by a constant.
+template <class D, class Func, typename T = TFromD<D>>
+void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) {
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    const Vec<D> v = LoadU(d, inout + idx);
+    StoreU(func(d, v), d, inout + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    const V1 v = LoadU(d1, inout + idx);
+    StoreU(func(d1, v), d1, inout + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+  BlendedStore(func(d, v), mask, d, inout + idx);
+#endif
+}
+
+// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage:
+// multiplying array elements by those of another array.
+template <class D, class Func, typename T = TFromD<D>>
+void Transform1(D d, T* HWY_RESTRICT inout, size_t count,
+                const T* HWY_RESTRICT in1, const Func& func) {
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    const Vec<D> v = LoadU(d, inout + idx);
+    const Vec<D> v1 = LoadU(d, in1 + idx);
+    StoreU(func(d, v, v1), d, inout + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    const V1 v = LoadU(d1, inout + idx);
+    const V1 v1 = LoadU(d1, in1 + idx);
+    StoreU(func(d1, v, v1), d1, inout + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+  const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
+  BlendedStore(func(d, v, v1), mask, d, inout + idx);
+#endif
+}
+
+// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example
+// usage: FMA of elements from three arrays, stored into the first array.
+template <class D, class Func, typename T = TFromD<D>>
+void Transform2(D d, T* HWY_RESTRICT inout, size_t count,
+                const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2,
+                const Func& func) {
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    const Vec<D> v = LoadU(d, inout + idx);
+    const Vec<D> v1 = LoadU(d, in1 + idx);
+    const Vec<D> v2 = LoadU(d, in2 + idx);
+    StoreU(func(d, v, v1, v2), d, inout + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    const V1 v = LoadU(d1, inout + idx);
+    const V1 v1 = LoadU(d1, in1 + idx);
+    const V1 v2 = LoadU(d1, in2 + idx);
+    StoreU(func(d1, v, v1, v2), d1, inout + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+  const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
+  const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx);
+  BlendedStore(func(d, v, v1, v2), mask, d, inout + idx);
+#endif
+}
+
+template <class D, typename T = TFromD<D>>
+void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) {
+  const size_t N = Lanes(d);
+  const Vec<D> old_v = Set(d, old_t);
+  const Vec<D> new_v = Set(d, new_t);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    Vec<D> v = LoadU(d, inout + idx);
+    StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  const Vec<decltype(d1)> old_v1 = Set(d1, old_t);
+  const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    const V1 v1 = LoadU(d1, inout + idx);
+    StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+  BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx);
+#endif
+}
+
+template <class D, class Func, typename T = TFromD<D>>
+void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t,
+               const Func& func) {
+  const size_t N = Lanes(d);
+  const Vec<D> new_v = Set(d, new_t);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    Vec<D> v = LoadU(d, inout + idx);
+    StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    const V1 v = LoadU(d1, inout + idx);
+    StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+  BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
@@ -0,0 +1,372 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>  // memcpy
+
+#include "hwy/aligned_allocator.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc"  //NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/algo/transform-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// If your project requires C++14 or later, you can ignore this and pass lambdas
+// directly to Transform, without requiring an lvalue as we do here for C++11.
+#if __cplusplus < 201402L
+#define HWY_GENERIC_LAMBDA 0
+#else
+#define HWY_GENERIC_LAMBDA 1
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <typename T>
+T Alpha() {
+  return static_cast<T>(1.5);  // arbitrary scalar
+}
+
+// Returns random floating-point number in [-8, 8) to ensure computations do
+// not exceed float32 precision.
+template <typename T>
+T Random(RandomState& rng) {
+  const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
+  const double val = (bits - 512) / 64.0;
+  // Clamp negative to zero for unsigned types.
+  return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
+}
+
+// SCAL, AXPY names are from BLAS.
+template <typename T>
+HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    out[i] = Alpha<T>() * x[i];
+  }
+}
+
+template <typename T>
+HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    out[i] = Alpha<T>() * x[i] + y[i];
+  }
+}
+
+template <typename T>
+HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out,
+                             size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    out[i] = x[i] * y[i] + z[i];
+  }
+}
+
+// In C++14, we can instead define these as generic lambdas next to where they
+// are invoked.
+#if !HWY_GENERIC_LAMBDA
+
+// Generator that returns even numbers by doubling the output indices.
+struct Gen2 {
+  template <class D, class VU>
+  Vec<D> operator()(D d, VU vidx) const {
+    return BitCast(d, Add(vidx, vidx));
+  }
+};
+
+struct SCAL {
+  template <class D, class V>
+  Vec<D> operator()(D d, V v) const {
+    using T = TFromD<D>;
+    return Mul(Set(d, Alpha<T>()), v);
+  }
+};
+
+struct AXPY {
+  template <class D, class V>
+  Vec<D> operator()(D d, V v, V v1) const {
+    using T = TFromD<D>;
+    return MulAdd(Set(d, Alpha<T>()), v, v1);
+  }
+};
+
+struct FMA4 {
+  template <class D, class V>
+  Vec<D> operator()(D /*d*/, V v, V v1, V v2) const {
+    return MulAdd(v, v1, v2);
+  }
+};
+
+#endif  // !HWY_GENERIC_LAMBDA
+
+// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from
+// ForFloatTypes.
+template <class Test>
+struct ForeachCountAndMisalign {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) const {
+    RandomState rng;
+    const size_t N = Lanes(d);
+    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+
+    for (size_t count = 0; count < 2 * N; ++count) {
+      for (size_t ma : misalignments) {
+        for (size_t mb : misalignments) {
+          Test()(d, count, ma, mb, rng);
+        }
+      }
+    }
+  }
+};
+
+// Output-only, no loads
+struct TestGenerate {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/,
+                  RandomState& /*rng*/) {
+    using T = TFromD<D>;
+    AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count + 1);
+    T* actual = pa.get() + misalign_a;
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+    for (size_t i = 0; i < count; ++i) {
+      expected[i] = static_cast<T>(2 * i);
+    }
+
+    // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
+    // the attribute also applies to lambdas? If so, remove HWY_ATTR.
+#if HWY_GENERIC_LAMBDA
+    const auto gen2 = [](const auto d, const auto vidx)
+                          HWY_ATTR { return BitCast(d, Add(vidx, vidx)); };
+#else
+    const Gen2 gen2;
+#endif
+    actual[count] = T{0};  // sentinel
+    Generate(d, actual, count, gen2);
+    HWY_ASSERT_EQ(T{0}, actual[count]);  // did not write past end
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected.get(), actual, count,
+                                  target_name, __FILE__, __LINE__);
+  }
+};
+
+// Zero extra input arrays
+struct TestTransform {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    if (misalign_b != 0) return;
+    using T = TFromD<D>;
+    // Prevents error if size to allocate is zero.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    T* a = pa.get() + misalign_a;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random<T>(rng);
+    }
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+    SimpleSCAL(a, expected.get(), count);
+
+    // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
+    // the attribute also applies to lambdas? If so, remove HWY_ATTR.
+#if HWY_GENERIC_LAMBDA
+    const auto scal = [](const auto d, const auto v)
+                          HWY_ATTR { return Mul(Set(d, Alpha<T>()), v); };
+#else
+    const SCAL scal;
+#endif
+    Transform(d, a, count, scal);
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
+                                  __FILE__, __LINE__);
+  }
+};
+
+// One extra input array
+struct TestTransform1 {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    using T = TFromD<D>;
+    // Prevents error if size to allocate is zero.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    AlignedFreeUniquePtr<T[]> pb =
+        AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
+    T* a = pa.get() + misalign_a;
+    T* b = pb.get() + misalign_b;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random<T>(rng);
+      b[i] = Random<T>(rng);
+    }
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+    SimpleAXPY(a, b, expected.get(), count);
+
+#if HWY_GENERIC_LAMBDA
+    const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR {
+      return MulAdd(Set(d, Alpha<T>()), v, v1);
+    };
+#else
+    const AXPY axpy;
+#endif
+    Transform1(d, a, count, b, axpy);
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
+                                  __FILE__, __LINE__);
+  }
+};
+
+// Two extra input arrays
+struct TestTransform2 {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    using T = TFromD<D>;
+    // Prevents error if size to allocate is zero.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    AlignedFreeUniquePtr<T[]> pb =
+        AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
+    AlignedFreeUniquePtr<T[]> pc =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    T* a = pa.get() + misalign_a;
+    T* b = pb.get() + misalign_b;
+    T* c = pc.get() + misalign_a;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random<T>(rng);
+      b[i] = Random<T>(rng);
+      c[i] = Random<T>(rng);
+    }
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+    SimpleFMA4(a, b, c, expected.get(), count);
+
+#if HWY_GENERIC_LAMBDA
+    const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2)
+                          HWY_ATTR { return MulAdd(v, v1, v2); };
+#else
+    const FMA4 fma4;
+#endif
+    Transform2(d, a, count, b, c, fma4);
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
+                                  __FILE__, __LINE__);
+  }
+};
+
+template <typename T>
+class IfEq {
+ public:
+  IfEq(T val) : val_(val) {}
+
+  template <class D, class V>
+  Mask<D> operator()(D d, V v) const {
+    return Eq(v, Set(d, val_));
+  }
+
+ private:
+  T val_;
+};
+
+struct TestReplace {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    if (misalign_b != 0) return;
+    if (count == 0) return;
+    using T = TFromD<D>;
+    AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count);
+    T* a = pa.get() + misalign_a;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random<T>(rng);
+    }
+    AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(count);
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(count);
+
+    std::vector<size_t> positions(AdjustedReps(count));
+    for (size_t& pos : positions) {
+      pos = static_cast<size_t>(rng()) % count;
+    }
+
+    for (size_t pos = 0; pos < count; ++pos) {
+      const T old_t = a[pos];
+      const T new_t = Random<T>(rng);
+      for (size_t i = 0; i < count; ++i) {
+        expected[i] = IsEqual(a[i], old_t) ? new_t : a[i];
+      }
+
+      // Copy so ReplaceIf gets the same input (and thus also outputs expected)
+      memcpy(pb.get(), a, count * sizeof(T));
+
+      Replace(d, a, count, new_t, old_t);
+      HWY_ASSERT_ARRAY_EQ(expected.get(), a, count);
+
+      ReplaceIf(d, pb.get(), count, new_t, IfEq<T>(old_t));
+      HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count);
+    }
+  }
+};
+
+void TestAllGenerate() {
+  // The test BitCast-s the indices, which does not work for floats.
+  ForIntegerTypes(ForPartialVectors<ForeachCountAndMisalign<TestGenerate>>());
+}
+
+void TestAllTransform() {
+  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform>>());
+}
+
+void TestAllTransform1() {
+  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform1>>());
+}
+
+void TestAllTransform2() {
+  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform2>>());
+}
+
+void TestAllReplace() {
+  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestReplace>>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(TransformTest);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace);
+}  // namespace hwy
+
+#endif
@@ -0,0 +1,252 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Include guard (still compiled once per target)
+#include <cmath>
+
+#if defined(HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct Dot {
+  // Specify zero or more of these, ORed together, as the kAssumptions template
+  // argument to Compute. Each one may improve performance or reduce code size,
+  // at the cost of additional requirements on the arguments.
+  enum Assumptions {
+    // num_elements is at least N, which may be up to HWY_MAX_BYTES / sizeof(T).
+    kAtLeastOneVector = 1,
+    // num_elements is divisible by N (a power of two, so this can be used if
+    // the problem size is known to be a power of two >= HWY_MAX_BYTES /
+    // sizeof(T)).
+    kMultipleOfVector = 2,
+    // RoundUpTo(num_elements, N) elements are accessible; their value does not
+    // matter (will be treated as if they were zero).
+    kPaddedToVector = 4,
+  };
+
+  // Returns sum{pa[i] * pb[i]} for float or double inputs. Aligning the
+  // pointers to a multiple of N elements is helpful but not required.
+  template <int kAssumptions, class D, typename T = TFromD<D>,
+            HWY_IF_NOT_LANE_SIZE_D(D, 2)>
+  static HWY_INLINE T Compute(const D d, const T* const HWY_RESTRICT pa,
+                              const T* const HWY_RESTRICT pb,
+                              const size_t num_elements) {
+    static_assert(IsFloat<T>(), "MulAdd requires float type");
+    using V = decltype(Zero(d));
+
+    const size_t N = Lanes(d);
+    size_t i = 0;
+
+    constexpr bool kIsAtLeastOneVector =
+        (kAssumptions & kAtLeastOneVector) != 0;
+    constexpr bool kIsMultipleOfVector =
+        (kAssumptions & kMultipleOfVector) != 0;
+    constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
+
+    // Won't be able to do a full vector load without padding => scalar loop.
+    if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
+        HWY_UNLIKELY(num_elements < N)) {
+      // Only 2x unroll to avoid excessive code size.
+      T sum0 = T(0);
+      T sum1 = T(0);
+      for (; i + 2 <= num_elements; i += 2) {
+        sum0 += pa[i + 0] * pb[i + 0];
+        sum1 += pa[i + 1] * pb[i + 1];
+      }
+      if (i < num_elements) {
+        sum1 += pa[i] * pb[i];
+      }
+      return sum0 + sum1;
+    }
+
+    // Compiler doesn't make independent sum* accumulators, so unroll manually.
+    // 2 FMA ports * 4 cycle latency = up to 8 in-flight, but that is excessive
+    // for unaligned inputs (each unaligned pointer halves the throughput
+    // because it occupies both L1 load ports for a cycle). We cannot have
+    // arrays of vectors on RVV/SVE, so always unroll 4x.
+    V sum0 = Zero(d);
+    V sum1 = Zero(d);
+    V sum2 = Zero(d);
+    V sum3 = Zero(d);
+
+    // Main loop: unrolled
+    for (; i + 4 * N <= num_elements; /* i += 4 * N */) {  // incr in loop
+      const auto a0 = LoadU(d, pa + i);
+      const auto b0 = LoadU(d, pb + i);
+      i += N;
+      sum0 = MulAdd(a0, b0, sum0);
+      const auto a1 = LoadU(d, pa + i);
+      const auto b1 = LoadU(d, pb + i);
+      i += N;
+      sum1 = MulAdd(a1, b1, sum1);
+      const auto a2 = LoadU(d, pa + i);
+      const auto b2 = LoadU(d, pb + i);
+      i += N;
+      sum2 = MulAdd(a2, b2, sum2);
+      const auto a3 = LoadU(d, pa + i);
+      const auto b3 = LoadU(d, pb + i);
+      i += N;
+      sum3 = MulAdd(a3, b3, sum3);
+    }
+
+    // Up to 3 iterations of whole vectors
+    for (; i + N <= num_elements; i += N) {
+      const auto a = LoadU(d, pa + i);
+      const auto b = LoadU(d, pb + i);
+      sum0 = MulAdd(a, b, sum0);
+    }
+
+    if (!kIsMultipleOfVector) {
+      const size_t remaining = num_elements - i;
+      if (remaining != 0) {
+        if (kIsPaddedToVector) {
+          const auto mask = FirstN(d, remaining);
+          const auto a = LoadU(d, pa + i);
+          const auto b = LoadU(d, pb + i);
+          sum1 = MulAdd(IfThenElseZero(mask, a), IfThenElseZero(mask, b), sum1);
+        } else {
+          // Unaligned load such that the last element is in the highest lane -
+          // ensures we do not touch any elements outside the valid range.
+          // If we get here, then num_elements >= N.
+          HWY_DASSERT(i >= N);
+          i += remaining - N;
+          const auto skip = FirstN(d, N - remaining);
+          const auto a = LoadU(d, pa + i);  // always unaligned
+          const auto b = LoadU(d, pb + i);
+          sum1 = MulAdd(IfThenZeroElse(skip, a), IfThenZeroElse(skip, b), sum1);
+        }
+      }
+    }  // kMultipleOfVector
+
+    // Reduction tree: sum of all accumulators by pairs, then across lanes.
+    sum0 = Add(sum0, sum1);
+    sum2 = Add(sum2, sum3);
+    sum0 = Add(sum0, sum2);
+    return GetLane(SumOfLanes(d, sum0));
+  }
+
+  // Returns sum{pa[i] * pb[i]} for bfloat16 inputs. Aligning the pointers to a
+  // multiple of N elements is helpful but not required.
+  template <int kAssumptions, class D>
+  static HWY_INLINE float Compute(const D d,
+                                  const bfloat16_t* const HWY_RESTRICT pa,
+                                  const bfloat16_t* const HWY_RESTRICT pb,
+                                  const size_t num_elements) {
+    const RebindToUnsigned<D> du16;
+    const Repartition<float, D> df32;
+
+    using V = decltype(Zero(df32));
+    const size_t N = Lanes(d);
+    size_t i = 0;
+
+    constexpr bool kIsAtLeastOneVector =
+        (kAssumptions & kAtLeastOneVector) != 0;
+    constexpr bool kIsMultipleOfVector =
+        (kAssumptions & kMultipleOfVector) != 0;
+    constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
+
+    // Won't be able to do a full vector load without padding => scalar loop.
+    if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
+        HWY_UNLIKELY(num_elements < N)) {
+      float sum0 = 0.0f;  // Only 2x unroll to avoid excessive code size for..
+      float sum1 = 0.0f;  // this unlikely(?) case.
+      for (; i + 2 <= num_elements; i += 2) {
+        sum0 += F32FromBF16(pa[i + 0]) * F32FromBF16(pb[i + 0]);
+        sum1 += F32FromBF16(pa[i + 1]) * F32FromBF16(pb[i + 1]);
+      }
+      if (i < num_elements) {
+        sum1 += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
+      }
+      return sum0 + sum1;
+    }
+
+    // See comment in the other Compute() overload. Unroll 2x, but we need
+    // twice as many sums for ReorderWidenMulAccumulate.
+    V sum0 = Zero(df32);
+    V sum1 = Zero(df32);
+    V sum2 = Zero(df32);
+    V sum3 = Zero(df32);
+
+    // Main loop: unrolled
+    for (; i + 2 * N <= num_elements; /* i += 2 * N */) {  // incr in loop
+      const auto a0 = LoadU(d, pa + i);
+      const auto b0 = LoadU(d, pb + i);
+      i += N;
+      sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
+      const auto a1 = LoadU(d, pa + i);
+      const auto b1 = LoadU(d, pb + i);
+      i += N;
+      sum2 = ReorderWidenMulAccumulate(df32, a1, b1, sum2, sum3);
+    }
+
+    // Possibly one more iteration of whole vectors
+    if (i + N <= num_elements) {
+      const auto a0 = LoadU(d, pa + i);
+      const auto b0 = LoadU(d, pb + i);
+      i += N;
+      sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
+    }
+
+    if (!kIsMultipleOfVector) {
+      const size_t remaining = num_elements - i;
+      if (remaining != 0) {
+        if (kIsPaddedToVector) {
+          const auto mask = FirstN(du16, remaining);
+          const auto va = LoadU(d, pa + i);
+          const auto vb = LoadU(d, pb + i);
+          const auto a16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, va)));
+          const auto b16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, vb)));
+          sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
+
+        } else {
+          // Unaligned load such that the last element is in the highest lane -
+          // ensures we do not touch any elements outside the valid range.
+          // If we get here, then num_elements >= N.
+          HWY_DASSERT(i >= N);
+          i += remaining - N;
+          const auto skip = FirstN(du16, N - remaining);
+          const auto va = LoadU(d, pa + i);  // always unaligned
+          const auto vb = LoadU(d, pb + i);
+          const auto a16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, va)));
+          const auto b16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, vb)));
+          sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
+        }
+      }
+    }  // kMultipleOfVector
+
+    // Reduction tree: sum of all accumulators by pairs, then across lanes.
+    sum0 = Add(sum0, sum1);
+    sum2 = Add(sum2, sum3);
+    sum0 = Add(sum0, sum2);
+    return GetLane(SumOfLanes(df32, sum0));
+  }
+};
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
@@ -0,0 +1,167 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "hwy/aligned_allocator.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/dot/dot-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <typename T>
+HWY_NOINLINE T SimpleDot(const T* pa, const T* pb, size_t num) {
+  double sum = 0.0;
+  for (size_t i = 0; i < num; ++i) {
+    sum += pa[i] * pb[i];
+  }
+  return static_cast<T>(sum);
+}
+
+HWY_NOINLINE float SimpleDot(const bfloat16_t* pa, const bfloat16_t* pb,
+                             size_t num) {
+  float sum = 0.0f;
+  for (size_t i = 0; i < num; ++i) {
+    sum += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
+  }
+  return sum;
+}
+
+template <typename T>
+void SetValue(const float value, T* HWY_RESTRICT ptr) {
+  *ptr = static_cast<T>(value);
+}
+void SetValue(const float value, bfloat16_t* HWY_RESTRICT ptr) {
+  *ptr = BF16FromF32(value);
+}
+
+class TestDot {
+  // Computes/verifies one dot product.
+  template <int kAssumptions, class D>
+  void Test(D d, size_t num, size_t misalign_a, size_t misalign_b,
+            RandomState& rng) {
+    using T = TFromD<D>;
+    const size_t N = Lanes(d);
+    const auto random_t = [&rng]() {
+      const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
+      return static_cast<float>(bits - 512) * (1.0f / 64);
+    };
+
+    const size_t padded =
+        (kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num;
+    AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded);
+    AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + padded);
+    T* a = pa.get() + misalign_a;
+    T* b = pb.get() + misalign_b;
+    size_t i = 0;
+    for (; i < num; ++i) {
+      SetValue(random_t(), a + i);
+      SetValue(random_t(), b + i);
+    }
+    // Fill padding with NaN - the values are not used, but avoids MSAN errors.
+    for (; i < padded; ++i) {
+      ScalableTag<float> df1;
+      SetValue(GetLane(NaN(df1)), a + i);
+      SetValue(GetLane(NaN(df1)), b + i);
+    }
+
+    const auto expected = SimpleDot(a, b, num);
+    const auto actual = Dot::Compute<kAssumptions>(d, a, b, num);
+    const auto max = static_cast<decltype(actual)>(8 * 8 * num);
+    HWY_ASSERT(-max <= actual && actual <= max);
+    HWY_ASSERT(expected - 1E-4 <= actual && actual <= expected + 1E-4);
+  }
+
+  // Runs tests with various alignments.
+  template <int kAssumptions, class D>
+  void ForeachMisalign(D d, size_t num, RandomState& rng) {
+    const size_t N = Lanes(d);
+    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+    for (size_t ma : misalignments) {
+      for (size_t mb : misalignments) {
+        Test<kAssumptions>(d, num, ma, mb, rng);
+      }
+    }
+  }
+
+  // Runs tests with various lengths compatible with the given assumptions.
+  template <int kAssumptions, class D>
+  void ForeachCount(D d, RandomState& rng) {
+    const size_t N = Lanes(d);
+    const size_t counts[] = {1,
+                             3,
+                             7,
+                             16,
+                             HWY_MAX(N / 2, 1),
+                             HWY_MAX(2 * N / 3, 1),
+                             N,
+                             N + 1,
+                             4 * N / 3,
+                             3 * N,
+                             8 * N,
+                             8 * N + 2};
+    for (size_t num : counts) {
+      if ((kAssumptions & Dot::kAtLeastOneVector) && num < N) continue;
+      if ((kAssumptions & Dot::kMultipleOfVector) && (num % N) != 0) continue;
+      ForeachMisalign<kAssumptions>(d, num, rng);
+    }
+  }
+
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    // All 8 combinations of the three length-related flags:
+    ForeachCount<0>(d, rng);
+    ForeachCount<Dot::kAtLeastOneVector>(d, rng);
+    ForeachCount<Dot::kMultipleOfVector>(d, rng);
+    ForeachCount<Dot::kMultipleOfVector | Dot::kAtLeastOneVector>(d, rng);
+    ForeachCount<Dot::kPaddedToVector>(d, rng);
+    ForeachCount<Dot::kPaddedToVector | Dot::kAtLeastOneVector>(d, rng);
+    ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector>(d, rng);
+    ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector |
+                 Dot::kAtLeastOneVector>(d, rng);
+  }
+};
+
+void TestAllDot() { ForFloatTypes(ForPartialVectors<TestDot>()); }
+void TestAllDotBF16() { ForShrinkableVectors<TestDot>()(bfloat16_t()); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(DotTest);
+HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot);
+HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16);
+}  // namespace hwy
+
+#endif
@@ -0,0 +1,145 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/image/image.h"
+
+#include <algorithm>  // swap
+#include <cstddef>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+size_t GetVectorSize() { return Lanes(ScalableTag<uint8_t>()); }
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(GetVectorSize);  // Local function.
+}  // namespace
+
+size_t ImageBase::VectorSize() {
+  // Do not cache result - must return the current value, which may be greater
+  // than the first call if it was subject to DisableTargets!
+  return HWY_DYNAMIC_DISPATCH(GetVectorSize)();
+}
+
+size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) {
+  const size_t vec_size = VectorSize();
+  size_t valid_bytes = xsize * sizeof_t;
+
+  // Allow unaligned accesses starting at the last valid value - this may raise
+  // msan errors unless the user calls InitializePaddingForUnalignedAccesses.
+  // Skip for the scalar case because no extra lanes will be loaded.
+  if (vec_size != 1) {
+    HWY_DASSERT(vec_size >= sizeof_t);
+    valid_bytes += vec_size - sizeof_t;
+  }
+
+  // Round up to vector and cache line size.
+  const size_t align = HWY_MAX(vec_size, HWY_ALIGNMENT);
+  size_t bytes_per_row = RoundUpTo(valid_bytes, align);
+
+  // During the lengthy window before writes are committed to memory, CPUs
+  // guard against read after write hazards by checking the address, but
+  // only the lower 11 bits. We avoid a false dependency between writes to
+  // consecutive rows by ensuring their sizes are not multiples of 2 KiB.
+  // Avoid2K prevents the same problem for the planes of an Image3.
+  if (bytes_per_row % HWY_ALIGNMENT == 0) {
+    bytes_per_row += align;
+  }
+
+  HWY_DASSERT(bytes_per_row % align == 0);
+  return bytes_per_row;
+}
+
+ImageBase::ImageBase(const size_t xsize, const size_t ysize,
+                     const size_t sizeof_t)
+    : xsize_(static_cast<uint32_t>(xsize)),
+      ysize_(static_cast<uint32_t>(ysize)),
+      bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
+  HWY_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8);
+
+  bytes_per_row_ = 0;
+  // Dimensions can be zero, e.g. for lazily-allocated images. Only allocate
+  // if nonzero, because "zero" bytes still have padding/bookkeeping overhead.
+  if (xsize != 0 && ysize != 0) {
+    bytes_per_row_ = BytesPerRow(xsize, sizeof_t);
+    bytes_ = AllocateAligned<uint8_t>(bytes_per_row_ * ysize);
+    HWY_ASSERT(bytes_.get() != nullptr);
+    InitializePadding(sizeof_t, Padding::kRoundUp);
+  }
+}
+
+ImageBase::ImageBase(const size_t xsize, const size_t ysize,
+                     const size_t bytes_per_row, void* const aligned)
+    : xsize_(static_cast<uint32_t>(xsize)),
+      ysize_(static_cast<uint32_t>(ysize)),
+      bytes_per_row_(bytes_per_row),
+      bytes_(static_cast<uint8_t*>(aligned),
+             AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
+  const size_t vec_size = VectorSize();
+  HWY_ASSERT(bytes_per_row % vec_size == 0);
+  HWY_ASSERT(reinterpret_cast<uintptr_t>(aligned) % vec_size == 0);
+}
+
+void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
+#if HWY_IS_MSAN || HWY_IDE
+  if (xsize_ == 0 || ysize_ == 0) return;
+
+  const size_t vec_size = VectorSize();  // Bytes, independent of sizeof_t!
+  if (vec_size == 1) return;             // Scalar mode: no padding needed
+
+  const size_t valid_size = xsize_ * sizeof_t;
+  const size_t initialize_size = padding == Padding::kRoundUp
+                                     ? RoundUpTo(valid_size, vec_size)
+                                     : valid_size + vec_size - sizeof_t;
+  if (valid_size == initialize_size) return;
+
+  for (size_t y = 0; y < ysize_; ++y) {
+    uint8_t* HWY_RESTRICT row = static_cast<uint8_t*>(VoidRow(y));
+#if defined(__clang__) && (__clang_major__ <= 6)
+    // There's a bug in msan in clang-6 when handling AVX2 operations. This
+    // workaround allows tests to pass on msan, although it is slower and
+    // prevents msan warnings from uninitialized images.
+    memset(row, 0, initialize_size);
+#else
+    memset(row + valid_size, 0, initialize_size - valid_size);
+#endif  // clang6
+  }
+#else
+  (void)sizeof_t;
+  (void)padding;
+#endif  // HWY_IS_MSAN
+}
+
+void ImageBase::Swap(ImageBase& other) {
+  std::swap(xsize_, other.xsize_);
+  std::swap(ysize_, other.ysize_);
+  std::swap(bytes_per_row_, other.bytes_per_row_);
+  std::swap(bytes_, other.bytes_);
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,471 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
+#define HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
+
+// SIMD/multicore-friendly planar image representation with row accessors.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <cstddef>
+#include <utility>  // std::move
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/highway_export.h"
+
+namespace hwy {
+
+// Type-independent parts of Image<> - reduces code duplication and facilitates
+// moving member function implementations to cc file.
+struct HWY_CONTRIB_DLLEXPORT ImageBase {
+  // Returns required alignment in bytes for externally allocated memory.
+  static size_t VectorSize();
+
+  // Returns distance [bytes] between the start of two consecutive rows, a
+  // multiple of VectorSize but NOT kAlias (see implementation).
+  static size_t BytesPerRow(const size_t xsize, const size_t sizeof_t);
+
+  // No allocation (for output params or unused images)
+  ImageBase()
+      : xsize_(0),
+        ysize_(0),
+        bytes_per_row_(0),
+        bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {}
+
+  // Allocates memory (this is the common case)
+  ImageBase(size_t xsize, size_t ysize, size_t sizeof_t);
+
+  // References but does not take ownership of external memory. Useful for
+  // interoperability with other libraries. `aligned` must be aligned to a
+  // multiple of VectorSize() and `bytes_per_row` must also be a multiple of
+  // VectorSize() or preferably equal to BytesPerRow().
+  ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned);
+
+  // Copy construction/assignment is forbidden to avoid inadvertent copies,
+  // which can be very expensive. Use CopyImageTo() instead.
+  ImageBase(const ImageBase& other) = delete;
+  ImageBase& operator=(const ImageBase& other) = delete;
+
+  // Move constructor (required for returning Image from function)
+  ImageBase(ImageBase&& other) noexcept = default;
+
+  // Move assignment (required for std::vector)
+  ImageBase& operator=(ImageBase&& other) noexcept = default;
+
+  void Swap(ImageBase& other);
+
+  // Useful for pre-allocating image with some padding for alignment purposes
+  // and later reporting the actual valid dimensions. Caller is responsible
+  // for ensuring xsize/ysize are <= the original dimensions.
+  void ShrinkTo(const size_t xsize, const size_t ysize) {
+    xsize_ = static_cast<uint32_t>(xsize);
+    ysize_ = static_cast<uint32_t>(ysize);
+    // NOTE: we can't recompute bytes_per_row for more compact storage and
+    // better locality because that would invalidate the image contents.
+  }
+
+  // How many pixels.
+  HWY_INLINE size_t xsize() const { return xsize_; }
+  HWY_INLINE size_t ysize() const { return ysize_; }
+
+  // NOTE: do not use this for copying rows - the valid xsize may be much less.
+  HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; }
+
+  // Raw access to byte contents, for interfacing with other libraries.
+  // Unsigned char instead of char to avoid surprises (sign extension).
+  HWY_INLINE uint8_t* bytes() {
+    void* p = bytes_.get();
+    return static_cast<uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
+  }
+  HWY_INLINE const uint8_t* bytes() const {
+    const void* p = bytes_.get();
+    return static_cast<const uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
+  }
+
+ protected:
+  // Returns pointer to the start of a row.
+  HWY_INLINE void* VoidRow(const size_t y) const {
+#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+    if (y >= ysize_) {
+      HWY_ABORT("Row(%d) >= %u\n", static_cast<int>(y), ysize_);
+    }
+#endif
+
+    void* row = bytes_.get() + y * bytes_per_row_;
+    return HWY_ASSUME_ALIGNED(row, 64);
+  }
+
+  enum class Padding {
+    // Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
+    kRoundUp,
+    // Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra
+    // vector to be initialized. If done by default, this would suppress
+    // legitimate msan warnings. We therefore require users to explicitly call
+    // InitializePadding before using unaligned loads (e.g. convolution).
+    kUnaligned
+  };
+
+  // Initializes the minimum bytes required to suppress msan warnings from
+  // legitimate (according to Padding mode) vector loads/stores on the right
+  // border, where some lanes are uninitialized and assumed to be unused.
+  void InitializePadding(size_t sizeof_t, Padding padding);
+
+  // (Members are non-const to enable assignment during move-assignment.)
+  uint32_t xsize_;  // In valid pixels, not including any padding.
+  uint32_t ysize_;
+  size_t bytes_per_row_;  // Includes padding.
+  AlignedFreeUniquePtr<uint8_t[]> bytes_;
+};
+
+// Single channel, aligned rows separated by padding. T must be POD.
+//
+// 'Single channel' (one 2D array per channel) simplifies vectorization
+// (repeating the same operation on multiple adjacent components) without the
+// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
+// can easily iterate over all components in a row and Image requires no
+// knowledge of the pixel format beyond the component type "T".
+//
+// 'Aligned' means each row is aligned to the L1 cache line size. This prevents
+// false sharing between two threads operating on adjacent rows.
+//
+// 'Padding' is still relevant because vectors could potentially be larger than
+// a cache line. By rounding up row sizes to the vector size, we allow
+// reading/writing ALIGNED vectors whose first lane is a valid sample. This
+// avoids needing a separate loop to handle remaining unaligned lanes.
+//
+// This image layout could also be achieved with a vector and a row accessor
+// function, but a class wrapper with support for "deleter" allows wrapping
+// existing memory allocated by clients without copying the pixels. It also
+// provides convenient accessors for xsize/ysize, which shortens function
+// argument lists. Supports move-construction so it can be stored in containers.
+template <typename ComponentType>
+class Image : public ImageBase {
+ public:
+  using T = ComponentType;
+
+  Image() = default;
+  Image(const size_t xsize, const size_t ysize)
+      : ImageBase(xsize, ysize, sizeof(T)) {}
+  Image(const size_t xsize, const size_t ysize, size_t bytes_per_row,
+        void* aligned)
+      : ImageBase(xsize, ysize, bytes_per_row, aligned) {}
+
+  void InitializePaddingForUnalignedAccesses() {
+    InitializePadding(sizeof(T), Padding::kUnaligned);
+  }
+
+  HWY_INLINE const T* ConstRow(const size_t y) const {
+    return static_cast<const T*>(VoidRow(y));
+  }
+  HWY_INLINE const T* ConstRow(const size_t y) {
+    return static_cast<const T*>(VoidRow(y));
+  }
+
+  // Returns pointer to non-const. This allows passing const Image* parameters
+  // when the callee is only supposed to fill the pixels, as opposed to
+  // allocating or resizing the image.
+  HWY_INLINE T* MutableRow(const size_t y) const {
+    return static_cast<T*>(VoidRow(y));
+  }
+  HWY_INLINE T* MutableRow(const size_t y) {
+    return static_cast<T*>(VoidRow(y));
+  }
+
+  // Returns number of pixels (some of which are padding) per row. Useful for
+  // computing other rows via pointer arithmetic. WARNING: this must
+  // NOT be used to determine xsize.
+  HWY_INLINE intptr_t PixelsPerRow() const {
+    return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
+  }
+};
+
+using ImageF = Image<float>;
+
+// A bundle of 3 same-sized images. To fill an existing Image3 using
+// single-channel producers, we also need access to each const Image*. Const
+// prevents breaking the same-size invariant, while still allowing pixels to be
+// changed via MutableRow.
+template <typename ComponentType>
+class Image3 {
+ public:
+  using T = ComponentType;
+  using ImageT = Image<T>;
+  static constexpr size_t kNumPlanes = 3;
+
+  Image3() : planes_{ImageT(), ImageT(), ImageT()} {}
+
+  Image3(const size_t xsize, const size_t ysize)
+      : planes_{ImageT(xsize, ysize), ImageT(xsize, ysize),
+                ImageT(xsize, ysize)} {}
+
+  Image3(Image3&& other) noexcept {
+    for (size_t i = 0; i < kNumPlanes; i++) {
+      planes_[i] = std::move(other.planes_[i]);
+    }
+  }
+
+  Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
+    if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
+      HWY_ABORT(
+          "Not same size: %d x %d, %d x %d, %d x %d\n",
+          static_cast<int>(plane0.xsize()), static_cast<int>(plane0.ysize()),
+          static_cast<int>(plane1.xsize()), static_cast<int>(plane1.ysize()),
+          static_cast<int>(plane2.xsize()), static_cast<int>(plane2.ysize()));
+    }
+    planes_[0] = std::move(plane0);
+    planes_[1] = std::move(plane1);
+    planes_[2] = std::move(plane2);
+  }
+
+  // Copy construction/assignment is forbidden to avoid inadvertent copies,
+  // which can be very expensive. Use CopyImageTo instead.
+  Image3(const Image3& other) = delete;
+  Image3& operator=(const Image3& other) = delete;
+
+  Image3& operator=(Image3&& other) noexcept {
+    for (size_t i = 0; i < kNumPlanes; i++) {
+      planes_[i] = std::move(other.planes_[i]);
+    }
+    return *this;
+  }
+
+  HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
+    return static_cast<const T*>(VoidPlaneRow(c, y));
+  }
+  HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) {
+    return static_cast<const T*>(VoidPlaneRow(c, y));
+  }
+
+  HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const {
+    return static_cast<T*>(VoidPlaneRow(c, y));
+  }
+  HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) {
+    return static_cast<T*>(VoidPlaneRow(c, y));
+  }
+
+  HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; }
+
+  void Swap(Image3& other) {
+    for (size_t c = 0; c < 3; ++c) {
+      other.planes_[c].Swap(planes_[c]);
+    }
+  }
+
+  void ShrinkTo(const size_t xsize, const size_t ysize) {
+    for (ImageT& plane : planes_) {
+      plane.ShrinkTo(xsize, ysize);
+    }
+  }
+
+  // Sizes of all three images are guaranteed to be equal.
+  HWY_INLINE size_t xsize() const { return planes_[0].xsize(); }
+  HWY_INLINE size_t ysize() const { return planes_[0].ysize(); }
+  // Returns offset [bytes] from one row to the next row of the same plane.
+  // WARNING: this must NOT be used to determine xsize, nor for copying rows -
+  // the valid xsize may be much less.
+  HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
+  // Returns number of pixels (some of which are padding) per row. Useful for
+  // computing other rows via pointer arithmetic. WARNING: this must NOT be used
+  // to determine xsize.
+  HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }
+
+ private:
+  // Returns pointer to the start of a row.
+  HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
+#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+    if (c >= kNumPlanes || y >= ysize()) {
+      HWY_ABORT("PlaneRow(%d, %d) >= %d\n", static_cast<int>(c),
+                static_cast<int>(y), static_cast<int>(ysize()));
+    }
+#endif
+    // Use the first plane's stride because the compiler might not realize they
+    // are all equal. Thus we only need a single multiplication for all planes.
+    const size_t row_offset = y * planes_[0].bytes_per_row();
+    const void* row = planes_[c].bytes() + row_offset;
+    return static_cast<const T * HWY_RESTRICT>(
+        HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT));
+  }
+
+ private:
+  ImageT planes_[kNumPlanes];
+};
+
+using Image3F = Image3<float>;
+
+// Rectangular region in image(s). Factoring this out of Image instead of
+// shifting the pointer by x0/y0 allows this to apply to multiple images with
+// different resolutions. Can compare size via SameSize(rect1, rect2).
+class Rect {
+ public:
+  // Most windows are xsize_max * ysize_max, except those on the borders where
+  // begin + size_max > end.
+  constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max,
+                 size_t ysize_max, size_t xend, size_t yend)
+      : x0_(xbegin),
+        y0_(ybegin),
+        xsize_(ClampedSize(xbegin, xsize_max, xend)),
+        ysize_(ClampedSize(ybegin, ysize_max, yend)) {}
+
+  // Construct with origin and known size (typically from another Rect).
+  constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize)
+      : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}
+
+  // Construct a rect that covers a whole image.
+  template <typename Image>
+  explicit Rect(const Image& image)
+      : Rect(0, 0, image.xsize(), image.ysize()) {}
+
+  Rect() : Rect(0, 0, 0, 0) {}
+
+  Rect(const Rect&) = default;
+  Rect& operator=(const Rect&) = default;
+
+  Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max,
+               size_t ysize_max) {
+    return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_,
+                y0_ + ysize_);
+  }
+
+  template <typename T>
+  const T* ConstRow(const Image<T>* image, size_t y) const {
+    return image->ConstRow(y + y0_) + x0_;
+  }
+
+  template <typename T>
+  T* MutableRow(const Image<T>* image, size_t y) const {
+    return image->MutableRow(y + y0_) + x0_;
+  }
+
+  template <typename T>
+  const T* ConstPlaneRow(const Image3<T>& image, size_t c, size_t y) const {
+    return image.ConstPlaneRow(c, y + y0_) + x0_;
+  }
+
+  template <typename T>
+  T* MutablePlaneRow(Image3<T>* image, const size_t c, size_t y) const {
+    return image->MutablePlaneRow(c, y + y0_) + x0_;
+  }
+
+  // Returns true if this Rect fully resides in the given image. ImageT could be
+  // Image<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
+  template <class ImageT>
+  bool IsInside(const ImageT& image) const {
+    return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize());
+  }
+
+  size_t x0() const { return x0_; }
+  size_t y0() const { return y0_; }
+  size_t xsize() const { return xsize_; }
+  size_t ysize() const { return ysize_; }
+
+ private:
+  // Returns size_max, or whatever is left in [begin, end).
+  static constexpr size_t ClampedSize(size_t begin, size_t size_max,
+                                      size_t end) {
+    return (begin + size_max <= end) ? size_max
+                                     : (end > begin ? end - begin : 0);
+  }
+
+  size_t x0_;
+  size_t y0_;
+
+  size_t xsize_;
+  size_t ysize_;
+};
+
+// Works for any image-like input type(s).
+template <class Image1, class Image2>
+HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) {
+  return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
+}
+
+// Mirrors out of bounds coordinates and returns valid coordinates unchanged.
+// We assume the radius (distance outside the image) is small compared to the
+// image size, otherwise this might not terminate.
+// The mirror is outside the last column (border pixel is also replicated).
+static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x,
+                                                 const int64_t xsize) {
+  HWY_DASSERT(xsize != 0);
+
+  // TODO(janwas): replace with branchless version
+  while (x < 0 || x >= xsize) {
+    if (x < 0) {
+      x = -x - 1;
+    } else {
+      x = 2 * xsize - 1 - x;
+    }
+  }
+  return static_cast<size_t>(x);
+}
+
+// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):
+
+// Mirrors (repeating the edge pixel once). Useful for convolutions.
+struct WrapMirror {
+  HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const {
+    return Mirror(coord, static_cast<int64_t>(size));
+  }
+};
+
+// Returns the same coordinate, for when we know "coord" is already valid (e.g.
+// interior of an image).
+struct WrapUnchanged {
+  HWY_INLINE size_t operator()(const int64_t coord, size_t /*size*/) const {
+    return static_cast<size_t>(coord);
+  }
+};
+
+// Similar to Wrap* but for row pointers (reduces Row() multiplications).
+
+class WrapRowMirror {
+ public:
+  template <class View>
+  WrapRowMirror(const View& image, size_t ysize)
+      : first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}
+
+  const float* operator()(const float* const HWY_RESTRICT row,
+                          const int64_t stride) const {
+    if (row < first_row_) {
+      const int64_t num_before = first_row_ - row;
+      // Mirrored; one row before => row 0, two before = row 1, ...
+      return first_row_ + num_before - stride;
+    }
+    if (row > last_row_) {
+      const int64_t num_after = row - last_row_;
+      // Mirrored; one row after => last row, two after = last - 1, ...
+      return last_row_ - num_after + stride;
+    }
+    return row;
+  }
+
+ private:
+  const float* const HWY_RESTRICT first_row_;
+  const float* const HWY_RESTRICT last_row_;
+};
+
+struct WrapRowUnchanged {
+  HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row,
+                                     int64_t /*stride*/) const {
+    return row;
+  }
+};
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
@@ -0,0 +1,152 @@
+// Copyright (c) the JPEG XL Project
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/image/image.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <random>
+#include <utility>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target:
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Ensure we can always write full aligned vectors.
+struct TestAlignedT {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+    std::mt19937 rng(129);
+    std::uniform_int_distribution<int> dist(0, 16);
+    const ScalableTag<T> d;
+
+    for (size_t ysize = 1; ysize < 4; ++ysize) {
+      for (size_t xsize = 1; xsize < 64; ++xsize) {
+        Image<T> img(xsize, ysize);
+
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          for (size_t x = 0; x < xsize; x += Lanes(d)) {
+            const auto values = Iota(d, static_cast<T>(dist(rng)));
+            Store(values, d, row + x);
+          }
+        }
+
+        // Sanity check to prevent optimizing out the writes
+        const auto x = std::uniform_int_distribution<size_t>(0, xsize - 1)(rng);
+        const auto y = std::uniform_int_distribution<size_t>(0, ysize - 1)(rng);
+        HWY_ASSERT(img.ConstRow(y)[x] < 16 + Lanes(d));
+      }
+    }
+  }
+};
+
+void TestAligned() { ForUnsignedTypes(TestAlignedT()); }
+
+// Ensure we can write an unaligned vector starting at the last valid value.
+struct TestUnalignedT {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+    std::mt19937 rng(129);
+    std::uniform_int_distribution<int> dist(0, 3);
+    const ScalableTag<T> d;
+
+    for (size_t ysize = 1; ysize < 4; ++ysize) {
+      for (size_t xsize = 1; xsize < 128; ++xsize) {
+        Image<T> img(xsize, ysize);
+        img.InitializePaddingForUnalignedAccesses();
+
+// This test reads padding, which only works if it was initialized,
+// which only happens in MSAN builds.
+#if HWY_IS_MSAN || HWY_IDE
+        // Initialize only the valid samples
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            row[x] = static_cast<T>(1u << dist(rng));
+          }
+        }
+
+        // Read padding bits
+        auto accum = Zero(d);
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            accum = Or(accum, LoadU(d, row + x));
+          }
+        }
+
+        // Ensure padding was zero
+        const size_t N = Lanes(d);
+        auto lanes = AllocateAligned<T>(N);
+        Store(accum, d, lanes.get());
+        for (size_t i = 0; i < N; ++i) {
+          HWY_ASSERT(lanes[i] < 16);
+        }
+#else  // Check that writing padding does not overwrite valid samples
+       // Initialize only the valid samples
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            row[x] = static_cast<T>(x);
+          }
+        }
+
+        // Zero padding and rightmost sample
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          StoreU(Zero(d), d, row + xsize - 1);
+        }
+
+        // Ensure no samples except the rightmost were overwritten
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          for (size_t x = 0; x < xsize - 1; ++x) {
+            HWY_ASSERT_EQ(static_cast<T>(x), row[x]);
+          }
+        }
+#endif
+      }
+    }
+  }
+};
+
+void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(ImageTest);
+HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
+HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
+}  // namespace hwy
+
+#endif
@@ -0,0 +1,227 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
+#include <stdio.h>
+
+#include <cfloat>  // FLT_MAX
+#include <type_traits>
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/math/math-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <class Out, class In>
+inline Out BitCast(const In& in) {
+  static_assert(sizeof(Out) == sizeof(In), "");
+  Out out;
+  CopyBytes<sizeof(out)>(&in, &out);
+  return out;
+}
+
+template <class T, class D>
+HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
+                           Vec<D> (*fxN)(D, VecArg<Vec<D>>), D d, T min, T max,
+                           uint64_t max_error_ulp) {
+  using UintT = MakeUnsigned<T>;
+
+  const UintT min_bits = BitCast<UintT>(min);
+  const UintT max_bits = BitCast<UintT>(max);
+
+  // If min is negative and max is positive, the range needs to be broken into
+  // two pieces, [+0, max] and [-0, min], otherwise [min, max].
+  int range_count = 1;
+  UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}};
+  if ((min < 0.0) && (max > 0.0)) {
+    ranges[0][0] = BitCast<UintT>(static_cast<T>(+0.0));
+    ranges[0][1] = max_bits;
+    ranges[1][0] = BitCast<UintT>(static_cast<T>(-0.0));
+    ranges[1][1] = min_bits;
+    range_count = 2;
+  }
+
+  uint64_t max_ulp = 0;
+  // Emulation is slower, so cannot afford as many.
+  constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(4000));
+  for (int range_index = 0; range_index < range_count; ++range_index) {
+    const UintT start = ranges[range_index][0];
+    const UintT stop = ranges[range_index][1];
+    const UintT step = HWY_MAX(1, ((stop - start) / kSamplesPerRange));
+    for (UintT value_bits = start; value_bits <= stop; value_bits += step) {
+      // For reasons unknown, the HWY_MAX is necessary on RVV, otherwise
+      // value_bits can be less than start, and thus possibly NaN.
+      const T value = BitCast<T>(HWY_MIN(HWY_MAX(start, value_bits), stop));
+      const T actual = GetLane(fxN(d, Set(d, value)));
+      const T expected = fx1(value);
+
+      // Skip small inputs and outputs on armv7, it flushes subnormals to zero.
+#if HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
+      if ((std::abs(value) < 1e-37f) || (std::abs(expected) < 1e-37f)) {
+        continue;
+      }
+#endif
+
+      const auto ulp = hwy::detail::ComputeUlpDelta(actual, expected);
+      max_ulp = HWY_MAX(max_ulp, ulp);
+      if (ulp > max_error_ulp) {
+        fprintf(stderr,
+                "%s: %s(%f) expected %f actual %f ulp %" PRIu64 " max ulp %u\n",
+                hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), value,
+                expected, actual, static_cast<uint64_t>(ulp),
+                static_cast<uint32_t>(max_error_ulp));
+      }
+    }
+  }
+  fprintf(stderr, "%s: %s max_ulp %" PRIu64 "\n",
+          hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), max_ulp);
+  HWY_ASSERT(max_ulp <= max_error_ulp);
+}
+
+#define DEFINE_MATH_TEST_FUNC(NAME)                 \
+  HWY_NOINLINE void TestAll##NAME() {               \
+    ForFloatTypes(ForPartialVectors<Test##NAME>()); \
+  }
+
+#undef DEFINE_MATH_TEST
+#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
+                         F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR)       \
+  struct Test##NAME {                                                     \
+    template <class T, class D>                                           \
+    HWY_NOINLINE void operator()(T, D d) {                                \
+      if (sizeof(T) == 4) {                                               \
+        TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX,  \
+                       F32_ERROR);                                        \
+      } else {                                                            \
+        TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d,                    \
+                       static_cast<T>(F64_MIN), static_cast<T>(F64_MAX),  \
+                       F64_ERROR);                                        \
+      }                                                                   \
+    }                                                                     \
+  };                                                                      \
+  DEFINE_MATH_TEST_FUNC(NAME)
+
+// Floating point values closest to but less than 1.0
+const float kNearOneF = BitCast<float>(0x3F7FFFFF);
+const double kNearOneD = BitCast<double>(0x3FEFFFFFFFFFFFFFULL);
+
+// The discrepancy is unacceptably large for MSYS2 (less accurate libm?), so
+// only increase the error tolerance there.
+constexpr uint64_t Cos64ULP() {
+#if defined(__MINGW32__)
+  return 23;
+#else
+  return 3;
+#endif
+}
+
+constexpr uint64_t ACosh32ULP() {
+#if defined(__MINGW32__)
+  return 8;
+#else
+  return 3;
+#endif
+}
+
+// clang-format off
+DEFINE_MATH_TEST(Acos,
+  std::acos,  CallAcos,  -1.0f,      +1.0f,       3,  // NEON is 3 instead of 2
+  std::acos,  CallAcos,  -1.0,       +1.0,        2)
+DEFINE_MATH_TEST(Acosh,
+  std::acosh, CallAcosh, +1.0f,      +FLT_MAX,    ACosh32ULP(),
+  std::acosh, CallAcosh, +1.0,       +DBL_MAX,    3)
+DEFINE_MATH_TEST(Asin,
+  std::asin,  CallAsin,  -1.0f,      +1.0f,       4,  // ARMv7 is 4 instead of 2
+  std::asin,  CallAsin,  -1.0,       +1.0,        2)
+DEFINE_MATH_TEST(Asinh,
+  std::asinh, CallAsinh, -FLT_MAX,   +FLT_MAX,    3,
+  std::asinh, CallAsinh, -DBL_MAX,   +DBL_MAX,    3)
+DEFINE_MATH_TEST(Atan,
+  std::atan,  CallAtan,  -FLT_MAX,   +FLT_MAX,    3,
+  std::atan,  CallAtan,  -DBL_MAX,   +DBL_MAX,    3)
+DEFINE_MATH_TEST(Atanh,
+  std::atanh, CallAtanh, -kNearOneF, +kNearOneF,  4,  // NEON is 4 instead of 3
+  std::atanh, CallAtanh, -kNearOneD, +kNearOneD,  3)
+DEFINE_MATH_TEST(Cos,
+  std::cos,   CallCos,   -39000.0f,  +39000.0f,   3,
+  std::cos,   CallCos,   -39000.0,   +39000.0,    Cos64ULP())
+DEFINE_MATH_TEST(Exp,
+  std::exp,   CallExp,   -FLT_MAX,   +104.0f,     1,
+  std::exp,   CallExp,   -DBL_MAX,   +104.0,      1)
+DEFINE_MATH_TEST(Expm1,
+  std::expm1, CallExpm1, -FLT_MAX,   +104.0f,     4,
+  std::expm1, CallExpm1, -DBL_MAX,   +104.0,      4)
+DEFINE_MATH_TEST(Log,
+  std::log,   CallLog,   +FLT_MIN,   +FLT_MAX,    1,
+  std::log,   CallLog,   +DBL_MIN,   +DBL_MAX,    1)
+DEFINE_MATH_TEST(Log10,
+  std::log10, CallLog10, +FLT_MIN,   +FLT_MAX,    2,
+  std::log10, CallLog10, +DBL_MIN,   +DBL_MAX,    2)
+DEFINE_MATH_TEST(Log1p,
+  std::log1p, CallLog1p, +0.0f,      +1e37f,      3,  // NEON is 3 instead of 2
+  std::log1p, CallLog1p, +0.0,       +DBL_MAX,    2)
+DEFINE_MATH_TEST(Log2,
+  std::log2,  CallLog2,  +FLT_MIN,   +FLT_MAX,    2,
+  std::log2,  CallLog2,  +DBL_MIN,   +DBL_MAX,    2)
+DEFINE_MATH_TEST(Sin,
+  std::sin,   CallSin,   -39000.0f,  +39000.0f,   3,
+  std::sin,   CallSin,   -39000.0,   +39000.0,    4)  // MSYS is 4 instead of 3
+DEFINE_MATH_TEST(Sinh,
+  std::sinh,  CallSinh,  -80.0f,     +80.0f,      4,
+  std::sinh,  CallSinh,  -709.0,     +709.0,      4)
+DEFINE_MATH_TEST(Tanh,
+  std::tanh,  CallTanh,  -FLT_MAX,   +FLT_MAX,    4,
+  std::tanh,  CallTanh,  -DBL_MAX,   +DBL_MAX,    4)
+// clang-format on
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMathTest);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
+}  // namespace hwy
+
+#endif
@@ -0,0 +1,190 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+# Unused on Bazel builds, where this is not defined/known; Copybara replaces
+# usages with an empty list.
+COMPAT = [
+    "//buildenv/target:non_prod",  # includes mobile/vendor.
+]
+
+# cc_library(
+#     name = "vxsort",
+#     srcs = [
+#         "vxsort/isa_detection.cpp",
+#         "vxsort/isa_detection_msvc.cpp",
+#         "vxsort/isa_detection_sane.cpp",
+#         "vxsort/machine_traits.avx2.cpp",
+#         "vxsort/smallsort/avx2_load_mask_tables.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp",
+#         "vxsort/vxsort_stats.cpp",
+#     ],
+#     hdrs = [
+#         "vxsort/alignment.h",
+#         "vxsort/defs.h",
+#         "vxsort/isa_detection.h",
+#         "vxsort/machine_traits.avx2.h",
+#         "vxsort/machine_traits.avx512.h",
+#         "vxsort/machine_traits.h",
+#         "vxsort/packer.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.h",
+#         "vxsort/vxsort.h",
+#         "vxsort/vxsort_stats.h",
+#     ],
+#     compatible_with = [],
+#     textual_hdrs = [
+#         "vxsort/vxsort_targets_disable.h",
+#         "vxsort/vxsort_targets_enable_avx2.h",
+#         "vxsort/vxsort_targets_enable_avx512.h",
+#     ],
+# )
+
+cc_library(
+    name = "vqsort",
+    srcs = [
+        # Split into separate files to reduce MSVC build time.
+        "vqsort.cc",
+        "vqsort_128a.cc",
+        "vqsort_128d.cc",
+        "vqsort_f32a.cc",
+        "vqsort_f32d.cc",
+        "vqsort_f64a.cc",
+        "vqsort_f64d.cc",
+        "vqsort_i16a.cc",
+        "vqsort_i16d.cc",
+        "vqsort_i32a.cc",
+        "vqsort_i32d.cc",
+        "vqsort_i64a.cc",
+        "vqsort_i64d.cc",
+        "vqsort_kv64a.cc",
+        "vqsort_kv64d.cc",
+        "vqsort_kv128a.cc",
+        "vqsort_kv128d.cc",
+        "vqsort_u16a.cc",
+        "vqsort_u16d.cc",
+        "vqsort_u32a.cc",
+        "vqsort_u32d.cc",
+        "vqsort_u64a.cc",
+        "vqsort_u64d.cc",
+    ],
+    hdrs = [
+        "vqsort.h",  # public interface
+    ],
+    compatible_with = [],
+    local_defines = ["hwy_contrib_EXPORTS"],
+    textual_hdrs = [
+        "shared-inl.h",
+        "sorting_networks-inl.h",
+        "traits-inl.h",
+        "traits128-inl.h",
+        "vqsort-inl.h",
+        # Placeholder for internal instrumentation. Do not remove.
+    ],
+    deps = [
+        # Only if VQSORT_SECURE_RNG is set.
+        # "//third_party/absl/random",
+        "//:hwy",
+        # ":vxsort",  # required if HAVE_VXSORT
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Internal-only targets
+
+cc_library(
+    name = "helpers",
+    testonly = 1,
+    textual_hdrs = [
+        "algo-inl.h",
+        "result-inl.h",
+    ],
+    deps = [
+        ":vqsort",
+        "//:nanobenchmark",
+        # Required for HAVE_PDQSORT, but that is unused and this is
+        # unavailable to Bazel builds, hence commented out.
+        # "//third_party/boost/allowed",
+        # Avoid ips4o and thus TBB to work around hwloc build failure.
+    ],
+)
+
+cc_binary(
+    name = "print_network",
+    testonly = 1,
+    srcs = ["print_network.cc"],
+    deps = [
+        ":helpers",
+        ":vqsort",
+        "//:hwy",
+    ],
+)
+
+cc_test(
+    name = "sort_test",
+    size = "medium",
+    srcs = ["sort_test.cc"],
+    # Do not enable fully_static_link (pthread crash on bazel)
+    local_defines = ["HWY_IS_TEST"],
+    # for test_suite.
+    tags = ["hwy_ops_test"],
+    deps = [
+        ":helpers",
+        ":vqsort",
+        "@com_google_googletest//:gtest_main",
+        "//:hwy",
+        "//:hwy_test_util",
+    ],
+)
+
+cc_binary(
+    name = "bench_sort",
+    testonly = 1,
+    srcs = ["bench_sort.cc"],
+    # Do not enable fully_static_link (pthread crash on bazel)
+    local_defines = ["HWY_IS_TEST"],
+    deps = [
+        ":helpers",
+        ":vqsort",
+        "@com_google_googletest//:gtest_main",
+        "//:hwy",
+        "//:hwy_test_util",
+    ],
+)
+
+cc_binary(
+    name = "bench_parallel",
+    testonly = 1,
+    srcs = ["bench_parallel.cc"],
+    # Do not enable fully_static_link (pthread crash on bazel)
+    local_defines = ["HWY_IS_TEST"],
+    deps = [
+        ":helpers",
+        ":vqsort",
+        "@com_google_googletest//:gtest_main",
+        "//:hwy",
+        "//:hwy_test_util",
+    ],
+)
@@ -0,0 +1,87 @@
+# Vectorized and performance-portable Quicksort
+
+## Introduction
+
+As of 2022-06-07 this sorts large arrays of built-in types about ten times as
+fast as `std::sort`. See also our
+[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html)
+and [paper](https://arxiv.org/abs/2205.05982).
+
+## Instructions
+
+Here are instructions for reproducing our results on Linux and AWS (SVE, NEON).
+
+### Linux
+
+Please first ensure golang, and Clang (tested with 13.0.1) are installed via
+your system's package manager.
+
+```
+go install github.com/bazelbuild/bazelisk@latest
+git clone https://github.com/google/highway
+cd highway
+CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all
+bazel-bin/hwy/contrib/sort/sort_test
+bazel-bin/hwy/contrib/sort/bench_sort
+```
+
+### AWS Graviton3
+
+Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is
+32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the
+config is verified, then re-launch. See IPv4 hostname in list of instances.
+
+`ssh -i /path/key.pem ec2-user@hostname`
+
+Note that the AWS CMake package is too old for llvm, so we build it first:
+```
+wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz
+tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/
+./bootstrap -- -DCMAKE_USE_OPENSSL=OFF
+make -j8 && sudo make install
+cd ..
+```
+
+AWS clang is at version 11.1, which generates unnecessary `AND` instructions
+which slow down the sort by 1.15x. We tested with clang trunk as of June 13
+(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
+
+```
+git clone --depth 1 https://github.com/llvm/llvm-project.git
+cd llvm-project
+mkdir -p build && cd build
+/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release
+make -j32 && sudo make install
+```
+
+```
+sudo yum install go
+go install github.com/bazelbuild/bazelisk@latest
+git clone https://github.com/google/highway
+cd highway
+CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all
+bazel-bin/hwy/contrib/sort/sort_test
+bazel-bin/hwy/contrib/sort/bench_sort
+```
+
+The above command line enables SVE, which is currently only available on
+Graviton 3. You can also test NEON on the same processor, or other Arm CPUs, by
+changing the `-march=` option to `--copt=-march=armv8.2-a+crypto`. Note that
+such flags will be unnecessary once Clang supports `#pragma target` for NEON and
+SVE intrinsics, as it does for x86.
+
+## Results
+
+`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
+algorithm (std for `std::sort`, vq for our vqsort), the type of keys being
+sorted (f32 is float), the distribution of keys (uniform32 for uniform random
+with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e.
+number of key bytes output per second).
+
+Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz:
+
+```
+[ RUN      ] BenchSortGroup/BenchSort.BenchAllSort/AVX3
+      AVX3:          std:     f32: uniform32: 1.00E+06   54 MB/s ( 1 threads)
+      AVX3:           vq:     f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads)
+```
@@ -0,0 +1,512 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Normal include guard for target-independent parts
+#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
+#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
+
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#include <algorithm>
+#include <cmath>  // std::abs
+#include <vector>
+
+#include "hwy/base.h"
+#include "hwy/contrib/sort/vqsort.h"
+
+// Third-party algorithms
+#define HAVE_AVX2SORT 0
+#define HAVE_IPS4O 0
+// When enabling, consider changing max_threads (required for Table 1a)
+#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
+#define HAVE_PDQSORT 0
+#define HAVE_SORT512 0
+#define HAVE_VXSORT 0
+
+#if HAVE_AVX2SORT
+HWY_PUSH_ATTRIBUTES("avx2,avx")
+#include "avx2sort.h"  //NOLINT
+HWY_POP_ATTRIBUTES
+#endif
+#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
+#include "third_party/ips4o/include/ips4o.hpp"
+#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
+#endif
+#if HAVE_PDQSORT
+#include "third_party/boost/allowed/sort/sort.hpp"
+#endif
+#if HAVE_SORT512
+#include "sort512.h"  //NOLINT
+#endif
+
+// vxsort is difficult to compile for multiple targets because it also uses
+// .cpp files, and we'd also have to #undef its include guards. Instead, compile
+// only for AVX2 or AVX3 depending on this macro.
+#define VXSORT_AVX3 1
+#if HAVE_VXSORT
+// inlined from vxsort_targets_enable_avx512 (must close before end of header)
+#ifdef __GNUC__
+#ifdef __clang__
+#if VXSORT_AVX3
+#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
+                             apply_to = any(function))
+#else
+#pragma clang attribute push(__attribute__((target("avx2"))), \
+                             apply_to = any(function))
+#endif  // VXSORT_AVX3
+
+#else
+#pragma GCC push_options
+#if VXSORT_AVX3
+#pragma GCC target("avx512f,avx512dq")
+#else
+#pragma GCC target("avx2")
+#endif  // VXSORT_AVX3
+#endif
+#endif
+
+#if VXSORT_AVX3
+#include "vxsort/machine_traits.avx512.h"
+#else
+#include "vxsort/machine_traits.avx2.h"
+#endif  // VXSORT_AVX3
+#include "vxsort/vxsort.h"
+#ifdef __GNUC__
+#ifdef __clang__
+#pragma clang attribute pop
+#else
+#pragma GCC pop_options
+#endif
+#endif
+#endif  // HAVE_VXSORT
+
+namespace hwy {
+
+enum class Dist { kUniform8, kUniform16, kUniform32 };
+
+static inline std::vector<Dist> AllDist() {
+  return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32};
+}
+
+static inline const char* DistName(Dist dist) {
+  switch (dist) {
+    case Dist::kUniform8:
+      return "uniform8";
+    case Dist::kUniform16:
+      return "uniform16";
+    case Dist::kUniform32:
+      return "uniform32";
+  }
+  return "unreachable";
+}
+
+template <typename T>
+class InputStats {
+ public:
+  void Notify(T value) {
+    min_ = std::min(min_, value);
+    max_ = std::max(max_, value);
+    // Converting to integer would truncate floats, multiplying to save digits
+    // risks overflow especially when casting, so instead take the sum of the
+    // bit representations as the checksum.
+    uint64_t bits = 0;
+    static_assert(sizeof(T) <= 8, "Expected a built-in type");
+    CopyBytes<sizeof(T)>(&value, &bits);  // not same size
+    sum_ += bits;
+    count_ += 1;
+  }
+
+  bool operator==(const InputStats& other) const {
+    if (count_ != other.count_) {
+      HWY_ABORT("count %d vs %d\n", static_cast<int>(count_),
+                static_cast<int>(other.count_));
+    }
+
+    if (min_ != other.min_ || max_ != other.max_) {
+      HWY_ABORT("minmax %f/%f vs %f/%f\n", static_cast<double>(min_),
+                static_cast<double>(max_), static_cast<double>(other.min_),
+                static_cast<double>(other.max_));
+    }
+
+    // Sum helps detect duplicated/lost values
+    if (sum_ != other.sum_) {
+      HWY_ABORT("Sum mismatch %g %g; min %g max %g\n",
+                static_cast<double>(sum_), static_cast<double>(other.sum_),
+                static_cast<double>(min_), static_cast<double>(max_));
+    }
+
+    return true;
+  }
+
+ private:
+  T min_ = hwy::HighestValue<T>();
+  T max_ = hwy::LowestValue<T>();
+  uint64_t sum_ = 0;
+  size_t count_ = 0;
+};
+
+enum class Algo {
+#if HAVE_AVX2SORT
+  kSEA,
+#endif
+#if HAVE_IPS4O
+  kIPS4O,
+#endif
+#if HAVE_PARALLEL_IPS4O
+  kParallelIPS4O,
+#endif
+#if HAVE_PDQSORT
+  kPDQ,
+#endif
+#if HAVE_SORT512
+  kSort512,
+#endif
+#if HAVE_VXSORT
+  kVXSort,
+#endif
+  kStd,
+  kVQSort,
+  kHeap,
+};
+
+static inline const char* AlgoName(Algo algo) {
+  switch (algo) {
+#if HAVE_AVX2SORT
+    case Algo::kSEA:
+      return "sea";
+#endif
+#if HAVE_IPS4O
+    case Algo::kIPS4O:
+      return "ips4o";
+#endif
+#if HAVE_PARALLEL_IPS4O
+    case Algo::kParallelIPS4O:
+      return "par_ips4o";
+#endif
+#if HAVE_PDQSORT
+    case Algo::kPDQ:
+      return "pdq";
+#endif
+#if HAVE_SORT512
+    case Algo::kSort512:
+      return "sort512";
+#endif
+#if HAVE_VXSORT
+    case Algo::kVXSort:
+      return "vxsort";
+#endif
+    case Algo::kStd:
+      return "std";
+    case Algo::kVQSort:
+      return "vq";
+    case Algo::kHeap:
+      return "heap";
+  }
+  return "unreachable";
+}
+
+}  // namespace hwy
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
+#endif
+
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"  // HeapSort
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+class Xorshift128Plus {
+  static HWY_INLINE uint64_t SplitMix64(uint64_t z) {
+    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+    z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+    return z ^ (z >> 31);
+  }
+
+ public:
+  // Generates two vectors of 64-bit seeds via SplitMix64 and stores into
+  // `seeds`. Generating these afresh in each ChoosePivot is too expensive.
+  template <class DU64>
+  static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) {
+    seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull);
+    for (size_t i = 1; i < 2 * Lanes(du64); ++i) {
+      seeds[i] = SplitMix64(seeds[i - 1]);
+    }
+  }
+
+  // Need to pass in the state because vector cannot be class members.
+  template <class VU64>
+  static VU64 RandomBits(VU64& state0, VU64& state1) {
+    VU64 s1 = state0;
+    VU64 s0 = state1;
+    const VU64 bits = Add(s1, s0);
+    state0 = s0;
+    s1 = Xor(s1, ShiftLeft<23>(s1));
+    state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
+    return bits;
+  }
+};
+
+template <class D, class VU64, HWY_IF_NOT_FLOAT_D(D)>
+Vec<D> RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) {
+  const VU64 bits = Xorshift128Plus::RandomBits(s0, s1);
+  return BitCast(d, And(bits, mask));
+}
+
+// It is important to avoid denormals, which are flushed to zero by SIMD but not
+// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
+template <class DF, class VU64, HWY_IF_FLOAT_D(DF)>
+Vec<DF> RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) {
+  using TF = TFromD<DF>;
+  const RebindToUnsigned<decltype(df)> du;
+  using VU = Vec<decltype(du)>;
+
+  const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask);
+
+#if HWY_TARGET == HWY_SCALAR  // Cannot repartition u64 to smaller types
+  using TU = MakeUnsigned<TF>;
+  const VU bits = Set(du, static_cast<TU>(GetLane(bits64) & LimitsMax<TU>()));
+#else
+  const VU bits = BitCast(du, bits64);
+#endif
+  // Avoid NaN/denormal by only generating values in [1, 2), i.e. random
+  // mantissas with the exponent taken from the representation of 1.0.
+  const VU k1 = BitCast(du, Set(df, TF{1.0}));
+  const VU mantissa_mask = Set(du, MantissaMask<TF>());
+  const VU representation = OrAnd(k1, bits, mantissa_mask);
+  return BitCast(df, representation);
+}
+
+template <class DU64>
+Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) {
+  switch (sizeof_t) {
+    case 2:
+      return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull
+                                                 : 0xFFFFFFFFFFFFFFFFull);
+    case 4:
+      return Set(du64, (dist == Dist::kUniform8)    ? 0x000000FF000000FFull
+                       : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull
+                                                    : 0xFFFFFFFFFFFFFFFFull);
+    case 8:
+      return Set(du64, (dist == Dist::kUniform8)    ? 0x00000000000000FFull
+                       : (dist == Dist::kUniform16) ? 0x000000000000FFFFull
+                                                    : 0x00000000FFFFFFFFull);
+    default:
+      HWY_ABORT("Logic error");
+      return Zero(du64);
+  }
+}
+
+template <typename T>
+InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
+  SortTag<uint64_t> du64;
+  using VU64 = Vec<decltype(du64)>;
+  const size_t N64 = Lanes(du64);
+  auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64);
+  Xorshift128Plus::GenerateSeeds(du64, seeds.get());
+  VU64 s0 = Load(du64, seeds.get());
+  VU64 s1 = Load(du64, seeds.get() + N64);
+
+#if HWY_TARGET == HWY_SCALAR
+  const Sisd<T> d;
+#else
+  const Repartition<T, decltype(du64)> d;
+#endif
+  using V = Vec<decltype(d)>;
+  const size_t N = Lanes(d);
+  const VU64 mask = MaskForDist(du64, dist, sizeof(T));
+  auto buf = hwy::AllocateAligned<T>(N);
+
+  size_t i = 0;
+  for (; i + N <= num; i += N) {
+    const V values = RandomValues(d, s0, s1, mask);
+    StoreU(values, d, v + i);
+  }
+  if (i < num) {
+    const V values = RandomValues(d, s0, s1, mask);
+    StoreU(values, d, buf.get());
+    memcpy(v + i, buf.get(), (num - i) * sizeof(T));
+  }
+
+  InputStats<T> input_stats;
+  for (size_t i = 0; i < num; ++i) {
+    input_stats.Notify(v[i]);
+  }
+  return input_stats;
+}
+
+struct ThreadLocal {
+  Sorter sorter;
+};
+
+struct SharedState {
+#if HAVE_PARALLEL_IPS4O
+  const unsigned max_threads = hwy::LimitsMax<unsigned>();  // 16 for Table 1a
+  ips4o::StdThreadPool pool{static_cast<int>(
+      HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))};
+#endif
+  std::vector<ThreadLocal> tls{1};
+};
+
+// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For
+// non-128-bit keys they are the same:
+template <class Order, typename KeyType, HWY_IF_NOT_LANE_SIZE(KeyType, 16)>
+void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) {
+  using detail::TraitsLane;
+  using detail::SharedTraits;
+  if (Order().IsAscending()) {
+    const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
+    return detail::HeapSort(st, keys, num_keys);
+  } else {
+    const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
+    return detail::HeapSort(st, keys, num_keys);
+  }
+}
+
+#if VQSORT_ENABLED
+template <class Order>
+void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) {
+  using detail::SharedTraits;
+  using detail::Traits128;
+  uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
+  const size_t num_lanes = num_keys * 2;
+  if (Order().IsAscending()) {
+    const SharedTraits<Traits128<detail::OrderAscending128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  } else {
+    const SharedTraits<Traits128<detail::OrderDescending128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  }
+}
+
+template <class Order>
+void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) {
+  using detail::SharedTraits;
+  using detail::Traits128;
+  uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
+  const size_t num_lanes = num_keys * 2;
+  if (Order().IsAscending()) {
+    const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  } else {
+    const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  }
+}
+#endif  // VQSORT_ENABLED
+
+template <class Order, typename KeyType>
+void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num,
+         SharedState& shared, size_t thread) {
+  const std::less<KeyType> less;
+  const std::greater<KeyType> greater;
+
+  switch (algo) {
+#if HAVE_AVX2SORT
+    case Algo::kSEA:
+      return avx2::quicksort(inout, static_cast<int>(num));
+#endif
+
+#if HAVE_IPS4O
+    case Algo::kIPS4O:
+      if (Order().IsAscending()) {
+        return ips4o::sort(inout, inout + num, less);
+      } else {
+        return ips4o::sort(inout, inout + num, greater);
+      }
+#endif
+
+#if HAVE_PARALLEL_IPS4O
+    case Algo::kParallelIPS4O:
+      if (Order().IsAscending()) {
+        return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
+      } else {
+        return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
+      }
+#endif
+
+#if HAVE_SORT512
+    case Algo::kSort512:
+      HWY_ABORT("not supported");
+      //    return Sort512::Sort(inout, num);
+#endif
+
+#if HAVE_PDQSORT
+    case Algo::kPDQ:
+      if (Order().IsAscending()) {
+        return boost::sort::pdqsort_branchless(inout, inout + num, less);
+      } else {
+        return boost::sort::pdqsort_branchless(inout, inout + num, greater);
+      }
+#endif
+
+#if HAVE_VXSORT
+    case Algo::kVXSort: {
+#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
+    (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
+      fprintf(stderr, "Do not call for target %s\n",
+              hwy::TargetName(HWY_TARGET));
+      return;
+#else
+#if VXSORT_AVX3
+      vxsort::vxsort<KeyType, vxsort::AVX512> vx;
+#else
+      vxsort::vxsort<KeyType, vxsort::AVX2> vx;
+#endif
+      if (Order().IsAscending()) {
+        return vx.sort(inout, inout + num - 1);
+      } else {
+        fprintf(stderr, "Skipping VX - does not support descending order\n");
+        return;
+      }
+#endif  // enabled for this target
+    }
+#endif  // HAVE_VXSORT
+
+    case Algo::kStd:
+      if (Order().IsAscending()) {
+        return std::sort(inout, inout + num, less);
+      } else {
+        return std::sort(inout, inout + num, greater);
+      }
+
+    case Algo::kVQSort:
+      return shared.tls[thread].sorter(inout, num, Order());
+
+    case Algo::kHeap:
+      return CallHeapSort<Order>(inout, num);
+
+    default:
+      HWY_ABORT("Not implemented");
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
@@ -0,0 +1,238 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Concurrent, independent sorts for generating more memory traffic and testing
+// scalability.
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <condition_variable>  //NOLINT
+#include <functional>
+#include <memory>
+#include <mutex>   //NOLINT
+#include <thread>  //NOLINT
+#include <utility>
+#include <vector>
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc"  //NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/result-inl.h"
+#include "hwy/aligned_allocator.h"
+// Last
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace {
+
+class ThreadPool {
+ public:
+  // Starts the given number of worker threads and blocks until they are ready.
+  explicit ThreadPool(
+      const size_t num_threads = std::thread::hardware_concurrency())
+      : num_threads_(num_threads) {
+    HWY_ASSERT(num_threads_ > 0);
+    threads_.reserve(num_threads_);
+    for (size_t i = 0; i < num_threads_; ++i) {
+      threads_.emplace_back(ThreadFunc, this, i);
+    }
+
+    WorkersReadyBarrier();
+  }
+
+  ThreadPool(const ThreadPool&) = delete;
+  ThreadPool& operator&(const ThreadPool&) = delete;
+
+  // Waits for all threads to exit.
+  ~ThreadPool() {
+    StartWorkers(kWorkerExit);
+
+    for (std::thread& thread : threads_) {
+      thread.join();
+    }
+  }
+
+  size_t NumThreads() const { return threads_.size(); }
+
+  template <class Func>
+  void RunOnThreads(size_t max_threads, const Func& func) {
+    task_ = &CallClosure<Func>;
+    data_ = &func;
+    StartWorkers(max_threads);
+    WorkersReadyBarrier();
+  }
+
+ private:
+  // After construction and between calls to Run, workers are "ready", i.e.
+  // waiting on worker_start_cv_. They are "started" by sending a "command"
+  // and notifying all worker_start_cv_ waiters. (That is why all workers
+  // must be ready/waiting - otherwise, the notification will not reach all of
+  // them and the main thread waits in vain for them to report readiness.)
+  using WorkerCommand = uint64_t;
+
+  static constexpr WorkerCommand kWorkerWait = ~1ULL;
+  static constexpr WorkerCommand kWorkerExit = ~2ULL;
+
+  // Calls a closure (lambda with captures).
+  template <class Closure>
+  static void CallClosure(const void* f, size_t thread) {
+    (*reinterpret_cast<const Closure*>(f))(thread);
+  }
+
+  void WorkersReadyBarrier() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    // Typically only a single iteration.
+    while (workers_ready_ != threads_.size()) {
+      workers_ready_cv_.wait(lock);
+    }
+    workers_ready_ = 0;
+
+    // Safely handle spurious worker wakeups.
+    worker_start_command_ = kWorkerWait;
+  }
+
+  // Precondition: all workers are ready.
+  void StartWorkers(const WorkerCommand worker_command) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    worker_start_command_ = worker_command;
+    // Workers will need this lock, so release it before they wake up.
+    lock.unlock();
+    worker_start_cv_.notify_all();
+  }
+
+  static void ThreadFunc(ThreadPool* self, size_t thread) {
+    // Until kWorkerExit command received:
+    for (;;) {
+      std::unique_lock<std::mutex> lock(self->mutex_);
+      // Notify main thread that this thread is ready.
+      if (++self->workers_ready_ == self->num_threads_) {
+        self->workers_ready_cv_.notify_one();
+      }
+    RESUME_WAIT:
+      // Wait for a command.
+      self->worker_start_cv_.wait(lock);
+      const WorkerCommand command = self->worker_start_command_;
+      switch (command) {
+        case kWorkerWait:    // spurious wakeup:
+          goto RESUME_WAIT;  // lock still held, avoid incrementing ready.
+        case kWorkerExit:
+          return;  // exits thread
+        default:
+          break;
+      }
+
+      lock.unlock();
+      // Command is the maximum number of threads that should run the task.
+      HWY_ASSERT(command < self->NumThreads());
+      if (thread < command) {
+        self->task_(self->data_, thread);
+      }
+    }
+  }
+
+  const size_t num_threads_;
+
+  // Unmodified after ctor, but cannot be const because we call thread::join().
+  std::vector<std::thread> threads_;
+
+  std::mutex mutex_;  // guards both cv and their variables.
+  std::condition_variable workers_ready_cv_;
+  size_t workers_ready_ = 0;
+  std::condition_variable worker_start_cv_;
+  WorkerCommand worker_start_command_;
+
+  // Written by main thread, read by workers (after mutex lock/unlock).
+  std::function<void(const void*, size_t)> task_;  // points to CallClosure
+  const void* data_;                               // points to caller's Func
+};
+
+template <class Traits>
+void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys,
+                      const Algo algo, SharedState& shared, size_t thread) {
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  using Order = typename Traits::Order;
+  const size_t num_lanes = num_keys * st.LanesPerKey();
+  auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+
+  (void)GenerateInput(dist, aligned.get(), num_lanes);
+
+  const Timestamp t0;
+  Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, shared,
+             thread);
+  HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]);
+}
+
+void BenchParallel() {
+  // Not interested in benchmark results for other targets on x86
+  if (HWY_ARCH_X86 && (HWY_TARGET != HWY_AVX2 && HWY_TARGET != HWY_AVX3)) {
+    return;
+  }
+
+  ThreadPool pool;
+  const size_t NT = pool.NumThreads();
+
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
+  using KeyType = typename decltype(st)::KeyType;
+  const size_t num_keys = size_t{100} * 1000 * 1000;
+
+#if HAVE_IPS4O
+  const Algo algo = Algo::kIPS4O;
+#else
+  const Algo algo = Algo::kVQSort;
+#endif
+  const Dist dist = Dist::kUniform32;
+
+  SharedState shared;
+  shared.tls.resize(NT);
+
+  std::vector<Result> results;
+  for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) {
+    Timestamp t0;
+    // Default capture because MSVC wants algo/dist but clang does not.
+    pool.RunOnThreads(nt, [=, &shared](size_t thread) {
+      RunWithoutVerify(st, dist, num_keys, algo, shared, thread);
+    });
+    const double sec = SecondsSince(t0);
+    results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType),
+                         st.KeyString());
+    results.back().Print();
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+namespace {
+HWY_BEFORE_TEST(BenchParallel);
+HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel);
+}  // namespace
+}  // namespace hwy
+
+#endif  // HWY_ONCE
@@ -0,0 +1,310 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <vector>
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/result-inl.h"
+#include "hwy/contrib/sort/sorting_networks-inl.h"  // SharedTraits
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// Mode for larger sorts because M1 is able to access more than the per-core
+// share of L2, so 1M elements might still be in cache.
+#define SORT_100M 0
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+// Defined within HWY_ONCE, used by BenchAllSort.
+extern int64_t first_sort_target;
+
+namespace HWY_NAMESPACE {
+namespace {
+using detail::TraitsLane;
+using detail::OrderAscending;
+using detail::OrderDescending;
+using detail::SharedTraits;
+
+#if VQSORT_ENABLED || HWY_IDE
+using detail::OrderAscending128;
+using detail::OrderAscendingKV128;
+using detail::Traits128;
+
+template <class Traits>
+HWY_NOINLINE void BenchPartition() {
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const SortTag<LaneType> d;
+  detail::SharedTraits<Traits> st;
+  const Dist dist = Dist::kUniform8;
+  double sum = 0.0;
+
+  detail::Generator rng(&sum, 123);  // for ChoosePivot
+
+  const size_t max_log2 = AdjustedLog2Reps(20);
+  for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
+    const size_t num_lanes = 1ull << log2;
+    const size_t num_keys = num_lanes / st.LanesPerKey();
+    auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+    auto buf = hwy::AllocateAligned<LaneType>(
+        HWY_MAX(hwy::SortConstants::PartitionBufNum(Lanes(d)),
+                hwy::SortConstants::PivotBufNum(sizeof(LaneType), Lanes(d))));
+
+    std::vector<double> seconds;
+    const size_t num_reps = (1ull << (14 - log2 / 2)) * 30;
+    for (size_t rep = 0; rep < num_reps; ++rep) {
+      (void)GenerateInput(dist, aligned.get(), num_lanes);
+
+      // The pivot value can influence performance. Do exactly what vqsort will
+      // do so that the performance (influenced by prefetching and branch
+      // prediction) is likely to predict the actual performance inside vqsort.
+      detail::DrawSamples(d, st, aligned.get(), num_lanes, buf.get(), rng);
+      detail::SortSamples(d, st, buf.get());
+      auto pivot = detail::ChoosePivotByRank(d, st, buf.get());
+
+      const Timestamp t0;
+      detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf.get());
+      seconds.push_back(SecondsSince(t0));
+      // 'Use' the result to prevent optimizing out the partition.
+      sum += static_cast<double>(aligned.get()[num_lanes / 2]);
+    }
+
+    Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds),
+           sizeof(KeyType), st.KeyString())
+        .Print();
+  }
+  HWY_ASSERT(sum != 999999);  // Prevent optimizing out
+}
+
+HWY_NOINLINE void BenchAllPartition() {
+  // Not interested in benchmark results for these targets
+  if (HWY_TARGET == HWY_SSSE3) {
+    return;
+  }
+
+  BenchPartition<TraitsLane<OrderDescending<float>>>();
+  BenchPartition<TraitsLane<OrderDescending<int32_t>>>();
+  BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
+  BenchPartition<Traits128<OrderAscending128>>();
+  // BenchPartition<Traits128<OrderDescending128>>();
+  BenchPartition<Traits128<OrderAscendingKV128>>();
+}
+
+template <class Traits>
+HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
+  // Not interested in benchmark results for these targets
+  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
+    return;
+  }
+
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const SortTag<LaneType> d;
+  detail::SharedTraits<Traits> st;
+  const Dist dist = Dist::kUniform32;
+
+  const size_t N = Lanes(d);
+  const size_t num_lanes = SortConstants::BaseCaseNum(N);
+  const size_t num_keys = num_lanes / st.LanesPerKey();
+  auto keys = hwy::AllocateAligned<LaneType>(num_lanes);
+  auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N);
+
+  std::vector<double> seconds;
+  double sum = 0;                             // prevents elision
+  constexpr size_t kMul = AdjustedReps(600);  // ensures long enough to measure
+
+  for (size_t rep = 0; rep < 30; ++rep) {
+    InputStats<LaneType> input_stats =
+        GenerateInput(dist, keys.get(), num_lanes);
+
+    const Timestamp t0;
+    for (size_t i = 0; i < kMul; ++i) {
+      detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes,
+                       buf.get());
+      sum += static_cast<double>(keys[0]);
+    }
+    seconds.push_back(SecondsSince(t0));
+    // printf("%f\n", seconds.back());
+
+    HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase"));
+  }
+  HWY_ASSERT(sum < 1E99);
+  results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1,
+                       SummarizeMeasurements(seconds), sizeof(KeyType),
+                       st.KeyString());
+}
+
+HWY_NOINLINE void BenchAllBase() {
+  // Not interested in benchmark results for these targets
+  if (HWY_TARGET == HWY_SSSE3) {
+    return;
+  }
+
+  std::vector<Result> results;
+  BenchBase<TraitsLane<OrderAscending<float>>>(results);
+  BenchBase<TraitsLane<OrderDescending<int64_t>>>(results);
+  BenchBase<Traits128<OrderAscending128>>(results);
+  for (const Result& r : results) {
+    r.Print();
+  }
+}
+
+#else
+void BenchAllPartition() {}
+void BenchAllBase() {}
+#endif  // VQSORT_ENABLED
+
+std::vector<Algo> AlgoForBench() {
+  return {
+#if HAVE_AVX2SORT
+    Algo::kSEA,
+#endif
+#if HAVE_PARALLEL_IPS4O
+        Algo::kParallelIPS4O,
+#elif HAVE_IPS4O
+        Algo::kIPS4O,
+#endif
+#if HAVE_PDQSORT
+        Algo::kPDQ,
+#endif
+#if HAVE_SORT512
+        Algo::kSort512,
+#endif
+// Only include if we're compiling for the target it supports.
+#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \
+                    (!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2))
+        Algo::kVXSort,
+#endif
+
+#if !HAVE_PARALLEL_IPS4O
+#if !SORT_100M
+        // These are 10-20x slower, but that's OK for the default size when we
+        // are not testing the parallel nor 100M modes.
+        Algo::kStd, Algo::kHeap,
+#endif
+
+        Algo::kVQSort,  // only ~4x slower, but not required for Table 1a
+#endif
+  };
+}
+
+template <class Traits>
+HWY_NOINLINE void BenchSort(size_t num_keys) {
+  if (first_sort_target == 0) first_sort_target = HWY_TARGET;
+
+  SharedState shared;
+  detail::SharedTraits<Traits> st;
+  using Order = typename Traits::Order;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const size_t num_lanes = num_keys * st.LanesPerKey();
+  auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+
+  const size_t reps = num_keys > 1000 * 1000 ? 10 : 30;
+
+  for (Algo algo : AlgoForBench()) {
+    // Other algorithms don't depend on the vector instructions, so only run
+    // them for the first target.
+#if !HAVE_VXSORT
+    if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) {
+      continue;
+    }
+#endif
+
+    for (Dist dist : AllDist()) {
+      std::vector<double> seconds;
+      for (size_t rep = 0; rep < reps; ++rep) {
+        InputStats<LaneType> input_stats =
+            GenerateInput(dist, aligned.get(), num_lanes);
+
+        const Timestamp t0;
+        Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys,
+                   shared, /*thread=*/0);
+        seconds.push_back(SecondsSince(t0));
+        // printf("%f\n", seconds.back());
+
+        HWY_ASSERT(
+            VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort"));
+      }
+      Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds),
+             sizeof(KeyType), st.KeyString())
+          .Print();
+    }  // dist
+  }    // algo
+}
+
+HWY_NOINLINE void BenchAllSort() {
+  // Not interested in benchmark results for these targets
+  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
+    return;
+  }
+
+  constexpr size_t K = 1000;
+  constexpr size_t M = K * K;
+  (void)K;
+  (void)M;
+  for (size_t num_keys : {
+#if HAVE_PARALLEL_IPS4O || SORT_100M
+         100 * M,
+#else
+        1 * M,
+#endif
+       }) {
+    BenchSort<TraitsLane<OrderAscending<float>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<double>>>(num_keys);
+    // BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys);
+    BenchSort<TraitsLane<OrderDescending<int32_t>>>(num_keys);
+    BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<uint16_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<uint32_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys);
+
+#if !HAVE_VXSORT && VQSORT_ENABLED
+    BenchSort<Traits128<OrderAscending128>>(num_keys);
+    BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
+#endif
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+int64_t first_sort_target = 0;  // none run yet
+namespace {
+HWY_BEFORE_TEST(BenchSort);
+HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
+HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase);
+HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
+}  // namespace
+}  // namespace hwy
+
+#endif  // HWY_ONCE
@@ -0,0 +1,191 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+
+#include <algorithm>
+
+#include "hwy/base.h"
+
+// Based on A.7 in "Entwurf und Implementierung vektorisierter
+// Sortieralgorithmen" and code by Mark Blacher.
+void PrintMergeNetwork16x2() {
+  for (int i = 8; i < 16; ++i) {
+    printf("v%x = st.SwapAdjacent(d, v%x);\n", i, i);
+  }
+  for (int i = 0; i < 8; ++i) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
+  }
+  for (int i = 0; i < 4; ++i) {
+    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 4, i + 4);
+    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 12, i + 12);
+  }
+  for (int i = 0; i < 4; ++i) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
+    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
+  }
+  for (int i = 0; i < 16; i += 4) {
+    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 2, i + 2);
+    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 3, i + 3);
+  }
+  for (int i = 0; i < 16; i += 4) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
+    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
+  }
+  for (int i = 0; i < 16; i += 2) {
+    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 1, i + 1);
+  }
+  for (int i = 0; i < 16; i += 2) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
+  }
+  for (int i = 0; i < 16; ++i) {
+    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
+  }
+  printf("\n");
+}
+
+void PrintMergeNetwork16x4() {
+  printf("\n");
+
+  for (int i = 8; i < 16; ++i) {
+    printf("v%x = st.Reverse4(d, v%x);\n", i, i);
+  }
+  for (int i = 0; i < 8; ++i) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
+  }
+  for (int i = 0; i < 4; ++i) {
+    printf("v%x = st.Reverse4(d, v%x);\n", i + 4, i + 4);
+    printf("v%x = st.Reverse4(d, v%x);\n", i + 12, i + 12);
+  }
+  for (int i = 0; i < 4; ++i) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
+    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
+  }
+  for (int i = 0; i < 16; i += 4) {
+    printf("v%x = st.Reverse4(d, v%x);\n", i + 2, i + 2);
+    printf("v%x = st.Reverse4(d, v%x);\n", i + 3, i + 3);
+  }
+  for (int i = 0; i < 16; i += 4) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
+    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
+  }
+  for (int i = 0; i < 16; i += 2) {
+    printf("v%x = st.Reverse4(d, v%x);\n", i + 1, i + 1);
+  }
+  for (int i = 0; i < 16; i += 2) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
+  }
+  for (int i = 0; i < 16; ++i) {
+    printf("v%x = st.SortPairsReverse4(d, v%x);\n", i, i);
+  }
+  for (int i = 0; i < 16; ++i) {
+    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
+  }
+}
+
+void PrintMergeNetwork16x8() {
+  printf("\n");
+
+  for (int i = 8; i < 16; ++i) {
+    printf("v%x = st.ReverseKeys8(d, v%x);\n", i, i);
+  }
+  for (int i = 0; i < 8; ++i) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
+  }
+  for (int i = 0; i < 4; ++i) {
+    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 4, i + 4);
+    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 12, i + 12);
+  }
+  for (int i = 0; i < 4; ++i) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
+    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
+  }
+  for (int i = 0; i < 16; i += 4) {
+    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 2, i + 2);
+    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 3, i + 3);
+  }
+  for (int i = 0; i < 16; i += 4) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
+    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
+  }
+  for (int i = 0; i < 16; i += 2) {
+    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 1, i + 1);
+  }
+  for (int i = 0; i < 16; i += 2) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
+  }
+  for (int i = 0; i < 16; ++i) {
+    printf("v%x = st.SortPairsReverse8(d, v%x);\n", i, i);
+  }
+  for (int i = 0; i < 16; ++i) {
+    printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
+  }
+  for (int i = 0; i < 16; ++i) {
+    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
+  }
+}
+
+void PrintMergeNetwork16x16() {
+  printf("\n");
+
+  for (int i = 8; i < 16; ++i) {
+    printf("v%x = st.ReverseKeys16(d, v%x);\n", i, i);
+  }
+  for (int i = 0; i < 8; ++i) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
+  }
+  for (int i = 0; i < 4; ++i) {
+    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 4, i + 4);
+    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 12, i + 12);
+  }
+  for (int i = 0; i < 4; ++i) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
+    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
+  }
+  for (int i = 0; i < 16; i += 4) {
+    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 2, i + 2);
+    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 3, i + 3);
+  }
+  for (int i = 0; i < 16; i += 4) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
+    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
+  }
+  for (int i = 0; i < 16; i += 2) {
+    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 1, i + 1);
+  }
+  for (int i = 0; i < 16; i += 2) {
+    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
+  }
+  for (int i = 0; i < 16; ++i) {
+    printf("v%x = st.SortPairsReverse16<kOrder>(d, v%x);\n", i, i);
+  }
+  for (int i = 0; i < 16; ++i) {
+    printf("v%x = st.SortPairsDistance4<kOrder>(d, v%x);\n", i, i);
+  }
+  for (int i = 0; i < 16; ++i) {
+    printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
+  }
+  for (int i = 0; i < 16; ++i) {
+    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
+  }
+}
+
+int main(int argc, char** argv) {
+  PrintMergeNetwork16x2();
+  PrintMergeNetwork16x4();
+  PrintMergeNetwork16x8();
+  PrintMergeNetwork16x16();
+  return 0;
+}
@@ -0,0 +1,139 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/algo-inl.h"
+
+// Normal include guard for non-SIMD parts
+#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
+#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
+
+#include <time.h>
+
+#include <algorithm>  // std::sort
+#include <string>
+
+#include "hwy/base.h"
+#include "hwy/nanobenchmark.h"
+
+namespace hwy {
+
+struct Timestamp {
+  Timestamp() { t = platform::Now(); }
+  double t;
+};
+
+static inline double SecondsSince(const Timestamp& t0) {
+  const Timestamp t1;
+  return t1.t - t0.t;
+}
+
+// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
+// enough for the mode to be reliable).
+static inline double SummarizeMeasurements(std::vector<double>& seconds) {
+  std::sort(seconds.begin(), seconds.end());
+  double sum = 0;
+  int count = 0;
+  const size_t num = seconds.size();
+  for (size_t i = num / 4; i < num / 2; ++i) {
+    sum += seconds[i];
+    count += 1;
+  }
+  return sum / count;
+}
+
+}  // namespace hwy
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct Result {
+  Result() {}
+  Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads,
+         double sec, size_t sizeof_key, const std::string& key_name)
+      : target(HWY_TARGET),
+        algo(algo),
+        dist(dist),
+        num_keys(num_keys),
+        num_threads(num_threads),
+        sec(sec),
+        sizeof_key(sizeof_key),
+        key_name(key_name) {}
+
+  void Print() const {
+    const double bytes = static_cast<double>(num_keys) *
+                         static_cast<double>(num_threads) *
+                         static_cast<double>(sizeof_key);
+    printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
+           hwy::TargetName(target), AlgoName(algo), key_name.c_str(),
+           DistName(dist), static_cast<double>(num_keys), bytes * 1E-6 / sec,
+           num_threads);
+  }
+
+  int64_t target;
+  Algo algo;
+  Dist dist;
+  size_t num_keys = 0;
+  size_t num_threads = 0;
+  double sec = 0.0;
+  size_t sizeof_key = 0;
+  std::string key_name;
+};
+
+template <class Traits, typename LaneType>
+bool VerifySort(Traits st, const InputStats<LaneType>& input_stats,
+                const LaneType* out, size_t num_lanes, const char* caller) {
+  constexpr size_t N1 = st.LanesPerKey();
+  HWY_ASSERT(num_lanes >= N1);
+
+  InputStats<LaneType> output_stats;
+  // Ensure it matches the sort order
+  for (size_t i = 0; i < num_lanes - N1; i += N1) {
+    output_stats.Notify(out[i]);
+    if (N1 == 2) output_stats.Notify(out[i + 1]);
+    // Reverse order instead of checking !Compare1 so we accept equal keys.
+    if (st.Compare1(out + i + N1, out + i)) {
+      printf("%s: i=%d of %d lanes: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n",
+             caller, static_cast<int>(i), static_cast<int>(num_lanes),
+             static_cast<int>(N1), static_cast<double>(out[i + 1]),
+             static_cast<double>(out[i + 0]),
+             static_cast<double>(out[i + N1 + 1]),
+             static_cast<double>(out[i + N1]));
+      HWY_ABORT("%d-bit sort is incorrect\n",
+                static_cast<int>(sizeof(LaneType) * 8 * N1));
+    }
+  }
+  output_stats.Notify(out[num_lanes - N1]);
+  if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]);
+
+  return input_stats == output_stats;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
@@ -0,0 +1,133 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Definitions shared between vqsort-inl and sorting_networks-inl.
+
+// Normal include guard for target-independent parts
+#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
+#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
+
+#include "hwy/base.h"
+
+namespace hwy {
+
+// Internal constants - these are to avoid magic numbers/literals and cannot be
+// changed without also changing the associated code.
+struct SortConstants {
+// SortingNetwork reshapes its input into a matrix. This is the maximum number
+// of *keys* per vector.
+#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
+  static constexpr size_t kMaxCols = 8;  // avoid build timeout/stack overflow
+#else
+  static constexpr size_t kMaxCols = 16;  // enough for u32 in 512-bit vector
+#endif
+
+  // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
+  // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
+  // code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
+  // extra logN factor for larger networks (for which only loose upper bounds
+  // on size are known).
+  static constexpr size_t kMaxRowsLog2 = 4;
+  static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
+
+  static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) {
+    return kMaxRows * HWY_MIN(N, kMaxCols);
+  }
+
+  // Unrolling is important (pipelining and amortizing branch mispredictions);
+  // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
+  // somewhat slower for sorting than 4x.
+  //
+  // To change, must also update left + 3 * N etc. in the loop.
+  static constexpr size_t kPartitionUnroll = 4;
+
+  static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
+    // The main loop reads kPartitionUnroll vectors, and first loads from
+    // both left and right beforehand, so it requires min = 2 *
+    // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
+    // >= BaseCaseNum), we partition the right side into a buffer. We need
+    // another vector at the end so CompressStore does not overwrite anything.
+    return (2 * kPartitionUnroll + 1) * N;
+  }
+
+  // Chunk := group of keys loaded for sampling a pivot. Matches the typical
+  // cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
+  // are larger, use entire vectors to ensure we do not overrun the array.
+  static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
+    return HWY_MAX(64 / sizeof_t, N);
+  }
+
+  static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
+    // 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
+    return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
+  }
+
+  template <typename T>
+  static constexpr HWY_INLINE size_t BufNum(size_t N) {
+    // One extra for padding plus another for full-vector loads.
+    return HWY_MAX(BaseCaseNum(N) + 2 * N,
+                   HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N)));
+  }
+
+  template <typename T>
+  static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
+    return sizeof(T) * BufNum<T>(vector_size / sizeof(T));
+  }
+};
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
+#endif
+
+#include "hwy/highway.h"
+
+// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
+// Arm v7 debug.
+#undef VQSORT_ENABLED
+#if (HWY_TARGET == HWY_SCALAR) ||                 \
+    (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
+    (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
+#define VQSORT_ENABLED 0
+#else
+#define VQSORT_ENABLED 1
+#endif
+
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Default tag / vector width selector.
+#if HWY_TARGET == HWY_RVV
+// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
+template <typename T>
+using SortTag = ScalableTag<T, -1>;
+#else
+template <typename T>
+using SortTag = ScalableTag<T>;
+#endif
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
@@ -0,0 +1,626 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>  // memcpy
+
+#include <unordered_map>
+#include <vector>
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/sort/vqsort.h"
+// After foreach_target
+#include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/result-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"  // BaseCase
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace {
+
+using detail::OrderAscending;
+using detail::OrderDescending;
+using detail::SharedTraits;
+using detail::TraitsLane;
+#if VQSORT_ENABLED || HWY_IDE
+using detail::OrderAscending128;
+using detail::OrderAscendingKV128;
+using detail::OrderAscendingKV64;
+using detail::OrderDescending128;
+using detail::OrderDescendingKV128;
+using detail::OrderDescendingKV64;
+using detail::Traits128;
+
+template <class Traits>
+static HWY_NOINLINE void TestMedian3() {
+  using LaneType = typename Traits::LaneType;
+  using D = CappedTag<LaneType, 1>;
+  SharedTraits<Traits> st;
+  const D d;
+  using V = Vec<D>;
+  for (uint32_t bits = 0; bits < 8; ++bits) {
+    const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u});
+    const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u});
+    const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u});
+    const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2));
+    // If at least half(rounded up) of bits are 1, so is the median.
+    const size_t count = PopCount(bits);
+    HWY_ASSERT_EQ((count >= 2) ? static_cast<LaneType>(1) : 0, m);
+  }
+}
+
+HWY_NOINLINE void TestAllMedian() {
+  TestMedian3<TraitsLane<OrderAscending<uint64_t> > >();
+}
+
+template <class Traits>
+static HWY_NOINLINE void TestBaseCaseAscDesc() {
+  using LaneType = typename Traits::LaneType;
+  SharedTraits<Traits> st;
+  const SortTag<LaneType> d;
+  const size_t N = Lanes(d);
+  const size_t base_case_num = SortConstants::BaseCaseNum(N);
+  const size_t N1 = st.LanesPerKey();
+
+  constexpr int kDebug = 0;
+  auto aligned_lanes = hwy::AllocateAligned<LaneType>(N + base_case_num + N);
+  auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
+
+  std::vector<size_t> lengths;
+  lengths.push_back(HWY_MAX(1, N1));
+  lengths.push_back(3 * N1);
+  lengths.push_back(base_case_num / 2);
+  lengths.push_back(base_case_num / 2 + N1);
+  lengths.push_back(base_case_num - N1);
+  lengths.push_back(base_case_num);
+
+  std::vector<size_t> misalignments;
+  misalignments.push_back(0);
+  misalignments.push_back(1);
+  if (N >= 6) misalignments.push_back(N / 2 - 1);
+  misalignments.push_back(N / 2);
+  misalignments.push_back(N / 2 + 1);
+  misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1}));
+
+  for (bool asc : {false, true}) {
+    for (size_t len : lengths) {
+      for (size_t misalign : misalignments) {
+        LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
+        if (kDebug) {
+          printf("============%s asc %d N1 %d len %d misalign %d\n",
+                 st.KeyString().c_str(), asc, static_cast<int>(N1),
+                 static_cast<int>(len), static_cast<int>(misalign));
+        }
+
+        for (size_t i = 0; i < misalign; ++i) {
+          aligned_lanes[i] = hwy::LowestValue<LaneType>();
+        }
+        InputStats<LaneType> input_stats;
+        for (size_t i = 0; i < len; ++i) {
+          lanes[i] = asc ? static_cast<LaneType>(LaneType(i) + 1)
+                         : static_cast<LaneType>(LaneType(len) - LaneType(i));
+          input_stats.Notify(lanes[i]);
+          if (kDebug >= 2) {
+            printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+          }
+        }
+        for (size_t i = len; i < base_case_num + N; ++i) {
+          lanes[i] = hwy::LowestValue<LaneType>();
+        }
+
+        detail::BaseCase(d, st, lanes, lanes + len, len, buf.get());
+
+        if (kDebug >= 2) {
+          printf("out>>>>>>\n");
+          for (size_t i = 0; i < len; ++i) {
+            printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+          }
+        }
+
+        HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc"));
+        for (size_t i = 0; i < misalign; ++i) {
+          if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
+            HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
+        }
+        for (size_t i = len; i < base_case_num + N; ++i) {
+          if (lanes[i] != hwy::LowestValue<LaneType>())
+            HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
+        }
+      }  // misalign
+    }    // len
+  }      // asc
+}
+
+template <class Traits>
+static HWY_NOINLINE void TestBaseCase01() {
+  using LaneType = typename Traits::LaneType;
+  SharedTraits<Traits> st;
+  const SortTag<LaneType> d;
+  const size_t N = Lanes(d);
+  const size_t base_case_num = SortConstants::BaseCaseNum(N);
+  const size_t N1 = st.LanesPerKey();
+
+  constexpr int kDebug = 0;
+  auto lanes = hwy::AllocateAligned<LaneType>(base_case_num + N);
+  auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
+
+  std::vector<size_t> lengths;
+  lengths.push_back(HWY_MAX(1, N1));
+  lengths.push_back(3 * N1);
+  lengths.push_back(base_case_num / 2);
+  lengths.push_back(base_case_num / 2 + N1);
+  lengths.push_back(base_case_num - N1);
+  lengths.push_back(base_case_num);
+
+  for (size_t len : lengths) {
+    if (kDebug) {
+      printf("============%s 01 N1 %d len %d\n", st.KeyString().c_str(),
+             static_cast<int>(N1), static_cast<int>(len));
+    }
+    const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
+    for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
+      InputStats<LaneType> input_stats;
+      for (size_t i = 0; i < len; ++i) {
+        lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
+        input_stats.Notify(lanes[i]);
+        if (kDebug >= 2) {
+          printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+        }
+      }
+      for (size_t i = len; i < base_case_num + N; ++i) {
+        lanes[i] = hwy::LowestValue<LaneType>();
+      }
+
+      detail::BaseCase(d, st, lanes.get(), lanes.get() + len, len, buf.get());
+
+      if (kDebug >= 2) {
+        printf("out>>>>>>\n");
+        for (size_t i = 0; i < len; ++i) {
+          printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+        }
+      }
+
+      HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01"));
+      for (size_t i = len; i < base_case_num + N; ++i) {
+        if (lanes[i] != hwy::LowestValue<LaneType>())
+          HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
+      }
+    }  // bits
+  }    // len
+}
+
+template <class Traits>
+static HWY_NOINLINE void TestBaseCase() {
+  TestBaseCaseAscDesc<Traits>();
+  TestBaseCase01<Traits>();
+}
+
+HWY_NOINLINE void TestAllBaseCase() {
+  // Workaround for stack overflow on MSVC debug.
+#if defined(_MSC_VER)
+  return;
+#endif
+  TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
+  TestBaseCase<TraitsLane<OrderDescending<int64_t> > >();
+  TestBaseCase<Traits128<OrderAscending128> >();
+  TestBaseCase<Traits128<OrderDescending128> >();
+}
+
+template <class Traits>
+static HWY_NOINLINE void VerifyPartition(
+    Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left,
+    size_t border, size_t right, const size_t N1,
+    const typename Traits::LaneType* pivot) {
+  /* for (size_t i = left; i < right; ++i) {
+     if (i == border) printf("--\n");
+     printf("%4zu: %3d\n", i, lanes[i]);
+   }*/
+
+  HWY_ASSERT(left % N1 == 0);
+  HWY_ASSERT(border % N1 == 0);
+  HWY_ASSERT(right % N1 == 0);
+  const bool asc = typename Traits::Order().IsAscending();
+  for (size_t i = left; i < border; i += N1) {
+    if (st.Compare1(pivot, lanes + i)) {
+      HWY_ABORT(
+          "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
+          "border %d",
+          st.KeyString().c_str(), asc, static_cast<int>(i),
+          static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
+          static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i + 0]),
+          static_cast<int>(border));
+    }
+  }
+  for (size_t i = border; i < right; i += N1) {
+    if (!st.Compare1(pivot, lanes + i)) {
+      HWY_ABORT(
+          "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
+          "border %d",
+          st.KeyString().c_str(), asc, static_cast<int>(i),
+          static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
+          static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i]),
+          static_cast<int>(border));
+    }
+  }
+}
+
+template <class Traits>
+static HWY_NOINLINE void TestPartition() {
+  using LaneType = typename Traits::LaneType;
+  const SortTag<LaneType> d;
+  SharedTraits<Traits> st;
+  const bool asc = typename Traits::Order().IsAscending();
+  const size_t N = Lanes(d);
+  constexpr int kDebug = 0;
+  const size_t base_case_num = SortConstants::BaseCaseNum(N);
+  // left + len + align
+  const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
+  auto aligned_lanes = hwy::AllocateAligned<LaneType>(total);
+  auto buf = hwy::AllocateAligned<LaneType>(SortConstants::PartitionBufNum(N));
+
+  const size_t N1 = st.LanesPerKey();
+  for (bool in_asc : {false, true}) {
+    for (int left_i : {0, 1, 4, 6, 7, 8, 12, 15, 22, 28, 30, 31}) {
+      const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
+      for (size_t ofs : {N, N + 1, N + 3, 2 * N, 2 * N + 2, 2 * N + 3,
+                         3 * N - 1, 4 * N - 3, 4 * N - 2}) {
+        const size_t len = (base_case_num + ofs) & ~(N1 - 1);
+        for (LaneType pivot1 :
+             {LaneType(0), LaneType(len / 3), LaneType(len / 2),
+              LaneType(2 * len / 3), LaneType(len)}) {
+          const LaneType pivot2[2] = {pivot1, 0};
+          const auto pivot = st.SetKey(d, pivot2);
+          for (size_t misalign = 0; misalign < N;
+               misalign += st.LanesPerKey()) {
+            LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
+            const size_t right = left + len;
+            if (kDebug) {
+              printf(
+                  "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
+                  st.KeyString().c_str(), asc, static_cast<int>(left),
+                  static_cast<int>(len), static_cast<int>(right),
+                  static_cast<double>(pivot2[1]),
+                  static_cast<double>(pivot2[0]));
+            }
+
+            for (size_t i = 0; i < misalign; ++i) {
+              aligned_lanes[i] = hwy::LowestValue<LaneType>();
+            }
+            for (size_t i = 0; i < left; ++i) {
+              lanes[i] = hwy::LowestValue<LaneType>();
+            }
+            std::unordered_map<LaneType, int> counts;
+            for (size_t i = left; i < right; ++i) {
+              lanes[i] = static_cast<LaneType>(
+                  in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
+                         : static_cast<LaneType>(right) - LaneType(i));
+              ++counts[lanes[i]];
+              if (kDebug >= 2) {
+                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+              }
+            }
+            for (size_t i = right; i < total - misalign; ++i) {
+              lanes[i] = hwy::LowestValue<LaneType>();
+            }
+
+            size_t border =
+                left + detail::Partition(d, st, lanes + left, right - left,
+                                         pivot, buf.get());
+
+            if (kDebug >= 2) {
+              printf("out>>>>>>\n");
+              for (size_t i = left; i < right; ++i) {
+                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+              }
+              for (size_t i = right; i < total - misalign; ++i) {
+                printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
+              }
+            }
+            for (size_t i = left; i < right; ++i) {
+              --counts[lanes[i]];
+            }
+            for (auto kv : counts) {
+              if (kv.second != 0) {
+                PrintValue(kv.first);
+                HWY_ABORT("Incorrect count %d\n", kv.second);
+              }
+            }
+            VerifyPartition(st, lanes, left, border, right, N1, pivot2);
+            for (size_t i = 0; i < misalign; ++i) {
+              if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
+                HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
+            }
+            for (size_t i = 0; i < left; ++i) {
+              if (lanes[i] != hwy::LowestValue<LaneType>())
+                HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
+            }
+            for (size_t i = right; i < total - misalign; ++i) {
+              if (lanes[i] != hwy::LowestValue<LaneType>())
+                HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
+            }
+          }  // misalign
+        }    // pivot
+      }      // len
+    }        // left
+  }          // asc
+}
+
+HWY_NOINLINE void TestAllPartition() {
+  TestPartition<TraitsLane<OrderDescending<int32_t> > >();
+  TestPartition<Traits128<OrderAscending128> >();
+
+#if !HWY_IS_DEBUG_BUILD
+  TestPartition<TraitsLane<OrderAscending<int16_t> > >();
+  TestPartition<TraitsLane<OrderAscending<int64_t> > >();
+  TestPartition<TraitsLane<OrderDescending<float> > >();
+#if HWY_HAVE_FLOAT64
+  TestPartition<TraitsLane<OrderDescending<double> > >();
+#endif
+  TestPartition<Traits128<OrderDescending128> >();
+#endif
+}
+
+// (used for sample selection for choosing a pivot)
+template <typename TU>
+static HWY_NOINLINE void TestRandomGenerator() {
+  static_assert(!hwy::IsSigned<TU>(), "");
+  SortTag<TU> du;
+  const size_t N = Lanes(du);
+
+  detail::Generator rng(&N, N);
+
+  const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N);  // power of two
+
+  for (uint32_t num_blocks = 2; num_blocks < 100000;
+       num_blocks = 3 * num_blocks / 2) {
+    // Generate some numbers and ensure all are in range
+    uint64_t sum = 0;
+    constexpr size_t kReps = 10000;
+    for (size_t rep = 0; rep < kReps; ++rep) {
+      const uint32_t bits = rng() & 0xFFFFFFFF;
+      const size_t index = detail::RandomChunkIndex(num_blocks, bits);
+      HWY_ASSERT(((index + 1) * lanes_per_block) <=
+                 num_blocks * lanes_per_block);
+
+      sum += index;
+    }
+
+    // Also ensure the mean is near the middle of the range
+    const double expected = (num_blocks - 1) / 2.0;
+    const double actual = static_cast<double>(sum) / kReps;
+    HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
+  }
+}
+
+HWY_NOINLINE void TestAllGenerator() {
+  TestRandomGenerator<uint32_t>();
+  TestRandomGenerator<uint64_t>();
+}
+
+#else
+static void TestAllMedian() {}
+static void TestAllBaseCase() {}
+static void TestAllPartition() {}
+static void TestAllGenerator() {}
+#endif  // VQSORT_ENABLED
+
+// Remembers input, and compares results to that of a reference algorithm.
+template <class Traits>
+class CompareResults {
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+
+ public:
+  CompareResults(const LaneType* in, size_t num_lanes) {
+    copy_.resize(num_lanes);
+    memcpy(copy_.data(), in, num_lanes * sizeof(LaneType));
+  }
+
+  bool Verify(const LaneType* output) {
+#if HAVE_PDQSORT
+    const Algo reference = Algo::kPDQ;
+#else
+    const Algo reference = Algo::kStd;
+#endif
+    SharedState shared;
+    using Order = typename Traits::Order;
+    const Traits st;
+    const size_t num_keys = copy_.size() / st.LanesPerKey();
+    Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
+               shared, /*thread=*/0);
+#if VQSORT_PRINT >= 3
+    fprintf(stderr, "\nExpected:\n");
+    for (size_t i = 0; i < copy_.size(); ++i) {
+      PrintValue(copy_[i]);
+    }
+    fprintf(stderr, "\n");
+#endif
+    for (size_t i = 0; i < copy_.size(); ++i) {
+      if (copy_[i] != output[i]) {
+        if (sizeof(KeyType) == 16) {
+          fprintf(stderr,
+                  "%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n",
+                  st.KeyString().c_str(), Order().IsAscending(),
+                  static_cast<int>(i), static_cast<int>(copy_.size()),
+                  static_cast<uint64_t>(copy_[i]),
+                  static_cast<uint64_t>(output[i]));
+        } else {
+          fprintf(stderr, "Type %s Asc %d mismatch at %d of %d: ",
+                  st.KeyString().c_str(), Order().IsAscending(),
+                  static_cast<int>(i), static_cast<int>(copy_.size()));
+          PrintValue(copy_[i]);
+          PrintValue(output[i]);
+          fprintf(stderr, "\n");
+        }
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  std::vector<LaneType> copy_;
+};
+
+std::vector<Algo> AlgoForTest() {
+  return {
+#if HAVE_AVX2SORT
+    Algo::kSEA,
+#endif
+#if HAVE_IPS4O
+        Algo::kIPS4O,
+#endif
+#if HAVE_PDQSORT
+        Algo::kPDQ,
+#endif
+#if HAVE_SORT512
+        Algo::kSort512,
+#endif
+        Algo::kHeap, Algo::kVQSort,
+  };
+}
+
+template <class Traits>
+void TestSort(size_t num_lanes) {
+// Workaround for stack overflow on clang-cl (/F 8388608 does not help).
+#if defined(_MSC_VER)
+  return;
+#endif
+  using Order = typename Traits::Order;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  SharedState shared;
+  SharedTraits<Traits> st;
+
+  // Round up to a whole number of keys.
+  num_lanes += (st.Is128() && (num_lanes & 1));
+  const size_t num_keys = num_lanes / st.LanesPerKey();
+
+  constexpr size_t kMaxMisalign = 16;
+  auto aligned =
+      hwy::AllocateAligned<LaneType>(kMaxMisalign + num_lanes + kMaxMisalign);
+  for (Algo algo : AlgoForTest()) {
+    for (Dist dist : AllDist()) {
+      for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
+                              size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
+        LaneType* lanes = aligned.get() + misalign;
+
+        // Set up red zones before/after the keys to sort
+        for (size_t i = 0; i < misalign; ++i) {
+          aligned[i] = hwy::LowestValue<LaneType>();
+        }
+        for (size_t i = 0; i < kMaxMisalign; ++i) {
+          lanes[num_lanes + i] = hwy::HighestValue<LaneType>();
+        }
+#if HWY_IS_MSAN
+        __msan_poison(aligned.get(), misalign * sizeof(LaneType));
+        __msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
+#endif
+        InputStats<LaneType> input_stats =
+            GenerateInput(dist, lanes, num_lanes);
+
+        CompareResults<Traits> compare(lanes, num_lanes);
+        Run<Order>(algo, reinterpret_cast<KeyType*>(lanes), num_keys, shared,
+                   /*thread=*/0);
+        HWY_ASSERT(compare.Verify(lanes));
+        HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort"));
+
+        // Check red zones
+#if HWY_IS_MSAN
+        __msan_unpoison(aligned.get(), misalign * sizeof(LaneType));
+        __msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
+#endif
+        for (size_t i = 0; i < misalign; ++i) {
+          if (aligned[i] != hwy::LowestValue<LaneType>())
+            HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
+        }
+        for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) {
+          if (lanes[i] != hwy::HighestValue<LaneType>())
+            HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
+        }
+      }  // misalign
+    }    // dist
+  }      // algo
+}
+
+void TestAllSort() {
+  for (int num : {129, 504, 3 * 1000, 34567}) {
+    const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
+    TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);
+
+    TestSort<TraitsLane<OrderDescending<int32_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderDescending<uint32_t> > >(num_lanes);
+
+    TestSort<TraitsLane<OrderAscending<int64_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderAscending<uint64_t> > >(num_lanes);
+
+    // WARNING: for float types, SIMD comparisons will flush denormals to
+    // zero, causing mismatches with scalar sorts. In this test, we avoid
+    // generating denormal inputs.
+    TestSort<TraitsLane<OrderAscending<float> > >(num_lanes);
+#if HWY_HAVE_FLOAT64  // protects algo-inl's GenerateRandom
+    if (Sorter::HaveFloat64()) {
+      TestSort<TraitsLane<OrderDescending<double> > >(num_lanes);
+    }
+#endif
+
+// Our HeapSort does not support 128-bit keys.
+#if VQSORT_ENABLED
+    TestSort<Traits128<OrderAscending128> >(num_lanes);
+    TestSort<Traits128<OrderDescending128> >(num_lanes);
+
+    TestSort<TraitsLane<OrderAscendingKV64> >(num_lanes);
+    TestSort<TraitsLane<OrderDescendingKV64> >(num_lanes);
+
+    TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
+    TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
+#endif
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+namespace {
+HWY_BEFORE_TEST(SortTest);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort);
+}  // namespace
+}  // namespace hwy
+
+#endif  // HWY_ONCE
@@ -0,0 +1,695 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
+#endif
+
+#include "hwy/contrib/sort/shared-inl.h"  // SortConstants
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace detail {
+
+#if VQSORT_ENABLED
+
+using Constants = hwy::SortConstants;
+
+// ------------------------------ SharedTraits
+
+// Code shared between all traits. It's unclear whether these can profitably be
+// specialized for Lane vs Block, or optimized like SortPairsDistance1 using
+// Compare/DupOdd.
+template <class Base>
+struct SharedTraits : public Base {
+  // Conditionally swaps lane 0 with 2, 1 with 3 etc.
+  template <class D>
+  HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const {
+    const Base* base = static_cast<const Base*>(this);
+    Vec<D> swapped = base->SwapAdjacentPairs(d, v);
+    base->Sort2(d, v, swapped);
+    return base->OddEvenPairs(d, swapped, v);
+  }
+
+  // Swaps with the vector formed by reversing contiguous groups of 8 keys.
+  template <class D>
+  HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const {
+    const Base* base = static_cast<const Base*>(this);
+    Vec<D> swapped = base->ReverseKeys8(d, v);
+    base->Sort2(d, v, swapped);
+    return base->OddEvenQuads(d, swapped, v);
+  }
+
+  // Swaps with the vector formed by reversing contiguous groups of 8 keys.
+  template <class D>
+  HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const {
+    const Base* base = static_cast<const Base*>(this);
+    static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16");
+    Vec<D> swapped = base->ReverseKeys(d, v);
+    base->Sort2(d, v, swapped);
+    return ConcatUpperLower(d, swapped, v);  // 8 = half of the vector
+  }
+};
+
+// ------------------------------ Sorting network
+
+// (Green's irregular) sorting network for independent columns in 16 vectors.
+template <class D, class Traits, class V = Vec<D>>
+HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
+                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
+                       V& ve, V& vf) {
+  st.Sort2(d, v0, v1);
+  st.Sort2(d, v2, v3);
+  st.Sort2(d, v4, v5);
+  st.Sort2(d, v6, v7);
+  st.Sort2(d, v8, v9);
+  st.Sort2(d, va, vb);
+  st.Sort2(d, vc, vd);
+  st.Sort2(d, ve, vf);
+  st.Sort2(d, v0, v2);
+  st.Sort2(d, v1, v3);
+  st.Sort2(d, v4, v6);
+  st.Sort2(d, v5, v7);
+  st.Sort2(d, v8, va);
+  st.Sort2(d, v9, vb);
+  st.Sort2(d, vc, ve);
+  st.Sort2(d, vd, vf);
+  st.Sort2(d, v0, v4);
+  st.Sort2(d, v1, v5);
+  st.Sort2(d, v2, v6);
+  st.Sort2(d, v3, v7);
+  st.Sort2(d, v8, vc);
+  st.Sort2(d, v9, vd);
+  st.Sort2(d, va, ve);
+  st.Sort2(d, vb, vf);
+  st.Sort2(d, v0, v8);
+  st.Sort2(d, v1, v9);
+  st.Sort2(d, v2, va);
+  st.Sort2(d, v3, vb);
+  st.Sort2(d, v4, vc);
+  st.Sort2(d, v5, vd);
+  st.Sort2(d, v6, ve);
+  st.Sort2(d, v7, vf);
+  st.Sort2(d, v5, va);
+  st.Sort2(d, v6, v9);
+  st.Sort2(d, v3, vc);
+  st.Sort2(d, v7, vb);
+  st.Sort2(d, vd, ve);
+  st.Sort2(d, v4, v8);
+  st.Sort2(d, v1, v2);
+  st.Sort2(d, v1, v4);
+  st.Sort2(d, v7, vd);
+  st.Sort2(d, v2, v8);
+  st.Sort2(d, vb, ve);
+  st.Sort2(d, v2, v4);
+  st.Sort2(d, v5, v6);
+  st.Sort2(d, v9, va);
+  st.Sort2(d, vb, vd);
+  st.Sort2(d, v3, v8);
+  st.Sort2(d, v7, vc);
+  st.Sort2(d, v3, v5);
+  st.Sort2(d, v6, v8);
+  st.Sort2(d, v7, v9);
+  st.Sort2(d, va, vc);
+  st.Sort2(d, v3, v4);
+  st.Sort2(d, v5, v6);
+  st.Sort2(d, v7, v8);
+  st.Sort2(d, v9, va);
+  st.Sort2(d, vb, vc);
+  st.Sort2(d, v6, v7);
+  st.Sort2(d, v8, v9);
+}
+
+// ------------------------------ Merging networks
+
+// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
+
+template <class D, class Traits, class V = Vec<D>>
+HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
+                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
+                       V& ve, V& vf) {
+  v8 = st.ReverseKeys2(d, v8);
+  v9 = st.ReverseKeys2(d, v9);
+  va = st.ReverseKeys2(d, va);
+  vb = st.ReverseKeys2(d, vb);
+  vc = st.ReverseKeys2(d, vc);
+  vd = st.ReverseKeys2(d, vd);
+  ve = st.ReverseKeys2(d, ve);
+  vf = st.ReverseKeys2(d, vf);
+  st.Sort2(d, v0, vf);
+  st.Sort2(d, v1, ve);
+  st.Sort2(d, v2, vd);
+  st.Sort2(d, v3, vc);
+  st.Sort2(d, v4, vb);
+  st.Sort2(d, v5, va);
+  st.Sort2(d, v6, v9);
+  st.Sort2(d, v7, v8);
+  v4 = st.ReverseKeys2(d, v4);
+  vc = st.ReverseKeys2(d, vc);
+  v5 = st.ReverseKeys2(d, v5);
+  vd = st.ReverseKeys2(d, vd);
+  v6 = st.ReverseKeys2(d, v6);
+  ve = st.ReverseKeys2(d, ve);
+  v7 = st.ReverseKeys2(d, v7);
+  vf = st.ReverseKeys2(d, vf);
+  st.Sort2(d, v0, v7);
+  st.Sort2(d, v8, vf);
+  st.Sort2(d, v1, v6);
+  st.Sort2(d, v9, ve);
+  st.Sort2(d, v2, v5);
+  st.Sort2(d, va, vd);
+  st.Sort2(d, v3, v4);
+  st.Sort2(d, vb, vc);
+  v2 = st.ReverseKeys2(d, v2);
+  v3 = st.ReverseKeys2(d, v3);
+  v6 = st.ReverseKeys2(d, v6);
+  v7 = st.ReverseKeys2(d, v7);
+  va = st.ReverseKeys2(d, va);
+  vb = st.ReverseKeys2(d, vb);
+  ve = st.ReverseKeys2(d, ve);
+  vf = st.ReverseKeys2(d, vf);
+  st.Sort2(d, v0, v3);
+  st.Sort2(d, v1, v2);
+  st.Sort2(d, v4, v7);
+  st.Sort2(d, v5, v6);
+  st.Sort2(d, v8, vb);
+  st.Sort2(d, v9, va);
+  st.Sort2(d, vc, vf);
+  st.Sort2(d, vd, ve);
+  v1 = st.ReverseKeys2(d, v1);
+  v3 = st.ReverseKeys2(d, v3);
+  v5 = st.ReverseKeys2(d, v5);
+  v7 = st.ReverseKeys2(d, v7);
+  v9 = st.ReverseKeys2(d, v9);
+  vb = st.ReverseKeys2(d, vb);
+  vd = st.ReverseKeys2(d, vd);
+  vf = st.ReverseKeys2(d, vf);
+  st.Sort2(d, v0, v1);
+  st.Sort2(d, v2, v3);
+  st.Sort2(d, v4, v5);
+  st.Sort2(d, v6, v7);
+  st.Sort2(d, v8, v9);
+  st.Sort2(d, va, vb);
+  st.Sort2(d, vc, vd);
+  st.Sort2(d, ve, vf);
+  v0 = st.SortPairsDistance1(d, v0);
+  v1 = st.SortPairsDistance1(d, v1);
+  v2 = st.SortPairsDistance1(d, v2);
+  v3 = st.SortPairsDistance1(d, v3);
+  v4 = st.SortPairsDistance1(d, v4);
+  v5 = st.SortPairsDistance1(d, v5);
+  v6 = st.SortPairsDistance1(d, v6);
+  v7 = st.SortPairsDistance1(d, v7);
+  v8 = st.SortPairsDistance1(d, v8);
+  v9 = st.SortPairsDistance1(d, v9);
+  va = st.SortPairsDistance1(d, va);
+  vb = st.SortPairsDistance1(d, vb);
+  vc = st.SortPairsDistance1(d, vc);
+  vd = st.SortPairsDistance1(d, vd);
+  ve = st.SortPairsDistance1(d, ve);
+  vf = st.SortPairsDistance1(d, vf);
+}
+
+template <class D, class Traits, class V = Vec<D>>
+HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
+                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
+                       V& ve, V& vf) {
+  v8 = st.ReverseKeys4(d, v8);
+  v9 = st.ReverseKeys4(d, v9);
+  va = st.ReverseKeys4(d, va);
+  vb = st.ReverseKeys4(d, vb);
+  vc = st.ReverseKeys4(d, vc);
+  vd = st.ReverseKeys4(d, vd);
+  ve = st.ReverseKeys4(d, ve);
+  vf = st.ReverseKeys4(d, vf);
+  st.Sort2(d, v0, vf);
+  st.Sort2(d, v1, ve);
+  st.Sort2(d, v2, vd);
+  st.Sort2(d, v3, vc);
+  st.Sort2(d, v4, vb);
+  st.Sort2(d, v5, va);
+  st.Sort2(d, v6, v9);
+  st.Sort2(d, v7, v8);
+  v4 = st.ReverseKeys4(d, v4);
+  vc = st.ReverseKeys4(d, vc);
+  v5 = st.ReverseKeys4(d, v5);
+  vd = st.ReverseKeys4(d, vd);
+  v6 = st.ReverseKeys4(d, v6);
+  ve = st.ReverseKeys4(d, ve);
+  v7 = st.ReverseKeys4(d, v7);
+  vf = st.ReverseKeys4(d, vf);
+  st.Sort2(d, v0, v7);
+  st.Sort2(d, v8, vf);
+  st.Sort2(d, v1, v6);
+  st.Sort2(d, v9, ve);
+  st.Sort2(d, v2, v5);
+  st.Sort2(d, va, vd);
+  st.Sort2(d, v3, v4);
+  st.Sort2(d, vb, vc);
+  v2 = st.ReverseKeys4(d, v2);
+  v3 = st.ReverseKeys4(d, v3);
+  v6 = st.ReverseKeys4(d, v6);
+  v7 = st.ReverseKeys4(d, v7);
+  va = st.ReverseKeys4(d, va);
+  vb = st.ReverseKeys4(d, vb);
+  ve = st.ReverseKeys4(d, ve);
+  vf = st.ReverseKeys4(d, vf);
+  st.Sort2(d, v0, v3);
+  st.Sort2(d, v1, v2);
+  st.Sort2(d, v4, v7);
+  st.Sort2(d, v5, v6);
+  st.Sort2(d, v8, vb);
+  st.Sort2(d, v9, va);
+  st.Sort2(d, vc, vf);
+  st.Sort2(d, vd, ve);
+  v1 = st.ReverseKeys4(d, v1);
+  v3 = st.ReverseKeys4(d, v3);
+  v5 = st.ReverseKeys4(d, v5);
+  v7 = st.ReverseKeys4(d, v7);
+  v9 = st.ReverseKeys4(d, v9);
+  vb = st.ReverseKeys4(d, vb);
+  vd = st.ReverseKeys4(d, vd);
+  vf = st.ReverseKeys4(d, vf);
+  st.Sort2(d, v0, v1);
+  st.Sort2(d, v2, v3);
+  st.Sort2(d, v4, v5);
+  st.Sort2(d, v6, v7);
+  st.Sort2(d, v8, v9);
+  st.Sort2(d, va, vb);
+  st.Sort2(d, vc, vd);
+  st.Sort2(d, ve, vf);
+  v0 = st.SortPairsReverse4(d, v0);
+  v1 = st.SortPairsReverse4(d, v1);
+  v2 = st.SortPairsReverse4(d, v2);
+  v3 = st.SortPairsReverse4(d, v3);
+  v4 = st.SortPairsReverse4(d, v4);
+  v5 = st.SortPairsReverse4(d, v5);
+  v6 = st.SortPairsReverse4(d, v6);
+  v7 = st.SortPairsReverse4(d, v7);
+  v8 = st.SortPairsReverse4(d, v8);
+  v9 = st.SortPairsReverse4(d, v9);
+  va = st.SortPairsReverse4(d, va);
+  vb = st.SortPairsReverse4(d, vb);
+  vc = st.SortPairsReverse4(d, vc);
+  vd = st.SortPairsReverse4(d, vd);
+  ve = st.SortPairsReverse4(d, ve);
+  vf = st.SortPairsReverse4(d, vf);
+  v0 = st.SortPairsDistance1(d, v0);
+  v1 = st.SortPairsDistance1(d, v1);
+  v2 = st.SortPairsDistance1(d, v2);
+  v3 = st.SortPairsDistance1(d, v3);
+  v4 = st.SortPairsDistance1(d, v4);
+  v5 = st.SortPairsDistance1(d, v5);
+  v6 = st.SortPairsDistance1(d, v6);
+  v7 = st.SortPairsDistance1(d, v7);
+  v8 = st.SortPairsDistance1(d, v8);
+  v9 = st.SortPairsDistance1(d, v9);
+  va = st.SortPairsDistance1(d, va);
+  vb = st.SortPairsDistance1(d, vb);
+  vc = st.SortPairsDistance1(d, vc);
+  vd = st.SortPairsDistance1(d, vd);
+  ve = st.SortPairsDistance1(d, ve);
+  vf = st.SortPairsDistance1(d, vf);
+}
+
+template <class D, class Traits, class V = Vec<D>>
+HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
+                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
+                       V& ve, V& vf) {
+  v8 = st.ReverseKeys8(d, v8);
+  v9 = st.ReverseKeys8(d, v9);
+  va = st.ReverseKeys8(d, va);
+  vb = st.ReverseKeys8(d, vb);
+  vc = st.ReverseKeys8(d, vc);
+  vd = st.ReverseKeys8(d, vd);
+  ve = st.ReverseKeys8(d, ve);
+  vf = st.ReverseKeys8(d, vf);
+  st.Sort2(d, v0, vf);
+  st.Sort2(d, v1, ve);
+  st.Sort2(d, v2, vd);
+  st.Sort2(d, v3, vc);
+  st.Sort2(d, v4, vb);
+  st.Sort2(d, v5, va);
+  st.Sort2(d, v6, v9);
+  st.Sort2(d, v7, v8);
+  v4 = st.ReverseKeys8(d, v4);
+  vc = st.ReverseKeys8(d, vc);
+  v5 = st.ReverseKeys8(d, v5);
+  vd = st.ReverseKeys8(d, vd);
+  v6 = st.ReverseKeys8(d, v6);
+  ve = st.ReverseKeys8(d, ve);
+  v7 = st.ReverseKeys8(d, v7);
+  vf = st.ReverseKeys8(d, vf);
+  st.Sort2(d, v0, v7);
+  st.Sort2(d, v8, vf);
+  st.Sort2(d, v1, v6);
+  st.Sort2(d, v9, ve);
+  st.Sort2(d, v2, v5);
+  st.Sort2(d, va, vd);
+  st.Sort2(d, v3, v4);
+  st.Sort2(d, vb, vc);
+  v2 = st.ReverseKeys8(d, v2);
+  v3 = st.ReverseKeys8(d, v3);
+  v6 = st.ReverseKeys8(d, v6);
+  v7 = st.ReverseKeys8(d, v7);
+  va = st.ReverseKeys8(d, va);
+  vb = st.ReverseKeys8(d, vb);
+  ve = st.ReverseKeys8(d, ve);
+  vf = st.ReverseKeys8(d, vf);
+  st.Sort2(d, v0, v3);
+  st.Sort2(d, v1, v2);
+  st.Sort2(d, v4, v7);
+  st.Sort2(d, v5, v6);
+  st.Sort2(d, v8, vb);
+  st.Sort2(d, v9, va);
+  st.Sort2(d, vc, vf);
+  st.Sort2(d, vd, ve);
+  v1 = st.ReverseKeys8(d, v1);
+  v3 = st.ReverseKeys8(d, v3);
+  v5 = st.ReverseKeys8(d, v5);
+  v7 = st.ReverseKeys8(d, v7);
+  v9 = st.ReverseKeys8(d, v9);
+  vb = st.ReverseKeys8(d, vb);
+  vd = st.ReverseKeys8(d, vd);
+  vf = st.ReverseKeys8(d, vf);
+  st.Sort2(d, v0, v1);
+  st.Sort2(d, v2, v3);
+  st.Sort2(d, v4, v5);
+  st.Sort2(d, v6, v7);
+  st.Sort2(d, v8, v9);
+  st.Sort2(d, va, vb);
+  st.Sort2(d, vc, vd);
+  st.Sort2(d, ve, vf);
+  v0 = st.SortPairsReverse8(d, v0);
+  v1 = st.SortPairsReverse8(d, v1);
+  v2 = st.SortPairsReverse8(d, v2);
+  v3 = st.SortPairsReverse8(d, v3);
+  v4 = st.SortPairsReverse8(d, v4);
+  v5 = st.SortPairsReverse8(d, v5);
+  v6 = st.SortPairsReverse8(d, v6);
+  v7 = st.SortPairsReverse8(d, v7);
+  v8 = st.SortPairsReverse8(d, v8);
+  v9 = st.SortPairsReverse8(d, v9);
+  va = st.SortPairsReverse8(d, va);
+  vb = st.SortPairsReverse8(d, vb);
+  vc = st.SortPairsReverse8(d, vc);
+  vd = st.SortPairsReverse8(d, vd);
+  ve = st.SortPairsReverse8(d, ve);
+  vf = st.SortPairsReverse8(d, vf);
+  v0 = st.SortPairsDistance2(d, v0);
+  v1 = st.SortPairsDistance2(d, v1);
+  v2 = st.SortPairsDistance2(d, v2);
+  v3 = st.SortPairsDistance2(d, v3);
+  v4 = st.SortPairsDistance2(d, v4);
+  v5 = st.SortPairsDistance2(d, v5);
+  v6 = st.SortPairsDistance2(d, v6);
+  v7 = st.SortPairsDistance2(d, v7);
+  v8 = st.SortPairsDistance2(d, v8);
+  v9 = st.SortPairsDistance2(d, v9);
+  va = st.SortPairsDistance2(d, va);
+  vb = st.SortPairsDistance2(d, vb);
+  vc = st.SortPairsDistance2(d, vc);
+  vd = st.SortPairsDistance2(d, vd);
+  ve = st.SortPairsDistance2(d, ve);
+  vf = st.SortPairsDistance2(d, vf);
+  v0 = st.SortPairsDistance1(d, v0);
+  v1 = st.SortPairsDistance1(d, v1);
+  v2 = st.SortPairsDistance1(d, v2);
+  v3 = st.SortPairsDistance1(d, v3);
+  v4 = st.SortPairsDistance1(d, v4);
+  v5 = st.SortPairsDistance1(d, v5);
+  v6 = st.SortPairsDistance1(d, v6);
+  v7 = st.SortPairsDistance1(d, v7);
+  v8 = st.SortPairsDistance1(d, v8);
+  v9 = st.SortPairsDistance1(d, v9);
+  va = st.SortPairsDistance1(d, va);
+  vb = st.SortPairsDistance1(d, vb);
+  vc = st.SortPairsDistance1(d, vc);
+  vd = st.SortPairsDistance1(d, vd);
+  ve = st.SortPairsDistance1(d, ve);
+  vf = st.SortPairsDistance1(d, vf);
+}
+
+// Unused on MSVC, see below
+#if !HWY_COMPILER_MSVC
+
+template <class D, class Traits, class V = Vec<D>>
+HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
+                        V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc,
+                        V& vd, V& ve, V& vf) {
+  v8 = st.ReverseKeys16(d, v8);
+  v9 = st.ReverseKeys16(d, v9);
+  va = st.ReverseKeys16(d, va);
+  vb = st.ReverseKeys16(d, vb);
+  vc = st.ReverseKeys16(d, vc);
+  vd = st.ReverseKeys16(d, vd);
+  ve = st.ReverseKeys16(d, ve);
+  vf = st.ReverseKeys16(d, vf);
+  st.Sort2(d, v0, vf);
+  st.Sort2(d, v1, ve);
+  st.Sort2(d, v2, vd);
+  st.Sort2(d, v3, vc);
+  st.Sort2(d, v4, vb);
+  st.Sort2(d, v5, va);
+  st.Sort2(d, v6, v9);
+  st.Sort2(d, v7, v8);
+  v4 = st.ReverseKeys16(d, v4);
+  vc = st.ReverseKeys16(d, vc);
+  v5 = st.ReverseKeys16(d, v5);
+  vd = st.ReverseKeys16(d, vd);
+  v6 = st.ReverseKeys16(d, v6);
+  ve = st.ReverseKeys16(d, ve);
+  v7 = st.ReverseKeys16(d, v7);
+  vf = st.ReverseKeys16(d, vf);
+  st.Sort2(d, v0, v7);
+  st.Sort2(d, v8, vf);
+  st.Sort2(d, v1, v6);
+  st.Sort2(d, v9, ve);
+  st.Sort2(d, v2, v5);
+  st.Sort2(d, va, vd);
+  st.Sort2(d, v3, v4);
+  st.Sort2(d, vb, vc);
+  v2 = st.ReverseKeys16(d, v2);
+  v3 = st.ReverseKeys16(d, v3);
+  v6 = st.ReverseKeys16(d, v6);
+  v7 = st.ReverseKeys16(d, v7);
+  va = st.ReverseKeys16(d, va);
+  vb = st.ReverseKeys16(d, vb);
+  ve = st.ReverseKeys16(d, ve);
+  vf = st.ReverseKeys16(d, vf);
+  st.Sort2(d, v0, v3);
+  st.Sort2(d, v1, v2);
+  st.Sort2(d, v4, v7);
+  st.Sort2(d, v5, v6);
+  st.Sort2(d, v8, vb);
+  st.Sort2(d, v9, va);
+  st.Sort2(d, vc, vf);
+  st.Sort2(d, vd, ve);
+  v1 = st.ReverseKeys16(d, v1);
+  v3 = st.ReverseKeys16(d, v3);
+  v5 = st.ReverseKeys16(d, v5);
+  v7 = st.ReverseKeys16(d, v7);
+  v9 = st.ReverseKeys16(d, v9);
+  vb = st.ReverseKeys16(d, vb);
+  vd = st.ReverseKeys16(d, vd);
+  vf = st.ReverseKeys16(d, vf);
+  st.Sort2(d, v0, v1);
+  st.Sort2(d, v2, v3);
+  st.Sort2(d, v4, v5);
+  st.Sort2(d, v6, v7);
+  st.Sort2(d, v8, v9);
+  st.Sort2(d, va, vb);
+  st.Sort2(d, vc, vd);
+  st.Sort2(d, ve, vf);
+  v0 = st.SortPairsReverse16(d, v0);
+  v1 = st.SortPairsReverse16(d, v1);
+  v2 = st.SortPairsReverse16(d, v2);
+  v3 = st.SortPairsReverse16(d, v3);
+  v4 = st.SortPairsReverse16(d, v4);
+  v5 = st.SortPairsReverse16(d, v5);
+  v6 = st.SortPairsReverse16(d, v6);
+  v7 = st.SortPairsReverse16(d, v7);
+  v8 = st.SortPairsReverse16(d, v8);
+  v9 = st.SortPairsReverse16(d, v9);
+  va = st.SortPairsReverse16(d, va);
+  vb = st.SortPairsReverse16(d, vb);
+  vc = st.SortPairsReverse16(d, vc);
+  vd = st.SortPairsReverse16(d, vd);
+  ve = st.SortPairsReverse16(d, ve);
+  vf = st.SortPairsReverse16(d, vf);
+  v0 = st.SortPairsDistance4(d, v0);
+  v1 = st.SortPairsDistance4(d, v1);
+  v2 = st.SortPairsDistance4(d, v2);
+  v3 = st.SortPairsDistance4(d, v3);
+  v4 = st.SortPairsDistance4(d, v4);
+  v5 = st.SortPairsDistance4(d, v5);
+  v6 = st.SortPairsDistance4(d, v6);
+  v7 = st.SortPairsDistance4(d, v7);
+  v8 = st.SortPairsDistance4(d, v8);
+  v9 = st.SortPairsDistance4(d, v9);
+  va = st.SortPairsDistance4(d, va);
+  vb = st.SortPairsDistance4(d, vb);
+  vc = st.SortPairsDistance4(d, vc);
+  vd = st.SortPairsDistance4(d, vd);
+  ve = st.SortPairsDistance4(d, ve);
+  vf = st.SortPairsDistance4(d, vf);
+  v0 = st.SortPairsDistance2(d, v0);
+  v1 = st.SortPairsDistance2(d, v1);
+  v2 = st.SortPairsDistance2(d, v2);
+  v3 = st.SortPairsDistance2(d, v3);
+  v4 = st.SortPairsDistance2(d, v4);
+  v5 = st.SortPairsDistance2(d, v5);
+  v6 = st.SortPairsDistance2(d, v6);
+  v7 = st.SortPairsDistance2(d, v7);
+  v8 = st.SortPairsDistance2(d, v8);
+  v9 = st.SortPairsDistance2(d, v9);
+  va = st.SortPairsDistance2(d, va);
+  vb = st.SortPairsDistance2(d, vb);
+  vc = st.SortPairsDistance2(d, vc);
+  vd = st.SortPairsDistance2(d, vd);
+  ve = st.SortPairsDistance2(d, ve);
+  vf = st.SortPairsDistance2(d, vf);
+  v0 = st.SortPairsDistance1(d, v0);
+  v1 = st.SortPairsDistance1(d, v1);
+  v2 = st.SortPairsDistance1(d, v2);
+  v3 = st.SortPairsDistance1(d, v3);
+  v4 = st.SortPairsDistance1(d, v4);
+  v5 = st.SortPairsDistance1(d, v5);
+  v6 = st.SortPairsDistance1(d, v6);
+  v7 = st.SortPairsDistance1(d, v7);
+  v8 = st.SortPairsDistance1(d, v8);
+  v9 = st.SortPairsDistance1(d, v9);
+  va = st.SortPairsDistance1(d, va);
+  vb = st.SortPairsDistance1(d, vb);
+  vc = st.SortPairsDistance1(d, vc);
+  vd = st.SortPairsDistance1(d, vd);
+  ve = st.SortPairsDistance1(d, ve);
+  vf = st.SortPairsDistance1(d, vf);
+}
+
+#endif  // !HWY_COMPILER_MSVC
+
+// Reshapes `buf` into a matrix, sorts columns independently, and then merges
+// into a sorted 1D array without transposing.
+//
+// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
+//   differences in sort order and single-lane vs 128-bit keys.
+// `buf` ensures full vectors are aligned, and enables loads/stores without
+//   bounds checks.
+//
+// NOINLINE because this is large and called twice from vqsort-inl.h.
+//
+// References:
+// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
+// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
+// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
+template <class Traits, typename T>
+HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
+  const CappedTag<T, Constants::kMaxCols> d;
+  using V = decltype(Zero(d));
+
+  HWY_DASSERT(cols <= Constants::kMaxCols);
+
+  // The network width depends on the number of keys, not lanes.
+  constexpr size_t kLanesPerKey = st.LanesPerKey();
+  const size_t keys = cols / kLanesPerKey;
+  constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey;
+
+  // These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
+  // offsets to duplicating this code for every value of cols.
+  static_assert(Constants::kMaxRows == 16, "Update loads/stores/args");
+  V v0 = LoadU(d, buf + 0x0 * cols);
+  V v1 = LoadU(d, buf + 0x1 * cols);
+  V v2 = LoadU(d, buf + 0x2 * cols);
+  V v3 = LoadU(d, buf + 0x3 * cols);
+  V v4 = LoadU(d, buf + 0x4 * cols);
+  V v5 = LoadU(d, buf + 0x5 * cols);
+  V v6 = LoadU(d, buf + 0x6 * cols);
+  V v7 = LoadU(d, buf + 0x7 * cols);
+  V v8 = LoadU(d, buf + 0x8 * cols);
+  V v9 = LoadU(d, buf + 0x9 * cols);
+  V va = LoadU(d, buf + 0xa * cols);
+  V vb = LoadU(d, buf + 0xb * cols);
+  V vc = LoadU(d, buf + 0xc * cols);
+  V vd = LoadU(d, buf + 0xd * cols);
+  V ve = LoadU(d, buf + 0xe * cols);
+  V vf = LoadU(d, buf + 0xf * cols);
+
+  Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
+
+  // Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
+  // code paths: if MaxLanes < 2, then keys <= cols < 2.
+  if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) {
+    Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
+           vf);
+
+    if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) {
+      Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
+             vf);
+
+      if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) {
+        Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
+               ve, vf);
+
+        // Avoids build timeout. Must match #if condition in kMaxCols.
+#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
+        if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
+          Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
+                  ve, vf);
+
+          static_assert(Constants::kMaxCols <= 16, "Add more branches");
+        }
+#endif
+      }
+    }
+  }
+
+  StoreU(v0, d, buf + 0x0 * cols);
+  StoreU(v1, d, buf + 0x1 * cols);
+  StoreU(v2, d, buf + 0x2 * cols);
+  StoreU(v3, d, buf + 0x3 * cols);
+  StoreU(v4, d, buf + 0x4 * cols);
+  StoreU(v5, d, buf + 0x5 * cols);
+  StoreU(v6, d, buf + 0x6 * cols);
+  StoreU(v7, d, buf + 0x7 * cols);
+  StoreU(v8, d, buf + 0x8 * cols);
+  StoreU(v9, d, buf + 0x9 * cols);
+  StoreU(va, d, buf + 0xa * cols);
+  StoreU(vb, d, buf + 0xb * cols);
+  StoreU(vc, d, buf + 0xc * cols);
+  StoreU(vd, d, buf + 0xd * cols);
+  StoreU(ve, d, buf + 0xe * cols);
+  StoreU(vf, d, buf + 0xf * cols);
+}
+
+#else
+template <class Base>
+struct SharedTraits : public Base {};
+#endif  // VQSORT_ENABLED
+
+}  // namespace detail
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
@@ -0,0 +1,527 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
+#endif
+
+#include <string>
+
+#include "hwy/contrib/sort/shared-inl.h"  // SortConstants
+#include "hwy/contrib/sort/vqsort.h"      // SortDescending
+#include "hwy/highway.h"
+#include "hwy/print.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace detail {
+
+#if VQSORT_ENABLED || HWY_IDE
+
+// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
+// along with an abstraction layer for single-lane vs. lane-pair, which is
+// independent of the order.
+template <typename T>
+struct KeyLane {
+  static constexpr bool Is128() { return false; }
+  constexpr size_t LanesPerKey() const { return 1; }
+
+  // What type bench_sort should allocate for generating inputs.
+  using LaneType = T;
+  // What type to pass to Sorter::operator().
+  using KeyType = T;
+
+  std::string KeyString() const {
+    char string100[100];
+    hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
+    return string100;
+  }
+
+  // For HeapSort
+  HWY_INLINE void Swap(T* a, T* b) const {
+    const T temp = *a;
+    *a = *b;
+    *b = temp;
+  }
+
+  template <class V, class M>
+  HWY_INLINE V CompressKeys(V keys, M mask) const {
+    return CompressNot(keys, mask);
+  }
+
+  // Broadcasts one key into a vector
+  template <class D>
+  HWY_INLINE Vec<D> SetKey(D d, const T* key) const {
+    return Set(d, *key);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+    return Eq(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+    return Ne(a, b);
+  }
+
+  HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; }
+
+  template <class D>
+  HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
+    return Reverse(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const {
+    return Reverse2(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const {
+    return Reverse4(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const {
+    return Reverse8(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const {
+    static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit");
+    return ReverseKeys(d, v);
+  }
+
+  template <class V>
+  HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
+    return OddEven(odd, even);
+  }
+
+  template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
+  HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const {
+    const Repartition<uint32_t, D> du32;
+    return BitCast(d, Shuffle2301(BitCast(du32, v)));
+  }
+  template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
+  HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
+    return Shuffle1032(v);
+  }
+  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+  HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
+    return SwapAdjacentBlocks(v);
+  }
+
+  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
+  HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
+#if HWY_HAVE_FLOAT64  // in case D is float32
+    const RepartitionToWide<D> dw;
+#else
+    const RepartitionToWide<RebindToUnsigned<D> > dw;
+#endif
+    return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
+  }
+  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+  HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
+    // Assumes max vector size = 512
+    return ConcatLowerUpper(d, v, v);
+  }
+
+  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
+  HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
+                                 const Vec<D> even) const {
+#if HWY_HAVE_FLOAT64  // in case D is float32
+    const RepartitionToWide<D> dw;
+#else
+    const RepartitionToWide<RebindToUnsigned<D> > dw;
+#endif
+    return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
+  }
+  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+  HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const {
+    return OddEvenBlocks(odd, even);
+  }
+
+  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
+  HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
+#if HWY_HAVE_FLOAT64  // in case D is float32
+    const RepartitionToWide<D> dw;
+#else
+    const RepartitionToWide<RebindToUnsigned<D> > dw;
+#endif
+    return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
+  }
+  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+  HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
+    return ConcatUpperLower(d, odd, even);
+  }
+};
+
+// Anything order-related depends on the key traits *and* the order (see
+// FirstOfLanes). We cannot implement just one Compare function because Lt128
+// only compiles if the lane type is u64. Thus we need either overloaded
+// functions with a tag type, class specializations, or separate classes.
+// We avoid overloaded functions because we want all functions to be callable
+// from a SortTraits without per-function wrappers. Specializing would work, but
+// we are anyway going to specialize at a higher level.
+template <typename T>
+struct OrderAscending : public KeyLane<T> {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+    return Lt(a, b);
+  }
+
+  // Two halves of Sort2, used in ScanMinMax.
+  template <class D>
+  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Min(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Max(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 T* HWY_RESTRICT /* buf */) const {
+    return MinOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                T* HWY_RESTRICT /* buf */) const {
+    return MaxOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::LowestValue<T>());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::HighestValue<T>());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Sub(v, Set(d, hwy::Epsilon<T>()));
+  }
+};
+
+template <typename T>
+struct OrderDescending : public KeyLane<T> {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+    return Lt(b, a);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Max(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Min(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 T* HWY_RESTRICT /* buf */) const {
+    return MaxOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                T* HWY_RESTRICT /* buf */) const {
+    return MinOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::HighestValue<T>());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::LowestValue<T>());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Add(v, Set(d, hwy::Epsilon<T>()));
+  }
+};
+
+struct OrderAscendingKV64 : public KeyLane<uint64_t> {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return (*a >> 32) < (*b >> 32);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+    return Lt(ShiftRight<32>(a), ShiftRight<32>(b));
+  }
+
+  // Not required to be stable (preserving the order of equivalent keys), so
+  // we can include the value in the comparison.
+  template <class D>
+  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Min(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Max(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 uint64_t* HWY_RESTRICT /* buf */) const {
+    return MinOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                uint64_t* HWY_RESTRICT /* buf */) const {
+    return MaxOfLanes(d, v);
+  }
+
+  // Same as for regular lanes.
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Sub(v, Set(d, 1));
+  }
+};
+
+struct OrderDescendingKV64 : public KeyLane<uint64_t> {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return (*b >> 32) < (*a >> 32);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+    return Lt(ShiftRight<32>(b), ShiftRight<32>(a));
+  }
+
+  // Not required to be stable (preserving the order of equivalent keys), so
+  // we can include the value in the comparison.
+  template <class D>
+  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Max(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Min(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 uint64_t* HWY_RESTRICT /* buf */) const {
+    return MaxOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                uint64_t* HWY_RESTRICT /* buf */) const {
+    return MinOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Add(v, Set(d, 1));
+  }
+};
+
+// Shared code that depends on Order.
+template <class Base>
+struct TraitsLane : public Base {
+  // For each lane i: replaces a[i] with the first and b[i] with the second
+  // according to Base.
+  // Corresponds to a conditional swap, which is one "node" of a sorting
+  // network. Min/Max are cheaper than compare + blend at least for integers.
+  template <class D>
+  HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
+    const Base* base = static_cast<const Base*>(this);
+
+    const Vec<D> a_copy = a;
+    // Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4
+    // instructions. We can reduce it to a compare + 2 IfThenElse.
+#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
+    if (sizeof(TFromD<D>) == 8) {
+      const Mask<D> cmp = base->Compare(d, a, b);
+      a = IfThenElse(cmp, a, b);
+      b = IfThenElse(cmp, b, a_copy);
+      return;
+    }
+#endif
+    a = base->First(d, a, b);
+    b = base->Last(d, a_copy, b);
+  }
+
+  // Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
+  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+  HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
+    const Base* base = static_cast<const Base*>(this);
+    Vec<D> swapped = base->ReverseKeys2(d, v);
+    // Further to the above optimization, Sort2+OddEvenKeys compile to four
+    // instructions; we can save one by combining two blends.
+#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
+    const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped));
+    return IfVecThenElse(DupOdd(cmp), swapped, v);
+#else
+    Sort2(d, v, swapped);
+    return base->OddEvenKeys(swapped, v);
+#endif
+  }
+
+  // (See above - we use Sort2 for non-64-bit types.)
+  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
+  HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
+    const Base* base = static_cast<const Base*>(this);
+    Vec<D> swapped = base->ReverseKeys2(d, v);
+    Sort2(d, v, swapped);
+    return base->OddEvenKeys(swapped, v);
+  }
+
+  // Swaps with the vector formed by reversing contiguous groups of 4 keys.
+  template <class D>
+  HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
+    const Base* base = static_cast<const Base*>(this);
+    Vec<D> swapped = base->ReverseKeys4(d, v);
+    Sort2(d, v, swapped);
+    return base->OddEvenPairs(d, swapped, v);
+  }
+
+  // Conditionally swaps lane 0 with 4, 1 with 5 etc.
+  template <class D>
+  HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const {
+    const Base* base = static_cast<const Base*>(this);
+    Vec<D> swapped = base->SwapAdjacentQuads(d, v);
+    // Only used in Merge16, so this will not be used on AVX2 (which only has 4
+    // u64 lanes), so skip the above optimization for 64-bit AVX2.
+    Sort2(d, v, swapped);
+    return base->OddEvenQuads(d, swapped, v);
+  }
+};
+
+#else
+
+// Base class shared between OrderAscending, OrderDescending.
+template <typename T>
+struct KeyLane {
+  constexpr bool Is128() const { return false; }
+  constexpr size_t LanesPerKey() const { return 1; }
+
+  using LaneType = T;
+  using KeyType = T;
+
+  std::string KeyString() const {
+    char string100[100];
+    hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
+    return string100;
+  }
+};
+
+template <typename T>
+struct OrderAscending : public KeyLane<T> {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
+    return Lt(a, b);
+  }
+};
+
+template <typename T>
+struct OrderDescending : public KeyLane<T> {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
+    return Lt(b, a);
+  }
+};
+
+template <class Order>
+struct TraitsLane : public Order {
+  // For HeapSort
+  template <typename T>  // MSVC doesn't find typename Order::LaneType.
+  HWY_INLINE void Swap(T* a, T* b) const {
+    const T temp = *a;
+    *a = *b;
+    *b = temp;
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
+    return Set(d, *key);
+  }
+};
+
+#endif  // VQSORT_ENABLED
+
+}  // namespace detail
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
@@ -0,0 +1,492 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
+#endif
+
+#include <string>
+
+#include "hwy/contrib/sort/shared-inl.h"
+#include "hwy/contrib/sort/vqsort.h"  // SortDescending
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace detail {
+
+#if VQSORT_ENABLED || HWY_IDE
+
+// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
+// along with an abstraction layer for single-lane vs. lane-pair, which is
+// independent of the order.
+struct KeyAny128 {
+  static constexpr bool Is128() { return true; }
+  constexpr size_t LanesPerKey() const { return 2; }
+
+  // What type bench_sort should allocate for generating inputs.
+  using LaneType = uint64_t;
+  // KeyType and KeyString are defined by derived classes.
+
+  HWY_INLINE void Swap(LaneType* a, LaneType* b) const {
+    const FixedTag<LaneType, 2> d;
+    const auto temp = LoadU(d, a);
+    StoreU(LoadU(d, b), d, a);
+    StoreU(temp, d, b);
+  }
+
+  template <class V, class M>
+  HWY_INLINE V CompressKeys(V keys, M mask) const {
+    return CompressBlocksNot(keys, mask);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
+    return LoadDup128(d, key);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
+    return ReverseBlocks(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const {
+    return SwapAdjacentBlocks(v);
+  }
+
+  // Only called for 4 keys because we do not support >512-bit vectors.
+  template <class D>
+  HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const {
+    HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
+    return ReverseKeys(d, v);
+  }
+
+  // Only called for 4 keys because we do not support >512-bit vectors.
+  template <class D>
+  HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
+                                 const Vec<D> even) const {
+    HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
+    return ConcatUpperLower(d, odd, even);
+  }
+
+  template <class V>
+  HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
+    return OddEvenBlocks(odd, even);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const {
+    HWY_ASSERT(0);  // not supported: would require 1024-bit vectors
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const {
+    HWY_ASSERT(0);  // not supported: would require 2048-bit vectors
+  }
+
+  // This is only called for 8/16 col networks (not supported).
+  template <class D>
+  HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const {
+    HWY_ASSERT(0);
+  }
+
+  // This is only called for 16 col networks (not supported).
+  template <class D>
+  HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const {
+    HWY_ASSERT(0);
+  }
+
+  // This is only called for 8 col networks (not supported).
+  template <class D>
+  HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const {
+    HWY_ASSERT(0);
+  }
+};
+
+// Base class shared between OrderAscending128, OrderDescending128.
+struct Key128 : public KeyAny128 {
+  // What type to pass to Sorter::operator().
+  using KeyType = hwy::uint128_t;
+
+  std::string KeyString() const { return "U128"; }
+
+  template <class D>
+  HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Eq128(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Ne128(d, a, b);
+  }
+
+  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
+    return a[0] == b[0] && a[1] == b[1];
+  }
+};
+
+// Anything order-related depends on the key traits *and* the order (see
+// FirstOfLanes). We cannot implement just one Compare function because Lt128
+// only compiles if the lane type is u64. Thus we need either overloaded
+// functions with a tag type, class specializations, or separate classes.
+// We avoid overloaded functions because we want all functions to be callable
+// from a SortTraits without per-function wrappers. Specializing would work, but
+// we are anyway going to specialize at a higher level.
+struct OrderAscending128 : public Key128 {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+    return Lt128(d, a, b);
+  }
+
+  // Used by CompareTop
+  template <class V>
+  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+    return Lt(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+    return Min128(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+    return Max128(d, a, b);
+  }
+
+  // Same as for regular lanes because 128-bit lanes are u64.
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k0 = Zero(d);
+    const Vec<D> k1 = OddEven(k0, Set(d, 1));
+    const Mask<D> borrow = Eq(v, k0);  // don't-care, lo == 0
+    // lo == 0? 1 : 0, 0
+    const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1));
+    return Sub(Sub(v, k1), adjust);
+  }
+};
+
+struct OrderDescending128 : public Key128 {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+    return Lt128(d, b, a);
+  }
+
+  // Used by CompareTop
+  template <class V>
+  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+    return Lt(b, a);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+    return Max128(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+    return Min128(d, a, b);
+  }
+
+  // Same as for regular lanes because 128-bit lanes are u64.
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
+    const Vec<D> added = Add(v, k1);
+    const Mask<D> overflowed = Lt(added, v);  // false, overflowed
+    // overflowed? 1 : 0, 0
+    const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1));
+    return Add(added, adjust);
+  }
+};
+
+// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
+struct KeyValue128 : public KeyAny128 {
+  // What type to pass to Sorter::operator().
+  using KeyType = K64V64;
+
+  std::string KeyString() const { return "KV128"; }
+
+  template <class D>
+  HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Eq128Upper(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Ne128Upper(d, a, b);
+  }
+
+  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
+    return a[1] == b[1];
+  }
+};
+
+struct OrderAscendingKV128 : public KeyValue128 {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return a[1] < b[1];
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+    return Lt128Upper(d, a, b);
+  }
+
+  // Used by CompareTop
+  template <class V>
+  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+    return Lt(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+    return Min128Upper(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+    return Max128Upper(d, a, b);
+  }
+
+  // Same as for regular lanes because 128-bit lanes are u64.
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k1 = OddEven(Set(d, 1), Zero(d));
+    return Sub(v, k1);
+  }
+};
+
+struct OrderDescendingKV128 : public KeyValue128 {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return b[1] < a[1];
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+    return Lt128Upper(d, b, a);
+  }
+
+  // Used by CompareTop
+  template <class V>
+  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+    return Lt(b, a);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+    return Max128Upper(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+    return Min128Upper(d, a, b);
+  }
+
+  // Same as for regular lanes because 128-bit lanes are u64.
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k1 = OddEven(Set(d, 1), Zero(d));
+    return Add(v, k1);
+  }
+};
+
+// Shared code that depends on Order.
+template <class Base>
+class Traits128 : public Base {
+  // Special case for >= 256 bit vectors
+#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
+  // Returns vector with only the top u64 lane valid. Useful when the next step
+  // is to replicate the mask anyway.
+  template <class D>
+  HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
+    const Base* base = static_cast<const Base*>(this);
+    const Mask<D> eqHL = Eq(a, b);
+    const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
+#if HWY_TARGET == HWY_SVE_256
+    return IfThenElse(eqHL, DupEven(ltHL), ltHL);
+#else
+    const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
+    return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX);
+#endif
+  }
+
+  // We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
+  // the most-significant of those lanes (the result of CompareTop), so
+  // replicate it 4x. Only called for >= 256-bit vectors.
+  template <class V>
+  HWY_INLINE V ReplicateTop4x(V v) const {
+#if HWY_TARGET == HWY_SVE_256
+    return svdup_lane_u64(v, 3);
+#elif HWY_TARGET <= HWY_AVX3
+    return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
+#else  // AVX2
+    return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
+#endif
+  }
+#endif  // HWY_TARGET
+
+ public:
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 TFromD<D>* HWY_RESTRICT buf) const {
+    const Base* base = static_cast<const Base*>(this);
+    const size_t N = Lanes(d);
+    Store(v, d, buf);
+    v = base->SetKey(d, buf + 0);  // result must be broadcasted
+    for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
+      v = base->First(d, v, base->SetKey(d, buf + i));
+    }
+    return v;
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                TFromD<D>* HWY_RESTRICT buf) const {
+    const Base* base = static_cast<const Base*>(this);
+    const size_t N = Lanes(d);
+    Store(v, d, buf);
+    v = base->SetKey(d, buf + 0);  // result must be broadcasted
+    for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
+      v = base->Last(d, v, base->SetKey(d, buf + i));
+    }
+    return v;
+  }
+
+  template <class D>
+  HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
+    const Base* base = static_cast<const Base*>(this);
+
+    const Vec<D> a_copy = a;
+    const auto lt = base->Compare(d, a, b);
+    a = IfThenElse(lt, a, b);
+    b = IfThenElse(lt, b, a_copy);
+  }
+
+  // Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
+  template <class D>
+  HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
+    const Base* base = static_cast<const Base*>(this);
+    Vec<D> swapped = base->ReverseKeys2(d, v);
+
+#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
+    const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
+    return IfVecThenElse(select, swapped, v);
+#else
+    Sort2(d, v, swapped);
+    return base->OddEvenKeys(swapped, v);
+#endif
+  }
+
+  // Swaps with the vector formed by reversing contiguous groups of 4 keys.
+  template <class D>
+  HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
+    const Base* base = static_cast<const Base*>(this);
+    Vec<D> swapped = base->ReverseKeys4(d, v);
+
+    // Only specialize for AVX3 because this requires 512-bit vectors.
+#if HWY_TARGET <= HWY_AVX3
+    const Vec512<uint64_t> outHx = CompareTop(d, v, swapped);
+    // Similar to ReplicateTop4x, we want to gang together 2 comparison results
+    // (4 lanes). They are not contiguous, so use permute to replicate 4x.
+    alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7};
+    const Vec512<uint64_t> select =
+        TableLookupLanes(outHx, SetTableIndices(d, kIndices));
+    return IfVecThenElse(select, swapped, v);
+#else
+    Sort2(d, v, swapped);
+    return base->OddEvenPairs(d, swapped, v);
+#endif
+  }
+
+  // Conditionally swaps lane 0 with 4, 1 with 5 etc.
+  template <class D>
+  HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const {
+    // Only used by Merge16, which would require 2048 bit vectors (unsupported).
+    HWY_ASSERT(0);
+  }
+};
+
+#endif  // VQSORT_ENABLED
+
+}  // namespace detail
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
@@ -0,0 +1,184 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#include <string.h>  // memset
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/shared-inl.h"
+
+// Architectures for which we know HWY_HAVE_SCALABLE == 0. This opts into an
+// optimization that replaces dynamic allocation with stack storage.
+#ifndef VQSORT_STACK
+#if HWY_ARCH_X86 || HWY_ARCH_WASM
+#define VQSORT_STACK 1
+#else
+#define VQSORT_STACK 0
+#endif
+#endif  // VQSORT_STACK
+
+#if !VQSORT_STACK
+#include "hwy/aligned_allocator.h"
+#endif
+
+// Check if we have sys/random.h. First skip some systems on which the check
+// itself (features.h) might be problematic.
+#if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV
+#define VQSORT_GETRANDOM 0
+#endif
+
+#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX
+#include <features.h>
+
+// ---- which libc
+#if defined(__UCLIBC__)
+#define VQSORT_GETRANDOM 1  // added Mar 2015, before uclibc-ng 1.0
+
+#elif defined(__GLIBC__) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 25)
+#define VQSORT_GETRANDOM 1
+#else
+#define VQSORT_GETRANDOM 0
+#endif
+
+#else
+// Assume MUSL, which has getrandom since 2018. There is no macro to test, see
+// https://www.openwall.com/lists/musl/2013/03/29/13.
+#define VQSORT_GETRANDOM 1
+
+#endif  // ---- which libc
+#endif  // linux
+
+#if !defined(VQSORT_GETRANDOM)
+#define VQSORT_GETRANDOM 0
+#endif
+
+// Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom
+// (not all Android support the getrandom wrapper)
+#ifndef VQSORT_SECURE_SEED
+
+#if VQSORT_GETRANDOM
+#define VQSORT_SECURE_SEED 1
+#elif defined(_WIN32) || defined(_WIN64)
+#define VQSORT_SECURE_SEED 2
+#else
+#define VQSORT_SECURE_SEED 0
+#endif
+
+#endif  // VQSORT_SECURE_SEED
+
+#if !VQSORT_SECURE_RNG
+
+#include <time.h>
+#if VQSORT_SECURE_SEED == 1
+#include <sys/random.h>
+#elif VQSORT_SECURE_SEED == 2
+#include <windows.h>
+#pragma comment(lib, "advapi32.lib")
+// Must come after windows.h.
+#include <wincrypt.h>
+#endif  // VQSORT_SECURE_SEED
+
+#endif  // !VQSORT_SECURE_RNG
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+size_t VectorSize() { return Lanes(ScalableTag<uint8_t, 3>()); }
+bool HaveFloat64() { return HWY_HAVE_FLOAT64; }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(VectorSize);
+HWY_EXPORT(HaveFloat64);
+
+}  // namespace
+
+Sorter::Sorter() {
+#if VQSORT_STACK
+  ptr_ = nullptr;  // Sort will use stack storage instead
+#else
+  // Determine the largest buffer size required for any type by trying them all.
+  // (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t
+  // may require a larger buffer.)
+  const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)();
+  const size_t max_bytes =
+      HWY_MAX(HWY_MAX(SortConstants::BufBytes<uint16_t>(vector_size),
+                      SortConstants::BufBytes<uint32_t>(vector_size)),
+              SortConstants::BufBytes<uint64_t>(vector_size));
+  ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr);
+
+  // Prevent msan errors by initializing.
+  memset(ptr_, 0, max_bytes);
+#endif
+}
+
+void Sorter::Delete() {
+#if !VQSORT_STACK
+  FreeAlignedBytes(ptr_, nullptr, nullptr);
+  ptr_ = nullptr;
+#endif
+}
+
+#if !VQSORT_SECURE_RNG
+
+void Sorter::Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes) {
+#if VQSORT_SECURE_SEED == 1
+  // May block if urandom is not yet initialized.
+  const ssize_t ret = getrandom(bytes, 24, /*flags=*/0);
+  if (ret == 24) return;
+#elif VQSORT_SECURE_SEED == 2
+  HCRYPTPROV hProvider{};
+  if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL,
+                           CRYPT_VERIFYCONTEXT)) {
+    const BOOL ok =
+        CryptGenRandom(hProvider, 24, reinterpret_cast<BYTE*>(bytes));
+    CryptReleaseContext(hProvider, 0);
+    if (ok) return;
+  }
+#endif
+
+  // VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from
+  // stack/heap/code addresses and the clock() timer.
+  uint64_t* words = reinterpret_cast<uint64_t*>(bytes);
+  uint64_t** seed_stack = &words;
+  void (*seed_code)(const void*, size_t, void*) = &Fill24Bytes;
+  const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack);
+  const uintptr_t bits_heap = reinterpret_cast<uintptr_t>(seed_heap);
+  const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code);
+  const uint64_t bits_time = static_cast<uint64_t>(clock());
+  words[0] = bits_stack ^ bits_time ^ seed_num;
+  words[1] = bits_heap ^ bits_time ^ seed_num;
+  words[2] = bits_code ^ bits_time ^ seed_num;
+}
+
+#endif  // !VQSORT_SECURE_RNG
+
+bool Sorter::HaveFloat64() { return HWY_DYNAMIC_DISPATCH(HaveFloat64)(); }
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,108 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Interface to vectorized quicksort with dynamic dispatch.
+// Blog post: https://tinyurl.com/vqsort-blog
+// Paper with measurements: https://arxiv.org/abs/2205.05982
+//
+// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is
+// worthwhile, we recommend using this code for sorting arrays whose size is at
+// least 512 KiB.
+
+#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
+#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
+
+#include "hwy/base.h"
+
+namespace hwy {
+
+// Tag arguments that determine the sort order.
+struct SortAscending {
+  constexpr bool IsAscending() const { return true; }
+};
+struct SortDescending {
+  constexpr bool IsAscending() const { return false; }
+};
+
+// Allocates O(1) space. Type-erased RAII wrapper over hwy/aligned_allocator.h.
+// This allows amortizing the allocation over multiple sorts.
+class HWY_CONTRIB_DLLEXPORT Sorter {
+ public:
+  Sorter();
+  ~Sorter() { Delete(); }
+
+  // Move-only
+  Sorter(const Sorter&) = delete;
+  Sorter& operator=(const Sorter&) = delete;
+  Sorter(Sorter&& other) {
+    Delete();
+    ptr_ = other.ptr_;
+    other.ptr_ = nullptr;
+  }
+  Sorter& operator=(Sorter&& other) {
+    Delete();
+    ptr_ = other.ptr_;
+    other.ptr_ = nullptr;
+    return *this;
+  }
+
+  // Sorts keys[0, n). Dispatches to the best available instruction set,
+  // and does not allocate memory.
+  void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
+  void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
+  void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
+  void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
+  void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
+  void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
+  void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(float* HWY_RESTRICT keys, size_t n, SortDescending) const;
+  void operator()(double* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(double* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
+  void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
+  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
+  void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
+  // For internal use only
+  static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
+  static bool HaveFloat64();
+
+ private:
+  void Delete();
+
+  template <typename T>
+  T* Get() const {
+    return static_cast<T*>(ptr_);
+  }
+
+  void* ptr_ = nullptr;
+};
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
@@ -0,0 +1,62 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
+                uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(Sort128Asc);
+}  // namespace
+
+void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(Sort128Asc)
+  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,62 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
+                 uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(Sort128Desc);
+}  // namespace
+
+void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(Sort128Desc)
+  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,53 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
+  SortTag<float> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<float>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortF32Asc);
+}  // namespace
+
+void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n, Get<float>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
+                 float* HWY_RESTRICT buf) {
+  SortTag<float> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<float>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortF32Desc);
+}  // namespace
+
+void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n, Get<float>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,61 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortF64Asc(double* HWY_RESTRICT keys, size_t num,
+                double* HWY_RESTRICT buf) {
+#if HWY_HAVE_FLOAT64
+  SortTag<double> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<double>>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void)keys;
+  (void)num;
+  (void)buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortF64Asc);
+}  // namespace
+
+void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n, Get<double>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,61 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortF64Desc(double* HWY_RESTRICT keys, size_t num,
+                 double* HWY_RESTRICT buf) {
+#if HWY_HAVE_FLOAT64
+  SortTag<double> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<double>>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void)keys;
+  (void)num;
+  (void)buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortF64Desc);
+}  // namespace
+
+void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n, Get<double>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
+                int16_t* HWY_RESTRICT buf) {
+  SortTag<int16_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int16_t>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI16Asc);
+}  // namespace
+
+void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n, Get<int16_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
+                 int16_t* HWY_RESTRICT buf) {
+  SortTag<int16_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int16_t>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI16Desc);
+}  // namespace
+
+void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n, Get<int16_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
+                int32_t* HWY_RESTRICT buf) {
+  SortTag<int32_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int32_t>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI32Asc);
+}  // namespace
+
+void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n, Get<int32_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
+                 int32_t* HWY_RESTRICT buf) {
+  SortTag<int32_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int32_t>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI32Desc);
+}  // namespace
+
+void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n, Get<int32_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
+                int64_t* HWY_RESTRICT buf) {
+  SortTag<int64_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI64Asc);
+}  // namespace
+
+void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n, Get<int64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
+                 int64_t* HWY_RESTRICT buf) {
+  SortTag<int64_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int64_t>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI64Desc);
+}  // namespace
+
+void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n, Get<int64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,65 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
+                  uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::Traits128<detail::OrderAscendingKV128>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV128Asc);
+}  // namespace
+
+void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV128Asc)
+  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,65 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
+                   uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::Traits128<detail::OrderDescendingKV128>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV128Desc);
+}  // namespace
+
+void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV128Desc)
+  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,65 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64a.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
+                 uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscendingKV64>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV64Asc);
+}  // namespace
+
+void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV64Asc)
+  (reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,65 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64d.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
+                  uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescendingKV64>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV64Desc);
+}  // namespace
+
+void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV64Desc)
+  (reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
+                uint16_t* HWY_RESTRICT buf) {
+  SortTag<uint16_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint16_t>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU16Asc);
+}  // namespace
+
+void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n, Get<uint16_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,55 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
+                 uint16_t* HWY_RESTRICT buf) {
+  SortTag<uint16_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint16_t>>>
+      st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU16Desc);
+}  // namespace
+
+void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n, Get<uint16_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
+                uint32_t* HWY_RESTRICT buf) {
+  SortTag<uint32_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint32_t>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU32Asc);
+}  // namespace
+
+void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n, Get<uint32_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,55 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
+                 uint32_t* HWY_RESTRICT buf) {
+  SortTag<uint32_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint32_t>>>
+      st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU32Desc);
+}  // namespace
+
+void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n, Get<uint32_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
+                uint64_t* HWY_RESTRICT buf) {
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint64_t>>> st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU64Asc);
+}  // namespace
+
+void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,55 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
+                 uint64_t* HWY_RESTRICT buf) {
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint64_t>>>
+      st;
+  Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU64Desc);
+}  // namespace
+
+void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
@@ -0,0 +1,234 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
+#define HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
+
+// Detects compiler and arch from predefined macros. Zero dependencies for
+// inclusion by foreach_target.h.
+
+// Add to #if conditions to prevent IDE from graying out code.
+#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
+    (defined Q_CREATOR_RUN) || (defined(__CLANGD__))
+#define HWY_IDE 1
+#else
+#define HWY_IDE 0
+#endif
+
+//------------------------------------------------------------------------------
+// Compiler
+
+// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like
+// MSVC in other aspects (e.g. HWY_DIAGNOSTICS).
+#if defined(_MSC_VER) && !defined(__clang__)
+#define HWY_COMPILER_MSVC _MSC_VER
+#else
+#define HWY_COMPILER_MSVC 0
+#endif
+
+#if defined(_MSC_VER) && defined(__clang__)
+#define HWY_COMPILER_CLANGCL _MSC_VER
+#else
+#define HWY_COMPILER_CLANGCL 0
+#endif
+
+#ifdef __INTEL_COMPILER
+#define HWY_COMPILER_ICC __INTEL_COMPILER
+#else
+#define HWY_COMPILER_ICC 0
+#endif
+
+#ifdef __INTEL_LLVM_COMPILER
+#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER
+#else
+#define HWY_COMPILER_ICX 0
+#endif
+
+// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
+// compiler extensions (eg. Clang, Intel...)
+#ifdef __GNUC__
+#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#define HWY_COMPILER_GCC 0
+#endif
+
+// Clang or clang-cl, not GCC.
+#ifdef __clang__
+// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
+// an invalid version number, deduce it from the presence of warnings.
+// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
+#if defined(__apple_build_version__) || __clang_major__ >= 999
+#if __has_warning("-Wbitwise-instead-of-logical")
+#define HWY_COMPILER_CLANG 1400
+#elif __has_warning("-Wreserved-identifier")
+#define HWY_COMPILER_CLANG 1300
+#elif __has_warning("-Wformat-insufficient-args")
+#define HWY_COMPILER_CLANG 1200
+#elif __has_warning("-Wimplicit-const-int-float-conversion")
+#define HWY_COMPILER_CLANG 1100
+#elif __has_warning("-Wmisleading-indentation")
+#define HWY_COMPILER_CLANG 1000
+#elif defined(__FILE_NAME__)
+#define HWY_COMPILER_CLANG 900
+#elif __has_warning("-Wextra-semi-stmt") || \
+    __has_builtin(__builtin_rotateleft32)
+#define HWY_COMPILER_CLANG 800
+// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently
+// based on Clang 7, but does not support the warning we test.
+// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and
+// https://trac.macports.org/wiki/XcodeVersionInfo.
+#elif __has_warning("-Wc++98-compat-extra-semi") || \
+    (defined(__apple_build_version__) && __apple_build_version__ >= 10010000)
+#define HWY_COMPILER_CLANG 700
+#else  // Anything older than 7.0 is not recommended for Highway.
+#define HWY_COMPILER_CLANG 600
+#endif  // __has_warning chain
+#else   // use normal version
+#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
+#endif
+#else  // Not clang
+#define HWY_COMPILER_CLANG 0
+#endif
+
+#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
+#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
+#else
+#define HWY_COMPILER_GCC_ACTUAL 0
+#endif
+
+// More than one may be nonzero, but we want at least one.
+#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
+          HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
+#error "Unsupported compiler"
+#endif
+
+// We should only detect one of these (only clang/clangcl overlap)
+#if 1 <                                                                     \
+    (!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
+     !!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
+#error "Detected multiple compilers"
+#endif
+
+#ifdef __has_builtin
+#define HWY_HAS_BUILTIN(name) __has_builtin(name)
+#else
+#define HWY_HAS_BUILTIN(name) 0
+#endif
+
+#ifdef __has_attribute
+#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
+#else
+#define HWY_HAS_ATTRIBUTE(name) 0
+#endif
+
+#ifdef __has_feature
+#define HWY_HAS_FEATURE(name) __has_feature(name)
+#else
+#define HWY_HAS_FEATURE(name) 0
+#endif
+
+//------------------------------------------------------------------------------
+// Architecture
+
+#if defined(__i386__) || defined(_M_IX86)
+#define HWY_ARCH_X86_32 1
+#else
+#define HWY_ARCH_X86_32 0
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define HWY_ARCH_X86_64 1
+#else
+#define HWY_ARCH_X86_64 0
+#endif
+
+#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
+#error "Cannot have both x86-32 and x86-64"
+#endif
+
+#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
+#define HWY_ARCH_X86 1
+#else
+#define HWY_ARCH_X86 0
+#endif
+
+#if defined(__powerpc64__) || defined(_M_PPC)
+#define HWY_ARCH_PPC 1
+#else
+#define HWY_ARCH_PPC 0
+#endif
+
+#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
+#define HWY_ARCH_ARM_A64 1
+#else
+#define HWY_ARCH_ARM_A64 0
+#endif
+
+#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7)
+#define HWY_ARCH_ARM_V7 1
+#else
+#define HWY_ARCH_ARM_V7 0
+#endif
+
+#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
+#error "Cannot have both A64 and V7"
+#endif
+
+// Any *supported* version of Arm, i.e. 7 or later
+#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
+#define HWY_ARCH_ARM 1
+#else
+#define HWY_ARCH_ARM 0
+#endif
+
+// Older than v7 (e.g. armel aka Arm v5), in which case we do not support SIMD.
+#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM
+#define HWY_ARCH_ARM_OLD 1
+#else
+#define HWY_ARCH_ARM_OLD 0
+#endif
+
+#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
+#define HWY_ARCH_WASM 1
+#else
+#define HWY_ARCH_WASM 0
+#endif
+
+#ifdef __riscv
+#define HWY_ARCH_RVV 1
+#else
+#define HWY_ARCH_RVV 0
+#endif
+
+// It is an error to detect multiple architectures at the same time, but OK to
+// detect none of the above.
+#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
+     HWY_ARCH_WASM + HWY_ARCH_RVV) > 1
+#error "Must not detect more than one architecture"
+#endif
+
+#if defined(_WIN32) || defined(_WIN64)
+#define HWY_OS_WIN 1
+#else
+#define HWY_OS_WIN 0
+#endif
+
+#if defined(linux) || defined(__linux__)
+#define HWY_OS_LINUX 1
+#else
+#define HWY_OS_LINUX 0
+#endif
+
+#endif  // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
@@ -0,0 +1,478 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
+#define HIGHWAY_HWY_DETECT_TARGETS_H_
+
+// Defines targets and chooses which to enable.
+
+#include "hwy/detect_compiler_arch.h"
+
+//------------------------------------------------------------------------------
+// Optional configuration
+
+// See g3doc/quick_reference.md for documentation of these macros.
+
+// Uncomment to override the default baseline determined from predefined macros:
+// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
+
+// Uncomment to override the default blocklist:
+// #define HWY_BROKEN_TARGETS HWY_AVX3
+
+// Uncomment to definitely avoid generating those target(s):
+// #define HWY_DISABLED_TARGETS HWY_SSE4
+
+// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
+// AVX2 target for VMs which support AVX2 but not the other instruction sets)
+// #define HWY_DISABLE_BMI2_FMA
+
+// Uncomment to enable SSSE3/SSE4 on MSVC even if AVX is not enabled
+// #define HWY_WANT_SSSE3
+// #define HWY_WANT_SSE4
+
+//------------------------------------------------------------------------------
+// Targets
+
+// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
+// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
+//
+// All values are unconditionally defined so we can test HWY_TARGETS without
+// first checking the HWY_ARCH_*.
+//
+// The C99 preprocessor evaluates #if expressions using intmax_t types. This
+// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
+// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
+// avoid overflow when computing HWY_TARGETS (subtracting one instead of
+// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
+
+// --------------------------- x86: 15 targets (+ one fallback)
+// Bits 0..6 reserved (7 targets)
+// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
+// VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
+// Tiger Lake? We do not yet have uses for GFNI.
+#define HWY_AVX3_DL (1LL << 7)  // see HWY_WANT_AVX3_DL below
+#define HWY_AVX3 (1LL << 8)
+#define HWY_AVX2 (1LL << 9)
+// Bit 10: reserved for AVX
+#define HWY_SSE4 (1LL << 11)
+#define HWY_SSSE3 (1LL << 12)
+// Bits 13..14 reserved for SSE3 or SSE2 (2 targets)
+// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
+// dynamic dispatch. All x86 target bits must be lower or equal to
+// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
+// HWY_MAX_DYNAMIC_TARGETS in total.
+#define HWY_HIGHEST_TARGET_BIT_X86 14
+
+// --------------------------- Arm: 15 targets (+ one fallback)
+// Bits 15..23 reserved (9 targets)
+#define HWY_SVE2_128 (1LL << 24)  // specialized target (e.g. Arm N2)
+#define HWY_SVE_256 (1LL << 25)   // specialized target (e.g. Arm V1)
+#define HWY_SVE2 (1LL << 26)
+#define HWY_SVE (1LL << 27)
+#define HWY_NEON (1LL << 28)  // On A64, includes/requires AES
+// Bit 29 reserved (Helium?)
+#define HWY_HIGHEST_TARGET_BIT_ARM 29
+
+// --------------------------- RISC-V: 9 targets (+ one fallback)
+// Bits 30..36 reserved (7 targets)
+#define HWY_RVV (1LL << 37)
+// Bit 38 reserved
+#define HWY_HIGHEST_TARGET_BIT_RVV 38
+
+// --------------------------- Future expansion: 4 targets
+// Bits 39..42 reserved
+
+
+// --------------------------- IBM Power: 9 targets (+ one fallback)
+// Bits 43..48 reserved (6 targets)
+#define HWY_PPC8 (1LL << 49)  // v2.07 or 3
+// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
+#define HWY_HIGHEST_TARGET_BIT_PPC 51
+
+// --------------------------- WebAssembly: 9 targets (+ one fallback)
+// Bits 52..57 reserved (6 targets)
+#define HWY_WASM_EMU256 (1LL << 58)  // Experimental
+#define HWY_WASM (1LL << 59)
+// Bits 60 reserved
+#define HWY_HIGHEST_TARGET_BIT_WASM 60
+
+// --------------------------- Emulation: 2 targets
+
+#define HWY_EMU128 (1LL << 61)
+// We do not add/left-shift, so this will not overflow to a negative number.
+#define HWY_SCALAR (1LL << 62)
+#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
+
+// Do not use bit 63 - would be confusing to have negative numbers.
+
+//------------------------------------------------------------------------------
+// Set default blocklists
+
+// Disabled means excluded from enabled at user's request. A separate config
+// macro allows disabling without deactivating the blocklist below.
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS 0
+#endif
+
+// Broken means excluded from enabled due to known compiler issues. Allow the
+// user to override this blocklist without any guarantee of success.
+#ifndef HWY_BROKEN_TARGETS
+
+// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
+// SSE4 codegen (possibly only for msan), so disable all those targets.
+#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
+#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
+// This entails a major speed reduction, so warn unless the user explicitly
+// opts in to scalar-only.
+#if !defined(HWY_COMPILE_ONLY_SCALAR)
+#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
+#endif
+
+// 32-bit may fail to compile AVX2/3.
+#elif HWY_ARCH_X86_32
+#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
+
+// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
+#elif HWY_COMPILER_MSVC != 0
+#define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
+
+// armv7be has not been tested and is not yet supported.
+#elif HWY_ARCH_ARM_V7 &&          \
+    (defined(__ARM_BIG_ENDIAN) || \
+     (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
+#define HWY_BROKEN_TARGETS (HWY_NEON)
+
+// SVE[2] require recent clang or gcc versions.
+#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
+    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
+#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
+
+#else
+#define HWY_BROKEN_TARGETS 0
+#endif
+
+#endif  // HWY_BROKEN_TARGETS
+
+// Enabled means not disabled nor blocklisted.
+#define HWY_ENABLED(targets) \
+  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
+
+// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
+// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate
+// from HWY_BROKEN_TARGETS because it affects the fallback target, which must
+// always be enabled. If 1, we instead choose HWY_SCALAR even without
+// HWY_COMPILE_ONLY_SCALAR being set.
+#if !defined(HWY_BROKEN_EMU128)  // allow overriding
+#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203
+#define HWY_BROKEN_EMU128 1
+#else
+#define HWY_BROKEN_EMU128 0
+#endif
+#endif  // HWY_BROKEN_EMU128
+
+//------------------------------------------------------------------------------
+// Detect baseline targets using predefined macros
+
+// Baseline means the targets for which the compiler is allowed to generate
+// instructions, implying the target CPU would have to support them. This does
+// not take the blocklist into account.
+
+#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
+#define HWY_BASELINE_SCALAR HWY_SCALAR
+#else
+#define HWY_BASELINE_SCALAR HWY_EMU128
+#endif
+
+// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
+// HWY_TARGET == HWY_BASELINE_SCALAR.
+
+#if HWY_ARCH_WASM && defined(__wasm_simd128__)
+#if defined(HWY_WANT_WASM2)
+#define HWY_BASELINE_WASM HWY_WASM_EMU256
+#else
+#define HWY_BASELINE_WASM HWY_WASM
+#endif  // HWY_WANT_WASM2
+#else
+#define HWY_BASELINE_WASM 0
+#endif
+
+// Avoid choosing the PPC target until we have an implementation.
+#if HWY_ARCH_PPC && defined(__VSX__) && 0
+#define HWY_BASELINE_PPC8 HWY_PPC8
+#else
+#define HWY_BASELINE_PPC8 0
+#endif
+
+#define HWY_BASELINE_SVE2 0
+#define HWY_BASELINE_SVE 0
+#define HWY_BASELINE_NEON 0
+
+#if HWY_ARCH_ARM
+
+#if defined(__ARM_FEATURE_SVE2)
+#undef HWY_BASELINE_SVE2  // was 0, will be re-defined
+// If user specified -msve-vector-bits=128, they assert the vector length is
+// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
+#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
+#define HWY_BASELINE_SVE2 HWY_SVE2_128
+// Otherwise we're not sure what the vector length will be. The baseline must be
+// unconditionally valid, so we can only assume HWY_SVE2. However, when running
+// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
+// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
+#else
+#define HWY_BASELINE_SVE2 HWY_SVE2
+#endif  // __ARM_FEATURE_SVE_BITS
+#endif  // __ARM_FEATURE_SVE2
+
+#if defined(__ARM_FEATURE_SVE)
+#undef HWY_BASELINE_SVE  // was 0, will be re-defined
+// See above. If user-specified vector length matches our optimization, use it.
+#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
+#define HWY_BASELINE_SVE HWY_SVE_256
+#else
+#define HWY_BASELINE_SVE HWY_SVE
+#endif  // __ARM_FEATURE_SVE_BITS
+#endif  // __ARM_FEATURE_SVE
+
+// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#undef HWY_BASELINE_NEON
+#define HWY_BASELINE_NEON HWY_NEON
+#endif
+
+#endif  // HWY_ARCH_ARM
+
+// Special handling for MSVC because it has fewer predefined macros:
+#if HWY_COMPILER_MSVC
+
+// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
+//    https://stackoverflow.com/questions/18563978/.
+#if defined(__AVX__)
+#define HWY_CHECK_SSSE3 1
+#define HWY_CHECK_SSE4 1
+#else
+#define HWY_CHECK_SSSE3 0
+#define HWY_CHECK_SSE4 0
+#endif
+
+// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
+//    PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
+#define HWY_CHECK_PCLMUL_AES 1
+#define HWY_CHECK_BMI2_FMA 1
+#define HWY_CHECK_F16C 1
+
+#else  // non-MSVC
+
+#if defined(__SSSE3__)
+#define HWY_CHECK_SSSE3 1
+#else
+#define HWY_CHECK_SSSE3 0
+#endif
+
+#if defined(__SSE4_1__) && defined(__SSE4_2__)
+#define HWY_CHECK_SSE4 1
+#else
+#define HWY_CHECK_SSE4 0
+#endif
+
+// If these are disabled, they should not gate the availability of SSE4/AVX2.
+#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
+#define HWY_CHECK_PCLMUL_AES 1
+#else
+#define HWY_CHECK_PCLMUL_AES 0
+#endif
+
+#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
+#define HWY_CHECK_BMI2_FMA 1
+#else
+#define HWY_CHECK_BMI2_FMA 0
+#endif
+
+#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
+#define HWY_CHECK_F16C 1
+#else
+#define HWY_CHECK_F16C 0
+#endif
+
+#endif  // non-MSVC
+
+#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
+#define HWY_BASELINE_SSSE3 HWY_SSSE3
+#else
+#define HWY_BASELINE_SSSE3 0
+#endif
+
+#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
+#define HWY_BASELINE_SSE4 HWY_SSE4
+#else
+#define HWY_BASELINE_SSE4 0
+#endif
+
+#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
+    defined(__AVX2__)
+#define HWY_BASELINE_AVX2 HWY_AVX2
+#else
+#define HWY_BASELINE_AVX2 0
+#endif
+
+// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
+#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
+    defined(__AVX512DQ__) && defined(__AVX512VL__)
+#define HWY_BASELINE_AVX3 HWY_AVX3
+#else
+#define HWY_BASELINE_AVX3 0
+#endif
+
+// TODO(janwas): not yet known whether these will be set by MSVC
+#if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
+    defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) &&                  \
+    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) &&            \
+    defined(__AVX512BITALG__)
+#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
+#else
+#define HWY_BASELINE_AVX3_DL 0
+#endif
+
+#if HWY_ARCH_RVV && defined(__riscv_vector)
+#define HWY_BASELINE_RVV HWY_RVV
+#else
+#define HWY_BASELINE_RVV 0
+#endif
+
+// Allow the user to override this without any guarantee of success.
+#ifndef HWY_BASELINE_TARGETS
+#define HWY_BASELINE_TARGETS                                     \
+  (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
+   HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | HWY_BASELINE_NEON |    \
+   HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 |  \
+   HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
+#endif  // HWY_BASELINE_TARGETS
+
+//------------------------------------------------------------------------------
+// Choose target for static dispatch
+
+#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
+#if HWY_ENABLED_BASELINE == 0
+#error "At least one baseline target must be defined and enabled"
+#endif
+
+// Best baseline, used for static dispatch. This is the least-significant 1-bit
+// within HWY_ENABLED_BASELINE and lower bit values imply "better".
+#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
+
+// Start by assuming static dispatch. If we later use dynamic dispatch, this
+// will be defined to other targets during the multiple-inclusion, and finally
+// return to the initial value. Defining this outside begin/end_target ensures
+// inl headers successfully compile by themselves (required by Bazel).
+#define HWY_TARGET HWY_STATIC_TARGET
+
+//------------------------------------------------------------------------------
+// Choose targets for dynamic dispatch according to one of four policies
+
+#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
+         defined(HWY_COMPILE_ONLY_STATIC))
+#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
+#endif
+// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
+
+// Clang, GCC and MSVC allow runtime dispatch on x86.
+#if HWY_ARCH_X86
+#define HWY_HAVE_RUNTIME_DISPATCH 1
+// On Arm, currently only GCC does, and we require Linux to detect CPU
+// capabilities.
+#elif HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX
+#define HWY_HAVE_RUNTIME_DISPATCH 1
+#else
+#define HWY_HAVE_RUNTIME_DISPATCH 0
+#endif
+
+// AVX3_DL is not widely available yet. To reduce code size and compile time,
+// only include it in the set of attainable targets (for dynamic dispatch) if
+// the user opts in, OR it is in the baseline (we check whether enabled below).
+#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE & HWY_AVX3_DL)
+#define HWY_ATTAINABLE_AVX3_DL HWY_AVX3_DL
+#else
+#define HWY_ATTAINABLE_AVX3_DL 0
+#endif
+
+#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
+                         (HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
+#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
+#else
+#define HWY_ATTAINABLE_SVE 0
+#endif
+
+#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
+                         (HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
+#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
+#else
+#define HWY_ATTAINABLE_SVE2 0
+#endif
+
+// Attainable means enabled and the compiler allows intrinsics (even when not
+// allowed to autovectorize). Used in 3 and 4.
+#if HWY_ARCH_X86
+#define HWY_ATTAINABLE_TARGETS                                        \
+  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
+              HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
+#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
+#define HWY_ATTAINABLE_TARGETS                                      \
+  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_NEON | HWY_ATTAINABLE_SVE | \
+              HWY_ATTAINABLE_SVE2)
+#else
+#define HWY_ATTAINABLE_TARGETS \
+  (HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE | HWY_ATTAINABLE_SVE2)
+#endif
+
+// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
+#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
+#undef HWY_STATIC_TARGET
+#define HWY_STATIC_TARGET HWY_EMU128  // override baseline
+#define HWY_TARGETS HWY_EMU128
+
+// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
+// we currently still support it for backwards compatibility.
+#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
+    (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
+#undef HWY_STATIC_TARGET
+#define HWY_STATIC_TARGET HWY_SCALAR  // override baseline
+#define HWY_TARGETS HWY_SCALAR
+
+// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
+#elif defined(HWY_COMPILE_ONLY_STATIC)
+#define HWY_TARGETS HWY_STATIC_TARGET
+
+// 3) For tests: include all attainable targets (in particular: scalar)
+#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
+#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
+
+// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
+// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
+// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
+// sets all lower bits (better targets), then we also include the static target.
+#else
+#define HWY_TARGETS \
+  (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
+
+#endif  // target policy
+
+// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
+// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
+// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
+#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
+#error "Logic error: best baseline should be included in dynamic targets"
+#endif
+
+#endif  // HIGHWAY_HWY_DETECT_TARGETS_H_
@@ -0,0 +1,254 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <memory>
+#include <numeric>  // iota
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// Must come after foreach_target.h to avoid redefinition errors.
+#include "hwy/aligned_allocator.h"
+#include "hwy/highway.h"
+#include "hwy/nanobenchmark.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+#if HWY_TARGET != HWY_SCALAR
+using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
+#endif
+
+class TwoArray {
+ public:
+  // Must be a multiple of the vector lane count * 8.
+  static size_t NumItems() { return 3456; }
+
+  TwoArray()
+      : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
+    // = 1, but compiler doesn't know
+    const float init = static_cast<float>(Unpredictable1());
+    std::iota(a_.get(), a_.get() + NumItems(), init);
+    std::iota(b_, b_ + NumItems(), init);
+  }
+
+ protected:
+  AlignedFreeUniquePtr<float[]> a_;
+  float* b_;
+};
+
+// Measures durations, verifies results, prints timings.
+template <class Benchmark>
+void RunBenchmark(const char* caption) {
+  printf("%10s: ", caption);
+  const size_t kNumInputs = 1;
+  const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1());
+  const FuncInput inputs[kNumInputs] = {num_items};
+  Result results[kNumInputs];
+
+  Benchmark benchmark;
+
+  Params p;
+  p.verbose = false;
+  p.max_evals = 7;
+  p.target_rel_mad = 0.002;
+  const size_t num_results = MeasureClosure(
+      [&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
+      kNumInputs, results, p);
+  if (num_results != kNumInputs) {
+    fprintf(stderr, "MeasureClosure failed.\n");
+  }
+
+  benchmark.Verify(num_items);
+
+  for (size_t i = 0; i < num_results; ++i) {
+    const double cycles_per_item =
+        results[i].ticks / static_cast<double>(results[i].input);
+    const double mad = results[i].variability * cycles_per_item;
+    printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
+           static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
+  }
+}
+
+void Intro() {
+  const float in[16] = {1, 2, 3, 4, 5, 6};
+  float out[16];
+  const ScalableTag<float> d;  // largest possible vector
+  for (size_t i = 0; i < 16; i += Lanes(d)) {
+    const auto vec = LoadU(d, in + i);  // no alignment requirement
+    auto result = Mul(vec, vec);
+    result = Add(result, result);  // can update if not const
+    StoreU(result, d, out + i);
+  }
+  printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
+}
+
+// BEGINNER: dot product
+// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
+class BenchmarkDot : public TwoArray {
+ public:
+  BenchmarkDot() : dot_{-1.0f} {}
+
+  FuncOutput operator()(const size_t num_items) {
+    const ScalableTag<float> d;
+    const size_t N = Lanes(d);
+    using V = decltype(Zero(d));
+    // Compiler doesn't make independent sum* accumulators, so unroll manually.
+    // We cannot use an array because V might be a sizeless type. For reasonable
+    // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
+    V sum0 = Zero(d);
+    V sum1 = Zero(d);
+    V sum2 = Zero(d);
+    V sum3 = Zero(d);
+    const float* const HWY_RESTRICT pa = &a_[0];
+    const float* const HWY_RESTRICT pb = b_;
+    for (size_t i = 0; i < num_items; i += 4 * N) {
+      const auto a0 = Load(d, pa + i + 0 * N);
+      const auto b0 = Load(d, pb + i + 0 * N);
+      sum0 = MulAdd(a0, b0, sum0);
+      const auto a1 = Load(d, pa + i + 1 * N);
+      const auto b1 = Load(d, pb + i + 1 * N);
+      sum1 = MulAdd(a1, b1, sum1);
+      const auto a2 = Load(d, pa + i + 2 * N);
+      const auto b2 = Load(d, pb + i + 2 * N);
+      sum2 = MulAdd(a2, b2, sum2);
+      const auto a3 = Load(d, pa + i + 3 * N);
+      const auto b3 = Load(d, pb + i + 3 * N);
+      sum3 = MulAdd(a3, b3, sum3);
+    }
+    // Reduction tree: sum of all accumulators by pairs into sum0.
+    sum0 = Add(sum0, sum1);
+    sum2 = Add(sum2, sum3);
+    sum0 = Add(sum0, sum2);
+    dot_ = GetLane(SumOfLanes(d, sum0));
+    return static_cast<FuncOutput>(dot_);
+  }
+  void Verify(size_t num_items) {
+    if (dot_ == -1.0f) {
+      fprintf(stderr, "Dot: must call Verify after benchmark");
+      abort();
+    }
+
+    const float expected =
+        std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
+    const float rel_err = std::abs(expected - dot_) / expected;
+    if (rel_err > 1.1E-6f) {
+      fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_,
+              rel_err);
+      abort();
+    }
+  }
+
+ private:
+  float dot_;  // for Verify
+};
+
+// INTERMEDIATE: delta coding
+// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
+struct BenchmarkDelta : public TwoArray {
+  FuncOutput operator()(const size_t num_items) const {
+#if HWY_TARGET == HWY_SCALAR
+    b_[0] = a_[0];
+    for (size_t i = 1; i < num_items; ++i) {
+      b_[i] = a_[i] - a_[i - 1];
+    }
+#elif HWY_CAP_GE256
+    // Larger vectors are split into 128-bit blocks, easiest to use the
+    // unaligned load support to shift between them.
+    const ScalableTag<float> df;
+    const size_t N = Lanes(df);
+    size_t i;
+    b_[0] = a_[0];
+    for (i = 1; i < N; ++i) {
+      b_[i] = a_[i] - a_[i - 1];
+    }
+    for (; i < num_items; i += N) {
+      const auto a = Load(df, &a_[i]);
+      const auto shifted = LoadU(df, &a_[i - 1]);
+      Store(a - shifted, df, &b_[i]);
+    }
+#else  // 128-bit
+    // Slightly better than unaligned loads
+    const HWY_CAPPED(float, 4) df;
+    const size_t N = Lanes(df);
+    size_t i;
+    b_[0] = a_[0];
+    for (i = 1; i < N; ++i) {
+      b_[i] = a_[i] - a_[i - 1];
+    }
+    auto prev = Load(df, &a_[0]);
+    for (; i < num_items; i += Lanes(df)) {
+      const auto a = Load(df, &a_[i]);
+      const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
+      prev = a;
+      Store(Sub(a, shifted), df, &b_[i]);
+    }
+#endif
+    return static_cast<FuncOutput>(b_[num_items - 1]);
+  }
+
+  void Verify(size_t num_items) {
+    for (size_t i = 0; i < num_items; ++i) {
+      const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
+      const float err = std::abs(expected - b_[i]);
+      if (err > 1E-6f) {
+        fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]);
+      }
+    }
+  }
+};
+
+void RunBenchmarks() {
+  Intro();
+  printf("------------------------ %s\n", TargetName(HWY_TARGET));
+  RunBenchmark<BenchmarkDot>("dot");
+  RunBenchmark<BenchmarkDelta>("delta");
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_EXPORT(RunBenchmarks);
+
+void Run() {
+  for (int64_t target : SupportedAndGeneratedTargets()) {
+    SetSupportedTargetsForTest(target);
+    HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
+  }
+  SetSupportedTargetsForTest(0);  // Reset the mask afterwards.
+}
+
+}  // namespace hwy
+
+int main(int /*argc*/, char** /*argv*/) {
+  hwy::Run();
+  return 0;
+}
+#endif  // HWY_ONCE
@@ -0,0 +1,66 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Demo of functions that might be called from multiple SIMD modules (either
+// other -inl.h files, or a .cc file between begin/end_target-inl). This is
+// optional - all SIMD code can reside in .cc files. However, this allows
+// splitting code into different files while still inlining instead of requiring
+// calling through function pointers.
+
+// Per-target include guard. This is only required when using dynamic dispatch,
+// i.e. including foreach_target.h. For static dispatch, a normal include
+// guard would be fine because the header is only compiled once.
+#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#else
+#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#endif
+
+// It is fine to #include normal or *-inl headers.
+#include <stddef.h>
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace skeleton {
+namespace HWY_NAMESPACE {
+
+// Highway ops reside here; ADL does not find templates nor builtins.
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Example of a type-agnostic (caller-specified lane type) and width-agnostic
+// (uses best available instruction set) function in a header.
+//
+// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
+template <class D, typename T>
+HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
+                                 const T* HWY_RESTRICT add_array,
+                                 const size_t size, T* HWY_RESTRICT x_array) {
+  for (size_t i = 0; i < size; i += hn::Lanes(d)) {
+    const auto mul = hn::Load(d, mul_array + i);
+    const auto add = hn::Load(d, add_array + i);
+    auto x = hn::Load(d, x_array + i);
+    x = hn::MulAdd(mul, x, add);
+    hn::Store(x, d, x_array + i);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace skeleton
+HWY_AFTER_NAMESPACE();
+
+#endif  // include guard
@@ -0,0 +1,121 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/examples/skeleton.h"
+
+#include <stdio.h>
+
+// >>>> for dynamic dispatch only, skip if you want static dispatch
+
+// First undef to prevent error when re-included.
+#undef HWY_TARGET_INCLUDE
+// For dynamic dispatch, specify the name of the current file (unfortunately
+// __FILE__ is not reliable) so that foreach_target.h can re-include it.
+#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
+// Generates code for each enabled target by re-including this source file.
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// <<<< end of dynamic dispatch
+
+// Must come after foreach_target.h to avoid redefinition errors.
+#include "hwy/highway.h"
+
+// Optional, can instead add HWY_ATTR to all functions.
+HWY_BEFORE_NAMESPACE();
+
+namespace skeleton {
+// This namespace name is unique per target, which allows code for multiple
+// targets to co-exist in the same translation unit. Required when using dynamic
+// dispatch, otherwise optional.
+namespace HWY_NAMESPACE {
+
+// Highway ops reside here; ADL does not find templates nor builtins.
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Computes log2 by converting to a vector of floats. Compiled once per target.
+template <class DF>
+HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
+                                   const uint8_t* HWY_RESTRICT values,
+                                   uint8_t* HWY_RESTRICT log2) {
+  // Type tags for converting to other element types (Rebind = same count).
+  const hn::RebindToSigned<DF> d32;
+  const hn::Rebind<uint8_t, DF> d8;
+
+  const auto u8 = hn::Load(d8, values);
+  const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
+  const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
+  hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
+}
+
+void CodepathDemo() {
+  // Highway defaults to portability, but per-target codepaths may be selected
+  // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
+#if HWY_HAVE_INTEGER64
+  const char* gather = "Has int64";
+#else
+  const char* gather = "No int64";
+#endif
+  printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
+}
+
+void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
+               uint8_t* HWY_RESTRICT log2) {
+  CodepathDemo();
+
+  const hn::ScalableTag<float> df;
+  const size_t N = hn::Lanes(df);
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    OneFloorLog2(df, values + i, log2 + i);
+  }
+  for (; i < count; ++i) {
+    hn::CappedTag<float, 1> d1;
+    OneFloorLog2(d1, values + i, log2 + i);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace skeleton
+HWY_AFTER_NAMESPACE();
+
+// The table of pointers to the various implementations in HWY_NAMESPACE must
+// be compiled only once (foreach_target #includes this file multiple times).
+// HWY_ONCE is true for only one of these 'compilation passes'.
+#if HWY_ONCE
+
+namespace skeleton {
+
+// This macro declares a static array used for dynamic dispatch; it resides in
+// the same outer namespace that contains FloorLog2.
+HWY_EXPORT(FloorLog2);
+
+// This function is optional and only needed in the case of exposing it in the
+// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
+// is equivalent to inlining this function.
+HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
+                                 const size_t count,
+                                 uint8_t* HWY_RESTRICT out) {
+  // This must reside outside of HWY_NAMESPACE because it references (calls the
+  // appropriate one from) the per-target implementations there.
+  // For static dispatch, use HWY_STATIC_DISPATCH.
+  return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
+}
+
+// Optional: anything to compile only once, e.g. non-SIMD implementations of
+// public functions provided by this module, can go inside #if HWY_ONCE.
+
+}  // namespace skeleton
+#endif  // HWY_ONCE
@@ -0,0 +1,36 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Demo interface to target-specific code in skeleton.cc
+
+// Normal header with include guard and namespace.
+#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
+#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_
+
+#include <stddef.h>
+
+// Platform-specific definitions used for declaring an interface, independent of
+// the SIMD instruction set.
+#include "hwy/base.h"  // HWY_RESTRICT
+
+namespace skeleton {
+
+// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
+HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
+                                 const size_t count, uint8_t* HWY_RESTRICT out);
+
+}  // namespace skeleton
+
+#endif  // HIGHWAY_HWY_EXAMPLES_SKELETON_H_
@@ -0,0 +1,110 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Example of unit test for the "skeleton" library.
+
+#include "hwy/examples/skeleton.h"
+
+#include <stdio.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// Must come after foreach_target.h to avoid redefinition errors.
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+// Optional: factor out parts of the implementation into *-inl.h
+// (must also come after foreach_target.h to avoid redefinition errors)
+#include "hwy/examples/skeleton-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace skeleton {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Calls function defined in skeleton.cc.
+struct TestFloorLog2 {
+  template <class T, class DF>
+  HWY_NOINLINE void operator()(T /*unused*/, DF df) {
+    const size_t count = 5 * hn::Lanes(df);
+    auto in = hwy::AllocateAligned<uint8_t>(count);
+    auto expected = hwy::AllocateAligned<uint8_t>(count);
+
+    hwy::RandomState rng;
+    for (size_t i = 0; i < count; ++i) {
+      expected[i] = Random32(&rng) & 7;
+      in[i] = static_cast<uint8_t>(1u << expected[i]);
+    }
+    auto out = hwy::AllocateAligned<uint8_t>(count);
+    CallFloorLog2(in.get(), count, out.get());
+    int sum = 0;
+    for (size_t i = 0; i < count; ++i) {
+      HWY_ASSERT_EQ(expected[i], out[i]);
+      sum += out[i];
+    }
+    hwy::PreventElision(sum);
+  }
+};
+
+HWY_NOINLINE void TestAllFloorLog2() {
+  hn::ForPartialVectors<TestFloorLog2>()(float());
+}
+
+// Calls function defined in skeleton-inl.h.
+struct TestSumMulAdd {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    hwy::RandomState rng;
+    const size_t count = 4096;
+    EXPECT_EQ(0, count % hn::Lanes(d));
+    auto mul = hwy::AllocateAligned<T>(count);
+    auto x = hwy::AllocateAligned<T>(count);
+    auto add = hwy::AllocateAligned<T>(count);
+    for (size_t i = 0; i < count; ++i) {
+      mul[i] = static_cast<T>(Random32(&rng) & 0xF);
+      x[i] = static_cast<T>(Random32(&rng) & 0xFF);
+      add[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    double expected_sum = 0.0;
+    for (size_t i = 0; i < count; ++i) {
+      expected_sum += mul[i] * x[i] + add[i];
+    }
+
+    MulAddLoop(d, mul.get(), add.get(), count, x.get());
+    HWY_ASSERT_EQ(4344240.0, expected_sum);
+  }
+};
+
+HWY_NOINLINE void TestAllSumMulAdd() {
+  hn::ForFloatTypes(hn::ForPartialVectors<TestSumMulAdd>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace skeleton
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace skeleton {
+HWY_BEFORE_TEST(SkeletonTest);
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
+}  // namespace skeleton
+
+#endif
@@ -0,0 +1,261 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_
+#define HIGHWAY_HWY_FOREACH_TARGET_H_
+
+// Re-includes the translation unit zero or more times to compile for any
+// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
+// highway.h defines the corresponding macro/namespace.
+
+#include "hwy/detect_targets.h"
+
+// *_inl.h may include other headers, which requires include guards to prevent
+// repeated inclusion. The guards must be reset after compiling each target, so
+// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE,
+// defining it if undefined and vice versa. This macro is initially undefined
+// so that IDEs don't gray out the contents of each header.
+#ifdef HWY_TARGET_TOGGLE
+#error "This macro must not be defined outside foreach_target.h"
+#endif
+
+#ifdef HWY_HIGHWAY_INCLUDED  // highway.h include guard
+// Trigger fixup at the bottom of this header.
+#define HWY_ALREADY_INCLUDED
+
+// The next highway.h must re-include set_macros-inl.h because the first
+// highway.h chose the static target instead of what we will set below.
+#undef HWY_SET_MACROS_PER_TARGET
+#endif
+
+// Disable HWY_EXPORT in user code until we have generated all targets. Note
+// that a subsequent highway.h will not override this definition.
+#undef HWY_ONCE
+#define HWY_ONCE (0 || HWY_IDE)
+
+// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE;
+// also skip if only 1 target defined (no re-inclusion will be necessary).
+#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
+
+#if !defined(HWY_TARGET_INCLUDE)
+#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
+#endif
+
+#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_EMU128
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SCALAR
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_NEON
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_RVV) && (HWY_STATIC_TARGET != HWY_RVV)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_RVV
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SVE
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SVE2
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SVE_256
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SVE2_128
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SSSE3
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SSE4
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_AVX2
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_AVX3
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_AVX3_DL) && (HWY_STATIC_TARGET != HWY_AVX3_DL)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_AVX3_DL
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_WASM_EMU256
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_WASM
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_PPC8
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#endif  // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
+
+// Now that all but the static target have been generated, re-enable HWY_EXPORT.
+#undef HWY_ONCE
+#define HWY_ONCE 1
+
+// If we re-include once per enabled target, the translation unit's
+// implementation would have to be skipped via #if to avoid redefining symbols.
+// We instead skip the re-include for HWY_STATIC_TARGET, and generate its
+// implementation when resuming compilation of the translation unit.
+#undef HWY_TARGET
+#define HWY_TARGET HWY_STATIC_TARGET
+
+#ifdef HWY_ALREADY_INCLUDED
+// Revert the previous toggle to prevent redefinitions for the static target.
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+
+// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored.
+#ifdef HWY_SET_MACROS_PER_TARGET
+#undef HWY_SET_MACROS_PER_TARGET
+#else
+#define HWY_SET_MACROS_PER_TARGET
+#endif
+#endif
+
+#endif  // HIGHWAY_HWY_FOREACH_TARGET_H_
@@ -0,0 +1,378 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This include guard is checked by foreach_target, so avoid the usual _H_
+// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
+// after/outside this include guard.
+#ifndef HWY_HIGHWAY_INCLUDED
+#define HWY_HIGHWAY_INCLUDED
+
+// Main header required before using vector types.
+
+#include "hwy/base.h"
+#include "hwy/targets.h"
+
+namespace hwy {
+
+// API version (https://semver.org/); keep in sync with CMakeLists.txt.
+#define HWY_MAJOR 1
+#define HWY_MINOR 0
+#define HWY_PATCH 2
+
+//------------------------------------------------------------------------------
+// Shorthand for tags (defined in shared-inl.h) used to select overloads.
+// Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
+// HWY_CAPPED(T, N).
+
+// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
+// registers in the group, and is ignored on targets that do not support groups.
+#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
+#define HWY_FULL2(T, LMUL) \
+  hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
+#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
+// Workaround for MSVC grouping __VA_ARGS__ into a single argument
+#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
+// Trailing comma avoids -pedantic false alarm
+#define HWY_CHOOSE_FULL(...) \
+  HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
+#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
+
+// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
+#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>
+
+//------------------------------------------------------------------------------
+// Export user functions for static/dynamic dispatch
+
+// Evaluates to 0 inside a translation unit if it is generating anything but the
+// static target (the last one if multiple targets are enabled). Used to prevent
+// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
+// compile once anyway, so this is 1 unless it is or has been included.
+#ifndef HWY_ONCE
+#define HWY_ONCE 1
+#endif
+
+// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
+// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
+// defined), and can be used to deduce the return type of Choose*.
+#if HWY_STATIC_TARGET == HWY_SCALAR
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_EMU128
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_RVV
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_WASM
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_NEON
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE2
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE_256
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE2_128
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_PPC8
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SSSE3
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SSE4
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_AVX2
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_AVX3
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_AVX3_DL
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
+#endif
+
+// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
+// nullptr is that target was not compiled.
+#if HWY_TARGETS & HWY_EMU128
+#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
+#elif HWY_TARGETS & HWY_SCALAR
+#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
+#else
+// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
+// runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
+#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
+#endif
+
+#if HWY_TARGETS & HWY_WASM_EMU256
+#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
+#else
+#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_WASM
+#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
+#else
+#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_RVV
+#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
+#else
+#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_NEON
+#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
+#else
+#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SVE
+#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SVE2
+#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SVE_256
+#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SVE2_128
+#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_PPC8
+#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
+#else
+#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SSSE3
+#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
+#else
+#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SSE4
+#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
+#else
+#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_AVX2
+#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
+#else
+#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_AVX3
+#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
+#else
+#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_AVX3_DL
+#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
+#else
+#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
+#endif
+
+// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
+// apparently cannot be an array. Use a function pointer instead, which has the
+// disadvantage that we call the static (not best) target on the first call to
+// any HWY_DYNAMIC_DISPATCH.
+#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
+#define HWY_DISPATCH_WORKAROUND 1
+#else
+#define HWY_DISPATCH_WORKAROUND 0
+#endif
+
+// Provides a static member function which is what is called during the first
+// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
+// this function are the first entry in the tables created by HWY_EXPORT.
+template <typename RetType, typename... Args>
+struct FunctionCache {
+ public:
+  typedef RetType(FunctionType)(Args...);
+
+#if HWY_DISPATCH_WORKAROUND
+  template <FunctionType* const func>
+  static RetType ChooseAndCall(Args... args) {
+    ChosenTarget& chosen_target = GetChosenTarget();
+    chosen_target.Update(SupportedTargets());
+    return (*func)(args...);
+  }
+#else
+  // A template function that when instantiated has the same signature as the
+  // function being called. This function initializes the bit array of targets
+  // supported by the current CPU and then calls the appropriate entry within
+  // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
+  // exported functions, even those defined by different translation units,
+  // will dispatch directly to the best available target.
+  template <FunctionType* const table[]>
+  static RetType ChooseAndCall(Args... args) {
+    ChosenTarget& chosen_target = GetChosenTarget();
+    chosen_target.Update(SupportedTargets());
+    return (table[chosen_target.GetIndex()])(args...);
+  }
+#endif  // HWY_DISPATCH_WORKAROUND
+};
+
+// Used to deduce the template parameters RetType and Args from a function.
+template <typename RetType, typename... Args>
+FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
+  return FunctionCache<RetType, Args...>();
+}
+
+#define HWY_DISPATCH_TABLE(FUNC_NAME) \
+  HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
+
+// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
+// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
+// static array must be defined at the same namespace level as the function
+// it is exporting.
+// After being exported, it can be called from other parts of the same source
+// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
+// like in the following example:
+//
+//   #include "hwy/highway.h"
+//   HWY_BEFORE_NAMESPACE();
+//   namespace skeleton {
+//   namespace HWY_NAMESPACE {
+//
+//   void MyFunction(int a, char b, const char* c) { ... }
+//
+//   // NOLINTNEXTLINE(google-readability-namespace-comments)
+//   }  // namespace HWY_NAMESPACE
+//   }  // namespace skeleton
+//   HWY_AFTER_NAMESPACE();
+//
+//   namespace skeleton {
+//   HWY_EXPORT(MyFunction);  // Defines the dispatch table in this scope.
+//
+//   void MyFunction(int a, char b, const char* c) {
+//     return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
+//   }
+//   }  // namespace skeleton
+//
+
+#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
+
+// Simplified version for IDE or the dynamic dispatch case with only one target.
+// This case still uses a table, although of a single element, to provide the
+// same compile error conditions as with the dynamic dispatch case when multiple
+// targets are being compiled.
+#define HWY_EXPORT(FUNC_NAME)                                             \
+  HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
+  HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
+#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
+
+#else
+
+// Simplified version for MSVC 2017: function pointer instead of table.
+#if HWY_DISPATCH_WORKAROUND
+
+#define HWY_EXPORT(FUNC_NAME)                                                \
+  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
+      FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
+      /* The first entry in the table initializes the global cache and       \
+       * calls the function from HWY_STATIC_TARGET. */                       \
+      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
+          FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>,      \
+      HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
+      HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
+  }
+
+#else
+
+// Dynamic dispatch case with one entry per dynamic target plus the fallback
+// target and the initialization wrapper.
+#define HWY_EXPORT(FUNC_NAME)                                                \
+  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
+      FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
+      /* The first entry in the table initializes the global cache and       \
+       * calls the appropriate function. */                                  \
+      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
+          FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>,        \
+      HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
+      HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
+  }
+
+#endif  // HWY_DISPATCH_WORKAROUND
+
+#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
+  (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
+
+#endif  // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
+
+// DEPRECATED names; please use HWY_HAVE_* instead.
+#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
+#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
+#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
+
+}  // namespace hwy
+
+#endif  // HWY_HIGHWAY_INCLUDED
+
+//------------------------------------------------------------------------------
+
+// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
+// to include them once per target, which is ensured by the toggle check.
+// Because ops/*.h are included under it, they do not need their own guard.
+#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
+#ifdef HWY_HIGHWAY_PER_TARGET
+#undef HWY_HIGHWAY_PER_TARGET
+#else
+#define HWY_HIGHWAY_PER_TARGET
+#endif
+
+// These define ops inside namespace hwy::HWY_NAMESPACE.
+#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
+#include "hwy/ops/x86_128-inl.h"
+#elif HWY_TARGET == HWY_AVX2
+#include "hwy/ops/x86_256-inl.h"
+#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL
+#include "hwy/ops/x86_512-inl.h"
+#elif HWY_TARGET == HWY_PPC8
+#error "PPC is not yet supported"
+#elif HWY_TARGET == HWY_NEON
+#include "hwy/ops/arm_neon-inl.h"
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
+    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
+#include "hwy/ops/arm_sve-inl.h"
+#elif HWY_TARGET == HWY_WASM_EMU256
+#include "hwy/ops/wasm_256-inl.h"
+#elif HWY_TARGET == HWY_WASM
+#include "hwy/ops/wasm_128-inl.h"
+#elif HWY_TARGET == HWY_RVV
+#include "hwy/ops/rvv-inl.h"
+#elif HWY_TARGET == HWY_EMU128
+#include "hwy/ops/emu128-inl.h"
+#elif HWY_TARGET == HWY_SCALAR
+#include "hwy/ops/scalar-inl.h"
+#else
+#pragma message("HWY_TARGET does not match any known target")
+#endif  // HWY_TARGET
+
+#include "hwy/ops/generic_ops-inl.h"
+
+#endif  // HWY_HIGHWAY_PER_TARGET
@@ -0,0 +1,74 @@
+// Pseudo-generated file to handle both cmake & bazel build system.
+
+// Initial generation done using cmake code:
+// include(GenerateExportHeader)
+// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME
+// hwy/highway_export.h)
+// code reformatted using clang-format --style=Google
+
+#ifndef HWY_DLLEXPORT_H
+#define HWY_DLLEXPORT_H
+
+#if !defined(HWY_SHARED_DEFINE)
+#define HWY_DLLEXPORT
+#define HWY_CONTRIB_DLLEXPORT
+#define HWY_TEST_DLLEXPORT
+#else  // !HWY_SHARED_DEFINE
+
+#ifndef HWY_DLLEXPORT
+#if defined(hwy_EXPORTS)
+/* We are building this library */
+#ifdef _WIN32
+#define HWY_DLLEXPORT __declspec(dllexport)
+#else
+#define HWY_DLLEXPORT __attribute__((visibility("default")))
+#endif
+#else  // defined(hwy_EXPORTS)
+/* We are using this library */
+#ifdef _WIN32
+#define HWY_DLLEXPORT __declspec(dllimport)
+#else
+#define HWY_DLLEXPORT __attribute__((visibility("default")))
+#endif
+#endif  // defined(hwy_EXPORTS)
+#endif  // HWY_DLLEXPORT
+
+#ifndef HWY_CONTRIB_DLLEXPORT
+#if defined(hwy_contrib_EXPORTS)
+/* We are building this library */
+#ifdef _WIN32
+#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport)
+#else
+#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
+#endif
+#else  // defined(hwy_contrib_EXPORTS)
+/* We are using this library */
+#ifdef _WIN32
+#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
+#else
+#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
+#endif
+#endif  // defined(hwy_contrib_EXPORTS)
+#endif  // HWY_CONTRIB_DLLEXPORT
+
+#ifndef HWY_TEST_DLLEXPORT
+#if defined(hwy_test_EXPORTS)
+/* We are building this library */
+#ifdef _WIN32
+#define HWY_TEST_DLLEXPORT __declspec(dllexport)
+#else
+#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
+#endif
+#else  // defined(hwy_test_EXPORTS)
+/* We are using this library */
+#ifdef _WIN32
+#define HWY_TEST_DLLEXPORT __declspec(dllimport)
+#else
+#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
+#endif
+#endif  // defined(hwy_test_EXPORTS)
+#endif  // HWY_TEST_DLLEXPORT
+
+#endif  // !HWY_SHARED_DEFINE
+
+#endif /* HWY_DLLEXPORT_H */
@@ -0,0 +1,485 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <bitset>
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "highway_test.cc"
+#include "hwy/foreach_target.h"    // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/nanobenchmark.h"  // Unpredictable1
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <size_t kLimit, typename T>
+HWY_NOINLINE void TestCappedLimit(T /* tag */) {
+  CappedTag<T, kLimit> d;
+  // Ensure two ops compile
+  HWY_ASSERT_VEC_EQ(d, Zero(d), Set(d, T{0}));
+
+  // Ensure we do not write more than kLimit lanes
+  const size_t N = Lanes(d);
+  if (kLimit < N) {
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T{0});
+    Store(Set(d, T{1}), d, lanes.get());
+    for (size_t i = kLimit; i < N; ++i) {
+      HWY_ASSERT_EQ(lanes[i], T{0});
+    }
+  }
+}
+
+// Adapter for ForAllTypes - we are constructing our own Simd<> and thus do not
+// use ForPartialVectors etc.
+struct TestCapped {
+  template <typename T>
+  void operator()(T t) const {
+    TestCappedLimit<1>(t);
+    TestCappedLimit<3>(t);
+    TestCappedLimit<5>(t);
+    TestCappedLimit<1ull << 15>(t);
+  }
+};
+
+HWY_NOINLINE void TestAllCapped() { ForAllTypes(TestCapped()); }
+
+// For testing that ForPartialVectors reaches every possible size:
+using NumLanesSet = std::bitset<HWY_MAX_BYTES + 1>;
+
+// Monostate pattern because ForPartialVectors takes a template argument, not a
+// functor by reference.
+static NumLanesSet* NumLanesForSize(size_t sizeof_t) {
+  HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
+  static NumLanesSet num_lanes[sizeof(uint64_t) + 1];
+  return num_lanes + sizeof_t;
+}
+static size_t* MaxLanesForSize(size_t sizeof_t) {
+  HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
+  static size_t num_lanes[sizeof(uint64_t) + 1] = {0};
+  return num_lanes + sizeof_t;
+}
+
+struct TestMaxLanes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const size_t kMax = MaxLanes(d);  // for RVV, includes LMUL
+    HWY_ASSERT(N <= kMax);
+    HWY_ASSERT(kMax <= (HWY_MAX_BYTES / sizeof(T)));
+
+    NumLanesForSize(sizeof(T))->set(N);
+    *MaxLanesForSize(sizeof(T)) = HWY_MAX(*MaxLanesForSize(sizeof(T)), N);
+  }
+};
+
+HWY_NOINLINE void TestAllMaxLanes() {
+  ForAllTypes(ForPartialVectors<TestMaxLanes>());
+
+  // Ensure ForPartialVectors visited all powers of two [1, N].
+  for (size_t sizeof_t : {sizeof(uint8_t), sizeof(uint16_t), sizeof(uint32_t),
+                          sizeof(uint64_t)}) {
+    const size_t N = *MaxLanesForSize(sizeof_t);
+    for (size_t i = 1; i <= N; i += i) {
+      if (!NumLanesForSize(sizeof_t)->test(i)) {
+        fprintf(stderr, "T=%d: did not visit for N=%d, max=%d\n",
+                static_cast<int>(sizeof_t), static_cast<int>(i),
+                static_cast<int>(N));
+        HWY_ASSERT(false);
+      }
+    }
+  }
+}
+
+struct TestSet {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Zero
+    const auto v0 = Zero(d);
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    std::fill(expected.get(), expected.get() + N, T(0));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), v0);
+
+    // Set
+    const auto v2 = Set(d, T(2));
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = 2;
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), v2);
+
+    // Iota
+    const auto vi = Iota(d, T(5));
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(5 + i);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), vi);
+
+    // Undefined
+    const auto vu = Undefined(d);
+    Store(vu, d, expected.get());
+  }
+};
+
+HWY_NOINLINE void TestAllSet() { ForAllTypes(ForPartialVectors<TestSet>()); }
+
+// Ensures wraparound (mod 2^bits)
+struct TestOverflow {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(1));
+    const auto vmax = Set(d, LimitsMax<T>());
+    const auto vmin = Set(d, LimitsMin<T>());
+    // Unsigned underflow / negative -> positive
+    HWY_ASSERT_VEC_EQ(d, vmax, Sub(vmin, v1));
+    // Unsigned overflow / positive -> negative
+    HWY_ASSERT_VEC_EQ(d, vmin, Add(vmax, v1));
+  }
+};
+
+HWY_NOINLINE void TestAllOverflow() {
+  ForIntegerTypes(ForPartialVectors<TestOverflow>());
+}
+
+struct TestClamp {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto v1 = Set(d, 1);
+    const auto v2 = Set(d, 2);
+
+    HWY_ASSERT_VEC_EQ(d, v1, Clamp(v2, v0, v1));
+    HWY_ASSERT_VEC_EQ(d, v1, Clamp(v0, v1, v2));
+  }
+};
+
+HWY_NOINLINE void TestAllClamp() {
+  ForAllTypes(ForPartialVectors<TestClamp>());
+}
+
+struct TestSignBitInteger {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto all = VecFromMask(d, Eq(v0, v0));
+    const auto vs = SignBit(d);
+    const auto other = Sub(vs, Set(d, 1));
+
+    // Shifting left by one => overflow, equal zero
+    HWY_ASSERT_VEC_EQ(d, v0, Add(vs, vs));
+    // Verify the lower bits are zero (only +/- and logical ops are available
+    // for all types)
+    HWY_ASSERT_VEC_EQ(d, all, Add(vs, other));
+  }
+};
+
+struct TestSignBitFloat {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vs = SignBit(d);
+    const auto vp = Set(d, 2.25);
+    const auto vn = Set(d, -2.25);
+    HWY_ASSERT_VEC_EQ(d, Or(vp, vs), vn);
+    HWY_ASSERT_VEC_EQ(d, AndNot(vs, vn), vp);
+    HWY_ASSERT_VEC_EQ(d, v0, vs);
+  }
+};
+
+HWY_NOINLINE void TestAllSignBit() {
+  ForIntegerTypes(ForPartialVectors<TestSignBitInteger>());
+  ForFloatTypes(ForPartialVectors<TestSignBitFloat>());
+}
+
+// inline to work around incorrect SVE codegen (only first 128 bits used).
+template <class D, class V>
+HWY_INLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
+  using T = TFromD<D>;
+  const size_t N = Lanes(d);
+  if (!AllTrue(d, IsNaN(v))) {
+    Print(d, "not all NaN", v, 0, N);
+    Print(d, "mask", VecFromMask(d, IsNaN(v)), 0, N);
+    const std::string type_name = TypeName(T(), N);
+    // RVV lacks PRIu64 and MSYS still has problems with %zu, so print bytes to
+    // avoid truncating doubles.
+    uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0};
+    const T lane = GetLane(v);
+    CopyBytes<sizeof(T)>(&lane, bytes);
+    Abort(file, line,
+          "Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x "
+          "%02x)",
+          type_name.c_str(), lane, bytes[0], bytes[1], bytes[2], bytes[3],
+          bytes[4], bytes[5], bytes[6], bytes[7]);
+  }
+}
+
+#define HWY_ASSERT_NAN(d, v) AssertNaN(d, v, __FILE__, __LINE__)
+
+struct TestNaN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+    HWY_ASSERT_NAN(d, nan);
+
+    // Arithmetic
+    HWY_ASSERT_NAN(d, Add(nan, v1));
+    HWY_ASSERT_NAN(d, Add(v1, nan));
+    HWY_ASSERT_NAN(d, Sub(nan, v1));
+    HWY_ASSERT_NAN(d, Sub(v1, nan));
+    HWY_ASSERT_NAN(d, Mul(nan, v1));
+    HWY_ASSERT_NAN(d, Mul(v1, nan));
+    HWY_ASSERT_NAN(d, Div(nan, v1));
+    HWY_ASSERT_NAN(d, Div(v1, nan));
+
+    // FMA
+    HWY_ASSERT_NAN(d, MulAdd(nan, v1, v1));
+    HWY_ASSERT_NAN(d, MulAdd(v1, nan, v1));
+    HWY_ASSERT_NAN(d, MulAdd(v1, v1, nan));
+    HWY_ASSERT_NAN(d, MulSub(nan, v1, v1));
+    HWY_ASSERT_NAN(d, MulSub(v1, nan, v1));
+    HWY_ASSERT_NAN(d, MulSub(v1, v1, nan));
+    HWY_ASSERT_NAN(d, NegMulAdd(nan, v1, v1));
+    HWY_ASSERT_NAN(d, NegMulAdd(v1, nan, v1));
+    HWY_ASSERT_NAN(d, NegMulAdd(v1, v1, nan));
+    HWY_ASSERT_NAN(d, NegMulSub(nan, v1, v1));
+    HWY_ASSERT_NAN(d, NegMulSub(v1, nan, v1));
+    HWY_ASSERT_NAN(d, NegMulSub(v1, v1, nan));
+
+    // Rcp/Sqrt
+    HWY_ASSERT_NAN(d, Sqrt(nan));
+
+    // Sign manipulation
+    HWY_ASSERT_NAN(d, Abs(nan));
+    HWY_ASSERT_NAN(d, Neg(nan));
+    HWY_ASSERT_NAN(d, CopySign(nan, v1));
+    HWY_ASSERT_NAN(d, CopySignToAbs(nan, v1));
+
+    // Rounding
+    HWY_ASSERT_NAN(d, Ceil(nan));
+    HWY_ASSERT_NAN(d, Floor(nan));
+    HWY_ASSERT_NAN(d, Round(nan));
+    HWY_ASSERT_NAN(d, Trunc(nan));
+
+    // Logical (And/AndNot/Xor will clear NaN!)
+    HWY_ASSERT_NAN(d, Or(nan, v1));
+
+    // Comparison
+    HWY_ASSERT(AllFalse(d, Eq(nan, v1)));
+    HWY_ASSERT(AllFalse(d, Gt(nan, v1)));
+    HWY_ASSERT(AllFalse(d, Lt(nan, v1)));
+    HWY_ASSERT(AllFalse(d, Ge(nan, v1)));
+    HWY_ASSERT(AllFalse(d, Le(nan, v1)));
+
+    // Reduction
+    HWY_ASSERT_NAN(d, SumOfLanes(d, nan));
+// TODO(janwas): re-enable after QEMU/Spike are fixed
+#if HWY_TARGET != HWY_RVV
+    HWY_ASSERT_NAN(d, MinOfLanes(d, nan));
+    HWY_ASSERT_NAN(d, MaxOfLanes(d, nan));
+#endif
+
+    // Min
+#if HWY_ARCH_X86 && (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128)
+    // x86 SIMD returns the second operand if any input is NaN.
+    HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
+    HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
+    HWY_ASSERT_NAN(d, Min(v1, nan));
+    HWY_ASSERT_NAN(d, Max(v1, nan));
+#elif HWY_ARCH_WASM
+    // Should return NaN if any input is NaN, but does not for scalar.
+    // TODO(janwas): remove once this is fixed.
+#elif HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
+    // ARMv7 NEON returns NaN if any input is NaN.
+    HWY_ASSERT_NAN(d, Min(v1, nan));
+    HWY_ASSERT_NAN(d, Max(v1, nan));
+    HWY_ASSERT_NAN(d, Min(nan, v1));
+    HWY_ASSERT_NAN(d, Max(nan, v1));
+#else
+    // IEEE 754-2019 minimumNumber is defined as the other argument if exactly
+    // one is NaN, and qNaN if both are.
+    HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
+    HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
+    HWY_ASSERT_VEC_EQ(d, v1, Min(v1, nan));
+    HWY_ASSERT_VEC_EQ(d, v1, Max(v1, nan));
+#endif
+    HWY_ASSERT_NAN(d, Min(nan, nan));
+    HWY_ASSERT_NAN(d, Max(nan, nan));
+  }
+};
+
+// For functions only available for float32
+struct TestF32NaN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+    HWY_ASSERT_NAN(d, ApproximateReciprocal(nan));
+    HWY_ASSERT_NAN(d, ApproximateReciprocalSqrt(nan));
+    HWY_ASSERT_NAN(d, AbsDiff(nan, v1));
+    HWY_ASSERT_NAN(d, AbsDiff(v1, nan));
+  }
+};
+
+HWY_NOINLINE void TestAllNaN() {
+  ForFloatTypes(ForPartialVectors<TestNaN>());
+  ForPartialVectors<TestF32NaN>()(float());
+}
+
+struct TestIsNaN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+    const auto neg = Set(d, T{-1});
+    HWY_ASSERT_NAN(d, nan);
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(inf));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(CopySign(inf, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(nan));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(CopySign(nan, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(v1));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Zero(d)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::LowestValue<T>())));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::HighestValue<T>())));
+  }
+};
+
+HWY_NOINLINE void TestAllIsNaN() {
+  ForFloatTypes(ForPartialVectors<TestIsNaN>());
+}
+
+struct TestIsInf {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+    const auto neg = Set(d, T{-1});
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(inf));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(CopySign(inf, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(nan));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(CopySign(nan, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(v1));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Zero(d)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::LowestValue<T>())));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::HighestValue<T>())));
+  }
+};
+
+HWY_NOINLINE void TestAllIsInf() {
+  ForFloatTypes(ForPartialVectors<TestIsInf>());
+}
+
+struct TestIsFinite {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+    const auto neg = Set(d, T{-1});
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(inf));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(inf, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(nan));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(nan, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(v1));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Zero(d)));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Set(d, hwy::LowestValue<T>())));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d),
+                       IsFinite(Set(d, hwy::HighestValue<T>())));
+  }
+};
+
+HWY_NOINLINE void TestAllIsFinite() {
+  ForFloatTypes(ForPartialVectors<TestIsFinite>());
+}
+
+struct TestCopyAndAssign {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // copy V
+    const auto v3 = Iota(d, 3);
+    auto v3b(v3);
+    HWY_ASSERT_VEC_EQ(d, v3, v3b);
+
+    // assign V
+    auto v3c = Undefined(d);
+    v3c = v3;
+    HWY_ASSERT_VEC_EQ(d, v3, v3c);
+  }
+};
+
+HWY_NOINLINE void TestAllCopyAndAssign() {
+  ForAllTypes(ForPartialVectors<TestCopyAndAssign>());
+}
+
+struct TestGetLane {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    HWY_ASSERT_EQ(T(0), GetLane(Zero(d)));
+    HWY_ASSERT_EQ(T(1), GetLane(Set(d, 1)));
+  }
+};
+
+HWY_NOINLINE void TestAllGetLane() {
+  ForAllTypes(ForPartialVectors<TestGetLane>());
+}
+
+struct TestDFromV {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    using D0 = DFromV<decltype(v0)>;         // not necessarily same as D
+    const auto v0b = And(v0, Set(D0(), 1));  // but vectors can interoperate
+    HWY_ASSERT_VEC_EQ(d, v0, v0b);
+  }
+};
+
+HWY_NOINLINE void TestAllDFromV() {
+  ForAllTypes(ForPartialVectors<TestDFromV>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HighwayTest);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCapped);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllMaxLanes);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsNaN);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsInf);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsFinite);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllDFromV);
+}  // namespace hwy
+
+#endif
@@ -0,0 +1,19 @@
+HWY_0 {
+  global:
+    extern "C++" {
+      *hwy::*;
+    };
+
+  local:
+    # Hide all the std namespace symbols. std namespace is explicitly marked
+    # as visibility(default) and header-only functions or methods (such as those
+    # from templates) should be exposed in shared libraries as weak symbols but
+    # this is only needed when we expose those types in the shared library API
+    # in any way. We don't use C++ std types in the API and we also don't
+    # support exceptions in the library.
+    # See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion
+    # about this.
+    extern "C++" {
+      *std::*;
+    };
+};
@@ -0,0 +1,762 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/nanobenchmark.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>    // clock_gettime
+
+#include <algorithm>  // sort
+#include <array>
+#include <atomic>
+#include <chrono>  //NOLINT
+#include <limits>
+#include <numeric>  // iota
+#include <random>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32) || defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif  // NOMINMAX
+#include <windows.h>
+#endif
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#endif
+
+#if defined(__HAIKU__)
+#include <OS.h>
+#endif
+
+#include "hwy/base.h"
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+#include <sys/platform/ppc.h>  // NOLINT __ppc_get_timebase_freq
+#elif HWY_ARCH_X86
+
+#if HWY_COMPILER_MSVC
+#include <intrin.h>
+#else
+#include <cpuid.h>  // NOLINT
+#endif              // HWY_COMPILER_MSVC
+
+#endif  // HWY_ARCH_X86
+
+namespace hwy {
+namespace {
+namespace timer {
+
+// Ticks := platform-specific timer values (CPU cycles on x86). Must be
+// unsigned to guarantee wraparound on overflow.
+using Ticks = uint64_t;
+
+// Start/Stop return absolute timestamps and must be placed immediately before
+// and after the region to measure. We provide separate Start/Stop functions
+// because they use different fences.
+//
+// Background: RDTSC is not 'serializing'; earlier instructions may complete
+// after it, and/or later instructions may complete before it. 'Fences' ensure
+// regions' elapsed times are independent of such reordering. The only
+// documented unprivileged serializing instruction is CPUID, which acts as a
+// full fence (no reordering across it in either direction). Unfortunately
+// the latency of CPUID varies wildly (perhaps made worse by not initializing
+// its EAX input). Because it cannot reliably be deducted from the region's
+// elapsed time, it must not be included in the region to measure (i.e.
+// between the two RDTSC).
+//
+// The newer RDTSCP is sometimes described as serializing, but it actually
+// only serves as a half-fence with release semantics. Although all
+// instructions in the region will complete before the final timestamp is
+// captured, subsequent instructions may leak into the region and increase the
+// elapsed time. Inserting another fence after the final RDTSCP would prevent
+// such reordering without affecting the measured region.
+//
+// Fortunately, such a fence exists. The LFENCE instruction is only documented
+// to delay later loads until earlier loads are visible. However, Intel's
+// reference manual says it acts as a full fence (waiting until all earlier
+// instructions have completed, and delaying later instructions until it
+// completes). AMD assigns the same behavior to MFENCE.
+//
+// We need a fence before the initial RDTSC to prevent earlier instructions
+// from leaking into the region, and arguably another after RDTSC to avoid
+// region instructions from completing before the timestamp is recorded.
+// When surrounded by fences, the additional RDTSCP half-fence provides no
+// benefit, so the initial timestamp can be recorded via RDTSC, which has
+// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
+// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
+//
+// Using Start+Start leads to higher variance and overhead than Stop+Stop.
+// However, Stop+Stop includes an LFENCE in the region measurements, which
+// adds a delay dependent on earlier loads. The combination of Start+Stop
+// is faster than Start+Start and more consistent than Stop+Stop because
+// the first LFENCE already delayed subsequent loads before the measured
+// region. This combination seems not to have been considered in prior work:
+// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
+//
+// Note: performance counters can measure 'exact' instructions-retired or
+// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
+// requires fences. Unfortunately, it is not accessible on all OSes and we
+// prefer to avoid kernel-mode drivers. Performance counters are also affected
+// by several under/over-count errata, so we use the TSC instead.
+
+// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
+// divide by InvariantTicksPerSecond.
+inline Ticks Start() {
+  Ticks t;
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
+  // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
+  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
+#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+  _ReadWriteBarrier();
+  _mm_lfence();
+  _ReadWriteBarrier();
+  t = __rdtsc();
+  _ReadWriteBarrier();
+  _mm_lfence();
+  _ReadWriteBarrier();
+#elif HWY_ARCH_X86_64
+  asm volatile(
+      "lfence\n\t"
+      "rdtsc\n\t"
+      "shl $32, %%rdx\n\t"
+      "or %%rdx, %0\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rdx = TSC >> 32.
+      // "cc" = flags modified by SHL.
+      : "rdx", "memory", "cc");
+#elif HWY_ARCH_RVV
+  asm volatile("rdtime %0" : "=r"(t));
+#elif defined(_WIN32) || defined(_WIN64)
+  LARGE_INTEGER counter;
+  (void)QueryPerformanceCounter(&counter);
+  t = counter.QuadPart;
+#elif defined(__APPLE__)
+  t = mach_absolute_time();
+#elif defined(__HAIKU__)
+  t = system_time_nsecs();  // since boot
+#else  // POSIX
+  timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
+#endif
+  return t;
+}
+
+// WARNING: on x86, caller must check HasRDTSCP before using this!
+inline Ticks Stop() {
+  uint64_t t;
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
+  // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
+  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
+#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+  _ReadWriteBarrier();
+  unsigned aux;
+  t = __rdtscp(&aux);
+  _ReadWriteBarrier();
+  _mm_lfence();
+  _ReadWriteBarrier();
+#elif HWY_ARCH_X86_64
+  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+  asm volatile(
+      "rdtscp\n\t"
+      "shl $32, %%rdx\n\t"
+      "or %%rdx, %0\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+      // "cc" = flags modified by SHL.
+      : "rcx", "rdx", "memory", "cc");
+#else
+  t = Start();
+#endif
+  return t;
+}
+
+}  // namespace timer
+
+namespace robust_statistics {
+
+// Sorts integral values in ascending order (e.g. for Mode). About 3x faster
+// than std::sort for input distributions with very few unique values.
+template <class T>
+void CountingSort(T* values, size_t num_values) {
+  // Unique values and their frequency (similar to flat_map).
+  using Unique = std::pair<T, int>;
+  std::vector<Unique> unique;
+  for (size_t i = 0; i < num_values; ++i) {
+    const T value = values[i];
+    const auto pos =
+        std::find_if(unique.begin(), unique.end(),
+                     [value](const Unique u) { return u.first == value; });
+    if (pos == unique.end()) {
+      unique.push_back(std::make_pair(value, 1));
+    } else {
+      ++pos->second;
+    }
+  }
+
+  // Sort in ascending order of value (pair.first).
+  std::sort(unique.begin(), unique.end());
+
+  // Write that many copies of each unique value to the array.
+  T* HWY_RESTRICT p = values;
+  for (const auto& value_count : unique) {
+    std::fill(p, p + value_count.second, value_count.first);
+    p += value_count.second;
+  }
+  NANOBENCHMARK_CHECK(p == values + num_values);
+}
+
+// @return i in [idx_begin, idx_begin + half_count) that minimizes
+// sorted[i + half_count] - sorted[i].
+template <typename T>
+size_t MinRange(const T* const HWY_RESTRICT sorted, const size_t idx_begin,
+                const size_t half_count) {
+  T min_range = std::numeric_limits<T>::max();
+  size_t min_idx = 0;
+
+  for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
+    NANOBENCHMARK_CHECK(sorted[idx] <= sorted[idx + half_count]);
+    const T range = sorted[idx + half_count] - sorted[idx];
+    if (range < min_range) {
+      min_range = range;
+      min_idx = idx;
+    }
+  }
+
+  return min_idx;
+}
+
+// Returns an estimate of the mode by calling MinRange on successively
+// halved intervals. "sorted" must be in ascending order. This is the
+// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
+// estimator of the mode", with complexity O(N log N). The mode is less
+// affected by outliers in highly-skewed distributions than the median.
+// The averaging operation below assumes "T" is an unsigned integer type.
+template <typename T>
+T ModeOfSorted(const T* const HWY_RESTRICT sorted, const size_t num_values) {
+  size_t idx_begin = 0;
+  size_t half_count = num_values / 2;
+  while (half_count > 1) {
+    idx_begin = MinRange(sorted, idx_begin, half_count);
+    half_count >>= 1;
+  }
+
+  const T x = sorted[idx_begin + 0];
+  if (half_count == 0) {
+    return x;
+  }
+  NANOBENCHMARK_CHECK(half_count == 1);
+  const T average = (x + sorted[idx_begin + 1] + 1) / 2;
+  return average;
+}
+
+// Returns the mode. Side effect: sorts "values".
+template <typename T>
+T Mode(T* values, const size_t num_values) {
+  CountingSort(values, num_values);
+  return ModeOfSorted(values, num_values);
+}
+
+template <typename T, size_t N>
+T Mode(T (&values)[N]) {
+  return Mode(&values[0], N);
+}
+
+// Returns the median value. Side effect: sorts "values".
+template <typename T>
+T Median(T* values, const size_t num_values) {
+  NANOBENCHMARK_CHECK(!values->empty());
+  std::sort(values, values + num_values);
+  const size_t half = num_values / 2;
+  // Odd count: return middle
+  if (num_values % 2) {
+    return values[half];
+  }
+  // Even count: return average of middle two.
+  return (values[half] + values[half - 1] + 1) / 2;
+}
+
+// Returns a robust measure of variability.
+template <typename T>
+T MedianAbsoluteDeviation(const T* values, const size_t num_values,
+                          const T median) {
+  NANOBENCHMARK_CHECK(num_values != 0);
+  std::vector<T> abs_deviations;
+  abs_deviations.reserve(num_values);
+  for (size_t i = 0; i < num_values; ++i) {
+    const int64_t abs = std::abs(static_cast<int64_t>(values[i]) -
+                                 static_cast<int64_t>(median));
+    abs_deviations.push_back(static_cast<T>(abs));
+  }
+  return Median(abs_deviations.data(), num_values);
+}
+
+}  // namespace robust_statistics
+}  // namespace
+namespace platform {
+namespace {
+
+// Prevents the compiler from eliding the computations that led to "output".
+template <class T>
+inline void PreventElision(T&& output) {
+#if HWY_COMPILER_MSVC == 0
+  // Works by indicating to the compiler that "output" is being read and
+  // modified. The +r constraint avoids unnecessary writes to memory, but only
+  // works for built-in types (typically FuncOutput).
+  asm volatile("" : "+r"(output) : : "memory");
+#else
+  // MSVC does not support inline assembly anymore (and never supported GCC's
+  // RTL constraints). Self-assignment with #pragma optimize("off") might be
+  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
+  // with volatile pointers generates inefficient code on MSVC 2017.
+  static std::atomic<T> dummy(T{});
+  dummy.store(output, std::memory_order_relaxed);
+#endif
+}
+
+// Measures the actual current frequency of Ticks. We cannot rely on the nominal
+// frequency encoded in x86 BrandString because it is misleading on M1 Rosetta,
+// and not reported by AMD. CPUID 0x15 is also not yet widely supported. Also
+// used on RISC-V and ARM64.
+HWY_MAYBE_UNUSED double MeasureNominalClockRate() {
+  double max_ticks_per_sec = 0.0;
+  // Arbitrary, enough to ignore 2 outliers without excessive init time.
+  for (int rep = 0; rep < 3; ++rep) {
+    auto time0 = std::chrono::steady_clock::now();
+    using Time = decltype(time0);
+    const timer::Ticks ticks0 = timer::Start();
+    const Time time_min = time0 + std::chrono::milliseconds(10);
+
+    Time time1;
+    timer::Ticks ticks1;
+    for (;;) {
+      time1 = std::chrono::steady_clock::now();
+      // Ideally this would be Stop, but that requires RDTSCP on x86. To avoid
+      // another codepath, just use Start instead. now() presumably has its own
+      // fence-like behavior.
+      ticks1 = timer::Start();  // Do not use Stop, see comment above
+      if (time1 >= time_min) break;
+    }
+
+    const double dticks = static_cast<double>(ticks1 - ticks0);
+    std::chrono::duration<double, std::ratio<1>> dtime = time1 - time0;
+    const double ticks_per_sec = dticks / dtime.count();
+    max_ticks_per_sec = std::max(max_ticks_per_sec, ticks_per_sec);
+  }
+  return max_ticks_per_sec;
+}
+
+#if HWY_ARCH_X86
+
+void Cpuid(const uint32_t level, const uint32_t count,
+           uint32_t* HWY_RESTRICT abcd) {
+#if HWY_COMPILER_MSVC
+  int regs[4];
+  __cpuidex(regs, level, count);
+  for (int i = 0; i < 4; ++i) {
+    abcd[i] = regs[i];
+  }
+#else
+  uint32_t a;
+  uint32_t b;
+  uint32_t c;
+  uint32_t d;
+  __cpuid_count(level, count, a, b, c, d);
+  abcd[0] = a;
+  abcd[1] = b;
+  abcd[2] = c;
+  abcd[3] = d;
+#endif
+}
+
+bool HasRDTSCP() {
+  uint32_t abcd[4];
+  Cpuid(0x80000001U, 0, abcd);         // Extended feature flags
+  return (abcd[3] & (1u << 27)) != 0;  // RDTSCP
+}
+
+std::string BrandString() {
+  char brand_string[49];
+  std::array<uint32_t, 4> abcd;
+
+  // Check if brand string is supported (it is on all reasonable Intel/AMD)
+  Cpuid(0x80000000U, 0, abcd.data());
+  if (abcd[0] < 0x80000004U) {
+    return std::string();
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
+    CopyBytes<sizeof(abcd)>(&abcd[0], brand_string + i * 16);  // not same size
+  }
+  brand_string[48] = 0;
+  return brand_string;
+}
+
+#endif  // HWY_ARCH_X86
+
+}  // namespace
+
+HWY_DLLEXPORT double InvariantTicksPerSecond() {
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+  return static_cast<double>(__ppc_get_timebase_freq());
+#elif HWY_ARCH_X86 || HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC)
+  // We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs.
+  static const double freq = MeasureNominalClockRate();
+  return freq;
+#elif defined(_WIN32) || defined(_WIN64)
+  LARGE_INTEGER freq;
+  (void)QueryPerformanceFrequency(&freq);
+  return static_cast<double>(freq.QuadPart);
+#elif defined(__APPLE__)
+  // https://developer.apple.com/library/mac/qa/qa1398/_index.html
+  mach_timebase_info_data_t timebase;
+  (void)mach_timebase_info(&timebase);
+  return static_cast<double>(timebase.denom) / timebase.numer * 1E9;
+#else
+  return 1E9;  // Haiku and clock_gettime return nanoseconds.
+#endif
+}
+
+HWY_DLLEXPORT double Now() {
+  static const double mul = 1.0 / InvariantTicksPerSecond();
+  return static_cast<double>(timer::Start()) * mul;
+}
+
+HWY_DLLEXPORT uint64_t TimerResolution() {
+#if HWY_ARCH_X86
+  bool can_use_stop = platform::HasRDTSCP();
+#else
+  constexpr bool can_use_stop = true;
+#endif
+
+  // Nested loop avoids exceeding stack/L1 capacity.
+  timer::Ticks repetitions[Params::kTimerSamples];
+  for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
+    timer::Ticks samples[Params::kTimerSamples];
+    if (can_use_stop) {
+      for (size_t i = 0; i < Params::kTimerSamples; ++i) {
+        const timer::Ticks t0 = timer::Start();
+        const timer::Ticks t1 = timer::Stop();  // we checked HasRDTSCP above
+        samples[i] = t1 - t0;
+      }
+    } else {
+      for (size_t i = 0; i < Params::kTimerSamples; ++i) {
+        const timer::Ticks t0 = timer::Start();
+        const timer::Ticks t1 = timer::Start();  // do not use Stop, see above
+        samples[i] = t1 - t0;
+      }
+    }
+    repetitions[rep] = robust_statistics::Mode(samples);
+  }
+  return robust_statistics::Mode(repetitions);
+}
+
+}  // namespace platform
+namespace {
+
+static const timer::Ticks timer_resolution = platform::TimerResolution();
+
+// Estimates the expected value of "lambda" values with a variable number of
+// samples until the variability "rel_mad" is less than "max_rel_mad".
+template <class Lambda>
+timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
+                               const Params& p, const Lambda& lambda) {
+  // Choose initial samples_per_eval based on a single estimated duration.
+  timer::Ticks t0 = timer::Start();
+  lambda();
+  timer::Ticks t1 = timer::Stop();  // Caller checks HasRDTSCP
+  timer::Ticks est = t1 - t0;
+  static const double ticks_per_second = platform::InvariantTicksPerSecond();
+  const size_t ticks_per_eval =
+      static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
+  size_t samples_per_eval = est == 0
+                                ? p.min_samples_per_eval
+                                : static_cast<size_t>(ticks_per_eval / est);
+  samples_per_eval = HWY_MAX(samples_per_eval, p.min_samples_per_eval);
+
+  std::vector<timer::Ticks> samples;
+  samples.reserve(1 + samples_per_eval);
+  samples.push_back(est);
+
+  // Percentage is too strict for tiny differences, so also allow a small
+  // absolute "median absolute deviation".
+  const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100;
+  *rel_mad = 0.0;  // ensure initialized
+
+  for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
+    samples.reserve(samples.size() + samples_per_eval);
+    for (size_t i = 0; i < samples_per_eval; ++i) {
+      t0 = timer::Start();
+      lambda();
+      t1 = timer::Stop();  // Caller checks HasRDTSCP
+      samples.push_back(t1 - t0);
+    }
+
+    if (samples.size() >= p.min_mode_samples) {
+      est = robust_statistics::Mode(samples.data(), samples.size());
+    } else {
+      // For "few" (depends also on the variance) samples, Median is safer.
+      est = robust_statistics::Median(samples.data(), samples.size());
+    }
+    NANOBENCHMARK_CHECK(est != 0);
+
+    // Median absolute deviation (mad) is a robust measure of 'variability'.
+    const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
+        samples.data(), samples.size(), est);
+    *rel_mad = static_cast<double>(abs_mad) / static_cast<double>(est);
+
+    if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
+      if (p.verbose) {
+        printf("%6" PRIu64 " samples => %5" PRIu64 " (abs_mad=%4" PRIu64
+               ", rel_mad=%4.2f%%)\n",
+               static_cast<uint64_t>(samples.size()),
+               static_cast<uint64_t>(est), static_cast<uint64_t>(abs_mad),
+               *rel_mad * 100.0);
+      }
+      return est;
+    }
+  }
+
+  if (p.verbose) {
+    printf("WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6" PRIu64
+           " samples.\n",
+           *rel_mad * 100.0, max_rel_mad * 100.0,
+           static_cast<uint64_t>(samples.size()));
+  }
+  return est;
+}
+
+using InputVec = std::vector<FuncInput>;
+
+// Returns vector of unique input values.
+InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) {
+  InputVec unique(inputs, inputs + num_inputs);
+  std::sort(unique.begin(), unique.end());
+  unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
+  return unique;
+}
+
+// Returns how often we need to call func for sufficient precision.
+size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
+               const Params& p) {
+  // Min elapsed ticks for any input.
+  timer::Ticks min_duration = ~timer::Ticks(0);
+
+  for (const FuncInput input : unique) {
+    double rel_mad;
+    const timer::Ticks total = SampleUntilStable(
+        p.target_rel_mad, &rel_mad, p,
+        [func, arg, input]() { platform::PreventElision(func(arg, input)); });
+    min_duration = HWY_MIN(min_duration, total - timer_resolution);
+  }
+
+  // Number of repetitions required to reach the target resolution.
+  const size_t max_skip = p.precision_divisor;
+  // Number of repetitions given the estimated duration.
+  const size_t num_skip =
+      min_duration == 0
+          ? 0
+          : static_cast<size_t>((max_skip + min_duration - 1) / min_duration);
+  if (p.verbose) {
+    printf("res=%" PRIu64 " max_skip=%" PRIu64 " min_dur=%" PRIu64
+           " num_skip=%" PRIu64 "\n",
+           static_cast<uint64_t>(timer_resolution),
+           static_cast<uint64_t>(max_skip), static_cast<uint64_t>(min_duration),
+           static_cast<uint64_t>(num_skip));
+  }
+  return num_skip;
+}
+
+// Replicates inputs until we can omit "num_skip" occurrences of an input.
+InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs,
+                         const size_t num_unique, const size_t num_skip,
+                         const Params& p) {
+  InputVec full;
+  if (num_unique == 1) {
+    full.assign(p.subset_ratio * num_skip, inputs[0]);
+    return full;
+  }
+
+  full.reserve(p.subset_ratio * num_skip * num_inputs);
+  for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) {
+    full.insert(full.end(), inputs, inputs + num_inputs);
+  }
+  std::mt19937 rng;
+  std::shuffle(full.begin(), full.end(), rng);
+  return full;
+}
+
+// Copies the "full" to "subset" in the same order, but with "num_skip"
+// randomly selected occurrences of "input_to_skip" removed.
+void FillSubset(const InputVec& full, const FuncInput input_to_skip,
+                const size_t num_skip, InputVec* subset) {
+  const size_t count =
+      static_cast<size_t>(std::count(full.begin(), full.end(), input_to_skip));
+  // Generate num_skip random indices: which occurrence to skip.
+  std::vector<uint32_t> omit(count);
+  std::iota(omit.begin(), omit.end(), 0);
+  // omit[] is the same on every call, but that's OK because they identify the
+  // Nth instance of input_to_skip, so the position within full[] differs.
+  std::mt19937 rng;
+  std::shuffle(omit.begin(), omit.end(), rng);
+  omit.resize(num_skip);
+  std::sort(omit.begin(), omit.end());
+
+  uint32_t occurrence = ~0u;  // 0 after preincrement
+  size_t idx_omit = 0;        // cursor within omit[]
+  size_t idx_subset = 0;      // cursor within *subset
+  for (const FuncInput next : full) {
+    if (next == input_to_skip) {
+      ++occurrence;
+      // Haven't removed enough already
+      if (idx_omit < num_skip) {
+        // This one is up for removal
+        if (occurrence == omit[idx_omit]) {
+          ++idx_omit;
+          continue;
+        }
+      }
+    }
+    if (idx_subset < subset->size()) {
+      (*subset)[idx_subset++] = next;
+    }
+  }
+  NANOBENCHMARK_CHECK(idx_subset == subset->size());
+  NANOBENCHMARK_CHECK(idx_omit == omit.size());
+  NANOBENCHMARK_CHECK(occurrence == count - 1);
+}
+
+// Returns total ticks elapsed for all inputs.
+timer::Ticks TotalDuration(const Func func, const uint8_t* arg,
+                           const InputVec* inputs, const Params& p,
+                           double* max_rel_mad) {
+  double rel_mad;
+  const timer::Ticks duration =
+      SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
+        for (const FuncInput input : *inputs) {
+          platform::PreventElision(func(arg, input));
+        }
+      });
+  *max_rel_mad = HWY_MAX(*max_rel_mad, rel_mad);
+  return duration;
+}
+
+// (Nearly) empty Func for measuring timer overhead/resolution.
+HWY_NOINLINE FuncOutput EmptyFunc(const void* /*arg*/, const FuncInput input) {
+  return input;
+}
+
+// Returns overhead of accessing inputs[] and calling a function; this will
+// be deducted from future TotalDuration return values.
+timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,
+                      const Params& p) {
+  double rel_mad;
+  // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
+  return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
+    for (const FuncInput input : *inputs) {
+      platform::PreventElision(EmptyFunc(arg, input));
+    }
+  });
+}
+
+}  // namespace
+
+HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; }
+
+HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
+                             const FuncInput* inputs, const size_t num_inputs,
+                             Result* results, const Params& p) {
+  NANOBENCHMARK_CHECK(num_inputs != 0);
+
+#if HWY_ARCH_X86
+  if (!platform::HasRDTSCP()) {
+    fprintf(stderr, "CPU '%s' does not support RDTSCP, skipping benchmark.\n",
+            platform::BrandString().c_str());
+    return 0;
+  }
+#endif
+
+  const InputVec& unique = UniqueInputs(inputs, num_inputs);
+
+  const size_t num_skip = NumSkip(func, arg, unique, p);  // never 0
+  if (num_skip == 0) return 0;  // NumSkip already printed error message
+  // (slightly less work on x86 to cast from signed integer)
+  const float mul = 1.0f / static_cast<float>(static_cast<int>(num_skip));
+
+  const InputVec& full =
+      ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
+  InputVec subset(full.size() - num_skip);
+
+  const timer::Ticks overhead = Overhead(arg, &full, p);
+  const timer::Ticks overhead_skip = Overhead(arg, &subset, p);
+  if (overhead < overhead_skip) {
+    fprintf(stderr, "Measurement failed: overhead %" PRIu64 " < %" PRIu64 "\n",
+            static_cast<uint64_t>(overhead),
+            static_cast<uint64_t>(overhead_skip));
+    return 0;
+  }
+
+  if (p.verbose) {
+    printf("#inputs=%5" PRIu64 ",%5" PRIu64 " overhead=%5" PRIu64 ",%5" PRIu64
+           "\n",
+           static_cast<uint64_t>(full.size()),
+           static_cast<uint64_t>(subset.size()),
+           static_cast<uint64_t>(overhead),
+           static_cast<uint64_t>(overhead_skip));
+  }
+
+  double max_rel_mad = 0.0;
+  const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
+
+  for (size_t i = 0; i < unique.size(); ++i) {
+    FillSubset(full, unique[i], num_skip, &subset);
+    const timer::Ticks total_skip =
+        TotalDuration(func, arg, &subset, p, &max_rel_mad);
+
+    if (total < total_skip) {
+      fprintf(stderr, "Measurement failed: total %" PRIu64 " < %" PRIu64 "\n",
+              static_cast<uint64_t>(total), static_cast<uint64_t>(total_skip));
+      return 0;
+    }
+
+    const timer::Ticks duration =
+        (total - overhead) - (total_skip - overhead_skip);
+    results[i].input = unique[i];
+    results[i].ticks = static_cast<float>(duration) * mul;
+    results[i].variability = static_cast<float>(max_rel_mad);
+  }
+
+  return unique.size();
+}
+
+}  // namespace hwy
@@ -0,0 +1,194 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
+#define HIGHWAY_HWY_NANOBENCHMARK_H_
+
+// Benchmarks functions of a single integer argument with realistic branch
+// prediction hit rates. Uses a robust estimator to summarize the measurements.
+// The precision is about 0.2%.
+//
+// Examples: see nanobenchmark_test.cc.
+//
+// Background: Microbenchmarks such as http://github.com/google/benchmark
+// can measure elapsed times on the order of a microsecond. Shorter functions
+// are typically measured by repeating them thousands of times and dividing
+// the total elapsed time by this count. Unfortunately, repetition (especially
+// with the same input parameter!) influences the runtime. In time-critical
+// code, it is reasonable to expect warm instruction/data caches and TLBs,
+// but a perfect record of which branches will be taken is unrealistic.
+// Unless the application also repeatedly invokes the measured function with
+// the same parameter, the benchmark is measuring something very different -
+// a best-case result, almost as if the parameter were made a compile-time
+// constant. This may lead to erroneous conclusions about branch-heavy
+// algorithms outperforming branch-free alternatives.
+//
+// Our approach differs in three ways. Adding fences to the timer functions
+// reduces variability due to instruction reordering, improving the timer
+// resolution to about 40 CPU cycles. However, shorter functions must still
+// be invoked repeatedly. For more realistic branch prediction performance,
+// we vary the input parameter according to a user-specified distribution.
+// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
+// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
+// central tendency of the measurement samples with the "half sample mode",
+// which is more robust to outliers and skewed data than the mean or median.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hwy/highway_export.h"
+
+// Enables sanity checks that verify correct operation at the cost of
+// longer benchmark runs.
+#ifndef NANOBENCHMARK_ENABLE_CHECKS
+#define NANOBENCHMARK_ENABLE_CHECKS 0
+#endif
+
+#define NANOBENCHMARK_CHECK_ALWAYS(condition)                             \
+  while (!(condition)) {                                                  \
+    fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
+    abort();                                                              \
+  }
+
+#if NANOBENCHMARK_ENABLE_CHECKS
+#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
+#else
+#define NANOBENCHMARK_CHECK(condition)
+#endif
+
+namespace hwy {
+
+namespace platform {
+
+// Returns tick rate, useful for converting measurements to seconds. Invariant
+// means the tick counter frequency is independent of CPU throttling or sleep.
+// This call may be expensive, callers should cache the result.
+HWY_DLLEXPORT double InvariantTicksPerSecond();
+
+// Returns current timestamp [in seconds] relative to an unspecified origin.
+// Features: monotonic (no negative elapsed time), steady (unaffected by system
+// time changes), high-resolution (on the order of microseconds).
+HWY_DLLEXPORT double Now();
+
+// Returns ticks elapsed in back to back timer calls, i.e. a function of the
+// timer resolution (minimum measurable difference) and overhead.
+// This call is expensive, callers should cache the result.
+HWY_DLLEXPORT uint64_t TimerResolution();
+
+}  // namespace platform
+
+// Returns 1, but without the compiler knowing what the value is. This prevents
+// optimizing out code.
+HWY_DLLEXPORT int Unpredictable1();
+
+// Input influencing the function being measured (e.g. number of bytes to copy).
+using FuncInput = size_t;
+
+// "Proof of work" returned by Func to ensure the compiler does not elide it.
+using FuncOutput = uint64_t;
+
+// Function to measure: either 1) a captureless lambda or function with two
+// arguments or 2) a lambda with capture, in which case the first argument
+// is reserved for use by MeasureClosure.
+using Func = FuncOutput (*)(const void*, FuncInput);
+
+// Internal parameters that determine precision/resolution/measuring time.
+struct Params {
+  // For measuring timer overhead/resolution. Used in a nested loop =>
+  // quadratic time, acceptable because we know timer overhead is "low".
+  // constexpr because this is used to define array bounds.
+  static constexpr size_t kTimerSamples = 256;
+
+  // Best-case precision, expressed as a divisor of the timer resolution.
+  // Larger => more calls to Func and higher precision.
+  size_t precision_divisor = 1024;
+
+  // Ratio between full and subset input distribution sizes. Cannot be less
+  // than 2; larger values increase measurement time but more faithfully
+  // model the given input distribution.
+  size_t subset_ratio = 2;
+
+  // Together with the estimated Func duration, determines how many times to
+  // call Func before checking the sample variability. Larger values increase
+  // measurement time, memory/cache use and precision.
+  double seconds_per_eval = 4E-3;
+
+  // The minimum number of samples before estimating the central tendency.
+  size_t min_samples_per_eval = 7;
+
+  // The mode is better than median for estimating the central tendency of
+  // skewed/fat-tailed distributions, but it requires sufficient samples
+  // relative to the width of half-ranges.
+  size_t min_mode_samples = 64;
+
+  // Maximum permissible variability (= median absolute deviation / center).
+  double target_rel_mad = 0.002;
+
+  // Abort after this many evals without reaching target_rel_mad. This
+  // prevents infinite loops.
+  size_t max_evals = 9;
+
+  // Whether to print additional statistics to stdout.
+  bool verbose = true;
+};
+
+// Measurement result for each unique input.
+struct Result {
+  FuncInput input;
+
+  // Robust estimate (mode or median) of duration.
+  float ticks;
+
+  // Measure of variability (median absolute deviation relative to "ticks").
+  float variability;
+};
+
+// Precisely measures the number of ticks elapsed when calling "func" with the
+// given inputs, shuffled to ensure realistic branch prediction hit rates.
+//
+// "func" returns a 'proof of work' to ensure its computations are not elided.
+// "arg" is passed to Func, or reserved for internal use by MeasureClosure.
+// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
+//   "func". The values should be chosen to maximize coverage of "func". This
+//   represents a distribution, so a value's frequency should reflect its
+//   probability in the real application. Order does not matter; for example, a
+//   uniform distribution over [0, 4) could be represented as {3,0,2,1}.
+// Returns how many Result were written to "results": one per unique input, or
+//   zero if the measurement failed (an error message goes to stderr).
+HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
+                             const FuncInput* inputs, const size_t num_inputs,
+                             Result* results, const Params& p = Params());
+
+// Calls operator() of the given closure (lambda function).
+template <class Closure>
+static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
+  return (*f)(input);
+}
+
+// Same as Measure, except "closure" is typically a lambda function of
+// FuncInput -> FuncOutput with a capture list.
+template <class Closure>
+static inline size_t MeasureClosure(const Closure& closure,
+                                    const FuncInput* inputs,
+                                    const size_t num_inputs, Result* results,
+                                    const Params& p = Params()) {
+  return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
+                 reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
+                 results, p);
+}
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_NANOBENCHMARK_H_
@@ -0,0 +1,94 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/nanobenchmark.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <random>
+
+#include "hwy/tests/test_util-inl.h"
+
+namespace hwy {
+namespace {
+
+// Governs duration of test; avoid timeout in debug builds.
+#if HWY_IS_DEBUG_BUILD
+constexpr size_t kMaxEvals = 3;
+#else
+constexpr size_t kMaxEvals = 4;
+#endif
+
+FuncOutput Div(const void*, FuncInput in) {
+  // Here we're measuring the throughput because benchmark invocations are
+  // independent. Any dividend will do; the divisor is nonzero.
+  return 0xFFFFF / in;
+}
+
+template <size_t N>
+void MeasureDiv(const FuncInput (&inputs)[N]) {
+  printf("Measuring integer division (output on final two lines)\n");
+  Result results[N];
+  Params params;
+  params.max_evals = kMaxEvals;
+  const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params);
+  for (size_t i = 0; i < num_results; ++i) {
+    printf("%5" PRIu64 ": %6.2f ticks; MAD=%4.2f%%\n",
+           static_cast<uint64_t>(results[i].input), results[i].ticks,
+           results[i].variability * 100.0);
+  }
+}
+
+std::mt19937 rng;
+
+// A function whose runtime depends on rng.
+FuncOutput Random(const void* /*arg*/, FuncInput in) {
+  const size_t r = rng() & 0xF;
+  FuncOutput ret = static_cast<FuncOutput>(in);
+  for (size_t i = 0; i < r; ++i) {
+    ret /= ((rng() & 1) + 2);
+  }
+  return ret;
+}
+
+// Ensure the measured variability is high.
+template <size_t N>
+void MeasureRandom(const FuncInput (&inputs)[N]) {
+  Result results[N];
+  Params p;
+  p.max_evals = kMaxEvals;
+  p.verbose = false;
+  const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p);
+  for (size_t i = 0; i < num_results; ++i) {
+    NANOBENCHMARK_CHECK(results[i].variability > 1E-3);
+  }
+}
+
+TEST(NanobenchmarkTest, RunAll) {
+  const int unpredictable = Unpredictable1();  // == 1, unknown to compiler.
+  static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
+                                     static_cast<FuncInput>(unpredictable + 9)};
+
+  MeasureDiv(inputs);
+  MeasureRandom(inputs);
+}
+
+}  // namespace
+}  // namespace hwy
@@ -0,0 +1,444 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Sets macros based on HWY_TARGET.
+
+// This include guard is toggled by foreach_target, so avoid the usual _H_
+// suffix to prevent copybara from renaming it.
+#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
+#ifdef HWY_SET_MACROS_PER_TARGET
+#undef HWY_SET_MACROS_PER_TARGET
+#else
+#define HWY_SET_MACROS_PER_TARGET
+#endif
+
+#endif  // HWY_SET_MACROS_PER_TARGET
+
+#include "hwy/detect_targets.h"
+
+#undef HWY_NAMESPACE
+#undef HWY_ALIGN
+#undef HWY_MAX_BYTES
+#undef HWY_LANES
+
+#undef HWY_HAVE_SCALABLE
+#undef HWY_HAVE_INTEGER64
+#undef HWY_HAVE_FLOAT16
+#undef HWY_HAVE_FLOAT64
+#undef HWY_MEM_OPS_MIGHT_FAULT
+#undef HWY_NATIVE_FMA
+#undef HWY_CAP_GE256
+#undef HWY_CAP_GE512
+
+#undef HWY_TARGET_STR
+
+#if defined(HWY_DISABLE_PCLMUL_AES)
+#define HWY_TARGET_STR_PCLMUL_AES ""
+#else
+#define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes"
+#endif
+
+#if defined(HWY_DISABLE_BMI2_FMA)
+#define HWY_TARGET_STR_BMI2_FMA ""
+#else
+#define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma"
+#endif
+
+#if defined(HWY_DISABLE_F16C)
+#define HWY_TARGET_STR_F16C ""
+#else
+#define HWY_TARGET_STR_F16C ",f16c"
+#endif
+
+#define HWY_TARGET_STR_SSSE3 "sse2,ssse3"
+
+#define HWY_TARGET_STR_SSE4 \
+  HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES
+// Include previous targets, which are the half-vectors of the next target.
+#define HWY_TARGET_STR_AVX2 \
+  HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
+#define HWY_TARGET_STR_AVX3 \
+  HWY_TARGET_STR_AVX2 ",avx512f,avx512vl,avx512dq,avx512bw"
+
+// Before include guard so we redefine HWY_TARGET_STR on each include,
+// governed by the current HWY_TARGET.
+
+//-----------------------------------------------------------------------------
+// SSSE3
+#if HWY_TARGET == HWY_SSSE3
+
+#define HWY_NAMESPACE N_SSSE3
+#define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+#define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 0
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
+
+//-----------------------------------------------------------------------------
+// SSE4
+#elif HWY_TARGET == HWY_SSE4
+
+#define HWY_NAMESPACE N_SSE4
+#define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+#define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 0
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_TARGET_STR HWY_TARGET_STR_SSE4
+
+//-----------------------------------------------------------------------------
+// AVX2
+#elif HWY_TARGET == HWY_AVX2
+
+#define HWY_NAMESPACE N_AVX2
+#define HWY_ALIGN alignas(32)
+#define HWY_MAX_BYTES 32
+#define HWY_LANES(T) (32 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+#define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+
+#ifdef HWY_DISABLE_BMI2_FMA
+#define HWY_NATIVE_FMA 0
+#else
+#define HWY_NATIVE_FMA 1
+#endif
+
+#define HWY_CAP_GE256 1
+#define HWY_CAP_GE512 0
+
+#define HWY_TARGET_STR HWY_TARGET_STR_AVX2
+
+//-----------------------------------------------------------------------------
+// AVX3[_DL]
+#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL
+
+#define HWY_ALIGN alignas(64)
+#define HWY_MAX_BYTES 64
+#define HWY_LANES(T) (64 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+#define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 0
+#define HWY_NATIVE_FMA 1
+#define HWY_CAP_GE256 1
+#define HWY_CAP_GE512 1
+
+#if HWY_TARGET == HWY_AVX3
+
+#define HWY_NAMESPACE N_AVX3
+#define HWY_TARGET_STR HWY_TARGET_STR_AVX3
+
+#elif HWY_TARGET == HWY_AVX3_DL
+
+#define HWY_NAMESPACE N_AVX3_DL
+#define HWY_TARGET_STR                                            \
+  HWY_TARGET_STR_AVX3                                             \
+  ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avxvnni,avx512bitalg," \
+  "avx512vpopcntdq"
+
+#else
+#error "Logic error"
+#endif  // HWY_TARGET == HWY_AVX3_DL
+
+//-----------------------------------------------------------------------------
+// PPC8
+#elif HWY_TARGET == HWY_PPC8
+
+#define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 0
+#define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 1
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_PPC8
+
+#define HWY_TARGET_STR "altivec,vsx"
+
+//-----------------------------------------------------------------------------
+// NEON
+#elif HWY_TARGET == HWY_NEON
+
+#define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+
+#if HWY_ARCH_ARM_A64
+#define HWY_HAVE_FLOAT64 1
+#else
+#define HWY_HAVE_FLOAT64 0
+#endif
+
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+
+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
+#define HWY_NATIVE_FMA 1
+#else
+#define HWY_NATIVE_FMA 0
+#endif
+
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_NEON
+
+// Can use pragmas instead of -march compiler flag
+#if HWY_HAVE_RUNTIME_DISPATCH
+#if HWY_ARCH_ARM_V7
+#define HWY_TARGET_STR "+neon-vfpv4"
+#else
+#define HWY_TARGET_STR "+crypto"
+#endif  // HWY_ARCH_ARM_V7
+#else
+// HWY_TARGET_STR remains undefined
+#endif
+
+//-----------------------------------------------------------------------------
+// SVE[2]
+#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \
+    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
+
+// SVE only requires lane alignment, not natural alignment of the entire vector.
+#define HWY_ALIGN alignas(8)
+
+// Value ensures MaxLanes() is the tightest possible upper bound to reduce
+// overallocation.
+#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 1
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+#define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 0
+#define HWY_NATIVE_FMA 1
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#if HWY_TARGET == HWY_SVE2
+#define HWY_NAMESPACE N_SVE2
+#define HWY_MAX_BYTES 256
+#elif HWY_TARGET == HWY_SVE_256
+#define HWY_NAMESPACE N_SVE_256
+#define HWY_MAX_BYTES 32
+#elif HWY_TARGET == HWY_SVE2_128
+#define HWY_NAMESPACE N_SVE2_128
+#define HWY_MAX_BYTES 16
+#else
+#define HWY_NAMESPACE N_SVE
+#define HWY_MAX_BYTES 256
+#endif
+
+// Can use pragmas instead of -march compiler flag
+#if HWY_HAVE_RUNTIME_DISPATCH
+#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
+#define HWY_TARGET_STR "+sve2-aes"
+#else
+#define HWY_TARGET_STR "+sve"
+#endif
+#else
+// HWY_TARGET_STR remains undefined
+#endif
+
+//-----------------------------------------------------------------------------
+// WASM
+#elif HWY_TARGET == HWY_WASM
+
+#define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+#define HWY_HAVE_FLOAT64 0
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 0
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_WASM
+
+#define HWY_TARGET_STR "simd128"
+
+//-----------------------------------------------------------------------------
+// WASM_EMU256
+#elif HWY_TARGET == HWY_WASM_EMU256
+
+#define HWY_ALIGN alignas(32)
+#define HWY_MAX_BYTES 32
+#define HWY_LANES(T) (32 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+#define HWY_HAVE_FLOAT64 0
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 0
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_WASM_EMU256
+
+#define HWY_TARGET_STR "simd128"
+
+//-----------------------------------------------------------------------------
+// RVV
+#elif HWY_TARGET == HWY_RVV
+
+// RVV only requires lane alignment, not natural alignment of the entire vector,
+// and the compiler already aligns builtin types, so nothing to do here.
+#define HWY_ALIGN
+
+// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
+#define HWY_MAX_BYTES 65536
+
+// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
+// LMUL. This is the tightest possible upper bound.
+#define HWY_LANES(T) (8192 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 1
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 0
+#define HWY_NATIVE_FMA 1
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#if defined(__riscv_zvfh)
+#define HWY_HAVE_FLOAT16 1
+#else
+#define HWY_HAVE_FLOAT16 0
+#endif
+
+#define HWY_NAMESPACE N_RVV
+
+// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+// (rv64gcv is not a valid target)
+
+//-----------------------------------------------------------------------------
+// EMU128
+#elif HWY_TARGET == HWY_EMU128
+
+#define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+#define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 0
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_EMU128
+
+// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+
+//-----------------------------------------------------------------------------
+// SCALAR
+#elif HWY_TARGET == HWY_SCALAR
+
+#define HWY_ALIGN
+#define HWY_MAX_BYTES 8
+#define HWY_LANES(T) 1
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+#define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 0
+#define HWY_NATIVE_FMA 0
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_SCALAR
+
+// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+
+#else
+#pragma message("HWY_TARGET does not match any known target")
+#endif  // HWY_TARGET
+
+// Override this to 1 in asan/msan builds, which will still fault.
+#if HWY_IS_ASAN || HWY_IS_MSAN
+#undef HWY_MEM_OPS_MIGHT_FAULT
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#endif
+
+// Clang <9 requires this be invoked at file scope, before any namespace.
+#undef HWY_BEFORE_NAMESPACE
+#if defined(HWY_TARGET_STR)
+#define HWY_BEFORE_NAMESPACE()        \
+  HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \
+  static_assert(true, "For requiring trailing semicolon")
+#else
+// avoids compiler warning if no HWY_TARGET_STR
+#define HWY_BEFORE_NAMESPACE() \
+  static_assert(true, "For requiring trailing semicolon")
+#endif
+
+// Clang <9 requires any namespaces be closed before this macro.
+#undef HWY_AFTER_NAMESPACE
+#if defined(HWY_TARGET_STR)
+#define HWY_AFTER_NAMESPACE() \
+  HWY_POP_ATTRIBUTES          \
+  static_assert(true, "For requiring trailing semicolon")
+#else
+// avoids compiler warning if no HWY_TARGET_STR
+#define HWY_AFTER_NAMESPACE() \
+  static_assert(true, "For requiring trailing semicolon")
+#endif
+
+#undef HWY_ATTR
+#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
+#define HWY_ATTR __attribute__((target(HWY_TARGET_STR)))
+#else
+#define HWY_ATTR
+#endif
@@ -0,0 +1,311 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target definitions shared by ops/*.h and user code.
+
+#include <cmath>
+
+#include "hwy/base.h"
+
+// Separate header because foreach_target.h re-enables its include guard.
+#include "hwy/ops/set_macros-inl.h"
+
+// Relies on the external include guard in highway.h.
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Highway operations are implemented as overloaded functions selected using an
+// internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
+// shift count applied to scalable vectors. Instead of referring to Simd<>
+// directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
+// full vector, or fractions/groups if the argument is negative/positive),
+// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
+// Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
+// cap. For constexpr-size vectors, N is the actual number of lanes. This
+// ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
+template <typename Lane, size_t N, int kPow2>
+struct Simd {
+  constexpr Simd() = default;
+  using T = Lane;
+  static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
+
+  // Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
+  // warns when using enums and non-enums in the same expression. Cannot be
+  // static constexpr function (another MSVC limitation).
+  static constexpr size_t kPrivateN = N;
+  static constexpr int kPrivatePow2 = kPow2;
+
+  template <typename NewT>
+  static constexpr size_t NewN() {
+    // Round up to correctly handle scalars with N=1.
+    return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
+  }
+
+#if HWY_HAVE_SCALABLE
+  template <typename NewT>
+  static constexpr int Pow2Ratio() {
+    return (sizeof(NewT) > sizeof(T))
+               ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
+               : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)));
+  }
+#endif
+
+  // Widening/narrowing ops change the number of lanes and/or their type.
+  // To initialize such vectors, we need the corresponding tag types:
+
+// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
+#if HWY_HAVE_SCALABLE
+  template <typename NewT>
+  using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>;
+#else
+  template <typename NewT>
+  using Rebind = Simd<NewT, N, kPow2>;
+#endif
+
+  // Change lane type while keeping the same vector size, e.g. for MulEven.
+  template <typename NewT>
+  using Repartition = Simd<NewT, NewN<NewT>(), kPow2>;
+
+// Half the lanes while keeping the same lane type, e.g. for LowerHalf.
+// Round up to correctly handle scalars with N=1.
+#if HWY_HAVE_SCALABLE
+  // Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
+  // then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
+  using Half = Simd<T, (N + 1) / 2, kPow2 - 1>;
+#else
+  using Half = Simd<T, (N + 1) / 2, kPow2>;
+#endif
+
+// Twice the lanes while keeping the same lane type, e.g. for Combine.
+#if HWY_HAVE_SCALABLE
+  using Twice = Simd<T, 2 * N, kPow2 + 1>;
+#else
+  using Twice = Simd<T, 2 * N, kPow2>;
+#endif
+};
+
+namespace detail {
+
+template <typename T, size_t N, int kPow2>
+constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
+  return N == HWY_LANES(T) && kPow2 == 0;
+}
+
+// Returns the number of lanes (possibly zero) after applying a shift:
+// - 0: no change;
+// - [1,3]: a group of 2,4,8 [fractional] vectors;
+// - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
+constexpr size_t ScaleByPower(size_t N, int pow2) {
+#if HWY_TARGET == HWY_RVV
+  return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
+#else
+  return pow2 >= 0 ? N : (N >> (-pow2));
+#endif
+}
+
+// Struct wrappers enable validation of arguments via static_assert.
+template <typename T, int kPow2>
+struct ScalableTagChecker {
+  static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
+#if HWY_TARGET == HWY_RVV
+  // Only RVV supports register groups.
+  using type = Simd<T, HWY_LANES(T), kPow2>;
+#elif HWY_HAVE_SCALABLE
+  // For SVE[2], only allow full or fractions.
+  using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>;
+#elif HWY_TARGET == HWY_SCALAR
+  using type = Simd<T, /*N=*/1, 0>;
+#else
+  // Only allow full or fractions.
+  using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>;
+#endif
+};
+
+template <typename T, size_t kLimit>
+struct CappedTagChecker {
+  static_assert(kLimit != 0, "Does not make sense to have zero lanes");
+  // Safely handle non-power-of-two inputs by rounding down, which is allowed by
+  // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
+  static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
+  using type = Simd<T, HWY_MIN(kLimitPow2, HWY_LANES(T)), 0>;
+};
+
+template <typename T, size_t kNumLanes>
+struct FixedTagChecker {
+  static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
+  static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
+  using type = Simd<T, kNumLanes, 0>;
+};
+
+}  // namespace detail
+
+// Alias for a tag describing a full vector (kPow2 == 0: the most common usage,
+// e.g. 1D loops where the application does not care about the vector size) or a
+// fraction/multiple of one. Multiples are the same as full vectors for all
+// targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return
+// value of type promotion and demotion.
+template <typename T, int kPow2 = 0>
+using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
+
+// Alias for a tag describing a vector with *up to* kLimit active lanes, even on
+// targets with scalable vectors and HWY_SCALAR. The runtime lane count
+// `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is
+// typically used for 1D loops with a relatively low application-defined upper
+// bound, e.g. for 8x8 DCTs. However, it is better if data structures are
+// designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
+// chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
+// this would enable vector-length-agnostic loops using ScalableTag).
+template <typename T, size_t kLimit>
+using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type;
+
+// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
+// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
+// two not exceeding `HWY_LANES(T)`.
+//
+// NOTE: if the application does not need to support HWY_SCALAR (+), use this
+// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
+// This is useful for data structures that rely on exactly 128-bit SIMD, but
+// these are discouraged because they cannot benefit from wider vectors.
+// Instead, applications would ideally define a larger problem size and loop
+// over it with the (unknown size) vectors from ScalableTag.
+//
+// + e.g. if the baseline is known to support SIMD, or the application requires
+//   ops such as TableLookupBytes not supported by HWY_SCALAR.
+template <typename T, size_t kNumLanes>
+using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type;
+
+template <class D>
+using TFromD = typename D::T;
+
+// Tag for the same number of lanes as D, but with the LaneType T.
+template <class T, class D>
+using Rebind = typename D::template Rebind<T>;
+
+template <class D>
+using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
+template <class D>
+using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
+template <class D>
+using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
+
+// Tag for the same total size as D, but with the LaneType T.
+template <class T, class D>
+using Repartition = typename D::template Repartition<T>;
+
+template <class D>
+using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
+template <class D>
+using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
+
+// Tag for the same lane type as D, but half the lanes.
+template <class D>
+using Half = typename D::Half;
+
+// Tag for the same lane type as D, but twice the lanes.
+template <class D>
+using Twice = typename D::Twice;
+
+template <typename T>
+using Full32 = Simd<T, 4 / sizeof(T), 0>;
+
+template <typename T>
+using Full64 = Simd<T, 8 / sizeof(T), 0>;
+
+template <typename T>
+using Full128 = Simd<T, 16 / sizeof(T), 0>;
+
+// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
+#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
+#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
+#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
+#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
+#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
+#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
+
+// MSVC workaround: use PrivateN directly instead of MaxLanes.
+#define HWY_IF_LT128_D(D) \
+  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr
+#define HWY_IF_GE128_D(D) \
+  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
+
+// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
+#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
+#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
+#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
+#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
+#define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV<V>, bytes)
+
+template <class D>
+HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
+  return D::kPrivatePow2;
+}
+
+// MSVC requires the explicit <D>.
+#define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr
+
+#if HWY_HAVE_SCALABLE
+
+// Upper bound on the number of lanes. Intended for template arguments and
+// reducing code size (e.g. for SSE4, we know at compile-time that vectors will
+// not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
+// actual size for allocating storage. WARNING: MSVC might not be able to deduce
+// arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
+template <class D>
+HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
+  return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)),
+                              D::kPrivatePow2);
+}
+
+#else
+// Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
+// is not an option, nor does a member function work.
+template <class D>
+HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
+  return D::kPrivateN;
+}
+
+// (Potentially) non-constant actual size of the vector at runtime, subject to
+// the limit imposed by the Simd. Useful for advancing loop counters.
+// Targets with scalable vectors define this themselves.
+template <typename T, size_t N, int kPow2>
+HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N, kPow2>) {
+  return N;
+}
+
+#endif  // !HWY_HAVE_SCALABLE
+
+// NOTE: GCC generates incorrect code for vector arguments to non-inlined
+// functions in two situations:
+// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
+//   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
+// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
+//   all) tests to fail.
+//
+// We therefore pass by const& only on GCC and (Windows or ARM64). This alias
+// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
+// and possibly also other functions that are not inlined.
+#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
+template <class V>
+using VecArg = const V&;
+#else
+template <class V>
+using VecArg = V;
+#endif
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
--- a/Show More
+++ b/Show More