ported from UXP:

- Issue #1769 - Part 1: Add vendored libjxl and highway sources. (7983f5d8)
- Issue #1769 - Part 1 Follow-up: Tidy up moz.build for highway and libjxl (00a5d640)
- Issue #1769 - Part 1 Follow-up: Use standard [[deprecated]] for JXL_DEPRECATED. (40f27cd3)
- Issue #1769 - Follow-up: Fix typo in MOZ_ARG_ENABLE_BOOL (4d78f53d)
- Issue #1769: Update symbols for libxul linkage (1ef390db)
- PR #2050 follow-up: add symbols to build shared on Windows. (514ed142)
- Issue #2061 Follow-up: Export jxl/version.h. (b1811451)
- Issue #2061 Follow-up: Fix moz.build to compile on all platforms. (743d1f66)
- Issue #2061 - Follow-up: Silence compiler warnings for libjxl (84dc161d)
- Issue #2061 - Follow-up: Silence compiler warnings for libjxl (MSVC) (80c20628)
This commit is contained in:
2023-06-26 11:40:31 +08:00
parent f7c8a895e8
commit 3c3852c268
955 changed files with 231619 additions and 0 deletions
+1
View File
@@ -159,6 +159,7 @@ def old_configure_options(*options):
'--enable-alsa',
'--enable-android-omx',
'--enable-av1',
'--enable-jxl',
'--enable-b2g-bt',
'--enable-b2g-camera',
'--enable-b2g-ril',
+3
View File
@@ -53,6 +53,9 @@ if CONFIG['MOZ_WEBSPEECH_POCKETSPHINX']:
if CONFIG['MOZ_FFVPX']:
external_dirs += ['media/ffvpx']
if CONFIG["MOZ_JXL"]:
external_dirs += ["media/libjxl", "media/highway"]
external_dirs += [
'media/kiss_fft',
'media/libcubeb',
+18
View File
@@ -80,6 +80,24 @@ aom_codec_peek_stream_info
aom_img_alloc
aom_img_free
#endif
#ifdef MOZ_JXL
JxlDecoderCreate
JxlDecoderDestroy
JxlDecoderSetParallelRunner
JxlDecoderSubscribeEvents
JxlDecoderProcessInput
JxlDecoderSetInput
JxlDecoderReleaseInput
JxlDecoderGetBasicInfo
JxlDecoderImageOutBufferSize
JxlDecoderSetImageOutBuffer
JxlDecoderGetFrameHeader
JxlDecoderFlushImage
JxlThreadParallelRunner
JxlThreadParallelRunnerCreate
JxlThreadParallelRunnerDestroy
JxlThreadParallelRunnerDefaultNumWorkerThreads
#endif
#ifdef MOZ_VORBIS
ogg_page_bos
ogg_page_granulepos
+12
View File
@@ -0,0 +1,12 @@
This directory contains build files for the Highway C++
SIMD library.
Any patches or additional configuration to be applied to the
upstream source should be kept here in the media/highway
directory.
The upstream highway git repository is:
https://github.com/google/highway
The version used was tagged 1.0.2.
+48
View File
@@ -0,0 +1,48 @@
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
# vim: set filetype=python:
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
LOCAL_INCLUDES += [
"/media/highway/src/",
]
SOURCES += [
"/media/highway/src/hwy/aligned_allocator.cc",
"/media/highway/src/hwy/contrib/image/image.cc",
"/media/highway/src/hwy/per_target.cc",
"/media/highway/src/hwy/targets.cc",
]
EXPORTS.hwy += [
"/media/highway/src/hwy/aligned_allocator.h",
"/media/highway/src/hwy/base.h",
"/media/highway/src/hwy/cache_control.h",
"/media/highway/src/hwy/detect_compiler_arch.h",
"/media/highway/src/hwy/detect_targets.h",
"/media/highway/src/hwy/foreach_target.h",
"/media/highway/src/hwy/highway.h",
"/media/highway/src/hwy/highway_export.h",
"/media/highway/src/hwy/targets.h",
]
EXPORTS.hwy.ops += [
"/media/highway/src/hwy/ops/arm_neon-inl.h",
"/media/highway/src/hwy/ops/arm_sve-inl.h",
"/media/highway/src/hwy/ops/emu128-inl.h",
"/media/highway/src/hwy/ops/generic_ops-inl.h",
"/media/highway/src/hwy/ops/rvv-inl.h",
"/media/highway/src/hwy/ops/scalar-inl.h",
"/media/highway/src/hwy/ops/set_macros-inl.h",
"/media/highway/src/hwy/ops/shared-inl.h",
"/media/highway/src/hwy/ops/wasm_128-inl.h",
"/media/highway/src/hwy/ops/x86_128-inl.h",
"/media/highway/src/hwy/ops/x86_256-inl.h",
"/media/highway/src/hwy/ops/x86_512-inl.h",
]
FINAL_LIBRARY = "gkmedias"
# We allow warnings for third-party code that can be updated from upstream.
ALLOW_COMPILER_WARNINGS = True
+413
View File
@@ -0,0 +1,413 @@
load("@bazel_skylib//lib:selects.bzl", "selects")
load("@rules_cc//cc:defs.bzl", "cc_test")
package(
default_visibility = ["//visibility:public"],
)
licenses(["notice"])
exports_files(["LICENSE"])
# Detect compiler:
config_setting(
name = "compiler_clang",
flag_values = {"@bazel_tools//tools/cpp:compiler": "clang"},
)
config_setting(
name = "compiler_clangcl",
flag_values = {"@bazel_tools//tools/cpp:compiler": "lexan"},
)
config_setting(
name = "compiler_msvc_actual",
flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"},
)
# The above is insufficient for Bazel on Windows, which does not seem to
# detect/set a compiler flag. This workaround prevents compile errors due to
# passing clang-only warning flags to MSVC.
config_setting(
name = "compiler_msvc_cpu",
values = {
"cpu": "x64_windows",
},
)
selects.config_setting_group(
name = "compiler_msvc",
match_any = [
":compiler_msvc_actual",
":compiler_msvc_cpu",
],
)
config_setting(
name = "compiler_emscripten",
values = {"cpu": "wasm32"},
)
# See https://github.com/bazelbuild/bazel/issues/12707
config_setting(
name = "compiler_gcc_bug",
flag_values = {
"@bazel_tools//tools/cpp:compiler": "compiler",
},
)
config_setting(
name = "compiler_gcc_actual",
flag_values = {
"@bazel_tools//tools/cpp:compiler": "gcc",
},
)
selects.config_setting_group(
name = "compiler_gcc",
match_any = [
":compiler_gcc_bug",
":compiler_gcc_actual",
],
)
# Additional warnings for Clang OR GCC (skip for MSVC)
CLANG_GCC_COPTS = [
"-Wunused-parameter",
"-Wunused-variable",
"-Wextra-semi",
"-Wunreachable-code",
]
# Warnings supported by Clang and Clang-cl
CLANG_OR_CLANGCL_OPTS = CLANG_GCC_COPTS + [
"-Wfloat-overflow-conversion",
"-Wfloat-zero-conversion",
"-Wfor-loop-analysis",
"-Wgnu-redeclared-enum",
"-Winfinite-recursion",
"-Wliteral-conversion",
"-Wno-c++98-compat",
"-Wno-unused-command-line-argument",
"-Wprivate-header",
"-Wself-assign",
"-Wstring-conversion",
"-Wtautological-overlap-compare",
"-Wthread-safety-analysis",
"-Wundefined-func-template",
"-Wunused-comparison",
]
# Warnings only supported by Clang, but not Clang-cl
CLANG_ONLY_COPTS = CLANG_OR_CLANGCL_OPTS + [
# Do not treat the third_party headers as system headers when building
# highway - the errors are pertinent.
"--no-system-header-prefix=third_party/highway",
]
COPTS = select({
":compiler_msvc": [],
":compiler_gcc": CLANG_GCC_COPTS,
":compiler_clangcl": CLANG_OR_CLANGCL_OPTS,
# Default to clang because compiler detection only works in Bazel
"//conditions:default": CLANG_ONLY_COPTS,
}) + select({
"@platforms//cpu:riscv64": [
"-march=rv64gcv1p0",
"-menable-experimental-extensions",
],
"//conditions:default": [
],
})
DEFINES = select({
":compiler_msvc": ["HWY_SHARED_DEFINE"],
":compiler_clangcl": ["HWY_SHARED_DEFINE"],
"//conditions:default": [],
})
# Unused on Bazel builds, where this is not defined/known; Copybara replaces
# usages with an empty list.
COMPAT = [
"//buildenv/target:non_prod", # includes mobile/vendor.
]
# WARNING: changing flags such as HWY_DISABLED_TARGETS may break users without
# failing integration tests, if the machine running tests does not support the
# newly enabled instruction set, or the failure is only caught by sanitizers
# which do not run in CI.
cc_library(
name = "hwy",
srcs = [
"hwy/aligned_allocator.cc",
"hwy/per_target.cc",
"hwy/print.cc",
"hwy/targets.cc",
],
# Normal headers with include guards
hdrs = [
"hwy/aligned_allocator.h",
"hwy/base.h",
"hwy/cache_control.h",
"hwy/detect_compiler_arch.h", # private
"hwy/print.h",
],
compatible_with = [],
copts = COPTS,
defines = DEFINES,
local_defines = ["hwy_EXPORTS"],
textual_hdrs = [
# These are textual because config macros influence them:
"hwy/detect_targets.h", # private
"hwy/targets.h",
# This .cc file #includes itself through foreach_target.h
"hwy/per_target.cc",
# End of list
"hwy/highway.h", # public
"hwy/foreach_target.h", # public
"hwy/per_target.h", # public
"hwy/print-inl.h", # public
"hwy/highway_export.h", # public
"hwy/ops/arm_neon-inl.h",
"hwy/ops/arm_sve-inl.h",
"hwy/ops/emu128-inl.h",
"hwy/ops/generic_ops-inl.h",
"hwy/ops/scalar-inl.h",
"hwy/ops/set_macros-inl.h",
"hwy/ops/shared-inl.h",
"hwy/ops/x86_128-inl.h",
"hwy/ops/x86_256-inl.h",
"hwy/ops/x86_512-inl.h",
# Select avoids recompiling native arch if only non-native changed
] + select({
":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"],
"//conditions:default": [],
}) + select({
"@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"],
"//conditions:default": [],
}),
)
cc_library(
name = "algo",
compatible_with = [],
copts = COPTS,
textual_hdrs = [
"hwy/contrib/algo/copy-inl.h",
"hwy/contrib/algo/find-inl.h",
"hwy/contrib/algo/transform-inl.h",
],
deps = [
":hwy",
],
)
cc_library(
name = "dot",
compatible_with = [],
copts = COPTS,
textual_hdrs = [
"hwy/contrib/dot/dot-inl.h",
],
deps = [
":hwy",
],
)
cc_library(
name = "image",
srcs = [
"hwy/contrib/image/image.cc",
],
hdrs = [
"hwy/contrib/image/image.h",
],
compatible_with = [],
copts = COPTS,
local_defines = ["hwy_contrib_EXPORTS"],
deps = [
":hwy",
],
)
cc_library(
name = "math",
compatible_with = [],
copts = COPTS,
textual_hdrs = [
"hwy/contrib/math/math-inl.h",
],
deps = [
":hwy",
],
)
# Everything required for tests that use Highway.
cc_library(
name = "hwy_test_util",
srcs = ["hwy/tests/test_util.cc"],
hdrs = ["hwy/tests/test_util.h"],
compatible_with = [],
copts = COPTS,
local_defines = ["hwy_test_EXPORTS"],
textual_hdrs = [
"hwy/tests/test_util-inl.h",
"hwy/tests/hwy_gtest.h",
],
# Must not depend on a gtest variant, which can conflict with the
# GUNIT_INTERNAL_BUILD_MODE defined by the test.
deps = [
":hwy",
],
)
cc_library(
name = "nanobenchmark",
srcs = ["hwy/nanobenchmark.cc"],
hdrs = ["hwy/nanobenchmark.h"],
compatible_with = [],
copts = COPTS,
local_defines = ["hwy_EXPORTS"],
deps = [":hwy"],
)
cc_binary(
name = "benchmark",
srcs = ["hwy/examples/benchmark.cc"],
copts = COPTS,
deps = [
":hwy",
":nanobenchmark",
],
)
cc_library(
name = "skeleton",
srcs = ["hwy/examples/skeleton.cc"],
hdrs = ["hwy/examples/skeleton.h"],
copts = COPTS,
local_defines = ["hwy_EXPORTS"],
textual_hdrs = ["hwy/examples/skeleton-inl.h"],
deps = [
":hwy",
],
)
cc_binary(
name = "list_targets",
srcs = ["hwy/tests/list_targets.cc"],
deps = [":hwy"],
)
# path, name
HWY_TESTS = [
("hwy/contrib/algo/", "copy_test"),
("hwy/contrib/algo/", "find_test"),
("hwy/contrib/algo/", "transform_test"),
("hwy/contrib/dot/", "dot_test"),
("hwy/contrib/image/", "image_test"),
("hwy/contrib/math/", "math_test"),
# contrib/sort has its own BUILD, we add it to GUITAR_TESTS.
("hwy/examples/", "skeleton_test"),
("hwy/", "nanobenchmark_test"),
("hwy/", "aligned_allocator_test"),
("hwy/", "base_test"),
("hwy/", "highway_test"),
("hwy/", "targets_test"),
("hwy/tests/", "arithmetic_test"),
("hwy/tests/", "blockwise_test"),
("hwy/tests/", "blockwise_shift_test"),
("hwy/tests/", "combine_test"),
("hwy/tests/", "compare_test"),
("hwy/tests/", "compress_test"),
("hwy/tests/", "convert_test"),
("hwy/tests/", "crypto_test"),
("hwy/tests/", "demote_test"),
("hwy/tests/", "float_test"),
("hwy/tests/", "if_test"),
("hwy/tests/", "interleaved_test"),
("hwy/tests/", "logical_test"),
("hwy/tests/", "mask_test"),
("hwy/tests/", "mask_mem_test"),
("hwy/tests/", "memory_test"),
("hwy/tests/", "mul_test"),
("hwy/tests/", "reduction_test"),
("hwy/tests/", "reverse_test"),
("hwy/tests/", "shift_test"),
("hwy/tests/", "swizzle_test"),
("hwy/tests/", "test_util_test"),
]
HWY_TEST_COPTS = select({
":compiler_msvc": [],
"//conditions:default": [
# gTest triggers this warning (which is enabled by the
# extra-semi in COPTS), so we need to disable it here,
# but it's still enabled for :hwy.
"-Wno-c++98-compat-extra-semi",
],
})
HWY_TEST_DEPS = [
":algo",
":dot",
":hwy",
":hwy_test_util",
":image",
":math",
":nanobenchmark",
":skeleton",
"//hwy/contrib/sort:vqsort",
"@com_google_googletest//:gtest_main",
]
[
[
cc_test(
name = test,
size = "medium",
timeout = "long", # default moderate is not enough for math_test
srcs = [
subdir + test + ".cc",
],
copts = COPTS + HWY_TEST_COPTS,
features = select({
"@platforms//cpu:riscv64": ["fully_static_link"],
"//conditions:default": [],
}),
linkopts = select({
":compiler_emscripten": [
"-s ASSERTIONS=2",
"-s ENVIRONMENT=node,shell,web",
"-s ERROR_ON_UNDEFINED_SYMBOLS=1",
"-s DEMANGLE_SUPPORT=1",
"-s EXIT_RUNTIME=1",
"-s ALLOW_MEMORY_GROWTH=1",
"--pre-js $(location :preamble.js.lds)",
],
"//conditions:default": [],
}),
linkstatic = select({
"@platforms//cpu:riscv64": True,
"//conditions:default": False,
}),
local_defines = ["HWY_IS_TEST"],
# for test_suite.
tags = ["hwy_ops_test"],
deps = HWY_TEST_DEPS + select({
":compiler_emscripten": [":preamble.js.lds"],
"//conditions:default": [],
}),
),
]
for subdir, test in HWY_TESTS
]
# For manually building the tests we define here (:all does not work in --config=msvc)
test_suite(
name = "hwy_ops_tests",
tags = ["hwy_ops_test"],
)
# Placeholder for integration test, do not remove
+580
View File
@@ -0,0 +1,580 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.10)
# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
if(POLICY CMP0083)
cmake_policy(SET CMP0083 NEW)
endif()
# Workaround for 3.19 raising error 'IMPORTED_LOCATION not set for imported
# target "GTest::gtest_main"'.
if(POLICY CMP0111)
cmake_policy(SET CMP0111 OLD)
endif()
project(hwy VERSION 1.0.2) # Keep in sync with highway.h version
# Directly define the ABI version from the cmake project() version values:
set(LIBRARY_VERSION "${hwy_VERSION}")
set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})
set(CMAKE_CXX_EXTENSIONS OFF)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
# Search for Atomics implementation:
find_package(Atomics REQUIRED)
# Enabled PIE binaries by default if supported.
include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
if(CHECK_PIE_SUPPORTED)
check_pie_supported(LANGUAGES CXX)
if(CMAKE_CXX_LINK_PIE_SUPPORTED)
set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
endif()
endif()
include(GNUInstallDirs)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE RelWithDebInfo)
endif()
set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4)?")
# Unconditionally adding -Werror risks breaking the build when new warnings
# arise due to compiler/platform changes. Enable this in CI/tests.
set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
set(HWY_ENABLE_TESTS ON CACHE BOOL "Enable HWY tests")
include(CheckCXXSourceCompiles)
check_cxx_source_compiles(
"int main() {
#if !defined(__EMSCRIPTEN__)
static_assert(false, \"__EMSCRIPTEN__ is not defined\");
#endif
return 0;
}"
HWY_EMSCRIPTEN
)
check_cxx_source_compiles(
"int main() {
#if !defined(__riscv)
static_assert(false, \"__riscv is not defined\");
#endif
return 0;
}"
HWY_RISCV
)
if (HWY_ENABLE_CONTRIB)
# Glob all the traits so we don't need to modify this file when adding
# additional special cases.
file(GLOB HWY_CONTRIB_SOURCES "hwy/contrib/sort/vqsort_*.cc")
list(APPEND HWY_CONTRIB_SOURCES
hwy/contrib/dot/dot-inl.h
hwy/contrib/image/image.cc
hwy/contrib/image/image.h
hwy/contrib/math/math-inl.h
hwy/contrib/sort/shared-inl.h
hwy/contrib/sort/sorting_networks-inl.h
hwy/contrib/sort/traits-inl.h
hwy/contrib/sort/traits128-inl.h
hwy/contrib/sort/vqsort-inl.h
hwy/contrib/sort/vqsort.cc
hwy/contrib/sort/vqsort.h
hwy/contrib/algo/copy-inl.h
hwy/contrib/algo/find-inl.h
hwy/contrib/algo/transform-inl.h
)
endif() # HWY_ENABLE_CONTRIB
set(HWY_SOURCES
hwy/aligned_allocator.cc
hwy/aligned_allocator.h
hwy/base.h
hwy/cache_control.h
hwy/detect_compiler_arch.h # private
hwy/detect_targets.h # private
hwy/foreach_target.h
hwy/highway.h
hwy/highway_export.h
hwy/nanobenchmark.cc
hwy/nanobenchmark.h
hwy/ops/arm_neon-inl.h
hwy/ops/arm_sve-inl.h
hwy/ops/emu128-inl.h
hwy/ops/generic_ops-inl.h
hwy/ops/rvv-inl.h
hwy/ops/scalar-inl.h
hwy/ops/set_macros-inl.h
hwy/ops/shared-inl.h
hwy/ops/wasm_128-inl.h
hwy/ops/x86_128-inl.h
hwy/ops/x86_256-inl.h
hwy/ops/x86_512-inl.h
hwy/per_target.cc
hwy/per_target.h
hwy/print-inl.h
hwy/print.cc
hwy/print.h
hwy/targets.cc
hwy/targets.h
)
set(HWY_TEST_SOURCES
hwy/tests/hwy_gtest.h
hwy/tests/test_util-inl.h
hwy/tests/test_util.cc
hwy/tests/test_util.h
)
if (MSVC)
set(HWY_FLAGS
# fix build error C1128 in blockwise*_test & arithmetic_test
/bigobj
)
else()
set(HWY_FLAGS
# Avoid changing binaries based on the current time and date.
-Wno-builtin-macro-redefined
-D__DATE__="redacted"
-D__TIMESTAMP__="redacted"
-D__TIME__="redacted"
# Optimizations
-fmerge-all-constants
# Warnings
-Wall
-Wextra
# These are not included in Wall nor Wextra:
-Wconversion
-Wsign-conversion
-Wvla
-Wnon-virtual-dtor
)
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
list(APPEND HWY_FLAGS
-Wfloat-overflow-conversion
-Wfloat-zero-conversion
-Wfor-loop-analysis
-Wgnu-redeclared-enum
-Winfinite-recursion
-Wself-assign
-Wstring-conversion
-Wtautological-overlap-compare
-Wthread-safety-analysis
-Wundefined-func-template
-fno-cxx-exceptions
-fno-slp-vectorize
-fno-vectorize
# Use color in messages
-fdiagnostics-show-option -fcolor-diagnostics
)
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 6.0)
list(APPEND HWY_FLAGS -Wc++2a-extensions)
endif()
endif()
if (WIN32)
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
list(APPEND HWY_FLAGS
-Wno-global-constructors
-Wno-language-extension-token
-Wno-used-but-marked-unused
-Wno-shadow-field-in-constructor
-Wno-unused-member-function
-Wno-unused-template
-Wno-c++98-compat-pedantic
-Wno-used-but-marked-unused
-Wno-zero-as-null-pointer-constant
)
endif()
list(APPEND HWY_FLAGS
-Wno-cast-align
-Wno-double-promotion
-Wno-float-equal
-Wno-format-nonliteral
-Wno-shadow
-Wno-sign-conversion
)
else()
list(APPEND HWY_FLAGS
-fmath-errno
-fno-exceptions
)
endif() # WIN32
if (HWY_CMAKE_ARM7)
list(APPEND HWY_FLAGS
-march=armv7-a
-mfpu=neon-vfpv4
-mfloat-abi=hard # must match the toolchain specified as CXX=
-mfp16-format=ieee # required for vcvt_f32_f16
)
endif() # HWY_CMAKE_ARM7
if(HWY_RISCV)
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
# Not yet supported by GCC. When runtime dispatch is supported and
# implemented, we will remove v from the required flags. Until then, using
# clang for RISC-V will require the CPU to support the V extension (1.0).
list(APPEND HWY_FLAGS -march=rv64gcv1p0)
list(APPEND HWY_FLAGS -menable-experimental-extensions)
endif()
endif()
if (HWY_WARNINGS_ARE_ERRORS)
list(APPEND HWY_FLAGS -Werror)
endif()
# Prevent "wasm-ld: error: --shared-memory is disallowed by targets.cc.o
# because it was not compiled with 'atomics' or 'bulk-memory' features."
if (HWY_EMSCRIPTEN)
list(APPEND HWY_FLAGS -matomics)
endif()
endif() # !MSVC
# By default prefer STATIC build (legacy behavior)
option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
option(HWY_FORCE_STATIC_LIBS "Ignore BUILD_SHARED_LIBS" OFF)
# only expose shared/static options to advanced users:
mark_as_advanced(BUILD_SHARED_LIBS)
mark_as_advanced(HWY_FORCE_STATIC_LIBS)
# Define visibility settings globally:
set(CMAKE_CXX_VISIBILITY_PRESET hidden)
set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
# Copy-cat "add_library" logic + add override.
set(HWY_LIBRARY_TYPE "SHARED")
if (NOT BUILD_SHARED_LIBS OR HWY_FORCE_STATIC_LIBS)
set(HWY_LIBRARY_TYPE "STATIC")
endif()
# This preprocessor define will drive the build, also used in the *.pc files:
if("${HWY_LIBRARY_TYPE}" STREQUAL "SHARED")
set(DLLEXPORT_TO_DEFINE "HWY_SHARED_DEFINE")
else()
set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE")
endif()
add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES})
target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}")
target_compile_options(hwy PRIVATE ${HWY_FLAGS})
set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
target_include_directories(hwy PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
target_compile_features(hwy PUBLIC cxx_std_11)
set_target_properties(hwy PROPERTIES
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
# For GCC __atomic_store_8, see #887
target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES})
if(UNIX AND NOT APPLE)
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
set_property(TARGET hwy APPEND_STRING PROPERTY
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
# uname -p is broken on this system. Try uname -m
EXECUTE_PROCESS( COMMAND uname -m
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
OUTPUT_VARIABLE HWY_ARCH)
else (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
set(HWY_ARCH ${CMAKE_SYSTEM_PROCESSOR})
endif (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
message(STATUS "Architecture: " ${HWY_ARCH})
if (HWY_ARCH MATCHES "mips")
target_link_options(hwy PUBLIC "LINKER:-z,noexecstack")
endif (HWY_ARCH MATCHES "mips")
if (HWY_ENABLE_CONTRIB)
add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
target_link_libraries(hwy_contrib hwy)
target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
target_include_directories(hwy_contrib PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
target_compile_features(hwy_contrib PUBLIC cxx_std_11)
set_target_properties(hwy_contrib PROPERTIES
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
if(UNIX AND NOT APPLE)
set_property(TARGET hwy_contrib APPEND_STRING PROPERTY
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
endif()
endif() # HWY_ENABLE_CONTRIB
add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
target_link_libraries(hwy_test hwy)
target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
target_include_directories(hwy_test PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
target_compile_features(hwy_test PUBLIC cxx_std_11)
set_target_properties(hwy_test PROPERTIES
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
if(UNIX AND NOT APPLE)
set_property(TARGET hwy_test APPEND_STRING PROPERTY
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
endif()
# -------------------------------------------------------- hwy_list_targets
# Generate a tool to print the compiled-in targets as defined by the current
# flags. This tool will print to stderr at build time, after building hwy.
add_executable(hwy_list_targets hwy/tests/list_targets.cc)
target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
target_link_libraries(hwy_list_targets hwy)
target_include_directories(hwy_list_targets PRIVATE
$<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
# TARGET_FILE always returns the path to executable
# Naked target also not always could be run (due to the lack of '.\' prefix)
# Thus effective command to run should contain the full path
# and emulator prefix (if any).
if (NOT CMAKE_CROSSCOMPILING OR CMAKE_CROSSCOMPILING_EMULATOR)
add_custom_command(TARGET hwy_list_targets POST_BUILD
COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
endif()
# --------------------------------------------------------
# Allow skipping the following sections for projects that do not need them:
# tests, examples, benchmarks and installation.
# -------------------------------------------------------- install library
if (HWY_ENABLE_INSTALL)
install(TARGETS hwy
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
# Install all the headers keeping the relative path to the current directory
# when installing them.
foreach (source ${HWY_SOURCES})
if ("${source}" MATCHES "\.h$")
get_filename_component(dirname "${source}" DIRECTORY)
install(FILES "${source}"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
endif()
endforeach()
if (HWY_ENABLE_CONTRIB)
install(TARGETS hwy_contrib
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
# Install all the headers keeping the relative path to the current directory
# when installing them.
foreach (source ${HWY_CONTRIB_SOURCES})
if ("${source}" MATCHES "\.h$")
get_filename_component(dirname "${source}" DIRECTORY)
install(FILES "${source}"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
endif()
endforeach()
endif() # HWY_ENABLE_CONTRIB
install(TARGETS hwy_test
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
# Install all the headers keeping the relative path to the current directory
# when installing them.
foreach (source ${HWY_TEST_SOURCES})
if ("${source}" MATCHES "\.h$")
get_filename_component(dirname "${source}" DIRECTORY)
install(FILES "${source}"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
endif()
endforeach()
# Add a pkg-config file for libhwy and the contrib/test libraries.
set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
set(HWY_PC_FILES libhwy.pc libhwy-test.pc)
if (HWY_ENABLE_CONTRIB)
list(APPEND HWY_PC_FILES libhwy-contrib.pc)
endif() # HWY_ENABLE_CONTRIB
foreach (pc ${HWY_PC_FILES})
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
endforeach()
endif() # HWY_ENABLE_INSTALL
# -------------------------------------------------------- Examples
if (HWY_ENABLE_EXAMPLES)
# Avoids mismatch between GTest's static CRT and our dynamic.
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
# Programming exercise with integrated benchmark
add_executable(hwy_benchmark hwy/examples/benchmark.cc)
target_sources(hwy_benchmark PRIVATE
hwy/nanobenchmark.h)
# Try adding one of -DHWY_COMPILE_ONLY_SCALAR, -DHWY_COMPILE_ONLY_EMU128 or
# -DHWY_COMPILE_ONLY_STATIC to observe the difference in targets printed.
target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
target_link_libraries(hwy_benchmark hwy)
set_target_properties(hwy_benchmark
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
endif() # HWY_ENABLE_EXAMPLES
# -------------------------------------------------------- Tests
include(CTest)
if(BUILD_TESTING AND HWY_ENABLE_TESTS)
enable_testing()
include(GoogleTest)
set(HWY_SYSTEM_GTEST OFF CACHE BOOL "Use pre-installed googletest?")
if(HWY_SYSTEM_GTEST)
find_package(GTest REQUIRED)
else()
# Download and unpack googletest at configure time
configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
if(result)
message(FATAL_ERROR "CMake step for googletest failed: ${result}")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
if(result)
message(FATAL_ERROR "Build step for googletest failed: ${result}")
endif()
# Prevent overriding the parent project's compiler/linker
# settings on Windows
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
# Add googletest directly to our build. This defines
# the gtest and gtest_main targets.
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
${CMAKE_CURRENT_BINARY_DIR}/googletest-build
EXCLUDE_FROM_ALL)
endif() # HWY_SYSTEM_GTEST
set(HWY_TEST_FILES
hwy/contrib/algo/copy_test.cc
hwy/contrib/algo/find_test.cc
hwy/contrib/algo/transform_test.cc
hwy/aligned_allocator_test.cc
hwy/base_test.cc
hwy/highway_test.cc
hwy/nanobenchmark_test.cc
hwy/targets_test.cc
hwy/examples/skeleton_test.cc
hwy/tests/arithmetic_test.cc
hwy/tests/blockwise_test.cc
hwy/tests/blockwise_shift_test.cc
hwy/tests/combine_test.cc
hwy/tests/compare_test.cc
hwy/tests/compress_test.cc
hwy/tests/convert_test.cc
hwy/tests/crypto_test.cc
hwy/tests/demote_test.cc
hwy/tests/float_test.cc
hwy/tests/if_test.cc
hwy/tests/interleaved_test.cc
hwy/tests/logical_test.cc
hwy/tests/mask_test.cc
hwy/tests/mask_mem_test.cc
hwy/tests/memory_test.cc
hwy/tests/mul_test.cc
hwy/tests/reduction_test.cc
hwy/tests/reverse_test.cc
hwy/tests/shift_test.cc
hwy/tests/swizzle_test.cc
hwy/tests/test_util_test.cc
)
set(HWY_TEST_LIBS hwy hwy_test)
if (HWY_ENABLE_CONTRIB)
list(APPEND HWY_TEST_LIBS hwy_contrib)
list(APPEND HWY_TEST_FILES
hwy/contrib/dot/dot_test.cc
hwy/contrib/image/image_test.cc
# Disabled due to SIGILL in clang7 debug build during gtest discovery phase,
# not reproducible locally. Still tested via bazel build.
# hwy/contrib/math/math_test.cc
hwy/contrib/sort/sort_test.cc
)
endif() # HWY_ENABLE_CONTRIB
if(HWY_SYSTEM_GTEST)
if (CMAKE_VERSION VERSION_LESS 3.20)
set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
else()
set(HWY_GTEST_LIBS GTest::gtest GTest::gtest_main)
endif()
else()
set(HWY_GTEST_LIBS gtest gtest_main)
endif()
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
foreach (TESTFILE IN LISTS HWY_TEST_FILES)
# The TESTNAME is the name without the extension or directory.
get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
add_executable(${TESTNAME} ${TESTFILE})
target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
# Test all targets, not just the best/baseline. This changes the default
# policy to all-attainable; note that setting -DHWY_COMPILE_* directly can
# cause compile errors because only one may be set, and other CMakeLists.txt
# that include us may set them.
target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
target_link_libraries(${TESTNAME} ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
# Output test targets in the test directory.
set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")
if (HWY_EMSCRIPTEN)
set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1")
endif()
if(${CMAKE_VERSION} VERSION_LESS "3.10.3")
gtest_discover_tests(${TESTNAME} TIMEOUT 60)
else ()
gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 60)
endif ()
endforeach ()
# The skeleton test uses the skeleton library code.
target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
endif() # BUILD_TESTING
+15
View File
@@ -0,0 +1,15 @@
cmake_minimum_required(VERSION 2.8.12)
project(googletest-download NONE)
include(ExternalProject)
ExternalProject_Add(googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG 43efa0a4efd40c78b9210d15373112081899a97c
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
+33
View File
@@ -0,0 +1,33 @@
# How to Contribute
We'd love to accept your patches and contributions to this project. There are
just a few small guidelines you need to follow.
## Contributor License Agreement
Contributions to this project must be accompanied by a Contributor License
Agreement. You (or your employer) retain the copyright to your contribution;
this simply gives us permission to use and redistribute your contributions as
part of the project. Head over to <https://cla.developers.google.com/> to see
your current agreements on file or to sign a new one.
You generally only need to submit a CLA once, so if you've already submitted one
(even if it was for a different project), you probably don't need to do it
again.
## Code reviews
All submissions, including submissions by project members, require review. We
use GitHub pull requests for this purpose. Consult
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
information on using pull requests.
## Testing
This repository is used by JPEG XL, so major API changes will require
coordination. Please get in touch with us beforehand, e.g. by raising an issue.
## Community Guidelines
This project follows
[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
+201
View File
@@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
+322
View File
@@ -0,0 +1,322 @@
# Efficient and performance-portable vector software
[//]: # (placeholder, do not remove)
Highway is a C++ library that provides portable SIMD/vector intrinsics.
## Why
We are passionate about high-performance software. We see major untapped
potential in CPUs (servers, mobile, desktops). Highway is for engineers who want
to reliably and economically push the boundaries of what is possible in
software.
## How
CPUs provide SIMD/vector instructions that apply the same operation to multiple
data items. This can reduce energy usage e.g. *fivefold* because fewer
instructions are executed. We also often see *5-10x* speedups.
Highway makes SIMD/vector programming practical and workable according to these
guiding principles:
**Does what you expect**: Highway is a C++ library with carefully-chosen
functions that map well to CPU instructions without extensive compiler
transformations. The resulting code is more predictable and robust to code
changes/compiler updates than autovectorization.
**Works on widely-used platforms**: Highway supports four architectures; the
same application code can target eight instruction sets, including those with
'scalable' vectors (size unknown at compile time). Highway only requires C++11
and supports four families of compilers. If you would like to use Highway on
other platforms, please raise an issue.
**Flexible to deploy**: Applications using Highway can run on heterogeneous
clouds or client devices, choosing the best available instruction set at
runtime. Alternatively, developers may choose to target a single instruction set
without any runtime overhead. In both cases, the application code is the same
except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one
line of code.
**Suitable for a variety of domains**: Highway provides an extensive set of
operations, used for image processing (floating-point), compression, video
analysis, linear algebra, cryptography, sorting and random generation. We
recognise that new use-cases may require additional ops and are happy to add
them where it makes sense (e.g. no performance cliffs on some architectures). If
you would like to discuss, please file an issue.
**Rewards data-parallel design**: Highway provides tools such as Gather,
MaskedLoad, and FixedTag to enable speedups for legacy data structures. However,
the biggest gains are unlocked by designing algorithms and data structures for
scalable vectors. Helpful techniques include batching, structure-of-array
layouts, and aligned/padded allocations.
## Examples
Online demos using Compiler Explorer:
- [multiple targets with dynamic dispatch](https://gcc.godbolt.org/z/zP7MYe9Yf)
(recommended)
- [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)
Projects using Highway: (to add yours, feel free to raise an issue or contact us
via the below email)
* [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp)
* [JPEG XL image codec](https://github.com/libjxl/libjxl)
* [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok)
* [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort) ([paper](https://arxiv.org/abs/2205.05982))
## Current status
### Targets
Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
WASM SIMD, RISC-V V.
SVE was initially tested using farm_sve (see acknowledgments).
### Versioning
Highway releases aim to follow the semver.org system (MAJOR.MINOR.PATCH),
incrementing MINOR after backward-compatible additions and PATCH after
backward-compatible fixes. We recommend using releases (rather than the Git tip)
because they are tested more extensively, see below.
The current version 1.0 signals an increased focus on backwards compatibility.
Applications using documented functionality will remain compatible with future
updates that have the same major version number.
### Testing
Continuous integration tests build with a recent version of Clang (running on
native x86, or QEMU for RVV and ARM) and MSVC 2019 (v19.28, running on native
x86).
Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via GCC
cross-compile. See the [testing process](g3doc/release_testing_process.md) for
details.
### Related modules
The `contrib` directory contains SIMD-related utilities: an image class with
aligned rows, a math library (16 functions already implemented, mostly
trigonometry), and functions for computing dot products and sorting.
## Installation
This project uses CMake to generate and build. In a Debian-based system you can
install it via:
```bash
sudo apt install cmake
```
Highway's unit tests use [googletest](https://github.com/google/googletest).
By default, Highway's CMake downloads this dependency at configuration time.
You can disable this by setting the `HWY_SYSTEM_GTEST` CMake variable to ON and
installing gtest separately:
```bash
sudo apt install libgtest-dev
```
To build Highway as a shared or static library (depending on BUILD_SHARED_LIBS),
the standard CMake workflow can be used:
```bash
mkdir -p build && cd build
cmake ..
make -j && make test
```
Or you can run `run_tests.sh` (`run_tests.bat` on Windows).
Bazel is also supported for building, but it is not as widely used/tested.
## Quick start
You can use the `benchmark` inside examples/ as a starting point.
A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
indicates the number of instructions per operation.
The [FAQ](g3doc/faq.md) answers questions about portability, API design and
where to find more information.
We recommend using full SIMD vectors whenever possible for maximum performance
portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
alternatives for use-cases requiring an upper bound on the lanes:
- For up to `N` lanes, specify `CappedTag<T, N>` or the equivalent
`HWY_CAPPED(T, N)`. The actual number of lanes will be `N` rounded down to
the nearest power of two, such as 4 if `N` is 5, or 8 if `N` is 8. This is
useful for data structures such as a narrow matrix. A loop is still required
because vectors may actually have fewer than `N` lanes.
- For exactly a power of two `N` lanes, specify `FixedTag<T, N>`. The largest
supported `N` depends on the target, but is guaranteed to be at least
`16/sizeof(T)`.
Due to ADL restrictions, user code calling Highway ops must either:
* Reside inside `namespace hwy { namespace HWY_NAMESPACE {`; or
* prefix each op with an alias such as `namespace hn = hwy::HWY_NAMESPACE;
hn::Add()`; or
* add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`.
Additionally, each function that calls Highway ops (such as `Load`) must either
be prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
`HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before
their opening brace.
The entry points into code using Highway differ slightly depending on whether
they use static or dynamic dispatch.
* For static dispatch, `HWY_TARGET` will be the best available target among
`HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
[quick-reference](g3doc/quick_reference.md)). Functions inside
`HWY_NAMESPACE` can be called using `HWY_STATIC_DISPATCH(func)(args)` within
the same module they are defined in. You can call the function from other
modules by wrapping it in a regular function and declaring the regular
function in a header.
* For dynamic dispatch, a table of function pointers is generated via the
`HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
call the best function pointer for the current CPU's supported targets. A
module is automatically compiled for each target in `HWY_TARGETS` (see
[quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
defined and `foreach_target.h` is included.
When using dynamic dispatch, `foreach_target.h` is included from translation
units (.cc files), not headers. Headers containing vector code shared between
several translation units require a special include guard, for example the
following taken from `examples/skeleton-inl.h`:
```
#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#else
#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#endif
#include "hwy/highway.h"
// Your vector code
#endif
```
By convention, we name such headers `-inl.h` because their contents (often
function templates) are usually inlined.
## Compiler flags
Applications should be compiled with optimizations enabled - without inlining,
SIMD code may slow down by factors of 10 to 100. For clang and GCC, `-O2` is
generally sufficient.
For MSVC, we recommend compiling with `/Gv` to allow non-inlined functions to
pass vector arguments in registers. If intending to use the AVX2 target together
with half-width vectors (e.g. for `PromoteTo`), it is also important to compile
with `/arch:AVX2`. This seems to be the only way to generate VEX-encoded SSE4
instructions on MSVC. Otherwise, mixing VEX-encoded AVX2 instructions and
non-VEX SSE4 may cause severe performance degradation. Unfortunately, the
resulting binary will then require AVX2. Note that no such flag is needed for
clang and GCC because they support target-specific attributes, which we use to
ensure proper VEX code generation for AVX2 targets.
## Strip-mining loops
To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
loop with number of iterations matching the preferred vector width.
In this section, let `T` denote the element type, `d = ScalableTag<T>`, `count`
the number of elements to process, and `N = Lanes(d)` the number of lanes in a
full vector. Assume the loop body is given as a function `template<bool partial,
class D> void LoopBody(D d, size_t index, size_t max_n)`.
Highway offers several ways to express loops where `N` need not divide `count`:
* Ensure all inputs/outputs are padded. Then the loop is simply
```
for (size_t i = 0; i < count; i += N) LoopBody<false>(d, i, 0);
```
Here, the template parameter and second function argument are not needed.
This is the preferred option, unless `N` is in the thousands and vector
operations are pipelined with long latencies. This was the case for
supercomputers in the 90s, but nowadays ALUs are cheap and we see most
implementations split vectors into 1, 2 or 4 parts, so there is little cost
to processing entire vectors even if we do not need all their lanes. Indeed
this avoids the (potentially large) cost of predication or partial
loads/stores on older targets, and does not duplicate code.
* Use the `Transform*` functions in hwy/contrib/algo/transform-inl.h. This
takes care of the loop and remainder handling and you simply define a
generic lambda function (C++14) or functor which receives the current vector
from the input/output array, plus optionally vectors from up to two extra
input arrays, and returns the value to write to the input/output array.
Here is an example implementing the BLAS function SAXPY (`alpha * x + y`):
```
Transform1(d, x, n, y, [](auto d, const auto v, const auto v1) HWY_ATTR {
return MulAdd(Set(d, alpha), v, v1);
});
```
* Process whole vectors as above, followed by a scalar loop:
```
size_t i = 0;
for (; i + N <= count; i += N) LoopBody<false>(d, i, 0);
for (; i < count; ++i) LoopBody<false>(CappedTag<T, 1>(), i, 0);
```
The template parameter and second function arguments are again not needed.
This avoids duplicating code, and is reasonable if `count` is large.
If `count` is small, the second loop may be slower than the next option.
* Process whole vectors as above, followed by a single call to a modified
`LoopBody` with masking:
```
size_t i = 0;
for (; i + N <= count; i += N) {
LoopBody<false>(d, i, 0);
}
if (i < count) {
LoopBody<true>(d, i, count - i);
}
```
Now the template parameter and third function argument can be used inside
`LoopBody` to non-atomically 'blend' the first `num_remaining` lanes of `v`
with the previous contents of memory at subsequent locations:
`BlendedStore(v, FirstN(d, num_remaining), d, pointer);`. Similarly,
`MaskedLoad(FirstN(d, num_remaining), d, pointer)` loads the first
`num_remaining` elements and returns zero in other lanes.
This is a good default when it is infeasible to ensure vectors are padded,
but is only safe `#if !HWY_MEM_OPS_MIGHT_FAULT`!
In contrast to the scalar loop, only a single final iteration is needed.
The increased code size from two loop bodies is expected to be worthwhile
because it avoids the cost of masking in all but the final iteration.
## Additional resources
* [Highway introduction (slides)](g3doc/highway_intro.pdf)
* [Overview of instructions per operation on different architectures](g3doc/instruction_matrix.pdf)
* [Design philosophy and comparison](g3doc/design_philosophy.md)
* [Implementation details](g3doc/impl_details.md)
## Acknowledgments
We have used [farm-sve](https://gitlab.inria.fr/bramas/farm-sve) by Berenger
Bramas; it has proved useful for checking the SVE port on an x86 development
machine.
This is not an officially supported Google product.
Contact: janwas@google.com
+24
View File
@@ -0,0 +1,24 @@
workspace(name = "highway")
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
http_archive(
name = "com_google_googletest",
urls = ["https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip"],
sha256 = "5cf189eb6847b4f8fc603a3ffff3b0771c08eec7dd4bd961bfd45477dd13eb73",
strip_prefix = "googletest-609281088cfefc76f9d0ce82e1ff6c30cc3591e5",
)
# See https://google.github.io/googletest/quickstart-bazel.html
http_archive(
name = "rules_cc",
urls = ["https://github.com/bazelbuild/rules_cc/archive/40548a2974f1aea06215272d9c2b47a14a24e556.zip"],
sha256 = "56ac9633c13d74cb71e0546f103ce1c58810e4a76aa8325da593ca4277908d72",
strip_prefix = "rules_cc-40548a2974f1aea06215272d9c2b47a14a24e556",
)
# Need recent version for config_setting_group
http_archive(
name = "bazel_skylib",
urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel_skylib-0.9.0.tar.gz"],
)
+157
View File
@@ -0,0 +1,157 @@
highway (1.0.2-1) UNRELEASED; urgency=medium
* Add ExclusiveNeither, FindKnownFirstTrue, Ne128
* Add 16-bit SumOfLanes/ReorderWidenMulAccumulate/ReorderDemote2To
* Faster sort for low-entropy input, improved pivot selection
* Add GN build system, Highway FAQ, k32v32 type to vqsort
* CMake: Support find_package(GTest), add rvv-inl.h, add HWY_ENABLE_TESTS
* Fix MIPS and C++20 build, Apple LLVM 10.3 detection, EMU128 AllTrue on RVV
* Fix missing exec_prefix, RVV build, warnings, libatomic linking
* Work around GCC 10.4 issue, disabled RDCYCLE, arm7 with vfpv3
* Documentation/example improvements
* Support static dispatch to SVE2_128 and SVE_256
-- Jan Wassenberg <janwas@google.com> Thu, 27 Oct 2022 17:00:00 +0200
highway (1.0.1-1) UNRELEASED; urgency=medium
* Add Eq128, i64 Mul, unsigned->float ConvertTo
* Faster sort for few unique keys, more robust pivot selection
* Fix: floating-point generator for sort tests, Min/MaxOfLanes for i16
* Fix: avoid always_inline in debug, link atomic
* GCC warnings: string.h, maybe-uninitialized, ignored-attributes
* GCC warnings: preprocessor int overflow, spurious use-after-free/overflow
* Doc: <=HWY_AVX3, Full32/64/128, how to use generic-inl
-- Jan Wassenberg <janwas@google.com> Tue, 23 Aug 2022 10:00:00 +0200
highway (1.0.0-1) UNRELEASED; urgency=medium
* ABI change: 64-bit target values, more room for expansion
* Add CompressBlocksNot, CompressNot, Lt128Upper, Min/Max128Upper, TruncateTo
* Add HWY_SVE2_128 target
* Sort speedups especially for 128-bit
* Documentation clarifications
* Faster NEON CountTrue/FindFirstTrue/AllFalse/AllTrue
* Improved SVE codegen
* Fix u16x8 ConcatEven/Odd, SSSE3 i64 Lt
* MSVC 2017 workarounds
* Support for runtime dispatch on Arm/GCC/Linux
-- Jan Wassenberg <janwas@google.com> Wed, 27 Jul 2022 10:00:00 +0200
highway (0.17.0-1) UNRELEASED; urgency=medium
* Add ExtractLane, InsertLane, IsInf, IsFinite, IsNaN
* Add StoreInterleaved2, LoadInterleaved2/3/4, BlendedStore, SafeFillN
* Add MulFixedPoint15, Or3
* Add Copy[If], Find[If], Generate, Replace[If] algos
* Add HWY_EMU128 target (replaces HWY_SCALAR)
* HWY_RVV is feature-complete
* Add HWY_ENABLE_CONTRIB build flag, HWY_NATIVE_FMA, HWY_WANT_SSSE3/SSE4 macros
* Extend ConcatOdd/Even and StoreInterleaved* to all types
* Allow CappedTag<T, nonPowerOfTwo>
* Sort speedups: 2x for AVX2, 1.09x for AVX3; avoid x86 malloc
* Expand documentation
* Fix RDTSCP crash in nanobenchmark
* Fix XCR0 check (was ignoring AVX3 on ICL)
* Support Arm/RISC-V timers
-- Jan Wassenberg <janwas@google.com> Fri, 20 May 2022 10:00:00 +0200
highway (0.16.0-1) UNRELEASED; urgency=medium
* Add contrib/sort (vectorized quicksort)
* Add IfNegativeThenElse, IfVecThenElse
* Add Reverse2,4,8, ReverseBlocks, DupEven/Odd, AESLastRound
* Add OrAnd, Min128, Max128, Lt128, SumsOf8
* Support capped/partial vectors on RVV/SVE, int64 in WASM
* Support SVE2, shared library build
* Remove deprecated overloads without the required d arg (UpperHalf etc.)
-- Jan Wassenberg <janwas@google.com> Thu, 03 Feb 2022 11:00:00 +0100
highway (0.15.0-1) UNRELEASED; urgency=medium
* New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec
* New ops: OddEvenBlocks, SwapAdjacentBlocks, Reverse, RotateRight
* Add bf16, unsigned comparisons, more lane types for Reverse/TableLookupLanes
* Contrib: add sort(ing network) and dot(product)
* Targets: update RVV for LLVM, add experimental WASM2
* Separate library hwy_test for test utils
* Add non-macro Simd<> aliases
* Fixes: const V& for GCC, AVX3 BZHI, POPCNT with AVX on MSVC, avoid %zu
-- Jan Wassenberg <janwas@google.com> Wed, 10 Nov 2021 10:00:00 +0100
highway (0.14.2-1) UNRELEASED; urgency=medium
* Add MaskedLoad
* Fix non-glibc PPC, Windows GCC, MSVC 19.14
* Opt-in for -Werror; separate design_philosophy.md
-- Jan Wassenberg <janwas@google.com> Tue, 24 Aug 2021 15:00:00 +0200
highway (0.14.1-1) UNRELEASED; urgency=medium
* Add LoadMaskBits, CompressBits[Store]
* Fix CPU feature check (AES/F16C) and warnings
* Improved DASSERT - disabled in optimized builds
-- Jan Wassenberg <janwas@google.com> Tue, 17 Aug 2021 14:00:00 +0200
highway (0.14.0-1) UNRELEASED; urgency=medium
* Add SVE, S-SSE3, AVX3_DL targets
* Support partial vectors in all ops
* Add PopulationCount, FindFirstTrue, Ne, TableLookupBytesOr0
* Add AESRound, CLMul, MulOdd, HWY_CAP_FLOAT16
-- Jan Wassenberg <janwas@google.com> Thu, 29 Jul 2021 15:00:00 +0200
highway (0.12.2-1) UNRELEASED; urgency=medium
* fix scalar-only test and Windows macro conflict with Load/StoreFence
* replace deprecated wasm intrinsics
-- Jan Wassenberg <janwas@google.com> Mon, 31 May 2021 16:00:00 +0200
highway (0.12.1-1) UNRELEASED; urgency=medium
* doc updates, ARM GCC support, fix s390/ppc, complete partial vectors
* fix warnings, faster ARM div/sqrt, separate hwy_contrib library
* add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC
-- Jan Wassenberg <janwas@google.com> Wed, 19 May 2021 15:00:00 +0200
highway (0.12.0-1) UNRELEASED; urgency=medium
* Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
* Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
* Proper IEEE rounding, reduce libstdc++ usage, inlined math
-- Jan Wassenberg <janwas@google.com> Thu, 15 Apr 2021 20:00:00 +0200
highway (0.11.1-1) UNRELEASED; urgency=medium
* Fix clang7 asan error, finish f16 conversions and add test
-- Jan Wassenberg <janwas@google.com> Thu, 25 Feb 2021 16:00:00 +0200
highway (0.11.0-1) UNRELEASED; urgency=medium
* Add RVV+mask logical ops, allow Shl/ShiftLeftSame on all targets, more math
-- Jan Wassenberg <janwas@google.com> Thu, 18 Feb 2021 20:00:00 +0200
highway (0.7.0-1) UNRELEASED; urgency=medium
* Added API stability notice, Compress[Store], contrib/, SignBit, CopySign
-- Jan Wassenberg <janwas@google.com> Tue, 5 Jan 2021 17:00:00 +0200
highway (0.1-1) UNRELEASED; urgency=medium
* Initial debian package.
-- Alex Deymo <deymo@google.com> Mon, 19 Oct 2020 16:48:07 +0200
+1
View File
@@ -0,0 +1 @@
10
+23
View File
@@ -0,0 +1,23 @@
Source: highway
Maintainer: JPEG XL Maintainers <jpegxl@google.com>
Section: misc
Priority: optional
Standards-Version: 3.9.8
Build-Depends: cmake,
debhelper (>= 9),
libgtest-dev
Homepage: https://github.com/google/highway
Package: libhwy-dev
Architecture: any
Section: libdevel
Depends: ${misc:Depends}
Description: Efficient and performance-portable SIMD wrapper (developer files)
This library provides type-safe and source-code portable wrappers over
existing platform-specific intrinsics. Its design aims for simplicity,
reliable efficiency across platforms, and immediate usability with current
compilers.
.
This package installs the development files. There's no runtime library
since most of Highway is implemented in headers and only a very small
static library is needed.
+20
View File
@@ -0,0 +1,20 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: highway
Files: *
Copyright: 2020 Google LLC
License: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
.
http://www.apache.org/licenses/LICENSE-2.0
.
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
.
On Debian systems, the complete text of the Apache License, Version 2
can be found in "/usr/share/common-licenses/Apache-2.0".
+6
View File
@@ -0,0 +1,6 @@
#!/usr/bin/make -f
%:
dh $@ --buildsystem=cmake
override_dh_auto_configure:
dh_auto_configure -- -DHWY_SYSTEM_GTEST=ON
+1
View File
@@ -0,0 +1 @@
3.0 (quilt)
+152
View File
@@ -0,0 +1,152 @@
// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/aligned_allocator.h"
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h> // malloc
#include <atomic>
#include <limits>
#include "hwy/base.h"
namespace hwy {
namespace {
#if HWY_ARCH_RVV && defined(__riscv_vector)
// Not actually an upper bound on the size, but this value prevents crossing a
// 4K boundary (relevant on Andes).
constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, 4096);
#else
constexpr size_t kAlignment = HWY_ALIGNMENT;
#endif
#if HWY_ARCH_X86
// On x86, aliasing can only occur at multiples of 2K, but that's too wasteful
// if this is used for single-vector allocations. 256 is more reasonable.
constexpr size_t kAlias = kAlignment * 4;
#else
constexpr size_t kAlias = kAlignment;
#endif
#pragma pack(push, 1)
struct AllocationHeader {
void* allocated;
size_t payload_size;
};
#pragma pack(pop)
// Returns a 'random' (cyclical) offset for AllocateAlignedBytes.
size_t NextAlignedOffset() {
static std::atomic<uint32_t> next{0};
constexpr uint32_t kGroups = kAlias / kAlignment;
const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
const size_t offset = kAlignment * group;
HWY_DASSERT((offset % kAlignment == 0) && offset <= kAlias);
return offset;
}
} // namespace
HWY_DLLEXPORT void* AllocateAlignedBytes(const size_t payload_size,
AllocPtr alloc_ptr, void* opaque_ptr) {
HWY_ASSERT(payload_size != 0); // likely a bug in caller
if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
HWY_DASSERT(false && "payload_size too large");
return nullptr;
}
size_t offset = NextAlignedOffset();
// What: | misalign | unused | AllocationHeader |payload
// Size: |<= kAlias | offset |payload_size
// ^allocated.^aligned.^header............^payload
// The header must immediately precede payload, which must remain aligned.
// To avoid wasting space, the header resides at the end of `unused`,
// which therefore cannot be empty (offset == 0).
if (offset == 0) {
offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
}
const size_t allocated_size = kAlias + offset + payload_size;
void* allocated;
if (alloc_ptr == nullptr) {
allocated = malloc(allocated_size);
} else {
allocated = (*alloc_ptr)(opaque_ptr, allocated_size);
}
if (allocated == nullptr) return nullptr;
// Always round up even if already aligned - we already asked for kAlias
// extra bytes and there's no way to give them back.
uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
aligned &= ~(kAlias - 1);
const uintptr_t payload = aligned + offset; // still aligned
// Stash `allocated` and payload_size inside header for FreeAlignedBytes().
// The allocated_size can be reconstructed from the payload_size.
AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1;
header->allocated = allocated;
header->payload_size = payload_size;
return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kAlignment);
}
HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
FreePtr free_ptr, void* opaque_ptr) {
if (aligned_pointer == nullptr) return;
const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
HWY_DASSERT(payload % kAlignment == 0);
const AllocationHeader* header =
reinterpret_cast<const AllocationHeader*>(payload) - 1;
if (free_ptr == nullptr) {
free(header->allocated);
} else {
(*free_ptr)(opaque_ptr, header->allocated);
}
}
// static
HWY_DLLEXPORT void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer,
FreePtr free_ptr,
void* opaque_ptr,
ArrayDeleter deleter) {
if (aligned_pointer == nullptr) return;
const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
HWY_DASSERT(payload % kAlignment == 0);
const AllocationHeader* header =
reinterpret_cast<const AllocationHeader*>(payload) - 1;
if (deleter) {
(*deleter)(aligned_pointer, header->payload_size);
}
if (free_ptr == nullptr) {
free(header->allocated);
} else {
(*free_ptr)(opaque_ptr, header->allocated);
}
}
} // namespace hwy
+212
View File
@@ -0,0 +1,212 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
// Memory allocator with support for alignment and offsets.
#include <stddef.h>
#include <memory>
#include "hwy/highway_export.h"
namespace hwy {
// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
// requires a literal. This matches typical L1 cache line sizes, which prevents
// false sharing.
#define HWY_ALIGNMENT 64
// Pointers to functions equivalent to malloc/free with an opaque void* passed
// to them.
using AllocPtr = void* (*)(void* opaque, size_t bytes);
using FreePtr = void (*)(void* opaque, void* memory);
// Returns null or a pointer to at least `payload_size` (which can be zero)
// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
// memory or malloc() if it is null.
HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
AllocPtr alloc_ptr, void* opaque_ptr);
// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
// must have been returned from a previous call to `AllocateAlignedBytes`.
// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
// `free_ptr` function is null, uses the default free().
HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
FreePtr free_ptr, void* opaque_ptr);
// Class that deletes the aligned pointer passed to operator() calling the
// destructor before freeing the pointer. This is equivalent to the
// std::default_delete but for aligned objects. For a similar deleter equivalent
// to free() for aligned memory see AlignedFreer().
class AlignedDeleter {
public:
AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
template <typename T>
void operator()(T* aligned_pointer) const {
return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
TypedArrayDeleter<T>);
}
private:
template <typename T>
static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
size_t elems = size_in_bytes / sizeof(T);
for (size_t i = 0; i < elems; i++) {
// Explicitly call the destructor on each element.
(static_cast<T*>(ptr) + i)->~T();
}
}
// Function prototype that calls the destructor for each element in a typed
// array. TypeArrayDeleter<T> would match this prototype.
using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
FreePtr free_ptr,
void* opaque_ptr,
ArrayDeleter deleter);
FreePtr free_;
void* opaque_ptr_;
};
// Unique pointer to T with custom aligned deleter. This can be a single
// element U or an array of element if T is a U[]. The custom aligned deleter
// will call the destructor on U or each element of a U[] in the array case.
template <typename T>
using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
// Aligned memory equivalent of make_unique<T> using the custom allocators
// alloc/free with the passed `opaque` pointer. This function calls the
// constructor with the passed Args... and calls the destructor of the object
// when the AlignedUniquePtr is destroyed.
template <typename T, typename... Args>
AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
void* opaque, Args&&... args) {
T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
AlignedDeleter(free, opaque));
}
// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
// functions.
template <typename T, typename... Args>
AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
T* ptr = static_cast<T*>(AllocateAlignedBytes(
sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
AlignedDeleter());
}
// Helpers for array allocators (avoids overflow)
namespace detail {
// Returns x such that 1u << x == n (if n is a power of two).
static inline constexpr size_t ShiftCount(size_t n) {
return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
}
template <typename T>
T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
constexpr size_t size = sizeof(T);
constexpr bool is_pow2 = (size & (size - 1)) == 0;
constexpr size_t bits = ShiftCount(size);
static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
const size_t bytes = is_pow2 ? items << bits : items * size;
const size_t check = is_pow2 ? bytes >> bits : bytes / size;
if (check != items) {
return nullptr; // overflowed
}
return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
}
} // namespace detail
// Aligned memory equivalent of make_unique<T[]> for array types using the
// custom allocators alloc/free. This function calls the constructor with the
// passed Args... on every created item. The destructor of each element will be
// called when the AlignedUniquePtr is destroyed.
template <typename T, typename... Args>
AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
if (ptr != nullptr) {
for (size_t i = 0; i < items; i++) {
new (ptr + i) T(std::forward<Args>(args)...);
}
}
return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
}
template <typename T, typename... Args>
AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
}
// Custom deleter for std::unique_ptr equivalent to using free() as a deleter
// but for aligned memory.
class AlignedFreer {
public:
// Pass address of this to ctor to skip deleting externally-owned memory.
static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}
AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
template <typename T>
void operator()(T* aligned_pointer) const {
// TODO(deymo): assert that we are using a POD type T.
FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
}
private:
FreePtr free_;
void* opaque_ptr_;
};
// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
// data use AlignedUniquePtr.
template <typename T>
using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
// Allocate an aligned and uninitialized array of POD values as a unique_ptr.
// Upon destruction of the unique_ptr the aligned array will be freed.
template <typename T>
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
FreePtr free, void* opaque) {
return AlignedFreeUniquePtr<T[]>(
detail::AllocateAlignedItems<T>(items, alloc, opaque),
AlignedFreer(free, opaque));
}
// Same as previous AllocateAligned(), using default allocate/free functions.
template <typename T>
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
}
} // namespace hwy
#endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
@@ -0,0 +1,278 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/aligned_allocator.h"
#include <stddef.h>
#include <array>
#include <new>
#include <random>
#include <vector>
#include "gtest/gtest.h"
namespace {
// Sample object that keeps track on an external counter of how many times was
// the explicit constructor and destructor called.
template <size_t N>
class SampleObject {
public:
SampleObject() { data_[0] = 'a'; }
explicit SampleObject(int* counter) : counter_(counter) {
if (counter) (*counter)++;
data_[0] = 'b';
}
~SampleObject() {
if (counter_) (*counter_)--;
}
static_assert(N > sizeof(int*), "SampleObject size too small.");
int* counter_ = nullptr;
char data_[N - sizeof(int*)];
};
class FakeAllocator {
public:
// static AllocPtr and FreePtr member to be used with the alligned
// allocator. These functions calls the private non-static members.
static void* StaticAlloc(void* opaque, size_t bytes) {
return reinterpret_cast<FakeAllocator*>(opaque)->Alloc(bytes);
}
static void StaticFree(void* opaque, void* memory) {
return reinterpret_cast<FakeAllocator*>(opaque)->Free(memory);
}
// Returns the number of pending allocations to be freed.
size_t PendingAllocs() { return allocs_.size(); }
private:
void* Alloc(size_t bytes) {
void* ret = malloc(bytes);
allocs_.insert(ret);
return ret;
}
void Free(void* memory) {
if (!memory) return;
EXPECT_NE(allocs_.end(), allocs_.find(memory));
allocs_.erase(memory);
free(memory);
}
std::set<void*> allocs_;
};
} // namespace
namespace hwy {
class AlignedAllocatorTest : public testing::Test {};
TEST(AlignedAllocatorTest, FreeNullptr) {
// Calling free with a nullptr is always ok.
FreeAlignedBytes(/*aligned_pointer=*/nullptr, /*free_ptr=*/nullptr,
/*opaque_ptr=*/nullptr);
}
TEST(AlignedAllocatorTest, Log2) {
EXPECT_EQ(0u, detail::ShiftCount(1));
EXPECT_EQ(1u, detail::ShiftCount(2));
EXPECT_EQ(3u, detail::ShiftCount(8));
}
// Allocator returns null when it detects overflow of items * sizeof(T).
TEST(AlignedAllocatorTest, Overflow) {
constexpr size_t max = ~size_t(0);
constexpr size_t msb = (max >> 1) + 1;
using Size5 = std::array<uint8_t, 5>;
using Size10 = std::array<uint8_t, 10>;
EXPECT_EQ(nullptr,
detail::AllocateAlignedItems<uint32_t>(max / 2, nullptr, nullptr));
EXPECT_EQ(nullptr,
detail::AllocateAlignedItems<uint32_t>(max / 3, nullptr, nullptr));
EXPECT_EQ(nullptr,
detail::AllocateAlignedItems<Size5>(max / 4, nullptr, nullptr));
EXPECT_EQ(nullptr,
detail::AllocateAlignedItems<uint16_t>(msb, nullptr, nullptr));
EXPECT_EQ(nullptr,
detail::AllocateAlignedItems<double>(msb + 1, nullptr, nullptr));
EXPECT_EQ(nullptr,
detail::AllocateAlignedItems<Size10>(msb / 4, nullptr, nullptr));
}
TEST(AlignedAllocatorTest, AllocDefaultPointers) {
const size_t kSize = 7777;
void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
/*opaque_ptr=*/nullptr);
ASSERT_NE(nullptr, ptr);
// Make sure the pointer is actually aligned.
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
char* p = static_cast<char*>(ptr);
size_t ret = 0;
for (size_t i = 0; i < kSize; i++) {
// Performs a computation using p[] to prevent it being optimized away.
p[i] = static_cast<char>(i & 0x7F);
if (i) ret += static_cast<size_t>(p[i] * p[i - 1]);
}
EXPECT_NE(0U, ret);
FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
}
TEST(AlignedAllocatorTest, EmptyAlignedUniquePtr) {
AlignedUniquePtr<SampleObject<32>> ptr(nullptr, AlignedDeleter());
AlignedUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedDeleter());
}
TEST(AlignedAllocatorTest, EmptyAlignedFreeUniquePtr) {
AlignedFreeUniquePtr<SampleObject<32>> ptr(nullptr, AlignedFreer());
AlignedFreeUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedFreer());
}
TEST(AlignedAllocatorTest, CustomAlloc) {
FakeAllocator fake_alloc;
const size_t kSize = 7777;
void* ptr =
AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
ASSERT_NE(nullptr, ptr);
// We should have only requested one alloc from the allocator.
EXPECT_EQ(1U, fake_alloc.PendingAllocs());
// Make sure the pointer is actually aligned.
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
EXPECT_EQ(0U, fake_alloc.PendingAllocs());
}
TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
{
auto ptr = MakeUniqueAligned<SampleObject<24>>();
// Default constructor sets the data_[0] to 'a'.
EXPECT_EQ('a', ptr->data_[0]);
EXPECT_EQ(nullptr, ptr->counter_);
}
}
TEST(AlignedAllocatorTest, MakeUniqueAligned) {
int counter = 0;
{
// Creates the object, initializes it with the explicit constructor and
// returns an unique_ptr to it.
auto ptr = MakeUniqueAligned<SampleObject<24>>(&counter);
EXPECT_EQ(1, counter);
// Custom constructor sets the data_[0] to 'b'.
EXPECT_EQ('b', ptr->data_[0]);
}
EXPECT_EQ(0, counter);
}
TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) {
int counter = 0;
{
// Creates the array of objects and initializes them with the explicit
// constructor.
auto arr = MakeUniqueAlignedArray<SampleObject<24>>(7, &counter);
EXPECT_EQ(7, counter);
for (size_t i = 0; i < 7; i++) {
// Custom constructor sets the data_[0] to 'b'.
EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
}
}
EXPECT_EQ(0, counter);
}
TEST(AlignedAllocatorTest, AllocSingleInt) {
auto ptr = AllocateAligned<uint32_t>(1);
ASSERT_NE(nullptr, ptr.get());
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
// Force delete of the unique_ptr now to check that it doesn't crash.
ptr.reset(nullptr);
EXPECT_EQ(nullptr, ptr.get());
}
TEST(AlignedAllocatorTest, AllocMultipleInt) {
const size_t kSize = 7777;
auto ptr = AllocateAligned<uint32_t>(kSize);
ASSERT_NE(nullptr, ptr.get());
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
// ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
// underlying type chosen by AllocateAligned() for the std::unique_ptr.
EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
size_t ret = 0;
for (size_t i = 0; i < kSize; i++) {
// Performs a computation using ptr[] to prevent it being optimized away.
ptr[i] = static_cast<uint32_t>(i);
if (i) ret += ptr[i] * ptr[i - 1];
}
EXPECT_NE(0U, ret);
}
TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
int counter = 0;
{
// This doesn't call the constructor.
auto obj = AllocateAligned<SampleObject<24>>(1);
obj[0].counter_ = &counter;
}
// Destroying the unique_ptr shouldn't have called the destructor of the
// SampleObject<24>.
EXPECT_EQ(0, counter);
}
TEST(AlignedAllocatorTest, MakeUniqueAlignedArrayWithCustomAlloc) {
FakeAllocator fake_alloc;
int counter = 0;
{
// Creates the array of objects and initializes them with the explicit
// constructor.
auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
&counter);
ASSERT_NE(nullptr, arr.get());
// An array should still only call a single allocation.
EXPECT_EQ(1u, fake_alloc.PendingAllocs());
EXPECT_EQ(7, counter);
for (size_t i = 0; i < 7; i++) {
// Custom constructor sets the data_[0] to 'b'.
EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
}
}
EXPECT_EQ(0, counter);
EXPECT_EQ(0u, fake_alloc.PendingAllocs());
}
TEST(AlignedAllocatorTest, DefaultInit) {
// The test is whether this compiles. Default-init is useful for output params
// and per-thread storage.
std::vector<AlignedUniquePtr<int[]>> ptrs;
std::vector<AlignedFreeUniquePtr<double[]>> free_ptrs;
ptrs.resize(128);
free_ptrs.resize(128);
// The following is to prevent elision of the pointers.
std::mt19937 rng(129); // Emscripten lacks random_device.
std::uniform_int_distribution<size_t> dist(0, 127);
ptrs[dist(rng)] = MakeUniqueAlignedArray<int>(123);
free_ptrs[dist(rng)] = AllocateAligned<double>(456);
// "Use" pointer without resorting to printf. 0 == 0. Can't shift by 64.
const auto addr1 = reinterpret_cast<uintptr_t>(ptrs[dist(rng)].get());
const auto addr2 = reinterpret_cast<uintptr_t>(free_ptrs[dist(rng)].get());
constexpr size_t kBits = sizeof(uintptr_t) * 8;
EXPECT_EQ((addr1 >> (kBits - 1)) >> (kBits - 1),
(addr2 >> (kBits - 1)) >> (kBits - 1));
}
} // namespace hwy
+946
View File
@@ -0,0 +1,946 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_BASE_H_
#define HIGHWAY_HWY_BASE_H_
// For SIMD module implementations and their callers, target-independent.
#include <stddef.h>
#include <stdint.h>
#include "hwy/detect_compiler_arch.h"
#include "hwy/highway_export.h"
#if HWY_COMPILER_MSVC
#include <string.h> // memcpy
#endif
#if HWY_ARCH_X86
#include <atomic>
#endif
//------------------------------------------------------------------------------
// Compiler-specific definitions
#define HWY_STR_IMPL(macro) #macro
#define HWY_STR(macro) HWY_STR_IMPL(macro)
#if HWY_COMPILER_MSVC
#include <intrin.h>
#define HWY_RESTRICT __restrict
#define HWY_INLINE __forceinline
#define HWY_NOINLINE __declspec(noinline)
#define HWY_FLATTEN
#define HWY_NORETURN __declspec(noreturn)
#define HWY_LIKELY(expr) (expr)
#define HWY_UNLIKELY(expr) (expr)
#define HWY_PRAGMA(tokens) __pragma(tokens)
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
#define HWY_MAYBE_UNUSED
#define HWY_HAS_ASSUME_ALIGNED 0
#if (_MSC_VER >= 1700)
#define HWY_MUST_USE_RESULT _Check_return_
#else
#define HWY_MUST_USE_RESULT
#endif
#else
#define HWY_RESTRICT __restrict__
// force inlining without optimization enabled creates very inefficient code
// that can cause compiler timeout
#ifdef __OPTIMIZE__
#define HWY_INLINE inline __attribute__((always_inline))
#else
#define HWY_INLINE inline
#endif
#define HWY_NOINLINE __attribute__((noinline))
#define HWY_FLATTEN __attribute__((flatten))
#define HWY_NORETURN __attribute__((noreturn))
#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
#define HWY_PRAGMA(tokens) _Pragma(#tokens)
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
// Encountered "attribute list cannot appear here" when using the C++17
// [[maybe_unused]], so only use the old style attribute for now.
#define HWY_MAYBE_UNUSED __attribute__((unused))
#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
#endif // !HWY_COMPILER_MSVC
//------------------------------------------------------------------------------
// Builtin/attributes
// Enables error-checking of format strings.
#if HWY_HAS_ATTRIBUTE(__format__)
#define HWY_FORMAT(idx_fmt, idx_arg) \
__attribute__((__format__(__printf__, idx_fmt, idx_arg)))
#else
#define HWY_FORMAT(idx_fmt, idx_arg)
#endif
// Returns a void* pointer which the compiler then assumes is N-byte aligned.
// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
//
// The assignment semantics are required by GCC/Clang. ICC provides an in-place
// __assume_aligned, whereas MSVC's __assume appears unsuitable.
#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
#else
#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
#endif
// Clang and GCC require attributes on each function into which SIMD intrinsics
// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
// automatic annotation via pragmas.
#if HWY_COMPILER_CLANG
#define HWY_PUSH_ATTRIBUTES(targets_str) \
HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
apply_to = function))
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
#elif HWY_COMPILER_GCC
#define HWY_PUSH_ATTRIBUTES(targets_str) \
HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
#else
#define HWY_PUSH_ATTRIBUTES(targets_str)
#define HWY_POP_ATTRIBUTES
#endif
//------------------------------------------------------------------------------
// Macros
#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
#define HWY_CONCAT_IMPL(a, b) a##b
#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
#if HWY_COMPILER_GCC_ACTUAL
// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
#define HWY_DEFAULT_UNROLL HWY_UNROLL()
#else
#define HWY_UNROLL(factor)
#define HWY_DEFAULT_UNROLL
#endif
// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
// does, without generating code.
#if HWY_ARCH_X86
#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
#else
// TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
#define HWY_FENCE
#endif
// 4 instances of a given literal value, useful as input to LoadDup128.
#define HWY_REP4(literal) literal, literal, literal, literal
#define HWY_ABORT(format, ...) \
::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
// Always enabled.
#define HWY_ASSERT(condition) \
do { \
if (!(condition)) { \
HWY_ABORT("Assert %s", #condition); \
} \
} while (0)
#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
#define HWY_IS_MSAN 1
#else
#define HWY_IS_MSAN 0
#endif
#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
#define HWY_IS_ASAN 1
#else
#define HWY_IS_ASAN 0
#endif
#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
#define HWY_IS_TSAN 1
#else
#define HWY_IS_TSAN 0
#endif
// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
// You can disable MSAN by adding this attribute to the function that fails.
#if HWY_IS_MSAN
#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
#else
#define HWY_ATTR_NO_MSAN
#endif
// For enabling HWY_DASSERT and shortening tests in slower debug builds
#if !defined(HWY_IS_DEBUG_BUILD)
// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
// MSVC defines NDEBUG (if not, could instead check _DEBUG).
#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
#define HWY_IS_DEBUG_BUILD 1
#else
#define HWY_IS_DEBUG_BUILD 0
#endif
#endif // HWY_IS_DEBUG_BUILD
#if HWY_IS_DEBUG_BUILD
#define HWY_DASSERT(condition) HWY_ASSERT(condition)
#else
#define HWY_DASSERT(condition) \
do { \
} while (0)
#endif
namespace hwy {
//------------------------------------------------------------------------------
// kMaxVectorSize (undocumented, pending removal)
#if HWY_ARCH_X86
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
#elif HWY_ARCH_RVV && defined(__riscv_vector)
// Not actually an upper bound on the size.
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
#else
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
#endif
//------------------------------------------------------------------------------
// Alignment
// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
// should be allocated dynamically via aligned_allocator.h because Lanes() may
// exceed the stack size.
#if HWY_ARCH_X86
#define HWY_ALIGN_MAX alignas(64)
#elif HWY_ARCH_RVV && defined(__riscv_vector)
#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
#else
#define HWY_ALIGN_MAX alignas(16)
#endif
//------------------------------------------------------------------------------
// Lane types
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
// by concatenating base type and bits.
#pragma pack(push, 1)
// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
// always supported on aarch64, for v7 only if -mfp16-format is given.
#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
using float16_t = __fp16;
// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
// Required for Clang RVV if the float16 extension is used.
#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
using float16_t = _Float16;
// Otherwise emulate
#else
struct float16_t {
uint16_t bits;
};
#endif
struct bfloat16_t {
uint16_t bits;
};
#pragma pack(pop)
using float32_t = float;
using float64_t = double;
#pragma pack(push, 1)
// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
// https://reviews.llvm.org/D86310
struct alignas(16) uint128_t {
uint64_t lo; // little-endian layout
uint64_t hi;
};
// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
// field is to be compared (Lt128Upper instead of Lt128).
struct alignas(16) K64V64 {
uint64_t value; // little-endian layout
uint64_t key;
};
// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
// than when considering both to be a 64-bit key.
struct alignas(8) K32V32 {
uint32_t value; // little-endian layout
uint32_t key;
};
#pragma pack(pop)
static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
const uint128_t& b) {
return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
}
// Required for std::greater.
static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
const uint128_t& b) {
return b < a;
}
static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
const uint128_t& b) {
return a.lo == b.lo && a.hi == b.hi;
}
static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
const K64V64& b) {
return a.key < b.key;
}
// Required for std::greater.
static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
const K64V64& b) {
return b < a;
}
static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
const K32V32& b) {
return a.key < b.key;
}
// Required for std::greater.
static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
const K32V32& b) {
return b < a;
}
//------------------------------------------------------------------------------
// Controlling overload resolution (SFINAE)
template <bool Condition>
struct EnableIfT {};
template <>
struct EnableIfT<true> {
using type = void;
};
template <bool Condition>
using EnableIf = typename EnableIfT<Condition>::type;
template <typename T, typename U>
struct IsSameT {
enum { value = 0 };
};
template <typename T>
struct IsSameT<T, T> {
enum { value = 1 };
};
template <typename T, typename U>
HWY_API constexpr bool IsSame() {
return IsSameT<T, U>::value;
}
// Insert into template/function arguments to enable this overload only for
// vectors of AT MOST this many bits.
//
// Note that enabling for exactly 128 bits is unnecessary because a function can
// simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
#define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
#define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
#define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
#define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
#define HWY_IF_SIGNED(T) \
hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
#define HWY_IF_LANE_SIZE(T, bytes) \
hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
#define HWY_IF_LANE_SIZE_LT(T, bytes) \
hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
// Empty struct used as a size tag type.
template <size_t N>
struct SizeTag {};
template <class T>
struct RemoveConstT {
using type = T;
};
template <class T>
struct RemoveConstT<const T> {
using type = T;
};
template <class T>
using RemoveConst = typename RemoveConstT<T>::type;
//------------------------------------------------------------------------------
// Type relations
namespace detail {
template <typename T>
struct Relations;
template <>
struct Relations<uint8_t> {
using Unsigned = uint8_t;
using Signed = int8_t;
using Wide = uint16_t;
enum { is_signed = 0, is_float = 0 };
};
template <>
struct Relations<int8_t> {
using Unsigned = uint8_t;
using Signed = int8_t;
using Wide = int16_t;
enum { is_signed = 1, is_float = 0 };
};
template <>
struct Relations<uint16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Wide = uint32_t;
using Narrow = uint8_t;
enum { is_signed = 0, is_float = 0 };
};
template <>
struct Relations<int16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Wide = int32_t;
using Narrow = int8_t;
enum { is_signed = 1, is_float = 0 };
};
template <>
struct Relations<uint32_t> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
using Wide = uint64_t;
using Narrow = uint16_t;
enum { is_signed = 0, is_float = 0 };
};
template <>
struct Relations<int32_t> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
using Wide = int64_t;
using Narrow = int16_t;
enum { is_signed = 1, is_float = 0 };
};
template <>
struct Relations<uint64_t> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
using Wide = uint128_t;
using Narrow = uint32_t;
enum { is_signed = 0, is_float = 0 };
};
template <>
struct Relations<int64_t> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
using Narrow = int32_t;
enum { is_signed = 1, is_float = 0 };
};
template <>
struct Relations<uint128_t> {
using Unsigned = uint128_t;
using Narrow = uint64_t;
enum { is_signed = 0, is_float = 0 };
};
template <>
struct Relations<float16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Float = float16_t;
using Wide = float;
enum { is_signed = 1, is_float = 1 };
};
template <>
struct Relations<bfloat16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Wide = float;
enum { is_signed = 1, is_float = 1 };
};
template <>
struct Relations<float> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
using Wide = double;
using Narrow = float16_t;
enum { is_signed = 1, is_float = 1 };
};
template <>
struct Relations<double> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
using Narrow = float;
enum { is_signed = 1, is_float = 1 };
};
template <size_t N>
struct TypeFromSize;
template <>
struct TypeFromSize<1> {
using Unsigned = uint8_t;
using Signed = int8_t;
};
template <>
struct TypeFromSize<2> {
using Unsigned = uint16_t;
using Signed = int16_t;
};
template <>
struct TypeFromSize<4> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
};
template <>
struct TypeFromSize<8> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
};
template <>
struct TypeFromSize<16> {
using Unsigned = uint128_t;
};
} // namespace detail
// Aliases for types of a different category, but the same size.
template <typename T>
using MakeUnsigned = typename detail::Relations<T>::Unsigned;
template <typename T>
using MakeSigned = typename detail::Relations<T>::Signed;
template <typename T>
using MakeFloat = typename detail::Relations<T>::Float;
// Aliases for types of the same category, but different size.
template <typename T>
using MakeWide = typename detail::Relations<T>::Wide;
template <typename T>
using MakeNarrow = typename detail::Relations<T>::Narrow;
// Obtain type from its size [bytes].
template <size_t N>
using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
template <size_t N>
using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
template <size_t N>
using FloatFromSize = typename detail::TypeFromSize<N>::Float;
// Avoid confusion with SizeTag where the parameter is a lane size.
using UnsignedTag = SizeTag<0>;
using SignedTag = SizeTag<0x100>; // integer
using FloatTag = SizeTag<0x200>;
template <typename T, class R = detail::Relations<T>>
constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>();
}
// For when we only want to distinguish FloatTag from everything else.
using NonFloatTag = SizeTag<0x400>;
template <typename T, class R = detail::Relations<T>>
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
}
//------------------------------------------------------------------------------
// Type traits
template <typename T>
HWY_API constexpr bool IsFloat() {
// Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
// from a float, not compared.
return IsSame<T, float>() || IsSame<T, double>();
}
template <typename T>
HWY_API constexpr bool IsSigned() {
return T(0) > T(-1);
}
template <>
constexpr bool IsSigned<float16_t>() {
return true;
}
template <>
constexpr bool IsSigned<bfloat16_t>() {
return true;
}
// Largest/smallest representable integer values.
template <typename T>
HWY_API constexpr T LimitsMax() {
static_assert(!IsFloat<T>(), "Only for integer types");
using TU = MakeUnsigned<T>;
return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
: static_cast<TU>(~0ull));
}
template <typename T>
HWY_API constexpr T LimitsMin() {
static_assert(!IsFloat<T>(), "Only for integer types");
return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
}
// Largest/smallest representable value (integer or float). This naming avoids
// confusion with numeric_limits<float>::min() (the smallest positive value).
template <typename T>
HWY_API constexpr T LowestValue() {
return LimitsMin<T>();
}
template <>
constexpr float LowestValue<float>() {
return -3.402823466e+38F;
}
template <>
constexpr double LowestValue<double>() {
return -1.7976931348623158e+308;
}
template <typename T>
HWY_API constexpr T HighestValue() {
return LimitsMax<T>();
}
template <>
constexpr float HighestValue<float>() {
return 3.402823466e+38F;
}
template <>
constexpr double HighestValue<double>() {
return 1.7976931348623158e+308;
}
// Difference between 1.0 and the next representable value.
template <typename T>
HWY_API constexpr T Epsilon() {
return 1;
}
template <>
constexpr float Epsilon<float>() {
return 1.192092896e-7f;
}
template <>
constexpr double Epsilon<double>() {
return 2.2204460492503131e-16;
}
// Returns width in bits of the mantissa field in IEEE binary32/64.
template <typename T>
constexpr int MantissaBits() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
constexpr int MantissaBits<float>() {
return 23;
}
template <>
constexpr int MantissaBits<double>() {
return 52;
}
// Returns the (left-shifted by one bit) IEEE binary32/64 representation with
// the largest possible (biased) exponent field. Used by IsInf.
template <typename T>
constexpr MakeSigned<T> MaxExponentTimes2() {
return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
}
// Returns bitmask of the sign bit in IEEE binary32/64.
template <typename T>
constexpr MakeUnsigned<T> SignMask() {
return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
}
// Returns bitmask of the exponent field in IEEE binary32/64.
template <typename T>
constexpr MakeUnsigned<T> ExponentMask() {
return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
}
// Returns bitmask of the mantissa field in IEEE binary32/64.
template <typename T>
constexpr MakeUnsigned<T> MantissaMask() {
return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
}
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
// absolute value are less than this can be represented exactly.
template <typename T>
constexpr T MantissaEnd() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
constexpr float MantissaEnd<float>() {
return 8388608.0f; // 1 << 23
}
template <>
constexpr double MantissaEnd<double>() {
// floating point literal with p52 requires C++17.
return 4503599627370496.0; // 1 << 52
}
// Returns width in bits of the exponent field in IEEE binary32/64.
template <typename T>
constexpr int ExponentBits() {
// Exponent := remaining bits after deducting sign and mantissa.
return 8 * sizeof(T) - 1 - MantissaBits<T>();
}
// Returns largest value of the biased exponent field in IEEE binary32/64,
// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
// This is expressed as a signed integer for more efficient comparison.
template <typename T>
constexpr MakeSigned<T> MaxExponentField() {
return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
}
//------------------------------------------------------------------------------
// Helper functions
template <typename T1, typename T2>
constexpr inline T1 DivCeil(T1 a, T2 b) {
return (a + b - 1) / b;
}
// Works for any `align`; if a power of two, compiler emits ADD+AND.
constexpr inline size_t RoundUpTo(size_t what, size_t align) {
return DivCeil(what, align) * align;
}
// Undefined results for x == 0.
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
#if HWY_COMPILER_MSVC
unsigned long index; // NOLINT
_BitScanForward(&index, x);
return index;
#else // HWY_COMPILER_MSVC
return static_cast<size_t>(__builtin_ctz(x));
#endif // HWY_COMPILER_MSVC
}
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
#if HWY_COMPILER_MSVC
#if HWY_ARCH_X86_64
unsigned long index; // NOLINT
_BitScanForward64(&index, x);
return index;
#else // HWY_ARCH_X86_64
// _BitScanForward64 not available
uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
unsigned long index; // NOLINT
if (lsb == 0) {
uint32_t msb = static_cast<uint32_t>(x >> 32u);
_BitScanForward(&index, msb);
return 32 + index;
} else {
_BitScanForward(&index, lsb);
return index;
}
#endif // HWY_ARCH_X86_64
#else // HWY_COMPILER_MSVC
return static_cast<size_t>(__builtin_ctzll(x));
#endif // HWY_COMPILER_MSVC
}
// Undefined results for x == 0.
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
#if HWY_COMPILER_MSVC
unsigned long index; // NOLINT
_BitScanReverse(&index, x);
return 31 - index;
#else // HWY_COMPILER_MSVC
return static_cast<size_t>(__builtin_clz(x));
#endif // HWY_COMPILER_MSVC
}
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
#if HWY_COMPILER_MSVC
#if HWY_ARCH_X86_64
unsigned long index; // NOLINT
_BitScanReverse64(&index, x);
return 63 - index;
#else // HWY_ARCH_X86_64
// _BitScanReverse64 not available
const uint32_t msb = static_cast<uint32_t>(x >> 32u);
unsigned long index; // NOLINT
if (msb == 0) {
const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
_BitScanReverse(&index, lsb);
return 63 - index;
} else {
_BitScanReverse(&index, msb);
return 31 - index;
}
#endif // HWY_ARCH_X86_64
#else // HWY_COMPILER_MSVC
return static_cast<size_t>(__builtin_clzll(x));
#endif // HWY_COMPILER_MSVC
}
HWY_API size_t PopCount(uint64_t x) {
#if HWY_COMPILER_GCC // includes clang
return static_cast<size_t>(__builtin_popcountll(x));
// This instruction has a separate feature flag, but is often called from
// non-SIMD code, so we don't want to require dynamic dispatch. It was first
// supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
// for AVX, so check for that.
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
return _mm_popcnt_u64(x);
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
_mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
#else
x -= ((x >> 1) & 0x5555555555555555ULL);
x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
x += (x >> 8);
x += (x >> 16);
x += (x >> 32);
return static_cast<size_t>(x & 0x7Fu);
#endif
}
// Skip HWY_API due to GCC "function not considered for inlining". Previously
// such errors were caused by underlying type mismatches, but it's not clear
// what is still mismatched despite all the casts.
template <typename TI>
/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
return x == TI{1}
? 0
: static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
}
template <typename TI>
/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
return x == TI{1}
? 0
: static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
}
#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
#pragma intrinsic(_umul128)
#endif
// 64 x 64 = 128 bit multiplication
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
#if defined(__SIZEOF_INT128__)
__uint128_t product = (__uint128_t)a * (__uint128_t)b;
*upper = (uint64_t)(product >> 64);
return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
return _umul128(a, b, upper);
#else
constexpr uint64_t kLo32 = 0xFFFFFFFFU;
const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
const uint64_t hi_lo = (a >> 32) * (b & kLo32);
const uint64_t lo_hi = (a & kLo32) * (b >> 32);
const uint64_t hi_hi = (a >> 32) * (b >> 32);
const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
*upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
return (t << 32) | (lo_lo & kLo32);
#endif
}
#if HWY_COMPILER_MSVC
#pragma intrinsic(memcpy)
#pragma intrinsic(memset)
#endif
// The source/destination must not overlap/alias.
template <size_t kBytes, typename From, typename To>
HWY_API void CopyBytes(const From* from, To* to) {
#if HWY_COMPILER_MSVC
memcpy(to, from, kBytes);
#else
__builtin_memcpy(
static_cast<void*>(to), static_cast<const void*>(from), kBytes);
#endif
}
// Same as CopyBytes, but for same-sized objects; avoids a size argument.
template <typename From, typename To>
HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
static_assert(sizeof(From) == sizeof(To), "");
CopyBytes<sizeof(From)>(from, to);
}
template <size_t kBytes, typename To>
HWY_API void ZeroBytes(To* to) {
#if HWY_COMPILER_MSVC
memset(to, 0, kBytes);
#else
__builtin_memset(to, 0, kBytes);
#endif
}
HWY_API float F32FromBF16(bfloat16_t bf) {
uint32_t bits = bf.bits;
bits <<= 16;
float f;
CopySameSize(&bits, &f);
return f;
}
HWY_API bfloat16_t BF16FromF32(float f) {
uint32_t bits;
CopySameSize(&f, &bits);
bfloat16_t bf;
bf.bits = static_cast<uint16_t>(bits >> 16);
return bf;
}
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
Abort(const char* file, int line, const char* format, ...);
} // namespace hwy
#endif // HIGHWAY_HWY_BASE_H_
+178
View File
@@ -0,0 +1,178 @@
// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#include <limits>
#include "hwy/base.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "base_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
HWY_NOINLINE void TestAllLimits() {
HWY_ASSERT_EQ(uint8_t{0}, LimitsMin<uint8_t>());
HWY_ASSERT_EQ(uint16_t{0}, LimitsMin<uint16_t>());
HWY_ASSERT_EQ(uint32_t{0}, LimitsMin<uint32_t>());
HWY_ASSERT_EQ(uint64_t{0}, LimitsMin<uint64_t>());
HWY_ASSERT_EQ(int8_t{-128}, LimitsMin<int8_t>());
HWY_ASSERT_EQ(int16_t{-32768}, LimitsMin<int16_t>());
HWY_ASSERT_EQ(static_cast<int32_t>(0x80000000u), LimitsMin<int32_t>());
HWY_ASSERT_EQ(static_cast<int64_t>(0x8000000000000000ull),
LimitsMin<int64_t>());
HWY_ASSERT_EQ(uint8_t{0xFF}, LimitsMax<uint8_t>());
HWY_ASSERT_EQ(uint16_t{0xFFFF}, LimitsMax<uint16_t>());
HWY_ASSERT_EQ(uint32_t{0xFFFFFFFFu}, LimitsMax<uint32_t>());
HWY_ASSERT_EQ(uint64_t{0xFFFFFFFFFFFFFFFFull}, LimitsMax<uint64_t>());
HWY_ASSERT_EQ(int8_t{0x7F}, LimitsMax<int8_t>());
HWY_ASSERT_EQ(int16_t{0x7FFF}, LimitsMax<int16_t>());
HWY_ASSERT_EQ(int32_t{0x7FFFFFFFu}, LimitsMax<int32_t>());
HWY_ASSERT_EQ(int64_t{0x7FFFFFFFFFFFFFFFull}, LimitsMax<int64_t>());
}
struct TestLowestHighest {
template <class T>
HWY_NOINLINE void operator()(T /*unused*/) const {
HWY_ASSERT_EQ(std::numeric_limits<T>::lowest(), LowestValue<T>());
HWY_ASSERT_EQ(std::numeric_limits<T>::max(), HighestValue<T>());
}
};
HWY_NOINLINE void TestAllLowestHighest() { ForAllTypes(TestLowestHighest()); }
struct TestIsUnsigned {
template <class T>
HWY_NOINLINE void operator()(T /*unused*/) const {
static_assert(!IsFloat<T>(), "Expected !IsFloat");
static_assert(!IsSigned<T>(), "Expected !IsSigned");
}
};
struct TestIsSigned {
template <class T>
HWY_NOINLINE void operator()(T /*unused*/) const {
static_assert(!IsFloat<T>(), "Expected !IsFloat");
static_assert(IsSigned<T>(), "Expected IsSigned");
}
};
struct TestIsFloat {
template <class T>
HWY_NOINLINE void operator()(T /*unused*/) const {
static_assert(IsFloat<T>(), "Expected IsFloat");
static_assert(IsSigned<T>(), "Floats are also considered signed");
}
};
HWY_NOINLINE void TestAllType() {
ForUnsignedTypes(TestIsUnsigned());
ForSignedTypes(TestIsSigned());
ForFloatTypes(TestIsFloat());
static_assert(sizeof(MakeUnsigned<hwy::uint128_t>) == 16, "");
static_assert(sizeof(MakeWide<uint64_t>) == 16, "Expected uint128_t");
static_assert(sizeof(MakeNarrow<hwy::uint128_t>) == 8, "Expected uint64_t");
}
struct TestIsSame {
template <class T>
HWY_NOINLINE void operator()(T /*unused*/) const {
static_assert(IsSame<T, T>(), "T == T");
static_assert(!IsSame<MakeSigned<T>, MakeUnsigned<T>>(), "S != U");
static_assert(!IsSame<MakeUnsigned<T>, MakeSigned<T>>(), "U != S");
}
};
HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); }
HWY_NOINLINE void TestAllBitScan() {
HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(2u));
HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(3u));
HWY_ASSERT_EQ(size_t{31}, Num0BitsAboveMS1Bit_Nonzero32(1u));
HWY_ASSERT_EQ(size_t{0},
Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull));
HWY_ASSERT_EQ(size_t{0},
Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull));
HWY_ASSERT_EQ(size_t{1},
Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull));
HWY_ASSERT_EQ(size_t{1},
Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull));
HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(2ull));
HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(3ull));
HWY_ASSERT_EQ(size_t{63}, Num0BitsAboveMS1Bit_Nonzero64(1ull));
HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero32(1u));
HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero32(2u));
HWY_ASSERT_EQ(size_t{30}, Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
HWY_ASSERT_EQ(size_t{31}, Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero64(1ull));
HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero64(2ull));
HWY_ASSERT_EQ(size_t{62},
Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull));
HWY_ASSERT_EQ(size_t{63},
Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull));
}
HWY_NOINLINE void TestAllPopCount() {
HWY_ASSERT_EQ(size_t{0}, PopCount(0u));
HWY_ASSERT_EQ(size_t{1}, PopCount(1u));
HWY_ASSERT_EQ(size_t{1}, PopCount(2u));
HWY_ASSERT_EQ(size_t{2}, PopCount(3u));
HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000u));
HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFu));
HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFu));
HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000ull));
HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFull));
HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFull));
HWY_ASSERT_EQ(size_t{33}, PopCount(0x10FFFFFFFFull));
HWY_ASSERT_EQ(size_t{63}, PopCount(0xFFFEFFFFFFFFFFFFull));
HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull));
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(BaseTest);
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits);
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest);
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType);
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllIsSame);
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan);
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
} // namespace hwy
#endif
+110
View File
@@ -0,0 +1,110 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
#define HIGHWAY_HWY_CACHE_CONTROL_H_
#include <stddef.h>
#include <stdint.h>
#include "hwy/base.h"
// Requires SSE2; fails to compile on 32-bit Clang 7 (see
// https://github.com/gperftools/gperftools/issues/946).
#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
#undef HWY_DISABLE_CACHE_CONTROL
#define HWY_DISABLE_CACHE_CONTROL
#endif
// intrin.h is sufficient on MSVC and already included by base.h.
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
#include <emmintrin.h> // SSE2
#endif
// Windows.h #defines these, which causes infinite recursion. Temporarily
// undefine them in this header; these functions are anyway deprecated.
// TODO(janwas): remove when these functions are removed.
#pragma push_macro("LoadFence")
#undef LoadFence
namespace hwy {
// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
#define HWY_STREAM_MULTIPLE 16
// The following functions may also require an attribute.
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
#define HWY_ATTR_CACHE __attribute__((target("sse2")))
#else
#define HWY_ATTR_CACHE
#endif
// Delays subsequent loads until prior loads are visible. Beware of potentially
// differing behavior across architectures and vendors: on Intel but not
// AMD CPUs, also serves as a full fence (waits for all prior instructions to
// complete).
HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_lfence();
#endif
}
// Ensures values written by previous `Stream` calls are visible on the current
// core. This is NOT sufficient for synchronizing across cores; when `Stream`
// outputs are to be consumed by other core(s), the producer must publish
// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_sfence();
#endif
}
// Optionally begins loading the cache line containing "p" to reduce latency of
// subsequent actual loads.
template <typename T>
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
#elif HWY_COMPILER_GCC // includes clang
// Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
// desirable, so use the default 3 (keep in caches).
__builtin_prefetch(p, /*write=*/0, /*hint=*/3);
#else
(void)p;
#endif
}
// Invalidates and flushes the cache line containing "p", if possible.
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_clflush(p);
#else
(void)p;
#endif
}
// When called inside a spin-loop, may reduce power consumption.
HWY_INLINE HWY_ATTR_CACHE void Pause() {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_pause();
#endif
}
} // namespace hwy
// TODO(janwas): remove when these functions are removed. (See above.)
#pragma pop_macro("LoadFence")
#endif // HIGHWAY_HWY_CACHE_CONTROL_H_
@@ -0,0 +1,136 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target include guard
#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
#else
#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
#endif
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// These functions avoid having to write a loop plus remainder handling in the
// (unfortunately still common) case where arrays are not aligned/padded. If the
// inputs are known to be aligned/padded, it is more efficient to write a single
// loop using Load(). We do not provide a CopyAlignedPadded because it
// would be more verbose than such a loop.
// Fills `to`[0, `count`) with `value`.
template <class D, typename T = TFromD<D>>
void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) {
const size_t N = Lanes(d);
const Vec<D> v = Set(d, value);
size_t idx = 0;
for (; idx + N <= count; idx += N) {
StoreU(v, d, to + idx);
}
// `count` was a multiple of the vector length `N`: already done.
if (HWY_UNLIKELY(idx == count)) return;
const size_t remaining = count - idx;
HWY_DASSERT(0 != remaining && remaining < N);
SafeFillN(remaining, value, d, to + idx);
}
// Copies `from`[0, `count`) to `to`, which must not overlap `from`.
template <class D, typename T = TFromD<D>>
void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) {
const size_t N = Lanes(d);
size_t idx = 0;
for (; idx + N <= count; idx += N) {
const Vec<D> v = LoadU(d, from + idx);
StoreU(v, d, to + idx);
}
// `count` was a multiple of the vector length `N`: already done.
if (HWY_UNLIKELY(idx == count)) return;
const size_t remaining = count - idx;
HWY_DASSERT(0 != remaining && remaining < N);
SafeCopyN(remaining, d, from + idx, to + idx);
}
// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the
// corresponding mask element of `func(d, v)` is true. Returns the STL-style end
// of the newly written elements in `to`.
//
// `func` is either a functor with a templated operator()(d, v) returning a
// mask, or a generic lambda if using C++14. Due to apparent limitations of
// Clang on Windows, it is currently necessary to add HWY_ATTR before the
// opening { of the lambda to avoid errors about "function .. requires target".
//
// NOTE: this is only supported for 16-, 32- or 64-bit types.
// NOTE: Func may be called a second time for elements it has already seen, but
// these elements will not be written to `to` again.
template <class D, class Func, typename T = TFromD<D>>
T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to,
const Func& func) {
const size_t N = Lanes(d);
size_t idx = 0;
for (; idx + N <= count; idx += N) {
const Vec<D> v = LoadU(d, from + idx);
to += CompressBlendedStore(v, func(d, v), d, to);
}
// `count` was a multiple of the vector length `N`: already done.
if (HWY_UNLIKELY(idx == count)) return to;
#if HWY_MEM_OPS_MIGHT_FAULT
// Proceed one by one.
const CappedTag<T, 1> d1;
for (; idx < count; ++idx) {
using V1 = Vec<decltype(d1)>;
// Workaround for -Waggressive-loop-optimizations on GCC 8
// (iteration 2305843009213693951 invokes undefined behavior for T=i64)
const uintptr_t addr = reinterpret_cast<uintptr_t>(from);
const T* HWY_RESTRICT from_idx =
reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T)));
const V1 v = LoadU(d1, from_idx);
// Avoid storing to `to` unless we know it should be kept - otherwise, we
// might overrun the end if it was allocated for the exact count.
if (CountTrue(d1, func(d1, v)) == 0) continue;
StoreU(v, d1, to);
to += 1;
}
#else
// Start index of the last unaligned whole vector, ending at the array end.
const size_t last = count - N;
// Number of elements before `from` or already written.
const size_t invalid = idx - last;
HWY_DASSERT(0 != invalid && invalid < N);
const Mask<D> mask = Not(FirstN(d, invalid));
const Vec<D> v = MaskedLoad(mask, d, from + last);
to += CompressBlendedStore(v, And(mask, func(d, v)), d, to);
#endif
return to;
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
@@ -0,0 +1,199 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/aligned_allocator.h"
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/contrib/algo/copy-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
// If your project requires C++14 or later, you can ignore this and pass lambdas
// directly to Transform, without requiring an lvalue as we do here for C++11.
#if __cplusplus < 201402L
#define HWY_GENERIC_LAMBDA 0
#else
#define HWY_GENERIC_LAMBDA 1
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Returns random integer in [0, 128), which fits in any lane type.
template <typename T>
T Random7Bit(RandomState& rng) {
return static_cast<T>(Random32(&rng) & 127);
}
// In C++14, we can instead define these as generic lambdas next to where they
// are invoked.
#if !HWY_GENERIC_LAMBDA
struct IsOdd {
template <class D, class V>
Mask<D> operator()(D d, V v) const {
return TestBit(v, Set(d, TFromD<D>{1}));
}
};
#endif // !HWY_GENERIC_LAMBDA
// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from
// ForFloatTypes.
template <class Test>
struct ForeachCountAndMisalign {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) const {
RandomState rng;
const size_t N = Lanes(d);
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
for (size_t count = 0; count < 2 * N; ++count) {
for (size_t ma : misalignments) {
for (size_t mb : misalignments) {
Test()(d, count, ma, mb, rng);
}
}
}
}
};
struct TestFill {
template <class D>
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
RandomState& rng) {
using T = TFromD<D>;
// HWY_MAX prevents error when misalign == count == 0.
AlignedFreeUniquePtr<T[]> pa =
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
T* expected = pa.get() + misalign_a;
const T value = Random7Bit<T>(rng);
for (size_t i = 0; i < count; ++i) {
expected[i] = value;
}
AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + count + 1);
T* actual = pb.get() + misalign_b;
actual[count] = T{0}; // sentinel
Fill(d, value, count, actual);
HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end
const auto info = hwy::detail::MakeTypeInfo<T>();
const char* target_name = hwy::TargetName(HWY_TARGET);
hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name,
__FILE__, __LINE__);
}
};
void TestAllFill() {
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFill>>());
}
struct TestCopy {
template <class D>
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
RandomState& rng) {
using T = TFromD<D>;
// Prevents error if size to allocate is zero.
AlignedFreeUniquePtr<T[]> pa =
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
T* a = pa.get() + misalign_a;
for (size_t i = 0; i < count; ++i) {
a[i] = Random7Bit<T>(rng);
}
AlignedFreeUniquePtr<T[]> pb =
AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
T* b = pb.get() + misalign_b;
Copy(d, a, count, b);
const auto info = hwy::detail::MakeTypeInfo<T>();
const char* target_name = hwy::TargetName(HWY_TARGET);
hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__,
__LINE__);
}
};
void TestAllCopy() {
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestCopy>>());
}
struct TestCopyIf {
template <class D>
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
RandomState& rng) {
using T = TFromD<D>;
// Prevents error if size to allocate is zero.
AlignedFreeUniquePtr<T[]> pa =
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
T* a = pa.get() + misalign_a;
for (size_t i = 0; i < count; ++i) {
a[i] = Random7Bit<T>(rng);
}
const size_t padding = Lanes(ScalableTag<T>());
AlignedFreeUniquePtr<T[]> pb =
AllocateAligned<T>(HWY_MAX(1, misalign_b + count + padding));
T* b = pb.get() + misalign_b;
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
size_t num_odd = 0;
for (size_t i = 0; i < count; ++i) {
if (a[i] & 1) {
expected[num_odd++] = a[i];
}
}
#if HWY_GENERIC_LAMBDA
const auto is_odd = [](const auto d, const auto v) HWY_ATTR {
return TestBit(v, Set(d, TFromD<decltype(d)>{1}));
};
#else
const IsOdd is_odd;
#endif
T* end = CopyIf(d, a, count, b, is_odd);
const size_t num_written = static_cast<size_t>(end - b);
HWY_ASSERT_EQ(num_odd, num_written);
const auto info = hwy::detail::MakeTypeInfo<T>();
const char* target_name = hwy::TargetName(HWY_TARGET);
hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name,
__FILE__, __LINE__);
}
};
void TestAllCopyIf() {
ForUI163264(ForPartialVectors<ForeachCountAndMisalign<TestCopyIf>>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(CopyTest);
HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill);
HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy);
HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf);
} // namespace hwy
#endif
@@ -0,0 +1,109 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target include guard
#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
#else
#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
#endif
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Returns index of the first element equal to `value` in `in[0, count)`, or
// `count` if not found.
template <class D, typename T = TFromD<D>>
size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) {
const size_t N = Lanes(d);
const Vec<D> broadcasted = Set(d, value);
size_t i = 0;
for (; i + N <= count; i += N) {
const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i)));
if (pos >= 0) return i + static_cast<size_t>(pos);
}
if (i != count) {
#if HWY_MEM_OPS_MIGHT_FAULT
// Scan single elements.
const CappedTag<T, 1> d1;
using V1 = Vec<decltype(d1)>;
const V1 broadcasted1 = Set(d1, GetLane(broadcasted));
for (; i < count; ++i) {
if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) {
return i;
}
}
#else
const size_t remaining = count - i;
HWY_DASSERT(0 != remaining && remaining < N);
const Mask<D> mask = FirstN(d, remaining);
const Vec<D> v = MaskedLoad(mask, d, in + i);
// Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask));
if (pos >= 0) return i + static_cast<size_t>(pos);
#endif // HWY_MEM_OPS_MIGHT_FAULT
}
return count; // not found
}
// Returns index of the first element in `in[0, count)` for which `func(d, vec)`
// returns true, otherwise `count`.
template <class D, class Func, typename T = TFromD<D>>
size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) {
const size_t N = Lanes(d);
size_t i = 0;
for (; i + N <= count; i += N) {
const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i)));
if (pos >= 0) return i + static_cast<size_t>(pos);
}
if (i != count) {
#if HWY_MEM_OPS_MIGHT_FAULT
// Scan single elements.
const CappedTag<T, 1> d1;
for (; i < count; ++i) {
if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) {
return i;
}
}
#else
const size_t remaining = count - i;
HWY_DASSERT(0 != remaining && remaining < N);
const Mask<D> mask = FirstN(d, remaining);
const Vec<D> v = MaskedLoad(mask, d, in + i);
// Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask));
if (pos >= 0) return i + static_cast<size_t>(pos);
#endif // HWY_MEM_OPS_MIGHT_FAULT
}
return count; // not found
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
@@ -0,0 +1,219 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <vector>
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#include "hwy/print.h"
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/contrib/algo/find-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
// If your project requires C++14 or later, you can ignore this and pass lambdas
// directly to FindIf, without requiring an lvalue as we do here for C++11.
#if __cplusplus < 201402L
#define HWY_GENERIC_LAMBDA 0
#else
#define HWY_GENERIC_LAMBDA 1
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Returns random number in [-8, 8) - we use knowledge of the range to Find()
// values we know are not present.
template <typename T>
T Random(RandomState& rng) {
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
const double val = (bits - 512) / 64.0;
// Clamp negative to zero for unsigned types.
return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
}
// In C++14, we can instead define these as generic lambdas next to where they
// are invoked.
#if !HWY_GENERIC_LAMBDA
class GreaterThan {
public:
GreaterThan(int val) : val_(val) {}
template <class D, class V>
Mask<D> operator()(D d, V v) const {
return Gt(v, Set(d, static_cast<TFromD<D>>(val_)));
}
private:
int val_;
};
#endif // !HWY_GENERIC_LAMBDA
// Invokes Test (e.g. TestFind) with all arg combinations.
template <class Test>
struct ForeachCountAndMisalign {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) const {
RandomState rng;
const size_t N = Lanes(d);
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
// Find() checks 8 vectors at a time, so we want to cover a fairly large
// range without oversampling (checking every possible count).
std::vector<size_t> counts(AdjustedReps(512));
for (size_t& count : counts) {
count = static_cast<size_t>(rng()) % (16 * N + 1);
}
counts[0] = 0; // ensure we test count=0.
for (size_t count : counts) {
for (size_t m : misalignments) {
Test()(d, count, m, rng);
}
}
}
};
struct TestFind {
template <class D>
void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
using T = TFromD<D>;
// Must allocate at least one even if count is zero.
AlignedFreeUniquePtr<T[]> storage =
AllocateAligned<T>(HWY_MAX(1, misalign + count));
T* in = storage.get() + misalign;
for (size_t i = 0; i < count; ++i) {
in[i] = Random<T>(rng);
}
// For each position, search for that element (which we know is there)
for (size_t pos = 0; pos < count; ++pos) {
const size_t actual = Find(d, in[pos], in, count);
// We may have found an earlier occurrence of the same value; ensure the
// value is the same, and that it is the first.
if (!IsEqual(in[pos], in[actual])) {
fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n",
hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
static_cast<double>(in[actual]), static_cast<int>(actual),
static_cast<double>(in[pos]));
HWY_ASSERT(false);
}
for (size_t i = 0; i < actual; ++i) {
if (IsEqual(in[i], in[pos])) {
fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n",
hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
static_cast<double>(in[i]), static_cast<int>(i),
static_cast<int>(actual));
HWY_ASSERT(false);
}
}
}
// Also search for values we know not to be present (out of range)
HWY_ASSERT_EQ(count, Find(d, T{9}, in, count));
HWY_ASSERT_EQ(count, Find(d, static_cast<T>(-9), in, count));
}
};
void TestAllFind() {
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFind>>());
}
struct TestFindIf {
template <class D>
void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
using T = TFromD<D>;
using TI = MakeSigned<T>;
// Must allocate at least one even if count is zero.
AlignedFreeUniquePtr<T[]> storage =
AllocateAligned<T>(HWY_MAX(1, misalign + count));
T* in = storage.get() + misalign;
for (size_t i = 0; i < count; ++i) {
in[i] = Random<T>(rng);
HWY_ASSERT(in[i] < 8);
HWY_ASSERT(!hwy::IsSigned<T>() || static_cast<TI>(in[i]) >= -8);
}
bool found_any = false;
bool not_found_any = false;
// unsigned T would be promoted to signed and compare greater than any
// negative val, whereas Set() would just cast to an unsigned value and the
// comparison remains unsigned, so avoid negative numbers there.
const int min_val = IsSigned<T>() ? -9 : 0;
// Includes out-of-range value 9 to test the not-found path.
for (int val = min_val; val <= 9; ++val) {
#if HWY_GENERIC_LAMBDA
const auto greater = [val](const auto d, const auto v) HWY_ATTR {
return Gt(v, Set(d, static_cast<T>(val)));
};
#else
const GreaterThan greater(val);
#endif
const size_t actual = FindIf(d, in, count, greater);
found_any |= actual < count;
not_found_any |= actual == count;
const auto pos = std::find_if(
in, in + count, [val](T x) { return x > static_cast<T>(val); });
// Convert returned iterator to index.
const size_t expected = static_cast<size_t>(pos - in);
if (expected != actual) {
fprintf(stderr, "%s count %d val %d, expected %d actual %d\n",
hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
val, static_cast<int>(expected), static_cast<int>(actual));
hwy::detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "in", in, count,
0, count);
HWY_ASSERT(false);
}
}
// We will always not-find something due to val=9.
HWY_ASSERT(not_found_any);
// We'll find something unless the input is empty or {0} - because 0 > i
// is false for all i=[0,9].
if (count != 0 && in[0] != 0) {
HWY_ASSERT(found_any);
}
}
};
void TestAllFindIf() {
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFindIf>>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(FindTest);
HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind);
HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf);
} // namespace hwy
#endif
@@ -0,0 +1,262 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target include guard
#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
#else
#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
#endif
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// These functions avoid having to write a loop plus remainder handling in the
// (unfortunately still common) case where arrays are not aligned/padded. If the
// inputs are known to be aligned/padded, it is more efficient to write a single
// loop using Load(). We do not provide a TransformAlignedPadded because it
// would be more verbose than such a loop.
//
// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a
// generic lambda if using C++14. Due to apparent limitations of Clang on
// Windows, it is currently necessary to add HWY_ATTR before the opening { of
// the lambda to avoid errors about "always_inline function .. requires target".
//
// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise,
// we used `MaskedLoad` and `BlendedStore` to read/write the final partial
// vector.
// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`,
// where `index_vec` is `Vec<RebindToUnsigned<D>>`. On the first call to `func`,
// the value of its lane i is i, and increases by `Lanes(d)` after every call.
// Note that some of these indices may be `>= count`, but the elements that
// `func` returns in those lanes will not be written to `out`.
template <class D, class Func, typename T = TFromD<D>>
void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) {
const RebindToUnsigned<D> du;
using TU = TFromD<decltype(du)>;
const size_t N = Lanes(d);
size_t idx = 0;
Vec<decltype(du)> vidx = Iota(du, 0);
for (; idx + N <= count; idx += N) {
StoreU(func(d, vidx), d, out + idx);
vidx = Add(vidx, Set(du, static_cast<TU>(N)));
}
// `count` was a multiple of the vector length `N`: already done.
if (HWY_UNLIKELY(idx == count)) return;
#if HWY_MEM_OPS_MIGHT_FAULT
// Proceed one by one.
const CappedTag<T, 1> d1;
const RebindToUnsigned<decltype(d1)> du1;
for (; idx < count; ++idx) {
StoreU(func(d1, Set(du1, static_cast<TU>(idx))), d1, out + idx);
}
#else
const size_t remaining = count - idx;
HWY_DASSERT(0 != remaining && remaining < N);
const Mask<D> mask = FirstN(d, remaining);
BlendedStore(func(d, vidx), mask, d, out + idx);
#endif
}
// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying
// array elements by a constant.
template <class D, class Func, typename T = TFromD<D>>
void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) {
const size_t N = Lanes(d);
size_t idx = 0;
for (; idx + N <= count; idx += N) {
const Vec<D> v = LoadU(d, inout + idx);
StoreU(func(d, v), d, inout + idx);
}
// `count` was a multiple of the vector length `N`: already done.
if (HWY_UNLIKELY(idx == count)) return;
#if HWY_MEM_OPS_MIGHT_FAULT
// Proceed one by one.
const CappedTag<T, 1> d1;
for (; idx < count; ++idx) {
using V1 = Vec<decltype(d1)>;
const V1 v = LoadU(d1, inout + idx);
StoreU(func(d1, v), d1, inout + idx);
}
#else
const size_t remaining = count - idx;
HWY_DASSERT(0 != remaining && remaining < N);
const Mask<D> mask = FirstN(d, remaining);
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
BlendedStore(func(d, v), mask, d, inout + idx);
#endif
}
// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage:
// multiplying array elements by those of another array.
template <class D, class Func, typename T = TFromD<D>>
void Transform1(D d, T* HWY_RESTRICT inout, size_t count,
const T* HWY_RESTRICT in1, const Func& func) {
const size_t N = Lanes(d);
size_t idx = 0;
for (; idx + N <= count; idx += N) {
const Vec<D> v = LoadU(d, inout + idx);
const Vec<D> v1 = LoadU(d, in1 + idx);
StoreU(func(d, v, v1), d, inout + idx);
}
// `count` was a multiple of the vector length `N`: already done.
if (HWY_UNLIKELY(idx == count)) return;
#if HWY_MEM_OPS_MIGHT_FAULT
// Proceed one by one.
const CappedTag<T, 1> d1;
for (; idx < count; ++idx) {
using V1 = Vec<decltype(d1)>;
const V1 v = LoadU(d1, inout + idx);
const V1 v1 = LoadU(d1, in1 + idx);
StoreU(func(d1, v, v1), d1, inout + idx);
}
#else
const size_t remaining = count - idx;
HWY_DASSERT(0 != remaining && remaining < N);
const Mask<D> mask = FirstN(d, remaining);
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
BlendedStore(func(d, v, v1), mask, d, inout + idx);
#endif
}
// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example
// usage: FMA of elements from three arrays, stored into the first array.
template <class D, class Func, typename T = TFromD<D>>
void Transform2(D d, T* HWY_RESTRICT inout, size_t count,
const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2,
const Func& func) {
const size_t N = Lanes(d);
size_t idx = 0;
for (; idx + N <= count; idx += N) {
const Vec<D> v = LoadU(d, inout + idx);
const Vec<D> v1 = LoadU(d, in1 + idx);
const Vec<D> v2 = LoadU(d, in2 + idx);
StoreU(func(d, v, v1, v2), d, inout + idx);
}
// `count` was a multiple of the vector length `N`: already done.
if (HWY_UNLIKELY(idx == count)) return;
#if HWY_MEM_OPS_MIGHT_FAULT
// Proceed one by one.
const CappedTag<T, 1> d1;
for (; idx < count; ++idx) {
using V1 = Vec<decltype(d1)>;
const V1 v = LoadU(d1, inout + idx);
const V1 v1 = LoadU(d1, in1 + idx);
const V1 v2 = LoadU(d1, in2 + idx);
StoreU(func(d1, v, v1, v2), d1, inout + idx);
}
#else
const size_t remaining = count - idx;
HWY_DASSERT(0 != remaining && remaining < N);
const Mask<D> mask = FirstN(d, remaining);
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx);
BlendedStore(func(d, v, v1, v2), mask, d, inout + idx);
#endif
}
template <class D, typename T = TFromD<D>>
void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) {
const size_t N = Lanes(d);
const Vec<D> old_v = Set(d, old_t);
const Vec<D> new_v = Set(d, new_t);
size_t idx = 0;
for (; idx + N <= count; idx += N) {
Vec<D> v = LoadU(d, inout + idx);
StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx);
}
// `count` was a multiple of the vector length `N`: already done.
if (HWY_UNLIKELY(idx == count)) return;
#if HWY_MEM_OPS_MIGHT_FAULT
// Proceed one by one.
const CappedTag<T, 1> d1;
const Vec<decltype(d1)> old_v1 = Set(d1, old_t);
const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
for (; idx < count; ++idx) {
using V1 = Vec<decltype(d1)>;
const V1 v1 = LoadU(d1, inout + idx);
StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx);
}
#else
const size_t remaining = count - idx;
HWY_DASSERT(0 != remaining && remaining < N);
const Mask<D> mask = FirstN(d, remaining);
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx);
#endif
}
template <class D, class Func, typename T = TFromD<D>>
void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t,
const Func& func) {
const size_t N = Lanes(d);
const Vec<D> new_v = Set(d, new_t);
size_t idx = 0;
for (; idx + N <= count; idx += N) {
Vec<D> v = LoadU(d, inout + idx);
StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx);
}
// `count` was a multiple of the vector length `N`: already done.
if (HWY_UNLIKELY(idx == count)) return;
#if HWY_MEM_OPS_MIGHT_FAULT
// Proceed one by one.
const CappedTag<T, 1> d1;
const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
for (; idx < count; ++idx) {
using V1 = Vec<decltype(d1)>;
const V1 v = LoadU(d1, inout + idx);
StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx);
}
#else
const size_t remaining = count - idx;
HWY_DASSERT(0 != remaining && remaining < N);
const Mask<D> mask = FirstN(d, remaining);
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
@@ -0,0 +1,372 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h> // memcpy
#include "hwy/aligned_allocator.h"
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc" //NOLINT
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/contrib/algo/transform-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
// If your project requires C++14 or later, you can ignore this and pass lambdas
// directly to Transform, without requiring an lvalue as we do here for C++11.
#if __cplusplus < 201402L
#define HWY_GENERIC_LAMBDA 0
#else
#define HWY_GENERIC_LAMBDA 1
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <typename T>
T Alpha() {
return static_cast<T>(1.5); // arbitrary scalar
}
// Returns random floating-point number in [-8, 8) to ensure computations do
// not exceed float32 precision.
template <typename T>
T Random(RandomState& rng) {
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
const double val = (bits - 512) / 64.0;
// Clamp negative to zero for unsigned types.
return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
}
// SCAL, AXPY names are from BLAS.
template <typename T>
HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) {
for (size_t i = 0; i < count; ++i) {
out[i] = Alpha<T>() * x[i];
}
}
template <typename T>
HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) {
for (size_t i = 0; i < count; ++i) {
out[i] = Alpha<T>() * x[i] + y[i];
}
}
template <typename T>
HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out,
size_t count) {
for (size_t i = 0; i < count; ++i) {
out[i] = x[i] * y[i] + z[i];
}
}
// In C++14, we can instead define these as generic lambdas next to where they
// are invoked.
#if !HWY_GENERIC_LAMBDA
// Generator that returns even numbers by doubling the output indices.
struct Gen2 {
template <class D, class VU>
Vec<D> operator()(D d, VU vidx) const {
return BitCast(d, Add(vidx, vidx));
}
};
struct SCAL {
template <class D, class V>
Vec<D> operator()(D d, V v) const {
using T = TFromD<D>;
return Mul(Set(d, Alpha<T>()), v);
}
};
struct AXPY {
template <class D, class V>
Vec<D> operator()(D d, V v, V v1) const {
using T = TFromD<D>;
return MulAdd(Set(d, Alpha<T>()), v, v1);
}
};
struct FMA4 {
template <class D, class V>
Vec<D> operator()(D /*d*/, V v, V v1, V v2) const {
return MulAdd(v, v1, v2);
}
};
#endif // !HWY_GENERIC_LAMBDA
// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from
// ForFloatTypes.
template <class Test>
struct ForeachCountAndMisalign {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) const {
RandomState rng;
const size_t N = Lanes(d);
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
for (size_t count = 0; count < 2 * N; ++count) {
for (size_t ma : misalignments) {
for (size_t mb : misalignments) {
Test()(d, count, ma, mb, rng);
}
}
}
}
};
// Output-only, no loads
struct TestGenerate {
template <class D>
void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/,
RandomState& /*rng*/) {
using T = TFromD<D>;
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count + 1);
T* actual = pa.get() + misalign_a;
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
for (size_t i = 0; i < count; ++i) {
expected[i] = static_cast<T>(2 * i);
}
// TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
// the attribute also applies to lambdas? If so, remove HWY_ATTR.
#if HWY_GENERIC_LAMBDA
const auto gen2 = [](const auto d, const auto vidx)
HWY_ATTR { return BitCast(d, Add(vidx, vidx)); };
#else
const Gen2 gen2;
#endif
actual[count] = T{0}; // sentinel
Generate(d, actual, count, gen2);
HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end
const auto info = hwy::detail::MakeTypeInfo<T>();
const char* target_name = hwy::TargetName(HWY_TARGET);
hwy::detail::AssertArrayEqual(info, expected.get(), actual, count,
target_name, __FILE__, __LINE__);
}
};
// Zero extra input arrays
struct TestTransform {
template <class D>
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
RandomState& rng) {
if (misalign_b != 0) return;
using T = TFromD<D>;
// Prevents error if size to allocate is zero.
AlignedFreeUniquePtr<T[]> pa =
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
T* a = pa.get() + misalign_a;
for (size_t i = 0; i < count; ++i) {
a[i] = Random<T>(rng);
}
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
SimpleSCAL(a, expected.get(), count);
// TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
// the attribute also applies to lambdas? If so, remove HWY_ATTR.
#if HWY_GENERIC_LAMBDA
const auto scal = [](const auto d, const auto v)
HWY_ATTR { return Mul(Set(d, Alpha<T>()), v); };
#else
const SCAL scal;
#endif
Transform(d, a, count, scal);
const auto info = hwy::detail::MakeTypeInfo<T>();
const char* target_name = hwy::TargetName(HWY_TARGET);
hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
__FILE__, __LINE__);
}
};
// One extra input array
struct TestTransform1 {
template <class D>
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
RandomState& rng) {
using T = TFromD<D>;
// Prevents error if size to allocate is zero.
AlignedFreeUniquePtr<T[]> pa =
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
AlignedFreeUniquePtr<T[]> pb =
AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
T* a = pa.get() + misalign_a;
T* b = pb.get() + misalign_b;
for (size_t i = 0; i < count; ++i) {
a[i] = Random<T>(rng);
b[i] = Random<T>(rng);
}
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
SimpleAXPY(a, b, expected.get(), count);
#if HWY_GENERIC_LAMBDA
const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR {
return MulAdd(Set(d, Alpha<T>()), v, v1);
};
#else
const AXPY axpy;
#endif
Transform1(d, a, count, b, axpy);
const auto info = hwy::detail::MakeTypeInfo<T>();
const char* target_name = hwy::TargetName(HWY_TARGET);
hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
__FILE__, __LINE__);
}
};
// Two extra input arrays
struct TestTransform2 {
template <class D>
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
RandomState& rng) {
using T = TFromD<D>;
// Prevents error if size to allocate is zero.
AlignedFreeUniquePtr<T[]> pa =
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
AlignedFreeUniquePtr<T[]> pb =
AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
AlignedFreeUniquePtr<T[]> pc =
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
T* a = pa.get() + misalign_a;
T* b = pb.get() + misalign_b;
T* c = pc.get() + misalign_a;
for (size_t i = 0; i < count; ++i) {
a[i] = Random<T>(rng);
b[i] = Random<T>(rng);
c[i] = Random<T>(rng);
}
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
SimpleFMA4(a, b, c, expected.get(), count);
#if HWY_GENERIC_LAMBDA
const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2)
HWY_ATTR { return MulAdd(v, v1, v2); };
#else
const FMA4 fma4;
#endif
Transform2(d, a, count, b, c, fma4);
const auto info = hwy::detail::MakeTypeInfo<T>();
const char* target_name = hwy::TargetName(HWY_TARGET);
hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
__FILE__, __LINE__);
}
};
template <typename T>
class IfEq {
public:
IfEq(T val) : val_(val) {}
template <class D, class V>
Mask<D> operator()(D d, V v) const {
return Eq(v, Set(d, val_));
}
private:
T val_;
};
struct TestReplace {
template <class D>
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
RandomState& rng) {
if (misalign_b != 0) return;
if (count == 0) return;
using T = TFromD<D>;
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count);
T* a = pa.get() + misalign_a;
for (size_t i = 0; i < count; ++i) {
a[i] = Random<T>(rng);
}
AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(count);
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(count);
std::vector<size_t> positions(AdjustedReps(count));
for (size_t& pos : positions) {
pos = static_cast<size_t>(rng()) % count;
}
for (size_t pos = 0; pos < count; ++pos) {
const T old_t = a[pos];
const T new_t = Random<T>(rng);
for (size_t i = 0; i < count; ++i) {
expected[i] = IsEqual(a[i], old_t) ? new_t : a[i];
}
// Copy so ReplaceIf gets the same input (and thus also outputs expected)
memcpy(pb.get(), a, count * sizeof(T));
Replace(d, a, count, new_t, old_t);
HWY_ASSERT_ARRAY_EQ(expected.get(), a, count);
ReplaceIf(d, pb.get(), count, new_t, IfEq<T>(old_t));
HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count);
}
}
};
void TestAllGenerate() {
// The test BitCast-s the indices, which does not work for floats.
ForIntegerTypes(ForPartialVectors<ForeachCountAndMisalign<TestGenerate>>());
}
void TestAllTransform() {
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform>>());
}
void TestAllTransform1() {
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform1>>());
}
void TestAllTransform2() {
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform2>>());
}
void TestAllReplace() {
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestReplace>>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(TransformTest);
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate);
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform);
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1);
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2);
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace);
} // namespace hwy
#endif
+252
View File
@@ -0,0 +1,252 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Include guard (still compiled once per target)
#include <cmath>
#if defined(HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
#undef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
#else
#define HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
#endif
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
struct Dot {
// Specify zero or more of these, ORed together, as the kAssumptions template
// argument to Compute. Each one may improve performance or reduce code size,
// at the cost of additional requirements on the arguments.
enum Assumptions {
// num_elements is at least N, which may be up to HWY_MAX_BYTES / sizeof(T).
kAtLeastOneVector = 1,
// num_elements is divisible by N (a power of two, so this can be used if
// the problem size is known to be a power of two >= HWY_MAX_BYTES /
// sizeof(T)).
kMultipleOfVector = 2,
// RoundUpTo(num_elements, N) elements are accessible; their value does not
// matter (will be treated as if they were zero).
kPaddedToVector = 4,
};
// Returns sum{pa[i] * pb[i]} for float or double inputs. Aligning the
// pointers to a multiple of N elements is helpful but not required.
template <int kAssumptions, class D, typename T = TFromD<D>,
HWY_IF_NOT_LANE_SIZE_D(D, 2)>
static HWY_INLINE T Compute(const D d, const T* const HWY_RESTRICT pa,
const T* const HWY_RESTRICT pb,
const size_t num_elements) {
static_assert(IsFloat<T>(), "MulAdd requires float type");
using V = decltype(Zero(d));
const size_t N = Lanes(d);
size_t i = 0;
constexpr bool kIsAtLeastOneVector =
(kAssumptions & kAtLeastOneVector) != 0;
constexpr bool kIsMultipleOfVector =
(kAssumptions & kMultipleOfVector) != 0;
constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
// Won't be able to do a full vector load without padding => scalar loop.
if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
HWY_UNLIKELY(num_elements < N)) {
// Only 2x unroll to avoid excessive code size.
T sum0 = T(0);
T sum1 = T(0);
for (; i + 2 <= num_elements; i += 2) {
sum0 += pa[i + 0] * pb[i + 0];
sum1 += pa[i + 1] * pb[i + 1];
}
if (i < num_elements) {
sum1 += pa[i] * pb[i];
}
return sum0 + sum1;
}
// Compiler doesn't make independent sum* accumulators, so unroll manually.
// 2 FMA ports * 4 cycle latency = up to 8 in-flight, but that is excessive
// for unaligned inputs (each unaligned pointer halves the throughput
// because it occupies both L1 load ports for a cycle). We cannot have
// arrays of vectors on RVV/SVE, so always unroll 4x.
V sum0 = Zero(d);
V sum1 = Zero(d);
V sum2 = Zero(d);
V sum3 = Zero(d);
// Main loop: unrolled
for (; i + 4 * N <= num_elements; /* i += 4 * N */) { // incr in loop
const auto a0 = LoadU(d, pa + i);
const auto b0 = LoadU(d, pb + i);
i += N;
sum0 = MulAdd(a0, b0, sum0);
const auto a1 = LoadU(d, pa + i);
const auto b1 = LoadU(d, pb + i);
i += N;
sum1 = MulAdd(a1, b1, sum1);
const auto a2 = LoadU(d, pa + i);
const auto b2 = LoadU(d, pb + i);
i += N;
sum2 = MulAdd(a2, b2, sum2);
const auto a3 = LoadU(d, pa + i);
const auto b3 = LoadU(d, pb + i);
i += N;
sum3 = MulAdd(a3, b3, sum3);
}
// Up to 3 iterations of whole vectors
for (; i + N <= num_elements; i += N) {
const auto a = LoadU(d, pa + i);
const auto b = LoadU(d, pb + i);
sum0 = MulAdd(a, b, sum0);
}
if (!kIsMultipleOfVector) {
const size_t remaining = num_elements - i;
if (remaining != 0) {
if (kIsPaddedToVector) {
const auto mask = FirstN(d, remaining);
const auto a = LoadU(d, pa + i);
const auto b = LoadU(d, pb + i);
sum1 = MulAdd(IfThenElseZero(mask, a), IfThenElseZero(mask, b), sum1);
} else {
// Unaligned load such that the last element is in the highest lane -
// ensures we do not touch any elements outside the valid range.
// If we get here, then num_elements >= N.
HWY_DASSERT(i >= N);
i += remaining - N;
const auto skip = FirstN(d, N - remaining);
const auto a = LoadU(d, pa + i); // always unaligned
const auto b = LoadU(d, pb + i);
sum1 = MulAdd(IfThenZeroElse(skip, a), IfThenZeroElse(skip, b), sum1);
}
}
} // kMultipleOfVector
// Reduction tree: sum of all accumulators by pairs, then across lanes.
sum0 = Add(sum0, sum1);
sum2 = Add(sum2, sum3);
sum0 = Add(sum0, sum2);
return GetLane(SumOfLanes(d, sum0));
}
// Returns sum{pa[i] * pb[i]} for bfloat16 inputs. Aligning the pointers to a
// multiple of N elements is helpful but not required.
template <int kAssumptions, class D>
static HWY_INLINE float Compute(const D d,
const bfloat16_t* const HWY_RESTRICT pa,
const bfloat16_t* const HWY_RESTRICT pb,
const size_t num_elements) {
const RebindToUnsigned<D> du16;
const Repartition<float, D> df32;
using V = decltype(Zero(df32));
const size_t N = Lanes(d);
size_t i = 0;
constexpr bool kIsAtLeastOneVector =
(kAssumptions & kAtLeastOneVector) != 0;
constexpr bool kIsMultipleOfVector =
(kAssumptions & kMultipleOfVector) != 0;
constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
// Won't be able to do a full vector load without padding => scalar loop.
if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
HWY_UNLIKELY(num_elements < N)) {
float sum0 = 0.0f; // Only 2x unroll to avoid excessive code size for..
float sum1 = 0.0f; // this unlikely(?) case.
for (; i + 2 <= num_elements; i += 2) {
sum0 += F32FromBF16(pa[i + 0]) * F32FromBF16(pb[i + 0]);
sum1 += F32FromBF16(pa[i + 1]) * F32FromBF16(pb[i + 1]);
}
if (i < num_elements) {
sum1 += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
}
return sum0 + sum1;
}
// See comment in the other Compute() overload. Unroll 2x, but we need
// twice as many sums for ReorderWidenMulAccumulate.
V sum0 = Zero(df32);
V sum1 = Zero(df32);
V sum2 = Zero(df32);
V sum3 = Zero(df32);
// Main loop: unrolled
for (; i + 2 * N <= num_elements; /* i += 2 * N */) { // incr in loop
const auto a0 = LoadU(d, pa + i);
const auto b0 = LoadU(d, pb + i);
i += N;
sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
const auto a1 = LoadU(d, pa + i);
const auto b1 = LoadU(d, pb + i);
i += N;
sum2 = ReorderWidenMulAccumulate(df32, a1, b1, sum2, sum3);
}
// Possibly one more iteration of whole vectors
if (i + N <= num_elements) {
const auto a0 = LoadU(d, pa + i);
const auto b0 = LoadU(d, pb + i);
i += N;
sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
}
if (!kIsMultipleOfVector) {
const size_t remaining = num_elements - i;
if (remaining != 0) {
if (kIsPaddedToVector) {
const auto mask = FirstN(du16, remaining);
const auto va = LoadU(d, pa + i);
const auto vb = LoadU(d, pb + i);
const auto a16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, va)));
const auto b16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, vb)));
sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
} else {
// Unaligned load such that the last element is in the highest lane -
// ensures we do not touch any elements outside the valid range.
// If we get here, then num_elements >= N.
HWY_DASSERT(i >= N);
i += remaining - N;
const auto skip = FirstN(du16, N - remaining);
const auto va = LoadU(d, pa + i); // always unaligned
const auto vb = LoadU(d, pb + i);
const auto a16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, va)));
const auto b16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, vb)));
sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
}
}
} // kMultipleOfVector
// Reduction tree: sum of all accumulators by pairs, then across lanes.
sum0 = Add(sum0, sum1);
sum2 = Add(sum2, sum3);
sum0 = Add(sum0, sum2);
return GetLane(SumOfLanes(df32, sum0));
}
};
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
@@ -0,0 +1,167 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include "hwy/aligned_allocator.h"
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/contrib/dot/dot-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <typename T>
HWY_NOINLINE T SimpleDot(const T* pa, const T* pb, size_t num) {
double sum = 0.0;
for (size_t i = 0; i < num; ++i) {
sum += pa[i] * pb[i];
}
return static_cast<T>(sum);
}
HWY_NOINLINE float SimpleDot(const bfloat16_t* pa, const bfloat16_t* pb,
size_t num) {
float sum = 0.0f;
for (size_t i = 0; i < num; ++i) {
sum += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
}
return sum;
}
template <typename T>
void SetValue(const float value, T* HWY_RESTRICT ptr) {
*ptr = static_cast<T>(value);
}
void SetValue(const float value, bfloat16_t* HWY_RESTRICT ptr) {
*ptr = BF16FromF32(value);
}
class TestDot {
// Computes/verifies one dot product.
template <int kAssumptions, class D>
void Test(D d, size_t num, size_t misalign_a, size_t misalign_b,
RandomState& rng) {
using T = TFromD<D>;
const size_t N = Lanes(d);
const auto random_t = [&rng]() {
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
return static_cast<float>(bits - 512) * (1.0f / 64);
};
const size_t padded =
(kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num;
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded);
AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + padded);
T* a = pa.get() + misalign_a;
T* b = pb.get() + misalign_b;
size_t i = 0;
for (; i < num; ++i) {
SetValue(random_t(), a + i);
SetValue(random_t(), b + i);
}
// Fill padding with NaN - the values are not used, but avoids MSAN errors.
for (; i < padded; ++i) {
ScalableTag<float> df1;
SetValue(GetLane(NaN(df1)), a + i);
SetValue(GetLane(NaN(df1)), b + i);
}
const auto expected = SimpleDot(a, b, num);
const auto actual = Dot::Compute<kAssumptions>(d, a, b, num);
const auto max = static_cast<decltype(actual)>(8 * 8 * num);
HWY_ASSERT(-max <= actual && actual <= max);
HWY_ASSERT(expected - 1E-4 <= actual && actual <= expected + 1E-4);
}
// Runs tests with various alignments.
template <int kAssumptions, class D>
void ForeachMisalign(D d, size_t num, RandomState& rng) {
const size_t N = Lanes(d);
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
for (size_t ma : misalignments) {
for (size_t mb : misalignments) {
Test<kAssumptions>(d, num, ma, mb, rng);
}
}
}
// Runs tests with various lengths compatible with the given assumptions.
template <int kAssumptions, class D>
void ForeachCount(D d, RandomState& rng) {
const size_t N = Lanes(d);
const size_t counts[] = {1,
3,
7,
16,
HWY_MAX(N / 2, 1),
HWY_MAX(2 * N / 3, 1),
N,
N + 1,
4 * N / 3,
3 * N,
8 * N,
8 * N + 2};
for (size_t num : counts) {
if ((kAssumptions & Dot::kAtLeastOneVector) && num < N) continue;
if ((kAssumptions & Dot::kMultipleOfVector) && (num % N) != 0) continue;
ForeachMisalign<kAssumptions>(d, num, rng);
}
}
public:
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
RandomState rng;
// All 8 combinations of the three length-related flags:
ForeachCount<0>(d, rng);
ForeachCount<Dot::kAtLeastOneVector>(d, rng);
ForeachCount<Dot::kMultipleOfVector>(d, rng);
ForeachCount<Dot::kMultipleOfVector | Dot::kAtLeastOneVector>(d, rng);
ForeachCount<Dot::kPaddedToVector>(d, rng);
ForeachCount<Dot::kPaddedToVector | Dot::kAtLeastOneVector>(d, rng);
ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector>(d, rng);
ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector |
Dot::kAtLeastOneVector>(d, rng);
}
};
void TestAllDot() { ForFloatTypes(ForPartialVectors<TestDot>()); }
void TestAllDotBF16() { ForShrinkableVectors<TestDot>()(bfloat16_t()); }
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(DotTest);
HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot);
HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16);
} // namespace hwy
#endif
@@ -0,0 +1,145 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/image/image.h"
#include <algorithm> // swap
#include <cstddef>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
size_t GetVectorSize() { return Lanes(ScalableTag<uint8_t>()); }
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(GetVectorSize); // Local function.
} // namespace
size_t ImageBase::VectorSize() {
// Do not cache result - must return the current value, which may be greater
// than the first call if it was subject to DisableTargets!
return HWY_DYNAMIC_DISPATCH(GetVectorSize)();
}
size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) {
const size_t vec_size = VectorSize();
size_t valid_bytes = xsize * sizeof_t;
// Allow unaligned accesses starting at the last valid value - this may raise
// msan errors unless the user calls InitializePaddingForUnalignedAccesses.
// Skip for the scalar case because no extra lanes will be loaded.
if (vec_size != 1) {
HWY_DASSERT(vec_size >= sizeof_t);
valid_bytes += vec_size - sizeof_t;
}
// Round up to vector and cache line size.
const size_t align = HWY_MAX(vec_size, HWY_ALIGNMENT);
size_t bytes_per_row = RoundUpTo(valid_bytes, align);
// During the lengthy window before writes are committed to memory, CPUs
// guard against read after write hazards by checking the address, but
// only the lower 11 bits. We avoid a false dependency between writes to
// consecutive rows by ensuring their sizes are not multiples of 2 KiB.
// Avoid2K prevents the same problem for the planes of an Image3.
if (bytes_per_row % HWY_ALIGNMENT == 0) {
bytes_per_row += align;
}
HWY_DASSERT(bytes_per_row % align == 0);
return bytes_per_row;
}
ImageBase::ImageBase(const size_t xsize, const size_t ysize,
const size_t sizeof_t)
: xsize_(static_cast<uint32_t>(xsize)),
ysize_(static_cast<uint32_t>(ysize)),
bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
HWY_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8);
bytes_per_row_ = 0;
// Dimensions can be zero, e.g. for lazily-allocated images. Only allocate
// if nonzero, because "zero" bytes still have padding/bookkeeping overhead.
if (xsize != 0 && ysize != 0) {
bytes_per_row_ = BytesPerRow(xsize, sizeof_t);
bytes_ = AllocateAligned<uint8_t>(bytes_per_row_ * ysize);
HWY_ASSERT(bytes_.get() != nullptr);
InitializePadding(sizeof_t, Padding::kRoundUp);
}
}
ImageBase::ImageBase(const size_t xsize, const size_t ysize,
const size_t bytes_per_row, void* const aligned)
: xsize_(static_cast<uint32_t>(xsize)),
ysize_(static_cast<uint32_t>(ysize)),
bytes_per_row_(bytes_per_row),
bytes_(static_cast<uint8_t*>(aligned),
AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
const size_t vec_size = VectorSize();
HWY_ASSERT(bytes_per_row % vec_size == 0);
HWY_ASSERT(reinterpret_cast<uintptr_t>(aligned) % vec_size == 0);
}
void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
#if HWY_IS_MSAN || HWY_IDE
if (xsize_ == 0 || ysize_ == 0) return;
const size_t vec_size = VectorSize(); // Bytes, independent of sizeof_t!
if (vec_size == 1) return; // Scalar mode: no padding needed
const size_t valid_size = xsize_ * sizeof_t;
const size_t initialize_size = padding == Padding::kRoundUp
? RoundUpTo(valid_size, vec_size)
: valid_size + vec_size - sizeof_t;
if (valid_size == initialize_size) return;
for (size_t y = 0; y < ysize_; ++y) {
uint8_t* HWY_RESTRICT row = static_cast<uint8_t*>(VoidRow(y));
#if defined(__clang__) && (__clang_major__ <= 6)
// There's a bug in msan in clang-6 when handling AVX2 operations. This
// workaround allows tests to pass on msan, although it is slower and
// prevents msan warnings from uninitialized images.
memset(row, 0, initialize_size);
#else
memset(row + valid_size, 0, initialize_size - valid_size);
#endif // clang6
}
#else
(void)sizeof_t;
(void)padding;
#endif // HWY_IS_MSAN
}
void ImageBase::Swap(ImageBase& other) {
std::swap(xsize_, other.xsize_);
std::swap(ysize_, other.ysize_);
std::swap(bytes_per_row_, other.bytes_per_row_);
std::swap(bytes_, other.bytes_);
}
} // namespace hwy
#endif // HWY_ONCE
+471
View File
@@ -0,0 +1,471 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
#define HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
// SIMD/multicore-friendly planar image representation with row accessors.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <cstddef>
#include <utility> // std::move
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#include "hwy/highway_export.h"
namespace hwy {
// Type-independent parts of Image<> - reduces code duplication and facilitates
// moving member function implementations to cc file.
struct HWY_CONTRIB_DLLEXPORT ImageBase {
// Returns required alignment in bytes for externally allocated memory.
static size_t VectorSize();
// Returns distance [bytes] between the start of two consecutive rows, a
// multiple of VectorSize but NOT kAlias (see implementation).
static size_t BytesPerRow(const size_t xsize, const size_t sizeof_t);
// No allocation (for output params or unused images)
ImageBase()
: xsize_(0),
ysize_(0),
bytes_per_row_(0),
bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {}
// Allocates memory (this is the common case)
ImageBase(size_t xsize, size_t ysize, size_t sizeof_t);
// References but does not take ownership of external memory. Useful for
// interoperability with other libraries. `aligned` must be aligned to a
// multiple of VectorSize() and `bytes_per_row` must also be a multiple of
// VectorSize() or preferably equal to BytesPerRow().
ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned);
// Copy construction/assignment is forbidden to avoid inadvertent copies,
// which can be very expensive. Use CopyImageTo() instead.
ImageBase(const ImageBase& other) = delete;
ImageBase& operator=(const ImageBase& other) = delete;
// Move constructor (required for returning Image from function)
ImageBase(ImageBase&& other) noexcept = default;
// Move assignment (required for std::vector)
ImageBase& operator=(ImageBase&& other) noexcept = default;
void Swap(ImageBase& other);
// Useful for pre-allocating image with some padding for alignment purposes
// and later reporting the actual valid dimensions. Caller is responsible
// for ensuring xsize/ysize are <= the original dimensions.
void ShrinkTo(const size_t xsize, const size_t ysize) {
xsize_ = static_cast<uint32_t>(xsize);
ysize_ = static_cast<uint32_t>(ysize);
// NOTE: we can't recompute bytes_per_row for more compact storage and
// better locality because that would invalidate the image contents.
}
// How many pixels.
HWY_INLINE size_t xsize() const { return xsize_; }
HWY_INLINE size_t ysize() const { return ysize_; }
// NOTE: do not use this for copying rows - the valid xsize may be much less.
HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; }
// Raw access to byte contents, for interfacing with other libraries.
// Unsigned char instead of char to avoid surprises (sign extension).
HWY_INLINE uint8_t* bytes() {
void* p = bytes_.get();
return static_cast<uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
}
HWY_INLINE const uint8_t* bytes() const {
const void* p = bytes_.get();
return static_cast<const uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
}
protected:
// Returns pointer to the start of a row.
HWY_INLINE void* VoidRow(const size_t y) const {
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
if (y >= ysize_) {
HWY_ABORT("Row(%d) >= %u\n", static_cast<int>(y), ysize_);
}
#endif
void* row = bytes_.get() + y * bytes_per_row_;
return HWY_ASSUME_ALIGNED(row, 64);
}
enum class Padding {
// Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
kRoundUp,
// Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra
// vector to be initialized. If done by default, this would suppress
// legitimate msan warnings. We therefore require users to explicitly call
// InitializePadding before using unaligned loads (e.g. convolution).
kUnaligned
};
// Initializes the minimum bytes required to suppress msan warnings from
// legitimate (according to Padding mode) vector loads/stores on the right
// border, where some lanes are uninitialized and assumed to be unused.
void InitializePadding(size_t sizeof_t, Padding padding);
// (Members are non-const to enable assignment during move-assignment.)
uint32_t xsize_; // In valid pixels, not including any padding.
uint32_t ysize_;
size_t bytes_per_row_; // Includes padding.
AlignedFreeUniquePtr<uint8_t[]> bytes_;
};
// Single channel, aligned rows separated by padding. T must be POD.
//
// 'Single channel' (one 2D array per channel) simplifies vectorization
// (repeating the same operation on multiple adjacent components) without the
// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
// can easily iterate over all components in a row and Image requires no
// knowledge of the pixel format beyond the component type "T".
//
// 'Aligned' means each row is aligned to the L1 cache line size. This prevents
// false sharing between two threads operating on adjacent rows.
//
// 'Padding' is still relevant because vectors could potentially be larger than
// a cache line. By rounding up row sizes to the vector size, we allow
// reading/writing ALIGNED vectors whose first lane is a valid sample. This
// avoids needing a separate loop to handle remaining unaligned lanes.
//
// This image layout could also be achieved with a vector and a row accessor
// function, but a class wrapper with support for "deleter" allows wrapping
// existing memory allocated by clients without copying the pixels. It also
// provides convenient accessors for xsize/ysize, which shortens function
// argument lists. Supports move-construction so it can be stored in containers.
template <typename ComponentType>
class Image : public ImageBase {
public:
using T = ComponentType;
Image() = default;
Image(const size_t xsize, const size_t ysize)
: ImageBase(xsize, ysize, sizeof(T)) {}
Image(const size_t xsize, const size_t ysize, size_t bytes_per_row,
void* aligned)
: ImageBase(xsize, ysize, bytes_per_row, aligned) {}
void InitializePaddingForUnalignedAccesses() {
InitializePadding(sizeof(T), Padding::kUnaligned);
}
HWY_INLINE const T* ConstRow(const size_t y) const {
return static_cast<const T*>(VoidRow(y));
}
HWY_INLINE const T* ConstRow(const size_t y) {
return static_cast<const T*>(VoidRow(y));
}
// Returns pointer to non-const. This allows passing const Image* parameters
// when the callee is only supposed to fill the pixels, as opposed to
// allocating or resizing the image.
HWY_INLINE T* MutableRow(const size_t y) const {
return static_cast<T*>(VoidRow(y));
}
HWY_INLINE T* MutableRow(const size_t y) {
return static_cast<T*>(VoidRow(y));
}
// Returns number of pixels (some of which are padding) per row. Useful for
// computing other rows via pointer arithmetic. WARNING: this must
// NOT be used to determine xsize.
HWY_INLINE intptr_t PixelsPerRow() const {
return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
}
};
using ImageF = Image<float>;
// A bundle of 3 same-sized images. To fill an existing Image3 using
// single-channel producers, we also need access to each const Image*. Const
// prevents breaking the same-size invariant, while still allowing pixels to be
// changed via MutableRow.
template <typename ComponentType>
class Image3 {
public:
using T = ComponentType;
using ImageT = Image<T>;
static constexpr size_t kNumPlanes = 3;
Image3() : planes_{ImageT(), ImageT(), ImageT()} {}
Image3(const size_t xsize, const size_t ysize)
: planes_{ImageT(xsize, ysize), ImageT(xsize, ysize),
ImageT(xsize, ysize)} {}
Image3(Image3&& other) noexcept {
for (size_t i = 0; i < kNumPlanes; i++) {
planes_[i] = std::move(other.planes_[i]);
}
}
Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
HWY_ABORT(
"Not same size: %d x %d, %d x %d, %d x %d\n",
static_cast<int>(plane0.xsize()), static_cast<int>(plane0.ysize()),
static_cast<int>(plane1.xsize()), static_cast<int>(plane1.ysize()),
static_cast<int>(plane2.xsize()), static_cast<int>(plane2.ysize()));
}
planes_[0] = std::move(plane0);
planes_[1] = std::move(plane1);
planes_[2] = std::move(plane2);
}
// Copy construction/assignment is forbidden to avoid inadvertent copies,
// which can be very expensive. Use CopyImageTo instead.
Image3(const Image3& other) = delete;
Image3& operator=(const Image3& other) = delete;
Image3& operator=(Image3&& other) noexcept {
for (size_t i = 0; i < kNumPlanes; i++) {
planes_[i] = std::move(other.planes_[i]);
}
return *this;
}
HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
return static_cast<const T*>(VoidPlaneRow(c, y));
}
HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) {
return static_cast<const T*>(VoidPlaneRow(c, y));
}
HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const {
return static_cast<T*>(VoidPlaneRow(c, y));
}
HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) {
return static_cast<T*>(VoidPlaneRow(c, y));
}
HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; }
void Swap(Image3& other) {
for (size_t c = 0; c < 3; ++c) {
other.planes_[c].Swap(planes_[c]);
}
}
void ShrinkTo(const size_t xsize, const size_t ysize) {
for (ImageT& plane : planes_) {
plane.ShrinkTo(xsize, ysize);
}
}
// Sizes of all three images are guaranteed to be equal.
HWY_INLINE size_t xsize() const { return planes_[0].xsize(); }
HWY_INLINE size_t ysize() const { return planes_[0].ysize(); }
// Returns offset [bytes] from one row to the next row of the same plane.
// WARNING: this must NOT be used to determine xsize, nor for copying rows -
// the valid xsize may be much less.
HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
// Returns number of pixels (some of which are padding) per row. Useful for
// computing other rows via pointer arithmetic. WARNING: this must NOT be used
// to determine xsize.
HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }
private:
// Returns pointer to the start of a row.
HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
if (c >= kNumPlanes || y >= ysize()) {
HWY_ABORT("PlaneRow(%d, %d) >= %d\n", static_cast<int>(c),
static_cast<int>(y), static_cast<int>(ysize()));
}
#endif
// Use the first plane's stride because the compiler might not realize they
// are all equal. Thus we only need a single multiplication for all planes.
const size_t row_offset = y * planes_[0].bytes_per_row();
const void* row = planes_[c].bytes() + row_offset;
return static_cast<const T * HWY_RESTRICT>(
HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT));
}
private:
ImageT planes_[kNumPlanes];
};
using Image3F = Image3<float>;
// Rectangular region in image(s). Factoring this out of Image instead of
// shifting the pointer by x0/y0 allows this to apply to multiple images with
// different resolutions. Can compare size via SameSize(rect1, rect2).
class Rect {
public:
// Most windows are xsize_max * ysize_max, except those on the borders where
// begin + size_max > end.
constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max,
size_t ysize_max, size_t xend, size_t yend)
: x0_(xbegin),
y0_(ybegin),
xsize_(ClampedSize(xbegin, xsize_max, xend)),
ysize_(ClampedSize(ybegin, ysize_max, yend)) {}
// Construct with origin and known size (typically from another Rect).
constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize)
: x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}
// Construct a rect that covers a whole image.
template <typename Image>
explicit Rect(const Image& image)
: Rect(0, 0, image.xsize(), image.ysize()) {}
Rect() : Rect(0, 0, 0, 0) {}
Rect(const Rect&) = default;
Rect& operator=(const Rect&) = default;
Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max,
size_t ysize_max) {
return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_,
y0_ + ysize_);
}
template <typename T>
const T* ConstRow(const Image<T>* image, size_t y) const {
return image->ConstRow(y + y0_) + x0_;
}
template <typename T>
T* MutableRow(const Image<T>* image, size_t y) const {
return image->MutableRow(y + y0_) + x0_;
}
template <typename T>
const T* ConstPlaneRow(const Image3<T>& image, size_t c, size_t y) const {
return image.ConstPlaneRow(c, y + y0_) + x0_;
}
template <typename T>
T* MutablePlaneRow(Image3<T>* image, const size_t c, size_t y) const {
return image->MutablePlaneRow(c, y + y0_) + x0_;
}
// Returns true if this Rect fully resides in the given image. ImageT could be
// Image<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
template <class ImageT>
bool IsInside(const ImageT& image) const {
return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize());
}
size_t x0() const { return x0_; }
size_t y0() const { return y0_; }
size_t xsize() const { return xsize_; }
size_t ysize() const { return ysize_; }
private:
// Returns size_max, or whatever is left in [begin, end).
static constexpr size_t ClampedSize(size_t begin, size_t size_max,
size_t end) {
return (begin + size_max <= end) ? size_max
: (end > begin ? end - begin : 0);
}
size_t x0_;
size_t y0_;
size_t xsize_;
size_t ysize_;
};
// Works for any image-like input type(s).
template <class Image1, class Image2>
HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) {
return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
}
// Mirrors out of bounds coordinates and returns valid coordinates unchanged.
// We assume the radius (distance outside the image) is small compared to the
// image size, otherwise this might not terminate.
// The mirror is outside the last column (border pixel is also replicated).
static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x,
const int64_t xsize) {
HWY_DASSERT(xsize != 0);
// TODO(janwas): replace with branchless version
while (x < 0 || x >= xsize) {
if (x < 0) {
x = -x - 1;
} else {
x = 2 * xsize - 1 - x;
}
}
return static_cast<size_t>(x);
}
// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):
// Mirrors (repeating the edge pixel once). Useful for convolutions.
struct WrapMirror {
HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const {
return Mirror(coord, static_cast<int64_t>(size));
}
};
// Returns the same coordinate, for when we know "coord" is already valid (e.g.
// interior of an image).
struct WrapUnchanged {
HWY_INLINE size_t operator()(const int64_t coord, size_t /*size*/) const {
return static_cast<size_t>(coord);
}
};
// Similar to Wrap* but for row pointers (reduces Row() multiplications).
class WrapRowMirror {
public:
template <class View>
WrapRowMirror(const View& image, size_t ysize)
: first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}
const float* operator()(const float* const HWY_RESTRICT row,
const int64_t stride) const {
if (row < first_row_) {
const int64_t num_before = first_row_ - row;
// Mirrored; one row before => row 0, two before = row 1, ...
return first_row_ + num_before - stride;
}
if (row > last_row_) {
const int64_t num_after = row - last_row_;
// Mirrored; one row after => last row, two after = last - 1, ...
return last_row_ - num_after + stride;
}
return row;
}
private:
const float* const HWY_RESTRICT first_row_;
const float* const HWY_RESTRICT last_row_;
};
struct WrapRowUnchanged {
HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row,
int64_t /*stride*/) const {
return row;
}
};
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
@@ -0,0 +1,152 @@
// Copyright (c) the JPEG XL Project
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/image/image.h"
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <random>
#include <utility>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target:
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Ensure we can always write full aligned vectors.
struct TestAlignedT {
template <typename T>
void operator()(T /*unused*/) const {
std::mt19937 rng(129);
std::uniform_int_distribution<int> dist(0, 16);
const ScalableTag<T> d;
for (size_t ysize = 1; ysize < 4; ++ysize) {
for (size_t xsize = 1; xsize < 64; ++xsize) {
Image<T> img(xsize, ysize);
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize; x += Lanes(d)) {
const auto values = Iota(d, static_cast<T>(dist(rng)));
Store(values, d, row + x);
}
}
// Sanity check to prevent optimizing out the writes
const auto x = std::uniform_int_distribution<size_t>(0, xsize - 1)(rng);
const auto y = std::uniform_int_distribution<size_t>(0, ysize - 1)(rng);
HWY_ASSERT(img.ConstRow(y)[x] < 16 + Lanes(d));
}
}
}
};
void TestAligned() { ForUnsignedTypes(TestAlignedT()); }
// Ensure we can write an unaligned vector starting at the last valid value.
struct TestUnalignedT {
template <typename T>
void operator()(T /*unused*/) const {
std::mt19937 rng(129);
std::uniform_int_distribution<int> dist(0, 3);
const ScalableTag<T> d;
for (size_t ysize = 1; ysize < 4; ++ysize) {
for (size_t xsize = 1; xsize < 128; ++xsize) {
Image<T> img(xsize, ysize);
img.InitializePaddingForUnalignedAccesses();
// This test reads padding, which only works if it was initialized,
// which only happens in MSAN builds.
#if HWY_IS_MSAN || HWY_IDE
// Initialize only the valid samples
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize; ++x) {
row[x] = static_cast<T>(1u << dist(rng));
}
}
// Read padding bits
auto accum = Zero(d);
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize; ++x) {
accum = Or(accum, LoadU(d, row + x));
}
}
// Ensure padding was zero
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
Store(accum, d, lanes.get());
for (size_t i = 0; i < N; ++i) {
HWY_ASSERT(lanes[i] < 16);
}
#else // Check that writing padding does not overwrite valid samples
// Initialize only the valid samples
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize; ++x) {
row[x] = static_cast<T>(x);
}
}
// Zero padding and rightmost sample
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
StoreU(Zero(d), d, row + xsize - 1);
}
// Ensure no samples except the rightmost were overwritten
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize - 1; ++x) {
HWY_ASSERT_EQ(static_cast<T>(x), row[x]);
}
}
#endif
}
}
}
};
void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); }
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(ImageTest);
HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
} // namespace hwy
#endif
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,227 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stdio.h>
#include <cfloat> // FLT_MAX
#include <type_traits>
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/contrib/math/math-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <class Out, class In>
inline Out BitCast(const In& in) {
static_assert(sizeof(Out) == sizeof(In), "");
Out out;
CopyBytes<sizeof(out)>(&in, &out);
return out;
}
template <class T, class D>
HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
Vec<D> (*fxN)(D, VecArg<Vec<D>>), D d, T min, T max,
uint64_t max_error_ulp) {
using UintT = MakeUnsigned<T>;
const UintT min_bits = BitCast<UintT>(min);
const UintT max_bits = BitCast<UintT>(max);
// If min is negative and max is positive, the range needs to be broken into
// two pieces, [+0, max] and [-0, min], otherwise [min, max].
int range_count = 1;
UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}};
if ((min < 0.0) && (max > 0.0)) {
ranges[0][0] = BitCast<UintT>(static_cast<T>(+0.0));
ranges[0][1] = max_bits;
ranges[1][0] = BitCast<UintT>(static_cast<T>(-0.0));
ranges[1][1] = min_bits;
range_count = 2;
}
uint64_t max_ulp = 0;
// Emulation is slower, so cannot afford as many.
constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(4000));
for (int range_index = 0; range_index < range_count; ++range_index) {
const UintT start = ranges[range_index][0];
const UintT stop = ranges[range_index][1];
const UintT step = HWY_MAX(1, ((stop - start) / kSamplesPerRange));
for (UintT value_bits = start; value_bits <= stop; value_bits += step) {
// For reasons unknown, the HWY_MAX is necessary on RVV, otherwise
// value_bits can be less than start, and thus possibly NaN.
const T value = BitCast<T>(HWY_MIN(HWY_MAX(start, value_bits), stop));
const T actual = GetLane(fxN(d, Set(d, value)));
const T expected = fx1(value);
// Skip small inputs and outputs on armv7, it flushes subnormals to zero.
#if HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
if ((std::abs(value) < 1e-37f) || (std::abs(expected) < 1e-37f)) {
continue;
}
#endif
const auto ulp = hwy::detail::ComputeUlpDelta(actual, expected);
max_ulp = HWY_MAX(max_ulp, ulp);
if (ulp > max_error_ulp) {
fprintf(stderr,
"%s: %s(%f) expected %f actual %f ulp %" PRIu64 " max ulp %u\n",
hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), value,
expected, actual, static_cast<uint64_t>(ulp),
static_cast<uint32_t>(max_error_ulp));
}
}
}
fprintf(stderr, "%s: %s max_ulp %" PRIu64 "\n",
hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), max_ulp);
HWY_ASSERT(max_ulp <= max_error_ulp);
}
#define DEFINE_MATH_TEST_FUNC(NAME) \
HWY_NOINLINE void TestAll##NAME() { \
ForFloatTypes(ForPartialVectors<Test##NAME>()); \
}
#undef DEFINE_MATH_TEST
#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \
struct Test##NAME { \
template <class T, class D> \
HWY_NOINLINE void operator()(T, D d) { \
if (sizeof(T) == 4) { \
TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX, \
F32_ERROR); \
} else { \
TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d, \
static_cast<T>(F64_MIN), static_cast<T>(F64_MAX), \
F64_ERROR); \
} \
} \
}; \
DEFINE_MATH_TEST_FUNC(NAME)
// Floating point values closest to but less than 1.0
const float kNearOneF = BitCast<float>(0x3F7FFFFF);
const double kNearOneD = BitCast<double>(0x3FEFFFFFFFFFFFFFULL);
// The discrepancy is unacceptably large for MSYS2 (less accurate libm?), so
// only increase the error tolerance there.
constexpr uint64_t Cos64ULP() {
#if defined(__MINGW32__)
return 23;
#else
return 3;
#endif
}
constexpr uint64_t ACosh32ULP() {
#if defined(__MINGW32__)
return 8;
#else
return 3;
#endif
}
// clang-format off
DEFINE_MATH_TEST(Acos,
std::acos, CallAcos, -1.0f, +1.0f, 3, // NEON is 3 instead of 2
std::acos, CallAcos, -1.0, +1.0, 2)
DEFINE_MATH_TEST(Acosh,
std::acosh, CallAcosh, +1.0f, +FLT_MAX, ACosh32ULP(),
std::acosh, CallAcosh, +1.0, +DBL_MAX, 3)
DEFINE_MATH_TEST(Asin,
std::asin, CallAsin, -1.0f, +1.0f, 4, // ARMv7 is 4 instead of 2
std::asin, CallAsin, -1.0, +1.0, 2)
DEFINE_MATH_TEST(Asinh,
std::asinh, CallAsinh, -FLT_MAX, +FLT_MAX, 3,
std::asinh, CallAsinh, -DBL_MAX, +DBL_MAX, 3)
DEFINE_MATH_TEST(Atan,
std::atan, CallAtan, -FLT_MAX, +FLT_MAX, 3,
std::atan, CallAtan, -DBL_MAX, +DBL_MAX, 3)
DEFINE_MATH_TEST(Atanh,
std::atanh, CallAtanh, -kNearOneF, +kNearOneF, 4, // NEON is 4 instead of 3
std::atanh, CallAtanh, -kNearOneD, +kNearOneD, 3)
DEFINE_MATH_TEST(Cos,
std::cos, CallCos, -39000.0f, +39000.0f, 3,
std::cos, CallCos, -39000.0, +39000.0, Cos64ULP())
DEFINE_MATH_TEST(Exp,
std::exp, CallExp, -FLT_MAX, +104.0f, 1,
std::exp, CallExp, -DBL_MAX, +104.0, 1)
DEFINE_MATH_TEST(Expm1,
std::expm1, CallExpm1, -FLT_MAX, +104.0f, 4,
std::expm1, CallExpm1, -DBL_MAX, +104.0, 4)
DEFINE_MATH_TEST(Log,
std::log, CallLog, +FLT_MIN, +FLT_MAX, 1,
std::log, CallLog, +DBL_MIN, +DBL_MAX, 1)
DEFINE_MATH_TEST(Log10,
std::log10, CallLog10, +FLT_MIN, +FLT_MAX, 2,
std::log10, CallLog10, +DBL_MIN, +DBL_MAX, 2)
DEFINE_MATH_TEST(Log1p,
std::log1p, CallLog1p, +0.0f, +1e37f, 3, // NEON is 3 instead of 2
std::log1p, CallLog1p, +0.0, +DBL_MAX, 2)
DEFINE_MATH_TEST(Log2,
std::log2, CallLog2, +FLT_MIN, +FLT_MAX, 2,
std::log2, CallLog2, +DBL_MIN, +DBL_MAX, 2)
DEFINE_MATH_TEST(Sin,
std::sin, CallSin, -39000.0f, +39000.0f, 3,
std::sin, CallSin, -39000.0, +39000.0, 4) // MSYS is 4 instead of 3
DEFINE_MATH_TEST(Sinh,
std::sinh, CallSinh, -80.0f, +80.0f, 4,
std::sinh, CallSinh, -709.0, +709.0, 4)
DEFINE_MATH_TEST(Tanh,
std::tanh, CallTanh, -FLT_MAX, +FLT_MAX, 4,
std::tanh, CallTanh, -DBL_MAX, +DBL_MAX, 4)
// clang-format on
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HwyMathTest);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
} // namespace hwy
#endif
+190
View File
@@ -0,0 +1,190 @@
package(default_visibility = ["//visibility:public"])
licenses(["notice"])
# Unused on Bazel builds, where this is not defined/known; Copybara replaces
# usages with an empty list.
COMPAT = [
"//buildenv/target:non_prod", # includes mobile/vendor.
]
# cc_library(
# name = "vxsort",
# srcs = [
# "vxsort/isa_detection.cpp",
# "vxsort/isa_detection_msvc.cpp",
# "vxsort/isa_detection_sane.cpp",
# "vxsort/machine_traits.avx2.cpp",
# "vxsort/smallsort/avx2_load_mask_tables.cpp",
# "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp",
# "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp",
# "vxsort/vxsort_stats.cpp",
# ],
# hdrs = [
# "vxsort/alignment.h",
# "vxsort/defs.h",
# "vxsort/isa_detection.h",
# "vxsort/machine_traits.avx2.h",
# "vxsort/machine_traits.avx512.h",
# "vxsort/machine_traits.h",
# "vxsort/packer.h",
# "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h",
# "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h",
# "vxsort/smallsort/bitonic_sort.h",
# "vxsort/vxsort.h",
# "vxsort/vxsort_stats.h",
# ],
# compatible_with = [],
# textual_hdrs = [
# "vxsort/vxsort_targets_disable.h",
# "vxsort/vxsort_targets_enable_avx2.h",
# "vxsort/vxsort_targets_enable_avx512.h",
# ],
# )
cc_library(
name = "vqsort",
srcs = [
# Split into separate files to reduce MSVC build time.
"vqsort.cc",
"vqsort_128a.cc",
"vqsort_128d.cc",
"vqsort_f32a.cc",
"vqsort_f32d.cc",
"vqsort_f64a.cc",
"vqsort_f64d.cc",
"vqsort_i16a.cc",
"vqsort_i16d.cc",
"vqsort_i32a.cc",
"vqsort_i32d.cc",
"vqsort_i64a.cc",
"vqsort_i64d.cc",
"vqsort_kv64a.cc",
"vqsort_kv64d.cc",
"vqsort_kv128a.cc",
"vqsort_kv128d.cc",
"vqsort_u16a.cc",
"vqsort_u16d.cc",
"vqsort_u32a.cc",
"vqsort_u32d.cc",
"vqsort_u64a.cc",
"vqsort_u64d.cc",
],
hdrs = [
"vqsort.h", # public interface
],
compatible_with = [],
local_defines = ["hwy_contrib_EXPORTS"],
textual_hdrs = [
"shared-inl.h",
"sorting_networks-inl.h",
"traits-inl.h",
"traits128-inl.h",
"vqsort-inl.h",
# Placeholder for internal instrumentation. Do not remove.
],
deps = [
# Only if VQSORT_SECURE_RNG is set.
# "//third_party/absl/random",
"//:hwy",
# ":vxsort", # required if HAVE_VXSORT
],
)
# -----------------------------------------------------------------------------
# Internal-only targets
cc_library(
name = "helpers",
testonly = 1,
textual_hdrs = [
"algo-inl.h",
"result-inl.h",
],
deps = [
":vqsort",
"//:nanobenchmark",
# Required for HAVE_PDQSORT, but that is unused and this is
# unavailable to Bazel builds, hence commented out.
# "//third_party/boost/allowed",
# Avoid ips4o and thus TBB to work around hwloc build failure.
],
)
cc_binary(
name = "print_network",
testonly = 1,
srcs = ["print_network.cc"],
deps = [
":helpers",
":vqsort",
"//:hwy",
],
)
cc_test(
name = "sort_test",
size = "medium",
srcs = ["sort_test.cc"],
# Do not enable fully_static_link (pthread crash on bazel)
local_defines = ["HWY_IS_TEST"],
# for test_suite.
tags = ["hwy_ops_test"],
deps = [
":helpers",
":vqsort",
"@com_google_googletest//:gtest_main",
"//:hwy",
"//:hwy_test_util",
],
)
cc_binary(
name = "bench_sort",
testonly = 1,
srcs = ["bench_sort.cc"],
# Do not enable fully_static_link (pthread crash on bazel)
local_defines = ["HWY_IS_TEST"],
deps = [
":helpers",
":vqsort",
"@com_google_googletest//:gtest_main",
"//:hwy",
"//:hwy_test_util",
],
)
cc_binary(
name = "bench_parallel",
testonly = 1,
srcs = ["bench_parallel.cc"],
# Do not enable fully_static_link (pthread crash on bazel)
local_defines = ["HWY_IS_TEST"],
deps = [
":helpers",
":vqsort",
"@com_google_googletest//:gtest_main",
"//:hwy",
"//:hwy_test_util",
],
)
@@ -0,0 +1,87 @@
# Vectorized and performance-portable Quicksort
## Introduction
As of 2022-06-07 this sorts large arrays of built-in types about ten times as
fast as `std::sort`. See also our
[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html)
and [paper](https://arxiv.org/abs/2205.05982).
## Instructions
Here are instructions for reproducing our results on Linux and AWS (SVE, NEON).
### Linux
Please first ensure golang, and Clang (tested with 13.0.1) are installed via
your system's package manager.
```
go install github.com/bazelbuild/bazelisk@latest
git clone https://github.com/google/highway
cd highway
CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all
bazel-bin/hwy/contrib/sort/sort_test
bazel-bin/hwy/contrib/sort/bench_sort
```
### AWS Graviton3
Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is
32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the
config is verified, then re-launch. See IPv4 hostname in list of instances.
`ssh -i /path/key.pem ec2-user@hostname`
Note that the AWS CMake package is too old for llvm, so we build it first:
```
wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz
tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF
make -j8 && sudo make install
cd ..
```
AWS clang is at version 11.1, which generates unnecessary `AND` instructions
which slow down the sort by 1.15x. We tested with clang trunk as of June 13
(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
```
git clone --depth 1 https://github.com/llvm/llvm-project.git
cd llvm-project
mkdir -p build && cd build
/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release
make -j32 && sudo make install
```
```
sudo yum install go
go install github.com/bazelbuild/bazelisk@latest
git clone https://github.com/google/highway
cd highway
CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all
bazel-bin/hwy/contrib/sort/sort_test
bazel-bin/hwy/contrib/sort/bench_sort
```
The above command line enables SVE, which is currently only available on
Graviton 3. You can also test NEON on the same processor, or other Arm CPUs, by
changing the `-march=` option to `--copt=-march=armv8.2-a+crypto`. Note that
such flags will be unnecessary once Clang supports `#pragma target` for NEON and
SVE intrinsics, as it does for x86.
## Results
`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
algorithm (std for `std::sort`, vq for our vqsort), the type of keys being
sorted (f32 is float), the distribution of keys (uniform32 for uniform random
with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e.
number of key bytes output per second).
Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz:
```
[ RUN ] BenchSortGroup/BenchSort.BenchAllSort/AVX3
AVX3: std: f32: uniform32: 1.00E+06 54 MB/s ( 1 threads)
AVX3: vq: f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads)
```
@@ -0,0 +1,512 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Normal include guard for target-independent parts
#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
#include <stdint.h>
#include <string.h> // memcpy
#include <algorithm>
#include <cmath> // std::abs
#include <vector>
#include "hwy/base.h"
#include "hwy/contrib/sort/vqsort.h"
// Third-party algorithms
#define HAVE_AVX2SORT 0
#define HAVE_IPS4O 0
// When enabling, consider changing max_threads (required for Table 1a)
#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
#define HAVE_PDQSORT 0
#define HAVE_SORT512 0
#define HAVE_VXSORT 0
#if HAVE_AVX2SORT
HWY_PUSH_ATTRIBUTES("avx2,avx")
#include "avx2sort.h" //NOLINT
HWY_POP_ATTRIBUTES
#endif
#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
#include "third_party/ips4o/include/ips4o.hpp"
#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
#endif
#if HAVE_PDQSORT
#include "third_party/boost/allowed/sort/sort.hpp"
#endif
#if HAVE_SORT512
#include "sort512.h" //NOLINT
#endif
// vxsort is difficult to compile for multiple targets because it also uses
// .cpp files, and we'd also have to #undef its include guards. Instead, compile
// only for AVX2 or AVX3 depending on this macro.
#define VXSORT_AVX3 1
#if HAVE_VXSORT
// inlined from vxsort_targets_enable_avx512 (must close before end of header)
#ifdef __GNUC__
#ifdef __clang__
#if VXSORT_AVX3
#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
apply_to = any(function))
#else
#pragma clang attribute push(__attribute__((target("avx2"))), \
apply_to = any(function))
#endif // VXSORT_AVX3
#else
#pragma GCC push_options
#if VXSORT_AVX3
#pragma GCC target("avx512f,avx512dq")
#else
#pragma GCC target("avx2")
#endif // VXSORT_AVX3
#endif
#endif
#if VXSORT_AVX3
#include "vxsort/machine_traits.avx512.h"
#else
#include "vxsort/machine_traits.avx2.h"
#endif // VXSORT_AVX3
#include "vxsort/vxsort.h"
#ifdef __GNUC__
#ifdef __clang__
#pragma clang attribute pop
#else
#pragma GCC pop_options
#endif
#endif
#endif // HAVE_VXSORT
namespace hwy {
enum class Dist { kUniform8, kUniform16, kUniform32 };
static inline std::vector<Dist> AllDist() {
return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32};
}
static inline const char* DistName(Dist dist) {
switch (dist) {
case Dist::kUniform8:
return "uniform8";
case Dist::kUniform16:
return "uniform16";
case Dist::kUniform32:
return "uniform32";
}
return "unreachable";
}
template <typename T>
class InputStats {
public:
void Notify(T value) {
min_ = std::min(min_, value);
max_ = std::max(max_, value);
// Converting to integer would truncate floats, multiplying to save digits
// risks overflow especially when casting, so instead take the sum of the
// bit representations as the checksum.
uint64_t bits = 0;
static_assert(sizeof(T) <= 8, "Expected a built-in type");
CopyBytes<sizeof(T)>(&value, &bits); // not same size
sum_ += bits;
count_ += 1;
}
bool operator==(const InputStats& other) const {
if (count_ != other.count_) {
HWY_ABORT("count %d vs %d\n", static_cast<int>(count_),
static_cast<int>(other.count_));
}
if (min_ != other.min_ || max_ != other.max_) {
HWY_ABORT("minmax %f/%f vs %f/%f\n", static_cast<double>(min_),
static_cast<double>(max_), static_cast<double>(other.min_),
static_cast<double>(other.max_));
}
// Sum helps detect duplicated/lost values
if (sum_ != other.sum_) {
HWY_ABORT("Sum mismatch %g %g; min %g max %g\n",
static_cast<double>(sum_), static_cast<double>(other.sum_),
static_cast<double>(min_), static_cast<double>(max_));
}
return true;
}
private:
T min_ = hwy::HighestValue<T>();
T max_ = hwy::LowestValue<T>();
uint64_t sum_ = 0;
size_t count_ = 0;
};
enum class Algo {
#if HAVE_AVX2SORT
kSEA,
#endif
#if HAVE_IPS4O
kIPS4O,
#endif
#if HAVE_PARALLEL_IPS4O
kParallelIPS4O,
#endif
#if HAVE_PDQSORT
kPDQ,
#endif
#if HAVE_SORT512
kSort512,
#endif
#if HAVE_VXSORT
kVXSort,
#endif
kStd,
kVQSort,
kHeap,
};
static inline const char* AlgoName(Algo algo) {
switch (algo) {
#if HAVE_AVX2SORT
case Algo::kSEA:
return "sea";
#endif
#if HAVE_IPS4O
case Algo::kIPS4O:
return "ips4o";
#endif
#if HAVE_PARALLEL_IPS4O
case Algo::kParallelIPS4O:
return "par_ips4o";
#endif
#if HAVE_PDQSORT
case Algo::kPDQ:
return "pdq";
#endif
#if HAVE_SORT512
case Algo::kSort512:
return "sort512";
#endif
#if HAVE_VXSORT
case Algo::kVXSort:
return "vxsort";
#endif
case Algo::kStd:
return "std";
case Algo::kVQSort:
return "vq";
case Algo::kHeap:
return "heap";
}
return "unreachable";
}
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
#endif
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h" // HeapSort
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
class Xorshift128Plus {
static HWY_INLINE uint64_t SplitMix64(uint64_t z) {
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
return z ^ (z >> 31);
}
public:
// Generates two vectors of 64-bit seeds via SplitMix64 and stores into
// `seeds`. Generating these afresh in each ChoosePivot is too expensive.
template <class DU64>
static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) {
seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull);
for (size_t i = 1; i < 2 * Lanes(du64); ++i) {
seeds[i] = SplitMix64(seeds[i - 1]);
}
}
// Need to pass in the state because vector cannot be class members.
template <class VU64>
static VU64 RandomBits(VU64& state0, VU64& state1) {
VU64 s1 = state0;
VU64 s0 = state1;
const VU64 bits = Add(s1, s0);
state0 = s0;
s1 = Xor(s1, ShiftLeft<23>(s1));
state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
return bits;
}
};
template <class D, class VU64, HWY_IF_NOT_FLOAT_D(D)>
Vec<D> RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) {
const VU64 bits = Xorshift128Plus::RandomBits(s0, s1);
return BitCast(d, And(bits, mask));
}
// It is important to avoid denormals, which are flushed to zero by SIMD but not
// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
template <class DF, class VU64, HWY_IF_FLOAT_D(DF)>
Vec<DF> RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) {
using TF = TFromD<DF>;
const RebindToUnsigned<decltype(df)> du;
using VU = Vec<decltype(du)>;
const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask);
#if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to smaller types
using TU = MakeUnsigned<TF>;
const VU bits = Set(du, static_cast<TU>(GetLane(bits64) & LimitsMax<TU>()));
#else
const VU bits = BitCast(du, bits64);
#endif
// Avoid NaN/denormal by only generating values in [1, 2), i.e. random
// mantissas with the exponent taken from the representation of 1.0.
const VU k1 = BitCast(du, Set(df, TF{1.0}));
const VU mantissa_mask = Set(du, MantissaMask<TF>());
const VU representation = OrAnd(k1, bits, mantissa_mask);
return BitCast(df, representation);
}
template <class DU64>
Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) {
switch (sizeof_t) {
case 2:
return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull
: 0xFFFFFFFFFFFFFFFFull);
case 4:
return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull
: (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull
: 0xFFFFFFFFFFFFFFFFull);
case 8:
return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull
: (dist == Dist::kUniform16) ? 0x000000000000FFFFull
: 0x00000000FFFFFFFFull);
default:
HWY_ABORT("Logic error");
return Zero(du64);
}
}
template <typename T>
InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
SortTag<uint64_t> du64;
using VU64 = Vec<decltype(du64)>;
const size_t N64 = Lanes(du64);
auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64);
Xorshift128Plus::GenerateSeeds(du64, seeds.get());
VU64 s0 = Load(du64, seeds.get());
VU64 s1 = Load(du64, seeds.get() + N64);
#if HWY_TARGET == HWY_SCALAR
const Sisd<T> d;
#else
const Repartition<T, decltype(du64)> d;
#endif
using V = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU64 mask = MaskForDist(du64, dist, sizeof(T));
auto buf = hwy::AllocateAligned<T>(N);
size_t i = 0;
for (; i + N <= num; i += N) {
const V values = RandomValues(d, s0, s1, mask);
StoreU(values, d, v + i);
}
if (i < num) {
const V values = RandomValues(d, s0, s1, mask);
StoreU(values, d, buf.get());
memcpy(v + i, buf.get(), (num - i) * sizeof(T));
}
InputStats<T> input_stats;
for (size_t i = 0; i < num; ++i) {
input_stats.Notify(v[i]);
}
return input_stats;
}
struct ThreadLocal {
Sorter sorter;
};
struct SharedState {
#if HAVE_PARALLEL_IPS4O
const unsigned max_threads = hwy::LimitsMax<unsigned>(); // 16 for Table 1a
ips4o::StdThreadPool pool{static_cast<int>(
HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))};
#endif
std::vector<ThreadLocal> tls{1};
};
// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For
// non-128-bit keys they are the same:
template <class Order, typename KeyType, HWY_IF_NOT_LANE_SIZE(KeyType, 16)>
void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) {
using detail::TraitsLane;
using detail::SharedTraits;
if (Order().IsAscending()) {
const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
return detail::HeapSort(st, keys, num_keys);
} else {
const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
return detail::HeapSort(st, keys, num_keys);
}
}
#if VQSORT_ENABLED
template <class Order>
void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) {
using detail::SharedTraits;
using detail::Traits128;
uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
const size_t num_lanes = num_keys * 2;
if (Order().IsAscending()) {
const SharedTraits<Traits128<detail::OrderAscending128>> st;
return detail::HeapSort(st, lanes, num_lanes);
} else {
const SharedTraits<Traits128<detail::OrderDescending128>> st;
return detail::HeapSort(st, lanes, num_lanes);
}
}
template <class Order>
void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) {
using detail::SharedTraits;
using detail::Traits128;
uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
const size_t num_lanes = num_keys * 2;
if (Order().IsAscending()) {
const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
return detail::HeapSort(st, lanes, num_lanes);
} else {
const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
return detail::HeapSort(st, lanes, num_lanes);
}
}
#endif // VQSORT_ENABLED
template <class Order, typename KeyType>
void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num,
SharedState& shared, size_t thread) {
const std::less<KeyType> less;
const std::greater<KeyType> greater;
switch (algo) {
#if HAVE_AVX2SORT
case Algo::kSEA:
return avx2::quicksort(inout, static_cast<int>(num));
#endif
#if HAVE_IPS4O
case Algo::kIPS4O:
if (Order().IsAscending()) {
return ips4o::sort(inout, inout + num, less);
} else {
return ips4o::sort(inout, inout + num, greater);
}
#endif
#if HAVE_PARALLEL_IPS4O
case Algo::kParallelIPS4O:
if (Order().IsAscending()) {
return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
} else {
return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
}
#endif
#if HAVE_SORT512
case Algo::kSort512:
HWY_ABORT("not supported");
// return Sort512::Sort(inout, num);
#endif
#if HAVE_PDQSORT
case Algo::kPDQ:
if (Order().IsAscending()) {
return boost::sort::pdqsort_branchless(inout, inout + num, less);
} else {
return boost::sort::pdqsort_branchless(inout, inout + num, greater);
}
#endif
#if HAVE_VXSORT
case Algo::kVXSort: {
#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
(!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
fprintf(stderr, "Do not call for target %s\n",
hwy::TargetName(HWY_TARGET));
return;
#else
#if VXSORT_AVX3
vxsort::vxsort<KeyType, vxsort::AVX512> vx;
#else
vxsort::vxsort<KeyType, vxsort::AVX2> vx;
#endif
if (Order().IsAscending()) {
return vx.sort(inout, inout + num - 1);
} else {
fprintf(stderr, "Skipping VX - does not support descending order\n");
return;
}
#endif // enabled for this target
}
#endif // HAVE_VXSORT
case Algo::kStd:
if (Order().IsAscending()) {
return std::sort(inout, inout + num, less);
} else {
return std::sort(inout, inout + num, greater);
}
case Algo::kVQSort:
return shared.tls[thread].sorter(inout, num, Order());
case Algo::kHeap:
return CallHeapSort<Order>(inout, num);
default:
HWY_ABORT("Not implemented");
}
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
@@ -0,0 +1,238 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Concurrent, independent sorts for generating more memory traffic and testing
// scalability.
#include <stdint.h>
#include <stdio.h>
#include <condition_variable> //NOLINT
#include <functional>
#include <memory>
#include <mutex> //NOLINT
#include <thread> //NOLINT
#include <utility>
#include <vector>
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc" //NOLINT
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/algo-inl.h"
#include "hwy/contrib/sort/result-inl.h"
#include "hwy/aligned_allocator.h"
// Last
#include "hwy/tests/test_util-inl.h"
// clang-format on
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace {
class ThreadPool {
public:
// Starts the given number of worker threads and blocks until they are ready.
explicit ThreadPool(
const size_t num_threads = std::thread::hardware_concurrency())
: num_threads_(num_threads) {
HWY_ASSERT(num_threads_ > 0);
threads_.reserve(num_threads_);
for (size_t i = 0; i < num_threads_; ++i) {
threads_.emplace_back(ThreadFunc, this, i);
}
WorkersReadyBarrier();
}
ThreadPool(const ThreadPool&) = delete;
ThreadPool& operator&(const ThreadPool&) = delete;
// Waits for all threads to exit.
~ThreadPool() {
StartWorkers(kWorkerExit);
for (std::thread& thread : threads_) {
thread.join();
}
}
size_t NumThreads() const { return threads_.size(); }
template <class Func>
void RunOnThreads(size_t max_threads, const Func& func) {
task_ = &CallClosure<Func>;
data_ = &func;
StartWorkers(max_threads);
WorkersReadyBarrier();
}
private:
// After construction and between calls to Run, workers are "ready", i.e.
// waiting on worker_start_cv_. They are "started" by sending a "command"
// and notifying all worker_start_cv_ waiters. (That is why all workers
// must be ready/waiting - otherwise, the notification will not reach all of
// them and the main thread waits in vain for them to report readiness.)
using WorkerCommand = uint64_t;
static constexpr WorkerCommand kWorkerWait = ~1ULL;
static constexpr WorkerCommand kWorkerExit = ~2ULL;
// Calls a closure (lambda with captures).
template <class Closure>
static void CallClosure(const void* f, size_t thread) {
(*reinterpret_cast<const Closure*>(f))(thread);
}
void WorkersReadyBarrier() {
std::unique_lock<std::mutex> lock(mutex_);
// Typically only a single iteration.
while (workers_ready_ != threads_.size()) {
workers_ready_cv_.wait(lock);
}
workers_ready_ = 0;
// Safely handle spurious worker wakeups.
worker_start_command_ = kWorkerWait;
}
// Precondition: all workers are ready.
void StartWorkers(const WorkerCommand worker_command) {
std::unique_lock<std::mutex> lock(mutex_);
worker_start_command_ = worker_command;
// Workers will need this lock, so release it before they wake up.
lock.unlock();
worker_start_cv_.notify_all();
}
static void ThreadFunc(ThreadPool* self, size_t thread) {
// Until kWorkerExit command received:
for (;;) {
std::unique_lock<std::mutex> lock(self->mutex_);
// Notify main thread that this thread is ready.
if (++self->workers_ready_ == self->num_threads_) {
self->workers_ready_cv_.notify_one();
}
RESUME_WAIT:
// Wait for a command.
self->worker_start_cv_.wait(lock);
const WorkerCommand command = self->worker_start_command_;
switch (command) {
case kWorkerWait: // spurious wakeup:
goto RESUME_WAIT; // lock still held, avoid incrementing ready.
case kWorkerExit:
return; // exits thread
default:
break;
}
lock.unlock();
// Command is the maximum number of threads that should run the task.
HWY_ASSERT(command < self->NumThreads());
if (thread < command) {
self->task_(self->data_, thread);
}
}
}
const size_t num_threads_;
// Unmodified after ctor, but cannot be const because we call thread::join().
std::vector<std::thread> threads_;
std::mutex mutex_; // guards both cv and their variables.
std::condition_variable workers_ready_cv_;
size_t workers_ready_ = 0;
std::condition_variable worker_start_cv_;
WorkerCommand worker_start_command_;
// Written by main thread, read by workers (after mutex lock/unlock).
std::function<void(const void*, size_t)> task_; // points to CallClosure
const void* data_; // points to caller's Func
};
template <class Traits>
void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys,
const Algo algo, SharedState& shared, size_t thread) {
using LaneType = typename Traits::LaneType;
using KeyType = typename Traits::KeyType;
using Order = typename Traits::Order;
const size_t num_lanes = num_keys * st.LanesPerKey();
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
(void)GenerateInput(dist, aligned.get(), num_lanes);
const Timestamp t0;
Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, shared,
thread);
HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]);
}
void BenchParallel() {
// Not interested in benchmark results for other targets on x86
if (HWY_ARCH_X86 && (HWY_TARGET != HWY_AVX2 && HWY_TARGET != HWY_AVX3)) {
return;
}
ThreadPool pool;
const size_t NT = pool.NumThreads();
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
using KeyType = typename decltype(st)::KeyType;
const size_t num_keys = size_t{100} * 1000 * 1000;
#if HAVE_IPS4O
const Algo algo = Algo::kIPS4O;
#else
const Algo algo = Algo::kVQSort;
#endif
const Dist dist = Dist::kUniform32;
SharedState shared;
shared.tls.resize(NT);
std::vector<Result> results;
for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) {
Timestamp t0;
// Default capture because MSVC wants algo/dist but clang does not.
pool.RunOnThreads(nt, [=, &shared](size_t thread) {
RunWithoutVerify(st, dist, num_keys, algo, shared, thread);
});
const double sec = SecondsSince(t0);
results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType),
st.KeyString());
results.back().Print();
}
}
} // namespace
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_BEFORE_TEST(BenchParallel);
HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel);
} // namespace
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,310 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <stdio.h>
#include <vector>
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/algo-inl.h"
#include "hwy/contrib/sort/result-inl.h"
#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
// Mode for larger sorts because M1 is able to access more than the per-core
// share of L2, so 1M elements might still be in cache.
#define SORT_100M 0
HWY_BEFORE_NAMESPACE();
namespace hwy {
// Defined within HWY_ONCE, used by BenchAllSort.
extern int64_t first_sort_target;
namespace HWY_NAMESPACE {
namespace {
using detail::TraitsLane;
using detail::OrderAscending;
using detail::OrderDescending;
using detail::SharedTraits;
#if VQSORT_ENABLED || HWY_IDE
using detail::OrderAscending128;
using detail::OrderAscendingKV128;
using detail::Traits128;
template <class Traits>
HWY_NOINLINE void BenchPartition() {
using LaneType = typename Traits::LaneType;
using KeyType = typename Traits::KeyType;
const SortTag<LaneType> d;
detail::SharedTraits<Traits> st;
const Dist dist = Dist::kUniform8;
double sum = 0.0;
detail::Generator rng(&sum, 123); // for ChoosePivot
const size_t max_log2 = AdjustedLog2Reps(20);
for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
const size_t num_lanes = 1ull << log2;
const size_t num_keys = num_lanes / st.LanesPerKey();
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
auto buf = hwy::AllocateAligned<LaneType>(
HWY_MAX(hwy::SortConstants::PartitionBufNum(Lanes(d)),
hwy::SortConstants::PivotBufNum(sizeof(LaneType), Lanes(d))));
std::vector<double> seconds;
const size_t num_reps = (1ull << (14 - log2 / 2)) * 30;
for (size_t rep = 0; rep < num_reps; ++rep) {
(void)GenerateInput(dist, aligned.get(), num_lanes);
// The pivot value can influence performance. Do exactly what vqsort will
// do so that the performance (influenced by prefetching and branch
// prediction) is likely to predict the actual performance inside vqsort.
detail::DrawSamples(d, st, aligned.get(), num_lanes, buf.get(), rng);
detail::SortSamples(d, st, buf.get());
auto pivot = detail::ChoosePivotByRank(d, st, buf.get());
const Timestamp t0;
detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf.get());
seconds.push_back(SecondsSince(t0));
// 'Use' the result to prevent optimizing out the partition.
sum += static_cast<double>(aligned.get()[num_lanes / 2]);
}
Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds),
sizeof(KeyType), st.KeyString())
.Print();
}
HWY_ASSERT(sum != 999999); // Prevent optimizing out
}
HWY_NOINLINE void BenchAllPartition() {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3) {
return;
}
BenchPartition<TraitsLane<OrderDescending<float>>>();
BenchPartition<TraitsLane<OrderDescending<int32_t>>>();
BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
BenchPartition<Traits128<OrderAscending128>>();
// BenchPartition<Traits128<OrderDescending128>>();
BenchPartition<Traits128<OrderAscendingKV128>>();
}
template <class Traits>
HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
return;
}
using LaneType = typename Traits::LaneType;
using KeyType = typename Traits::KeyType;
const SortTag<LaneType> d;
detail::SharedTraits<Traits> st;
const Dist dist = Dist::kUniform32;
const size_t N = Lanes(d);
const size_t num_lanes = SortConstants::BaseCaseNum(N);
const size_t num_keys = num_lanes / st.LanesPerKey();
auto keys = hwy::AllocateAligned<LaneType>(num_lanes);
auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N);
std::vector<double> seconds;
double sum = 0; // prevents elision
constexpr size_t kMul = AdjustedReps(600); // ensures long enough to measure
for (size_t rep = 0; rep < 30; ++rep) {
InputStats<LaneType> input_stats =
GenerateInput(dist, keys.get(), num_lanes);
const Timestamp t0;
for (size_t i = 0; i < kMul; ++i) {
detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes,
buf.get());
sum += static_cast<double>(keys[0]);
}
seconds.push_back(SecondsSince(t0));
// printf("%f\n", seconds.back());
HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase"));
}
HWY_ASSERT(sum < 1E99);
results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1,
SummarizeMeasurements(seconds), sizeof(KeyType),
st.KeyString());
}
HWY_NOINLINE void BenchAllBase() {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3) {
return;
}
std::vector<Result> results;
BenchBase<TraitsLane<OrderAscending<float>>>(results);
BenchBase<TraitsLane<OrderDescending<int64_t>>>(results);
BenchBase<Traits128<OrderAscending128>>(results);
for (const Result& r : results) {
r.Print();
}
}
#else
void BenchAllPartition() {}
void BenchAllBase() {}
#endif // VQSORT_ENABLED
std::vector<Algo> AlgoForBench() {
return {
#if HAVE_AVX2SORT
Algo::kSEA,
#endif
#if HAVE_PARALLEL_IPS4O
Algo::kParallelIPS4O,
#elif HAVE_IPS4O
Algo::kIPS4O,
#endif
#if HAVE_PDQSORT
Algo::kPDQ,
#endif
#if HAVE_SORT512
Algo::kSort512,
#endif
// Only include if we're compiling for the target it supports.
#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \
(!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2))
Algo::kVXSort,
#endif
#if !HAVE_PARALLEL_IPS4O
#if !SORT_100M
// These are 10-20x slower, but that's OK for the default size when we
// are not testing the parallel nor 100M modes.
Algo::kStd, Algo::kHeap,
#endif
Algo::kVQSort, // only ~4x slower, but not required for Table 1a
#endif
};
}
template <class Traits>
HWY_NOINLINE void BenchSort(size_t num_keys) {
if (first_sort_target == 0) first_sort_target = HWY_TARGET;
SharedState shared;
detail::SharedTraits<Traits> st;
using Order = typename Traits::Order;
using LaneType = typename Traits::LaneType;
using KeyType = typename Traits::KeyType;
const size_t num_lanes = num_keys * st.LanesPerKey();
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
const size_t reps = num_keys > 1000 * 1000 ? 10 : 30;
for (Algo algo : AlgoForBench()) {
// Other algorithms don't depend on the vector instructions, so only run
// them for the first target.
#if !HAVE_VXSORT
if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) {
continue;
}
#endif
for (Dist dist : AllDist()) {
std::vector<double> seconds;
for (size_t rep = 0; rep < reps; ++rep) {
InputStats<LaneType> input_stats =
GenerateInput(dist, aligned.get(), num_lanes);
const Timestamp t0;
Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys,
shared, /*thread=*/0);
seconds.push_back(SecondsSince(t0));
// printf("%f\n", seconds.back());
HWY_ASSERT(
VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort"));
}
Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds),
sizeof(KeyType), st.KeyString())
.Print();
} // dist
} // algo
}
HWY_NOINLINE void BenchAllSort() {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
return;
}
constexpr size_t K = 1000;
constexpr size_t M = K * K;
(void)K;
(void)M;
for (size_t num_keys : {
#if HAVE_PARALLEL_IPS4O || SORT_100M
100 * M,
#else
1 * M,
#endif
}) {
BenchSort<TraitsLane<OrderAscending<float>>>(num_keys);
// BenchSort<TraitsLane<OrderDescending<double>>>(num_keys);
// BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys);
BenchSort<TraitsLane<OrderDescending<int32_t>>>(num_keys);
BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys);
// BenchSort<TraitsLane<OrderDescending<uint16_t>>>(num_keys);
// BenchSort<TraitsLane<OrderDescending<uint32_t>>>(num_keys);
// BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys);
#if !HAVE_VXSORT && VQSORT_ENABLED
BenchSort<Traits128<OrderAscending128>>(num_keys);
BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
#endif
}
}
} // namespace
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
int64_t first_sort_target = 0; // none run yet
namespace {
HWY_BEFORE_TEST(BenchSort);
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase);
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
} // namespace
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,191 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <algorithm>
#include "hwy/base.h"
// Based on A.7 in "Entwurf und Implementierung vektorisierter
// Sortieralgorithmen" and code by Mark Blacher.
void PrintMergeNetwork16x2() {
for (int i = 8; i < 16; ++i) {
printf("v%x = st.SwapAdjacent(d, v%x);\n", i, i);
}
for (int i = 0; i < 8; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
}
for (int i = 0; i < 4; ++i) {
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 4, i + 4);
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 12, i + 12);
}
for (int i = 0; i < 4; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
}
for (int i = 0; i < 16; i += 4) {
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 2, i + 2);
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 3, i + 3);
}
for (int i = 0; i < 16; i += 4) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
}
for (int i = 0; i < 16; i += 2) {
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 1, i + 1);
}
for (int i = 0; i < 16; i += 2) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
}
printf("\n");
}
void PrintMergeNetwork16x4() {
printf("\n");
for (int i = 8; i < 16; ++i) {
printf("v%x = st.Reverse4(d, v%x);\n", i, i);
}
for (int i = 0; i < 8; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
}
for (int i = 0; i < 4; ++i) {
printf("v%x = st.Reverse4(d, v%x);\n", i + 4, i + 4);
printf("v%x = st.Reverse4(d, v%x);\n", i + 12, i + 12);
}
for (int i = 0; i < 4; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
}
for (int i = 0; i < 16; i += 4) {
printf("v%x = st.Reverse4(d, v%x);\n", i + 2, i + 2);
printf("v%x = st.Reverse4(d, v%x);\n", i + 3, i + 3);
}
for (int i = 0; i < 16; i += 4) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
}
for (int i = 0; i < 16; i += 2) {
printf("v%x = st.Reverse4(d, v%x);\n", i + 1, i + 1);
}
for (int i = 0; i < 16; i += 2) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsReverse4(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
}
}
void PrintMergeNetwork16x8() {
printf("\n");
for (int i = 8; i < 16; ++i) {
printf("v%x = st.ReverseKeys8(d, v%x);\n", i, i);
}
for (int i = 0; i < 8; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
}
for (int i = 0; i < 4; ++i) {
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 4, i + 4);
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 12, i + 12);
}
for (int i = 0; i < 4; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
}
for (int i = 0; i < 16; i += 4) {
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 2, i + 2);
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 3, i + 3);
}
for (int i = 0; i < 16; i += 4) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
}
for (int i = 0; i < 16; i += 2) {
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 1, i + 1);
}
for (int i = 0; i < 16; i += 2) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsReverse8(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
}
}
void PrintMergeNetwork16x16() {
printf("\n");
for (int i = 8; i < 16; ++i) {
printf("v%x = st.ReverseKeys16(d, v%x);\n", i, i);
}
for (int i = 0; i < 8; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
}
for (int i = 0; i < 4; ++i) {
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 4, i + 4);
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 12, i + 12);
}
for (int i = 0; i < 4; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
}
for (int i = 0; i < 16; i += 4) {
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 2, i + 2);
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 3, i + 3);
}
for (int i = 0; i < 16; i += 4) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
}
for (int i = 0; i < 16; i += 2) {
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 1, i + 1);
}
for (int i = 0; i < 16; i += 2) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsReverse16<kOrder>(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance4<kOrder>(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
}
}
int main(int argc, char** argv) {
PrintMergeNetwork16x2();
PrintMergeNetwork16x4();
PrintMergeNetwork16x8();
PrintMergeNetwork16x16();
return 0;
}
@@ -0,0 +1,139 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/algo-inl.h"
// Normal include guard for non-SIMD parts
#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
#include <time.h>
#include <algorithm> // std::sort
#include <string>
#include "hwy/base.h"
#include "hwy/nanobenchmark.h"
namespace hwy {
struct Timestamp {
Timestamp() { t = platform::Now(); }
double t;
};
static inline double SecondsSince(const Timestamp& t0) {
const Timestamp t1;
return t1.t - t0.t;
}
// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
// enough for the mode to be reliable).
static inline double SummarizeMeasurements(std::vector<double>& seconds) {
std::sort(seconds.begin(), seconds.end());
double sum = 0;
int count = 0;
const size_t num = seconds.size();
for (size_t i = num / 4; i < num / 2; ++i) {
sum += seconds[i];
count += 1;
}
return sum / count;
}
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
struct Result {
Result() {}
Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads,
double sec, size_t sizeof_key, const std::string& key_name)
: target(HWY_TARGET),
algo(algo),
dist(dist),
num_keys(num_keys),
num_threads(num_threads),
sec(sec),
sizeof_key(sizeof_key),
key_name(key_name) {}
void Print() const {
const double bytes = static_cast<double>(num_keys) *
static_cast<double>(num_threads) *
static_cast<double>(sizeof_key);
printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
hwy::TargetName(target), AlgoName(algo), key_name.c_str(),
DistName(dist), static_cast<double>(num_keys), bytes * 1E-6 / sec,
num_threads);
}
int64_t target;
Algo algo;
Dist dist;
size_t num_keys = 0;
size_t num_threads = 0;
double sec = 0.0;
size_t sizeof_key = 0;
std::string key_name;
};
template <class Traits, typename LaneType>
bool VerifySort(Traits st, const InputStats<LaneType>& input_stats,
const LaneType* out, size_t num_lanes, const char* caller) {
constexpr size_t N1 = st.LanesPerKey();
HWY_ASSERT(num_lanes >= N1);
InputStats<LaneType> output_stats;
// Ensure it matches the sort order
for (size_t i = 0; i < num_lanes - N1; i += N1) {
output_stats.Notify(out[i]);
if (N1 == 2) output_stats.Notify(out[i + 1]);
// Reverse order instead of checking !Compare1 so we accept equal keys.
if (st.Compare1(out + i + N1, out + i)) {
printf("%s: i=%d of %d lanes: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n",
caller, static_cast<int>(i), static_cast<int>(num_lanes),
static_cast<int>(N1), static_cast<double>(out[i + 1]),
static_cast<double>(out[i + 0]),
static_cast<double>(out[i + N1 + 1]),
static_cast<double>(out[i + N1]));
HWY_ABORT("%d-bit sort is incorrect\n",
static_cast<int>(sizeof(LaneType) * 8 * N1));
}
}
output_stats.Notify(out[num_lanes - N1]);
if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]);
return input_stats == output_stats;
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
@@ -0,0 +1,133 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Definitions shared between vqsort-inl and sorting_networks-inl.
// Normal include guard for target-independent parts
#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
#include "hwy/base.h"
namespace hwy {
// Internal constants - these are to avoid magic numbers/literals and cannot be
// changed without also changing the associated code.
struct SortConstants {
// SortingNetwork reshapes its input into a matrix. This is the maximum number
// of *keys* per vector.
#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow
#else
static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector
#endif
// 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
// fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
// code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
// extra logN factor for larger networks (for which only loose upper bounds
// on size are known).
static constexpr size_t kMaxRowsLog2 = 4;
static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) {
return kMaxRows * HWY_MIN(N, kMaxCols);
}
// Unrolling is important (pipelining and amortizing branch mispredictions);
// 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
// somewhat slower for sorting than 4x.
//
// To change, must also update left + 3 * N etc. in the loop.
static constexpr size_t kPartitionUnroll = 4;
static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
// The main loop reads kPartitionUnroll vectors, and first loads from
// both left and right beforehand, so it requires min = 2 *
// kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
// >= BaseCaseNum), we partition the right side into a buffer. We need
// another vector at the end so CompressStore does not overwrite anything.
return (2 * kPartitionUnroll + 1) * N;
}
// Chunk := group of keys loaded for sampling a pivot. Matches the typical
// cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
// are larger, use entire vectors to ensure we do not overrun the array.
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
return HWY_MAX(64 / sizeof_t, N);
}
static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
// 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
}
template <typename T>
static constexpr HWY_INLINE size_t BufNum(size_t N) {
// One extra for padding plus another for full-vector loads.
return HWY_MAX(BaseCaseNum(N) + 2 * N,
HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N)));
}
template <typename T>
static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
return sizeof(T) * BufNum<T>(vector_size / sizeof(T));
}
};
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
#endif
#include "hwy/highway.h"
// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
// Arm v7 debug.
#undef VQSORT_ENABLED
#if (HWY_TARGET == HWY_SCALAR) || \
(HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
(HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
#define VQSORT_ENABLED 0
#else
#define VQSORT_ENABLED 1
#endif
namespace hwy {
namespace HWY_NAMESPACE {
// Default tag / vector width selector.
#if HWY_TARGET == HWY_RVV
// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
template <typename T>
using SortTag = ScalableTag<T, -1>;
#else
template <typename T>
using SortTag = ScalableTag<T>;
#endif
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
@@ -0,0 +1,626 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h> // memcpy
#include <unordered_map>
#include <vector>
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/contrib/sort/vqsort.h"
// After foreach_target
#include "hwy/contrib/sort/algo-inl.h"
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/contrib/sort/result-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h" // BaseCase
#include "hwy/tests/test_util-inl.h"
// clang-format on
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace {
using detail::OrderAscending;
using detail::OrderDescending;
using detail::SharedTraits;
using detail::TraitsLane;
#if VQSORT_ENABLED || HWY_IDE
using detail::OrderAscending128;
using detail::OrderAscendingKV128;
using detail::OrderAscendingKV64;
using detail::OrderDescending128;
using detail::OrderDescendingKV128;
using detail::OrderDescendingKV64;
using detail::Traits128;
template <class Traits>
static HWY_NOINLINE void TestMedian3() {
using LaneType = typename Traits::LaneType;
using D = CappedTag<LaneType, 1>;
SharedTraits<Traits> st;
const D d;
using V = Vec<D>;
for (uint32_t bits = 0; bits < 8; ++bits) {
const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u});
const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u});
const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u});
const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2));
// If at least half(rounded up) of bits are 1, so is the median.
const size_t count = PopCount(bits);
HWY_ASSERT_EQ((count >= 2) ? static_cast<LaneType>(1) : 0, m);
}
}
HWY_NOINLINE void TestAllMedian() {
TestMedian3<TraitsLane<OrderAscending<uint64_t> > >();
}
template <class Traits>
static HWY_NOINLINE void TestBaseCaseAscDesc() {
using LaneType = typename Traits::LaneType;
SharedTraits<Traits> st;
const SortTag<LaneType> d;
const size_t N = Lanes(d);
const size_t base_case_num = SortConstants::BaseCaseNum(N);
const size_t N1 = st.LanesPerKey();
constexpr int kDebug = 0;
auto aligned_lanes = hwy::AllocateAligned<LaneType>(N + base_case_num + N);
auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
std::vector<size_t> lengths;
lengths.push_back(HWY_MAX(1, N1));
lengths.push_back(3 * N1);
lengths.push_back(base_case_num / 2);
lengths.push_back(base_case_num / 2 + N1);
lengths.push_back(base_case_num - N1);
lengths.push_back(base_case_num);
std::vector<size_t> misalignments;
misalignments.push_back(0);
misalignments.push_back(1);
if (N >= 6) misalignments.push_back(N / 2 - 1);
misalignments.push_back(N / 2);
misalignments.push_back(N / 2 + 1);
misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1}));
for (bool asc : {false, true}) {
for (size_t len : lengths) {
for (size_t misalign : misalignments) {
LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
if (kDebug) {
printf("============%s asc %d N1 %d len %d misalign %d\n",
st.KeyString().c_str(), asc, static_cast<int>(N1),
static_cast<int>(len), static_cast<int>(misalign));
}
for (size_t i = 0; i < misalign; ++i) {
aligned_lanes[i] = hwy::LowestValue<LaneType>();
}
InputStats<LaneType> input_stats;
for (size_t i = 0; i < len; ++i) {
lanes[i] = asc ? static_cast<LaneType>(LaneType(i) + 1)
: static_cast<LaneType>(LaneType(len) - LaneType(i));
input_stats.Notify(lanes[i]);
if (kDebug >= 2) {
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
}
}
for (size_t i = len; i < base_case_num + N; ++i) {
lanes[i] = hwy::LowestValue<LaneType>();
}
detail::BaseCase(d, st, lanes, lanes + len, len, buf.get());
if (kDebug >= 2) {
printf("out>>>>>>\n");
for (size_t i = 0; i < len; ++i) {
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
}
}
HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc"));
for (size_t i = 0; i < misalign; ++i) {
if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
}
for (size_t i = len; i < base_case_num + N; ++i) {
if (lanes[i] != hwy::LowestValue<LaneType>())
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
}
} // misalign
} // len
} // asc
}
template <class Traits>
static HWY_NOINLINE void TestBaseCase01() {
using LaneType = typename Traits::LaneType;
SharedTraits<Traits> st;
const SortTag<LaneType> d;
const size_t N = Lanes(d);
const size_t base_case_num = SortConstants::BaseCaseNum(N);
const size_t N1 = st.LanesPerKey();
constexpr int kDebug = 0;
auto lanes = hwy::AllocateAligned<LaneType>(base_case_num + N);
auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
std::vector<size_t> lengths;
lengths.push_back(HWY_MAX(1, N1));
lengths.push_back(3 * N1);
lengths.push_back(base_case_num / 2);
lengths.push_back(base_case_num / 2 + N1);
lengths.push_back(base_case_num - N1);
lengths.push_back(base_case_num);
for (size_t len : lengths) {
if (kDebug) {
printf("============%s 01 N1 %d len %d\n", st.KeyString().c_str(),
static_cast<int>(N1), static_cast<int>(len));
}
const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
InputStats<LaneType> input_stats;
for (size_t i = 0; i < len; ++i) {
lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
input_stats.Notify(lanes[i]);
if (kDebug >= 2) {
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
}
}
for (size_t i = len; i < base_case_num + N; ++i) {
lanes[i] = hwy::LowestValue<LaneType>();
}
detail::BaseCase(d, st, lanes.get(), lanes.get() + len, len, buf.get());
if (kDebug >= 2) {
printf("out>>>>>>\n");
for (size_t i = 0; i < len; ++i) {
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
}
}
HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01"));
for (size_t i = len; i < base_case_num + N; ++i) {
if (lanes[i] != hwy::LowestValue<LaneType>())
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
}
} // bits
} // len
}
template <class Traits>
static HWY_NOINLINE void TestBaseCase() {
TestBaseCaseAscDesc<Traits>();
TestBaseCase01<Traits>();
}
HWY_NOINLINE void TestAllBaseCase() {
// Workaround for stack overflow on MSVC debug.
#if defined(_MSC_VER)
return;
#endif
TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
TestBaseCase<TraitsLane<OrderDescending<int64_t> > >();
TestBaseCase<Traits128<OrderAscending128> >();
TestBaseCase<Traits128<OrderDescending128> >();
}
template <class Traits>
static HWY_NOINLINE void VerifyPartition(
Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left,
size_t border, size_t right, const size_t N1,
const typename Traits::LaneType* pivot) {
/* for (size_t i = left; i < right; ++i) {
if (i == border) printf("--\n");
printf("%4zu: %3d\n", i, lanes[i]);
}*/
HWY_ASSERT(left % N1 == 0);
HWY_ASSERT(border % N1 == 0);
HWY_ASSERT(right % N1 == 0);
const bool asc = typename Traits::Order().IsAscending();
for (size_t i = left; i < border; i += N1) {
if (st.Compare1(pivot, lanes + i)) {
HWY_ABORT(
"%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
"border %d",
st.KeyString().c_str(), asc, static_cast<int>(i),
static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i + 0]),
static_cast<int>(border));
}
}
for (size_t i = border; i < right; i += N1) {
if (!st.Compare1(pivot, lanes + i)) {
HWY_ABORT(
"%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
"border %d",
st.KeyString().c_str(), asc, static_cast<int>(i),
static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i]),
static_cast<int>(border));
}
}
}
template <class Traits>
static HWY_NOINLINE void TestPartition() {
using LaneType = typename Traits::LaneType;
const SortTag<LaneType> d;
SharedTraits<Traits> st;
const bool asc = typename Traits::Order().IsAscending();
const size_t N = Lanes(d);
constexpr int kDebug = 0;
const size_t base_case_num = SortConstants::BaseCaseNum(N);
// left + len + align
const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
auto aligned_lanes = hwy::AllocateAligned<LaneType>(total);
auto buf = hwy::AllocateAligned<LaneType>(SortConstants::PartitionBufNum(N));
const size_t N1 = st.LanesPerKey();
for (bool in_asc : {false, true}) {
for (int left_i : {0, 1, 4, 6, 7, 8, 12, 15, 22, 28, 30, 31}) {
const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
for (size_t ofs : {N, N + 1, N + 3, 2 * N, 2 * N + 2, 2 * N + 3,
3 * N - 1, 4 * N - 3, 4 * N - 2}) {
const size_t len = (base_case_num + ofs) & ~(N1 - 1);
for (LaneType pivot1 :
{LaneType(0), LaneType(len / 3), LaneType(len / 2),
LaneType(2 * len / 3), LaneType(len)}) {
const LaneType pivot2[2] = {pivot1, 0};
const auto pivot = st.SetKey(d, pivot2);
for (size_t misalign = 0; misalign < N;
misalign += st.LanesPerKey()) {
LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
const size_t right = left + len;
if (kDebug) {
printf(
"=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
st.KeyString().c_str(), asc, static_cast<int>(left),
static_cast<int>(len), static_cast<int>(right),
static_cast<double>(pivot2[1]),
static_cast<double>(pivot2[0]));
}
for (size_t i = 0; i < misalign; ++i) {
aligned_lanes[i] = hwy::LowestValue<LaneType>();
}
for (size_t i = 0; i < left; ++i) {
lanes[i] = hwy::LowestValue<LaneType>();
}
std::unordered_map<LaneType, int> counts;
for (size_t i = left; i < right; ++i) {
lanes[i] = static_cast<LaneType>(
in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
: static_cast<LaneType>(right) - LaneType(i));
++counts[lanes[i]];
if (kDebug >= 2) {
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
}
}
for (size_t i = right; i < total - misalign; ++i) {
lanes[i] = hwy::LowestValue<LaneType>();
}
size_t border =
left + detail::Partition(d, st, lanes + left, right - left,
pivot, buf.get());
if (kDebug >= 2) {
printf("out>>>>>>\n");
for (size_t i = left; i < right; ++i) {
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
}
for (size_t i = right; i < total - misalign; ++i) {
printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
}
}
for (size_t i = left; i < right; ++i) {
--counts[lanes[i]];
}
for (auto kv : counts) {
if (kv.second != 0) {
PrintValue(kv.first);
HWY_ABORT("Incorrect count %d\n", kv.second);
}
}
VerifyPartition(st, lanes, left, border, right, N1, pivot2);
for (size_t i = 0; i < misalign; ++i) {
if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
}
for (size_t i = 0; i < left; ++i) {
if (lanes[i] != hwy::LowestValue<LaneType>())
HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
}
for (size_t i = right; i < total - misalign; ++i) {
if (lanes[i] != hwy::LowestValue<LaneType>())
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
}
} // misalign
} // pivot
} // len
} // left
} // asc
}
HWY_NOINLINE void TestAllPartition() {
TestPartition<TraitsLane<OrderDescending<int32_t> > >();
TestPartition<Traits128<OrderAscending128> >();
#if !HWY_IS_DEBUG_BUILD
TestPartition<TraitsLane<OrderAscending<int16_t> > >();
TestPartition<TraitsLane<OrderAscending<int64_t> > >();
TestPartition<TraitsLane<OrderDescending<float> > >();
#if HWY_HAVE_FLOAT64
TestPartition<TraitsLane<OrderDescending<double> > >();
#endif
TestPartition<Traits128<OrderDescending128> >();
#endif
}
// (used for sample selection for choosing a pivot)
template <typename TU>
static HWY_NOINLINE void TestRandomGenerator() {
static_assert(!hwy::IsSigned<TU>(), "");
SortTag<TU> du;
const size_t N = Lanes(du);
detail::Generator rng(&N, N);
const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N); // power of two
for (uint32_t num_blocks = 2; num_blocks < 100000;
num_blocks = 3 * num_blocks / 2) {
// Generate some numbers and ensure all are in range
uint64_t sum = 0;
constexpr size_t kReps = 10000;
for (size_t rep = 0; rep < kReps; ++rep) {
const uint32_t bits = rng() & 0xFFFFFFFF;
const size_t index = detail::RandomChunkIndex(num_blocks, bits);
HWY_ASSERT(((index + 1) * lanes_per_block) <=
num_blocks * lanes_per_block);
sum += index;
}
// Also ensure the mean is near the middle of the range
const double expected = (num_blocks - 1) / 2.0;
const double actual = static_cast<double>(sum) / kReps;
HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
}
}
HWY_NOINLINE void TestAllGenerator() {
TestRandomGenerator<uint32_t>();
TestRandomGenerator<uint64_t>();
}
#else
static void TestAllMedian() {}
static void TestAllBaseCase() {}
static void TestAllPartition() {}
static void TestAllGenerator() {}
#endif // VQSORT_ENABLED
// Remembers input, and compares results to that of a reference algorithm.
template <class Traits>
class CompareResults {
using LaneType = typename Traits::LaneType;
using KeyType = typename Traits::KeyType;
public:
CompareResults(const LaneType* in, size_t num_lanes) {
copy_.resize(num_lanes);
memcpy(copy_.data(), in, num_lanes * sizeof(LaneType));
}
bool Verify(const LaneType* output) {
#if HAVE_PDQSORT
const Algo reference = Algo::kPDQ;
#else
const Algo reference = Algo::kStd;
#endif
SharedState shared;
using Order = typename Traits::Order;
const Traits st;
const size_t num_keys = copy_.size() / st.LanesPerKey();
Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
shared, /*thread=*/0);
#if VQSORT_PRINT >= 3
fprintf(stderr, "\nExpected:\n");
for (size_t i = 0; i < copy_.size(); ++i) {
PrintValue(copy_[i]);
}
fprintf(stderr, "\n");
#endif
for (size_t i = 0; i < copy_.size(); ++i) {
if (copy_[i] != output[i]) {
if (sizeof(KeyType) == 16) {
fprintf(stderr,
"%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n",
st.KeyString().c_str(), Order().IsAscending(),
static_cast<int>(i), static_cast<int>(copy_.size()),
static_cast<uint64_t>(copy_[i]),
static_cast<uint64_t>(output[i]));
} else {
fprintf(stderr, "Type %s Asc %d mismatch at %d of %d: ",
st.KeyString().c_str(), Order().IsAscending(),
static_cast<int>(i), static_cast<int>(copy_.size()));
PrintValue(copy_[i]);
PrintValue(output[i]);
fprintf(stderr, "\n");
}
return false;
}
}
return true;
}
private:
std::vector<LaneType> copy_;
};
std::vector<Algo> AlgoForTest() {
return {
#if HAVE_AVX2SORT
Algo::kSEA,
#endif
#if HAVE_IPS4O
Algo::kIPS4O,
#endif
#if HAVE_PDQSORT
Algo::kPDQ,
#endif
#if HAVE_SORT512
Algo::kSort512,
#endif
Algo::kHeap, Algo::kVQSort,
};
}
template <class Traits>
void TestSort(size_t num_lanes) {
// Workaround for stack overflow on clang-cl (/F 8388608 does not help).
#if defined(_MSC_VER)
return;
#endif
using Order = typename Traits::Order;
using LaneType = typename Traits::LaneType;
using KeyType = typename Traits::KeyType;
SharedState shared;
SharedTraits<Traits> st;
// Round up to a whole number of keys.
num_lanes += (st.Is128() && (num_lanes & 1));
const size_t num_keys = num_lanes / st.LanesPerKey();
constexpr size_t kMaxMisalign = 16;
auto aligned =
hwy::AllocateAligned<LaneType>(kMaxMisalign + num_lanes + kMaxMisalign);
for (Algo algo : AlgoForTest()) {
for (Dist dist : AllDist()) {
for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
LaneType* lanes = aligned.get() + misalign;
// Set up red zones before/after the keys to sort
for (size_t i = 0; i < misalign; ++i) {
aligned[i] = hwy::LowestValue<LaneType>();
}
for (size_t i = 0; i < kMaxMisalign; ++i) {
lanes[num_lanes + i] = hwy::HighestValue<LaneType>();
}
#if HWY_IS_MSAN
__msan_poison(aligned.get(), misalign * sizeof(LaneType));
__msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
#endif
InputStats<LaneType> input_stats =
GenerateInput(dist, lanes, num_lanes);
CompareResults<Traits> compare(lanes, num_lanes);
Run<Order>(algo, reinterpret_cast<KeyType*>(lanes), num_keys, shared,
/*thread=*/0);
HWY_ASSERT(compare.Verify(lanes));
HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort"));
// Check red zones
#if HWY_IS_MSAN
__msan_unpoison(aligned.get(), misalign * sizeof(LaneType));
__msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
#endif
for (size_t i = 0; i < misalign; ++i) {
if (aligned[i] != hwy::LowestValue<LaneType>())
HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
}
for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) {
if (lanes[i] != hwy::HighestValue<LaneType>())
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
}
} // misalign
} // dist
} // algo
}
void TestAllSort() {
for (int num : {129, 504, 3 * 1000, 34567}) {
const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);
TestSort<TraitsLane<OrderDescending<int32_t> > >(num_lanes);
TestSort<TraitsLane<OrderDescending<uint32_t> > >(num_lanes);
TestSort<TraitsLane<OrderAscending<int64_t> > >(num_lanes);
TestSort<TraitsLane<OrderAscending<uint64_t> > >(num_lanes);
// WARNING: for float types, SIMD comparisons will flush denormals to
// zero, causing mismatches with scalar sorts. In this test, we avoid
// generating denormal inputs.
TestSort<TraitsLane<OrderAscending<float> > >(num_lanes);
#if HWY_HAVE_FLOAT64 // protects algo-inl's GenerateRandom
if (Sorter::HaveFloat64()) {
TestSort<TraitsLane<OrderDescending<double> > >(num_lanes);
}
#endif
// Our HeapSort does not support 128-bit keys.
#if VQSORT_ENABLED
TestSort<Traits128<OrderAscending128> >(num_lanes);
TestSort<Traits128<OrderDescending128> >(num_lanes);
TestSort<TraitsLane<OrderAscendingKV64> >(num_lanes);
TestSort<TraitsLane<OrderDescendingKV64> >(num_lanes);
TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
#endif
}
}
} // namespace
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_BEFORE_TEST(SortTest);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort);
} // namespace
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,695 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
#endif
#include "hwy/contrib/sort/shared-inl.h" // SortConstants
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {
#if VQSORT_ENABLED
using Constants = hwy::SortConstants;
// ------------------------------ SharedTraits
// Code shared between all traits. It's unclear whether these can profitably be
// specialized for Lane vs Block, or optimized like SortPairsDistance1 using
// Compare/DupOdd.
template <class Base>
struct SharedTraits : public Base {
// Conditionally swaps lane 0 with 2, 1 with 3 etc.
template <class D>
HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->SwapAdjacentPairs(d, v);
base->Sort2(d, v, swapped);
return base->OddEvenPairs(d, swapped, v);
}
// Swaps with the vector formed by reversing contiguous groups of 8 keys.
template <class D>
HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys8(d, v);
base->Sort2(d, v, swapped);
return base->OddEvenQuads(d, swapped, v);
}
// Swaps with the vector formed by reversing contiguous groups of 8 keys.
template <class D>
HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16");
Vec<D> swapped = base->ReverseKeys(d, v);
base->Sort2(d, v, swapped);
return ConcatUpperLower(d, swapped, v); // 8 = half of the vector
}
};
// ------------------------------ Sorting network
// (Green's irregular) sorting network for independent columns in 16 vectors.
template <class D, class Traits, class V = Vec<D>>
HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
V& ve, V& vf) {
st.Sort2(d, v0, v1);
st.Sort2(d, v2, v3);
st.Sort2(d, v4, v5);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
st.Sort2(d, va, vb);
st.Sort2(d, vc, vd);
st.Sort2(d, ve, vf);
st.Sort2(d, v0, v2);
st.Sort2(d, v1, v3);
st.Sort2(d, v4, v6);
st.Sort2(d, v5, v7);
st.Sort2(d, v8, va);
st.Sort2(d, v9, vb);
st.Sort2(d, vc, ve);
st.Sort2(d, vd, vf);
st.Sort2(d, v0, v4);
st.Sort2(d, v1, v5);
st.Sort2(d, v2, v6);
st.Sort2(d, v3, v7);
st.Sort2(d, v8, vc);
st.Sort2(d, v9, vd);
st.Sort2(d, va, ve);
st.Sort2(d, vb, vf);
st.Sort2(d, v0, v8);
st.Sort2(d, v1, v9);
st.Sort2(d, v2, va);
st.Sort2(d, v3, vb);
st.Sort2(d, v4, vc);
st.Sort2(d, v5, vd);
st.Sort2(d, v6, ve);
st.Sort2(d, v7, vf);
st.Sort2(d, v5, va);
st.Sort2(d, v6, v9);
st.Sort2(d, v3, vc);
st.Sort2(d, v7, vb);
st.Sort2(d, vd, ve);
st.Sort2(d, v4, v8);
st.Sort2(d, v1, v2);
st.Sort2(d, v1, v4);
st.Sort2(d, v7, vd);
st.Sort2(d, v2, v8);
st.Sort2(d, vb, ve);
st.Sort2(d, v2, v4);
st.Sort2(d, v5, v6);
st.Sort2(d, v9, va);
st.Sort2(d, vb, vd);
st.Sort2(d, v3, v8);
st.Sort2(d, v7, vc);
st.Sort2(d, v3, v5);
st.Sort2(d, v6, v8);
st.Sort2(d, v7, v9);
st.Sort2(d, va, vc);
st.Sort2(d, v3, v4);
st.Sort2(d, v5, v6);
st.Sort2(d, v7, v8);
st.Sort2(d, v9, va);
st.Sort2(d, vb, vc);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
}
// ------------------------------ Merging networks
// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
template <class D, class Traits, class V = Vec<D>>
HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
V& ve, V& vf) {
v8 = st.ReverseKeys2(d, v8);
v9 = st.ReverseKeys2(d, v9);
va = st.ReverseKeys2(d, va);
vb = st.ReverseKeys2(d, vb);
vc = st.ReverseKeys2(d, vc);
vd = st.ReverseKeys2(d, vd);
ve = st.ReverseKeys2(d, ve);
vf = st.ReverseKeys2(d, vf);
st.Sort2(d, v0, vf);
st.Sort2(d, v1, ve);
st.Sort2(d, v2, vd);
st.Sort2(d, v3, vc);
st.Sort2(d, v4, vb);
st.Sort2(d, v5, va);
st.Sort2(d, v6, v9);
st.Sort2(d, v7, v8);
v4 = st.ReverseKeys2(d, v4);
vc = st.ReverseKeys2(d, vc);
v5 = st.ReverseKeys2(d, v5);
vd = st.ReverseKeys2(d, vd);
v6 = st.ReverseKeys2(d, v6);
ve = st.ReverseKeys2(d, ve);
v7 = st.ReverseKeys2(d, v7);
vf = st.ReverseKeys2(d, vf);
st.Sort2(d, v0, v7);
st.Sort2(d, v8, vf);
st.Sort2(d, v1, v6);
st.Sort2(d, v9, ve);
st.Sort2(d, v2, v5);
st.Sort2(d, va, vd);
st.Sort2(d, v3, v4);
st.Sort2(d, vb, vc);
v2 = st.ReverseKeys2(d, v2);
v3 = st.ReverseKeys2(d, v3);
v6 = st.ReverseKeys2(d, v6);
v7 = st.ReverseKeys2(d, v7);
va = st.ReverseKeys2(d, va);
vb = st.ReverseKeys2(d, vb);
ve = st.ReverseKeys2(d, ve);
vf = st.ReverseKeys2(d, vf);
st.Sort2(d, v0, v3);
st.Sort2(d, v1, v2);
st.Sort2(d, v4, v7);
st.Sort2(d, v5, v6);
st.Sort2(d, v8, vb);
st.Sort2(d, v9, va);
st.Sort2(d, vc, vf);
st.Sort2(d, vd, ve);
v1 = st.ReverseKeys2(d, v1);
v3 = st.ReverseKeys2(d, v3);
v5 = st.ReverseKeys2(d, v5);
v7 = st.ReverseKeys2(d, v7);
v9 = st.ReverseKeys2(d, v9);
vb = st.ReverseKeys2(d, vb);
vd = st.ReverseKeys2(d, vd);
vf = st.ReverseKeys2(d, vf);
st.Sort2(d, v0, v1);
st.Sort2(d, v2, v3);
st.Sort2(d, v4, v5);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
st.Sort2(d, va, vb);
st.Sort2(d, vc, vd);
st.Sort2(d, ve, vf);
v0 = st.SortPairsDistance1(d, v0);
v1 = st.SortPairsDistance1(d, v1);
v2 = st.SortPairsDistance1(d, v2);
v3 = st.SortPairsDistance1(d, v3);
v4 = st.SortPairsDistance1(d, v4);
v5 = st.SortPairsDistance1(d, v5);
v6 = st.SortPairsDistance1(d, v6);
v7 = st.SortPairsDistance1(d, v7);
v8 = st.SortPairsDistance1(d, v8);
v9 = st.SortPairsDistance1(d, v9);
va = st.SortPairsDistance1(d, va);
vb = st.SortPairsDistance1(d, vb);
vc = st.SortPairsDistance1(d, vc);
vd = st.SortPairsDistance1(d, vd);
ve = st.SortPairsDistance1(d, ve);
vf = st.SortPairsDistance1(d, vf);
}
template <class D, class Traits, class V = Vec<D>>
HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
V& ve, V& vf) {
v8 = st.ReverseKeys4(d, v8);
v9 = st.ReverseKeys4(d, v9);
va = st.ReverseKeys4(d, va);
vb = st.ReverseKeys4(d, vb);
vc = st.ReverseKeys4(d, vc);
vd = st.ReverseKeys4(d, vd);
ve = st.ReverseKeys4(d, ve);
vf = st.ReverseKeys4(d, vf);
st.Sort2(d, v0, vf);
st.Sort2(d, v1, ve);
st.Sort2(d, v2, vd);
st.Sort2(d, v3, vc);
st.Sort2(d, v4, vb);
st.Sort2(d, v5, va);
st.Sort2(d, v6, v9);
st.Sort2(d, v7, v8);
v4 = st.ReverseKeys4(d, v4);
vc = st.ReverseKeys4(d, vc);
v5 = st.ReverseKeys4(d, v5);
vd = st.ReverseKeys4(d, vd);
v6 = st.ReverseKeys4(d, v6);
ve = st.ReverseKeys4(d, ve);
v7 = st.ReverseKeys4(d, v7);
vf = st.ReverseKeys4(d, vf);
st.Sort2(d, v0, v7);
st.Sort2(d, v8, vf);
st.Sort2(d, v1, v6);
st.Sort2(d, v9, ve);
st.Sort2(d, v2, v5);
st.Sort2(d, va, vd);
st.Sort2(d, v3, v4);
st.Sort2(d, vb, vc);
v2 = st.ReverseKeys4(d, v2);
v3 = st.ReverseKeys4(d, v3);
v6 = st.ReverseKeys4(d, v6);
v7 = st.ReverseKeys4(d, v7);
va = st.ReverseKeys4(d, va);
vb = st.ReverseKeys4(d, vb);
ve = st.ReverseKeys4(d, ve);
vf = st.ReverseKeys4(d, vf);
st.Sort2(d, v0, v3);
st.Sort2(d, v1, v2);
st.Sort2(d, v4, v7);
st.Sort2(d, v5, v6);
st.Sort2(d, v8, vb);
st.Sort2(d, v9, va);
st.Sort2(d, vc, vf);
st.Sort2(d, vd, ve);
v1 = st.ReverseKeys4(d, v1);
v3 = st.ReverseKeys4(d, v3);
v5 = st.ReverseKeys4(d, v5);
v7 = st.ReverseKeys4(d, v7);
v9 = st.ReverseKeys4(d, v9);
vb = st.ReverseKeys4(d, vb);
vd = st.ReverseKeys4(d, vd);
vf = st.ReverseKeys4(d, vf);
st.Sort2(d, v0, v1);
st.Sort2(d, v2, v3);
st.Sort2(d, v4, v5);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
st.Sort2(d, va, vb);
st.Sort2(d, vc, vd);
st.Sort2(d, ve, vf);
v0 = st.SortPairsReverse4(d, v0);
v1 = st.SortPairsReverse4(d, v1);
v2 = st.SortPairsReverse4(d, v2);
v3 = st.SortPairsReverse4(d, v3);
v4 = st.SortPairsReverse4(d, v4);
v5 = st.SortPairsReverse4(d, v5);
v6 = st.SortPairsReverse4(d, v6);
v7 = st.SortPairsReverse4(d, v7);
v8 = st.SortPairsReverse4(d, v8);
v9 = st.SortPairsReverse4(d, v9);
va = st.SortPairsReverse4(d, va);
vb = st.SortPairsReverse4(d, vb);
vc = st.SortPairsReverse4(d, vc);
vd = st.SortPairsReverse4(d, vd);
ve = st.SortPairsReverse4(d, ve);
vf = st.SortPairsReverse4(d, vf);
v0 = st.SortPairsDistance1(d, v0);
v1 = st.SortPairsDistance1(d, v1);
v2 = st.SortPairsDistance1(d, v2);
v3 = st.SortPairsDistance1(d, v3);
v4 = st.SortPairsDistance1(d, v4);
v5 = st.SortPairsDistance1(d, v5);
v6 = st.SortPairsDistance1(d, v6);
v7 = st.SortPairsDistance1(d, v7);
v8 = st.SortPairsDistance1(d, v8);
v9 = st.SortPairsDistance1(d, v9);
va = st.SortPairsDistance1(d, va);
vb = st.SortPairsDistance1(d, vb);
vc = st.SortPairsDistance1(d, vc);
vd = st.SortPairsDistance1(d, vd);
ve = st.SortPairsDistance1(d, ve);
vf = st.SortPairsDistance1(d, vf);
}
template <class D, class Traits, class V = Vec<D>>
HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
V& ve, V& vf) {
v8 = st.ReverseKeys8(d, v8);
v9 = st.ReverseKeys8(d, v9);
va = st.ReverseKeys8(d, va);
vb = st.ReverseKeys8(d, vb);
vc = st.ReverseKeys8(d, vc);
vd = st.ReverseKeys8(d, vd);
ve = st.ReverseKeys8(d, ve);
vf = st.ReverseKeys8(d, vf);
st.Sort2(d, v0, vf);
st.Sort2(d, v1, ve);
st.Sort2(d, v2, vd);
st.Sort2(d, v3, vc);
st.Sort2(d, v4, vb);
st.Sort2(d, v5, va);
st.Sort2(d, v6, v9);
st.Sort2(d, v7, v8);
v4 = st.ReverseKeys8(d, v4);
vc = st.ReverseKeys8(d, vc);
v5 = st.ReverseKeys8(d, v5);
vd = st.ReverseKeys8(d, vd);
v6 = st.ReverseKeys8(d, v6);
ve = st.ReverseKeys8(d, ve);
v7 = st.ReverseKeys8(d, v7);
vf = st.ReverseKeys8(d, vf);
st.Sort2(d, v0, v7);
st.Sort2(d, v8, vf);
st.Sort2(d, v1, v6);
st.Sort2(d, v9, ve);
st.Sort2(d, v2, v5);
st.Sort2(d, va, vd);
st.Sort2(d, v3, v4);
st.Sort2(d, vb, vc);
v2 = st.ReverseKeys8(d, v2);
v3 = st.ReverseKeys8(d, v3);
v6 = st.ReverseKeys8(d, v6);
v7 = st.ReverseKeys8(d, v7);
va = st.ReverseKeys8(d, va);
vb = st.ReverseKeys8(d, vb);
ve = st.ReverseKeys8(d, ve);
vf = st.ReverseKeys8(d, vf);
st.Sort2(d, v0, v3);
st.Sort2(d, v1, v2);
st.Sort2(d, v4, v7);
st.Sort2(d, v5, v6);
st.Sort2(d, v8, vb);
st.Sort2(d, v9, va);
st.Sort2(d, vc, vf);
st.Sort2(d, vd, ve);
v1 = st.ReverseKeys8(d, v1);
v3 = st.ReverseKeys8(d, v3);
v5 = st.ReverseKeys8(d, v5);
v7 = st.ReverseKeys8(d, v7);
v9 = st.ReverseKeys8(d, v9);
vb = st.ReverseKeys8(d, vb);
vd = st.ReverseKeys8(d, vd);
vf = st.ReverseKeys8(d, vf);
st.Sort2(d, v0, v1);
st.Sort2(d, v2, v3);
st.Sort2(d, v4, v5);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
st.Sort2(d, va, vb);
st.Sort2(d, vc, vd);
st.Sort2(d, ve, vf);
v0 = st.SortPairsReverse8(d, v0);
v1 = st.SortPairsReverse8(d, v1);
v2 = st.SortPairsReverse8(d, v2);
v3 = st.SortPairsReverse8(d, v3);
v4 = st.SortPairsReverse8(d, v4);
v5 = st.SortPairsReverse8(d, v5);
v6 = st.SortPairsReverse8(d, v6);
v7 = st.SortPairsReverse8(d, v7);
v8 = st.SortPairsReverse8(d, v8);
v9 = st.SortPairsReverse8(d, v9);
va = st.SortPairsReverse8(d, va);
vb = st.SortPairsReverse8(d, vb);
vc = st.SortPairsReverse8(d, vc);
vd = st.SortPairsReverse8(d, vd);
ve = st.SortPairsReverse8(d, ve);
vf = st.SortPairsReverse8(d, vf);
v0 = st.SortPairsDistance2(d, v0);
v1 = st.SortPairsDistance2(d, v1);
v2 = st.SortPairsDistance2(d, v2);
v3 = st.SortPairsDistance2(d, v3);
v4 = st.SortPairsDistance2(d, v4);
v5 = st.SortPairsDistance2(d, v5);
v6 = st.SortPairsDistance2(d, v6);
v7 = st.SortPairsDistance2(d, v7);
v8 = st.SortPairsDistance2(d, v8);
v9 = st.SortPairsDistance2(d, v9);
va = st.SortPairsDistance2(d, va);
vb = st.SortPairsDistance2(d, vb);
vc = st.SortPairsDistance2(d, vc);
vd = st.SortPairsDistance2(d, vd);
ve = st.SortPairsDistance2(d, ve);
vf = st.SortPairsDistance2(d, vf);
v0 = st.SortPairsDistance1(d, v0);
v1 = st.SortPairsDistance1(d, v1);
v2 = st.SortPairsDistance1(d, v2);
v3 = st.SortPairsDistance1(d, v3);
v4 = st.SortPairsDistance1(d, v4);
v5 = st.SortPairsDistance1(d, v5);
v6 = st.SortPairsDistance1(d, v6);
v7 = st.SortPairsDistance1(d, v7);
v8 = st.SortPairsDistance1(d, v8);
v9 = st.SortPairsDistance1(d, v9);
va = st.SortPairsDistance1(d, va);
vb = st.SortPairsDistance1(d, vb);
vc = st.SortPairsDistance1(d, vc);
vd = st.SortPairsDistance1(d, vd);
ve = st.SortPairsDistance1(d, ve);
vf = st.SortPairsDistance1(d, vf);
}
// Unused on MSVC, see below
#if !HWY_COMPILER_MSVC
template <class D, class Traits, class V = Vec<D>>
HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc,
V& vd, V& ve, V& vf) {
v8 = st.ReverseKeys16(d, v8);
v9 = st.ReverseKeys16(d, v9);
va = st.ReverseKeys16(d, va);
vb = st.ReverseKeys16(d, vb);
vc = st.ReverseKeys16(d, vc);
vd = st.ReverseKeys16(d, vd);
ve = st.ReverseKeys16(d, ve);
vf = st.ReverseKeys16(d, vf);
st.Sort2(d, v0, vf);
st.Sort2(d, v1, ve);
st.Sort2(d, v2, vd);
st.Sort2(d, v3, vc);
st.Sort2(d, v4, vb);
st.Sort2(d, v5, va);
st.Sort2(d, v6, v9);
st.Sort2(d, v7, v8);
v4 = st.ReverseKeys16(d, v4);
vc = st.ReverseKeys16(d, vc);
v5 = st.ReverseKeys16(d, v5);
vd = st.ReverseKeys16(d, vd);
v6 = st.ReverseKeys16(d, v6);
ve = st.ReverseKeys16(d, ve);
v7 = st.ReverseKeys16(d, v7);
vf = st.ReverseKeys16(d, vf);
st.Sort2(d, v0, v7);
st.Sort2(d, v8, vf);
st.Sort2(d, v1, v6);
st.Sort2(d, v9, ve);
st.Sort2(d, v2, v5);
st.Sort2(d, va, vd);
st.Sort2(d, v3, v4);
st.Sort2(d, vb, vc);
v2 = st.ReverseKeys16(d, v2);
v3 = st.ReverseKeys16(d, v3);
v6 = st.ReverseKeys16(d, v6);
v7 = st.ReverseKeys16(d, v7);
va = st.ReverseKeys16(d, va);
vb = st.ReverseKeys16(d, vb);
ve = st.ReverseKeys16(d, ve);
vf = st.ReverseKeys16(d, vf);
st.Sort2(d, v0, v3);
st.Sort2(d, v1, v2);
st.Sort2(d, v4, v7);
st.Sort2(d, v5, v6);
st.Sort2(d, v8, vb);
st.Sort2(d, v9, va);
st.Sort2(d, vc, vf);
st.Sort2(d, vd, ve);
v1 = st.ReverseKeys16(d, v1);
v3 = st.ReverseKeys16(d, v3);
v5 = st.ReverseKeys16(d, v5);
v7 = st.ReverseKeys16(d, v7);
v9 = st.ReverseKeys16(d, v9);
vb = st.ReverseKeys16(d, vb);
vd = st.ReverseKeys16(d, vd);
vf = st.ReverseKeys16(d, vf);
st.Sort2(d, v0, v1);
st.Sort2(d, v2, v3);
st.Sort2(d, v4, v5);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
st.Sort2(d, va, vb);
st.Sort2(d, vc, vd);
st.Sort2(d, ve, vf);
v0 = st.SortPairsReverse16(d, v0);
v1 = st.SortPairsReverse16(d, v1);
v2 = st.SortPairsReverse16(d, v2);
v3 = st.SortPairsReverse16(d, v3);
v4 = st.SortPairsReverse16(d, v4);
v5 = st.SortPairsReverse16(d, v5);
v6 = st.SortPairsReverse16(d, v6);
v7 = st.SortPairsReverse16(d, v7);
v8 = st.SortPairsReverse16(d, v8);
v9 = st.SortPairsReverse16(d, v9);
va = st.SortPairsReverse16(d, va);
vb = st.SortPairsReverse16(d, vb);
vc = st.SortPairsReverse16(d, vc);
vd = st.SortPairsReverse16(d, vd);
ve = st.SortPairsReverse16(d, ve);
vf = st.SortPairsReverse16(d, vf);
v0 = st.SortPairsDistance4(d, v0);
v1 = st.SortPairsDistance4(d, v1);
v2 = st.SortPairsDistance4(d, v2);
v3 = st.SortPairsDistance4(d, v3);
v4 = st.SortPairsDistance4(d, v4);
v5 = st.SortPairsDistance4(d, v5);
v6 = st.SortPairsDistance4(d, v6);
v7 = st.SortPairsDistance4(d, v7);
v8 = st.SortPairsDistance4(d, v8);
v9 = st.SortPairsDistance4(d, v9);
va = st.SortPairsDistance4(d, va);
vb = st.SortPairsDistance4(d, vb);
vc = st.SortPairsDistance4(d, vc);
vd = st.SortPairsDistance4(d, vd);
ve = st.SortPairsDistance4(d, ve);
vf = st.SortPairsDistance4(d, vf);
v0 = st.SortPairsDistance2(d, v0);
v1 = st.SortPairsDistance2(d, v1);
v2 = st.SortPairsDistance2(d, v2);
v3 = st.SortPairsDistance2(d, v3);
v4 = st.SortPairsDistance2(d, v4);
v5 = st.SortPairsDistance2(d, v5);
v6 = st.SortPairsDistance2(d, v6);
v7 = st.SortPairsDistance2(d, v7);
v8 = st.SortPairsDistance2(d, v8);
v9 = st.SortPairsDistance2(d, v9);
va = st.SortPairsDistance2(d, va);
vb = st.SortPairsDistance2(d, vb);
vc = st.SortPairsDistance2(d, vc);
vd = st.SortPairsDistance2(d, vd);
ve = st.SortPairsDistance2(d, ve);
vf = st.SortPairsDistance2(d, vf);
v0 = st.SortPairsDistance1(d, v0);
v1 = st.SortPairsDistance1(d, v1);
v2 = st.SortPairsDistance1(d, v2);
v3 = st.SortPairsDistance1(d, v3);
v4 = st.SortPairsDistance1(d, v4);
v5 = st.SortPairsDistance1(d, v5);
v6 = st.SortPairsDistance1(d, v6);
v7 = st.SortPairsDistance1(d, v7);
v8 = st.SortPairsDistance1(d, v8);
v9 = st.SortPairsDistance1(d, v9);
va = st.SortPairsDistance1(d, va);
vb = st.SortPairsDistance1(d, vb);
vc = st.SortPairsDistance1(d, vc);
vd = st.SortPairsDistance1(d, vd);
ve = st.SortPairsDistance1(d, ve);
vf = st.SortPairsDistance1(d, vf);
}
#endif // !HWY_COMPILER_MSVC
// Reshapes `buf` into a matrix, sorts columns independently, and then merges
// into a sorted 1D array without transposing.
//
// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
// differences in sort order and single-lane vs 128-bit keys.
// `buf` ensures full vectors are aligned, and enables loads/stores without
// bounds checks.
//
// NOINLINE because this is large and called twice from vqsort-inl.h.
//
// References:
// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
template <class Traits, typename T>
HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
const CappedTag<T, Constants::kMaxCols> d;
using V = decltype(Zero(d));
HWY_DASSERT(cols <= Constants::kMaxCols);
// The network width depends on the number of keys, not lanes.
constexpr size_t kLanesPerKey = st.LanesPerKey();
const size_t keys = cols / kLanesPerKey;
constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey;
// These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
// offsets to duplicating this code for every value of cols.
static_assert(Constants::kMaxRows == 16, "Update loads/stores/args");
V v0 = LoadU(d, buf + 0x0 * cols);
V v1 = LoadU(d, buf + 0x1 * cols);
V v2 = LoadU(d, buf + 0x2 * cols);
V v3 = LoadU(d, buf + 0x3 * cols);
V v4 = LoadU(d, buf + 0x4 * cols);
V v5 = LoadU(d, buf + 0x5 * cols);
V v6 = LoadU(d, buf + 0x6 * cols);
V v7 = LoadU(d, buf + 0x7 * cols);
V v8 = LoadU(d, buf + 0x8 * cols);
V v9 = LoadU(d, buf + 0x9 * cols);
V va = LoadU(d, buf + 0xa * cols);
V vb = LoadU(d, buf + 0xb * cols);
V vc = LoadU(d, buf + 0xc * cols);
V vd = LoadU(d, buf + 0xd * cols);
V ve = LoadU(d, buf + 0xe * cols);
V vf = LoadU(d, buf + 0xf * cols);
Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
// Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
// code paths: if MaxLanes < 2, then keys <= cols < 2.
if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) {
Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
vf);
if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) {
Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
vf);
if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) {
Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
ve, vf);
// Avoids build timeout. Must match #if condition in kMaxCols.
#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
ve, vf);
static_assert(Constants::kMaxCols <= 16, "Add more branches");
}
#endif
}
}
}
StoreU(v0, d, buf + 0x0 * cols);
StoreU(v1, d, buf + 0x1 * cols);
StoreU(v2, d, buf + 0x2 * cols);
StoreU(v3, d, buf + 0x3 * cols);
StoreU(v4, d, buf + 0x4 * cols);
StoreU(v5, d, buf + 0x5 * cols);
StoreU(v6, d, buf + 0x6 * cols);
StoreU(v7, d, buf + 0x7 * cols);
StoreU(v8, d, buf + 0x8 * cols);
StoreU(v9, d, buf + 0x9 * cols);
StoreU(va, d, buf + 0xa * cols);
StoreU(vb, d, buf + 0xb * cols);
StoreU(vc, d, buf + 0xc * cols);
StoreU(vd, d, buf + 0xd * cols);
StoreU(ve, d, buf + 0xe * cols);
StoreU(vf, d, buf + 0xf * cols);
}
#else
template <class Base>
struct SharedTraits : public Base {};
#endif // VQSORT_ENABLED
} // namespace detail
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
@@ -0,0 +1,527 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
#endif
#include <string>
#include "hwy/contrib/sort/shared-inl.h" // SortConstants
#include "hwy/contrib/sort/vqsort.h" // SortDescending
#include "hwy/highway.h"
#include "hwy/print.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {
#if VQSORT_ENABLED || HWY_IDE
// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
// along with an abstraction layer for single-lane vs. lane-pair, which is
// independent of the order.
template <typename T>
struct KeyLane {
static constexpr bool Is128() { return false; }
constexpr size_t LanesPerKey() const { return 1; }
// What type bench_sort should allocate for generating inputs.
using LaneType = T;
// What type to pass to Sorter::operator().
using KeyType = T;
std::string KeyString() const {
char string100[100];
hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
return string100;
}
// For HeapSort
HWY_INLINE void Swap(T* a, T* b) const {
const T temp = *a;
*a = *b;
*b = temp;
}
template <class V, class M>
HWY_INLINE V CompressKeys(V keys, M mask) const {
return CompressNot(keys, mask);
}
// Broadcasts one key into a vector
template <class D>
HWY_INLINE Vec<D> SetKey(D d, const T* key) const {
return Set(d, *key);
}
template <class D>
HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
return Eq(a, b);
}
template <class D>
HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
return Ne(a, b);
}
HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; }
template <class D>
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
return Reverse(d, v);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const {
return Reverse2(d, v);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const {
return Reverse4(d, v);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const {
return Reverse8(d, v);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const {
static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit");
return ReverseKeys(d, v);
}
template <class V>
HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
return OddEven(odd, even);
}
template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const {
const Repartition<uint32_t, D> du32;
return BitCast(d, Shuffle2301(BitCast(du32, v)));
}
template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
return Shuffle1032(v);
}
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
return SwapAdjacentBlocks(v);
}
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
#if HWY_HAVE_FLOAT64 // in case D is float32
const RepartitionToWide<D> dw;
#else
const RepartitionToWide<RebindToUnsigned<D> > dw;
#endif
return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
}
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
// Assumes max vector size = 512
return ConcatLowerUpper(d, v, v);
}
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
const Vec<D> even) const {
#if HWY_HAVE_FLOAT64 // in case D is float32
const RepartitionToWide<D> dw;
#else
const RepartitionToWide<RebindToUnsigned<D> > dw;
#endif
return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
}
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const {
return OddEvenBlocks(odd, even);
}
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
#if HWY_HAVE_FLOAT64 // in case D is float32
const RepartitionToWide<D> dw;
#else
const RepartitionToWide<RebindToUnsigned<D> > dw;
#endif
return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
}
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
return ConcatUpperLower(d, odd, even);
}
};
// Anything order-related depends on the key traits *and* the order (see
// FirstOfLanes). We cannot implement just one Compare function because Lt128
// only compiles if the lane type is u64. Thus we need either overloaded
// functions with a tag type, class specializations, or separate classes.
// We avoid overloaded functions because we want all functions to be callable
// from a SortTraits without per-function wrappers. Specializing would work, but
// we are anyway going to specialize at a higher level.
template <typename T>
struct OrderAscending : public KeyLane<T> {
using Order = SortAscending;
HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
return Lt(a, b);
}
// Two halves of Sort2, used in ScanMinMax.
template <class D>
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Min(a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Max(a, b);
}
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
T* HWY_RESTRICT /* buf */) const {
return MinOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
T* HWY_RESTRICT /* buf */) const {
return MaxOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::LowestValue<T>());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::HighestValue<T>());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
return Sub(v, Set(d, hwy::Epsilon<T>()));
}
};
template <typename T>
struct OrderDescending : public KeyLane<T> {
using Order = SortDescending;
HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
return Lt(b, a);
}
template <class D>
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Max(a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Min(a, b);
}
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
T* HWY_RESTRICT /* buf */) const {
return MaxOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
T* HWY_RESTRICT /* buf */) const {
return MinOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::HighestValue<T>());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::LowestValue<T>());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
return Add(v, Set(d, hwy::Epsilon<T>()));
}
};
struct OrderAscendingKV64 : public KeyLane<uint64_t> {
using Order = SortAscending;
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
return (*a >> 32) < (*b >> 32);
}
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
return Lt(ShiftRight<32>(a), ShiftRight<32>(b));
}
// Not required to be stable (preserving the order of equivalent keys), so
// we can include the value in the comparison.
template <class D>
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Min(a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Max(a, b);
}
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
uint64_t* HWY_RESTRICT /* buf */) const {
return MinOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
uint64_t* HWY_RESTRICT /* buf */) const {
return MaxOfLanes(d, v);
}
// Same as for regular lanes.
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
return Sub(v, Set(d, 1));
}
};
struct OrderDescendingKV64 : public KeyLane<uint64_t> {
using Order = SortDescending;
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
return (*b >> 32) < (*a >> 32);
}
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
return Lt(ShiftRight<32>(b), ShiftRight<32>(a));
}
// Not required to be stable (preserving the order of equivalent keys), so
// we can include the value in the comparison.
template <class D>
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Max(a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Min(a, b);
}
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
uint64_t* HWY_RESTRICT /* buf */) const {
return MaxOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
uint64_t* HWY_RESTRICT /* buf */) const {
return MinOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
return Add(v, Set(d, 1));
}
};
// Shared code that depends on Order.
template <class Base>
struct TraitsLane : public Base {
// For each lane i: replaces a[i] with the first and b[i] with the second
// according to Base.
// Corresponds to a conditional swap, which is one "node" of a sorting
// network. Min/Max are cheaper than compare + blend at least for integers.
template <class D>
HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
const Base* base = static_cast<const Base*>(this);
const Vec<D> a_copy = a;
// Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4
// instructions. We can reduce it to a compare + 2 IfThenElse.
#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
if (sizeof(TFromD<D>) == 8) {
const Mask<D> cmp = base->Compare(d, a, b);
a = IfThenElse(cmp, a, b);
b = IfThenElse(cmp, b, a_copy);
return;
}
#endif
a = base->First(d, a, b);
b = base->Last(d, a_copy, b);
}
// Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys2(d, v);
// Further to the above optimization, Sort2+OddEvenKeys compile to four
// instructions; we can save one by combining two blends.
#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped));
return IfVecThenElse(DupOdd(cmp), swapped, v);
#else
Sort2(d, v, swapped);
return base->OddEvenKeys(swapped, v);
#endif
}
// (See above - we use Sort2 for non-64-bit types.)
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys2(d, v);
Sort2(d, v, swapped);
return base->OddEvenKeys(swapped, v);
}
// Swaps with the vector formed by reversing contiguous groups of 4 keys.
template <class D>
HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys4(d, v);
Sort2(d, v, swapped);
return base->OddEvenPairs(d, swapped, v);
}
// Conditionally swaps lane 0 with 4, 1 with 5 etc.
template <class D>
HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->SwapAdjacentQuads(d, v);
// Only used in Merge16, so this will not be used on AVX2 (which only has 4
// u64 lanes), so skip the above optimization for 64-bit AVX2.
Sort2(d, v, swapped);
return base->OddEvenQuads(d, swapped, v);
}
};
#else
// Base class shared between OrderAscending, OrderDescending.
template <typename T>
struct KeyLane {
constexpr bool Is128() const { return false; }
constexpr size_t LanesPerKey() const { return 1; }
using LaneType = T;
using KeyType = T;
std::string KeyString() const {
char string100[100];
hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
return string100;
}
};
template <typename T>
struct OrderAscending : public KeyLane<T> {
using Order = SortAscending;
HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
return Lt(a, b);
}
};
template <typename T>
struct OrderDescending : public KeyLane<T> {
using Order = SortDescending;
HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
return Lt(b, a);
}
};
template <class Order>
struct TraitsLane : public Order {
// For HeapSort
template <typename T> // MSVC doesn't find typename Order::LaneType.
HWY_INLINE void Swap(T* a, T* b) const {
const T temp = *a;
*a = *b;
*b = temp;
}
template <class D>
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
return Set(d, *key);
}
};
#endif // VQSORT_ENABLED
} // namespace detail
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
@@ -0,0 +1,492 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
#endif
#include <string>
#include "hwy/contrib/sort/shared-inl.h"
#include "hwy/contrib/sort/vqsort.h" // SortDescending
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {
#if VQSORT_ENABLED || HWY_IDE
// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
// along with an abstraction layer for single-lane vs. lane-pair, which is
// independent of the order.
struct KeyAny128 {
static constexpr bool Is128() { return true; }
constexpr size_t LanesPerKey() const { return 2; }
// What type bench_sort should allocate for generating inputs.
using LaneType = uint64_t;
// KeyType and KeyString are defined by derived classes.
HWY_INLINE void Swap(LaneType* a, LaneType* b) const {
const FixedTag<LaneType, 2> d;
const auto temp = LoadU(d, a);
StoreU(LoadU(d, b), d, a);
StoreU(temp, d, b);
}
template <class V, class M>
HWY_INLINE V CompressKeys(V keys, M mask) const {
return CompressBlocksNot(keys, mask);
}
template <class D>
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
return LoadDup128(d, key);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
return ReverseBlocks(d, v);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const {
return SwapAdjacentBlocks(v);
}
// Only called for 4 keys because we do not support >512-bit vectors.
template <class D>
HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const {
HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
return ReverseKeys(d, v);
}
// Only called for 4 keys because we do not support >512-bit vectors.
template <class D>
HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
const Vec<D> even) const {
HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
return ConcatUpperLower(d, odd, even);
}
template <class V>
HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
return OddEvenBlocks(odd, even);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const {
HWY_ASSERT(0); // not supported: would require 1024-bit vectors
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const {
HWY_ASSERT(0); // not supported: would require 2048-bit vectors
}
// This is only called for 8/16 col networks (not supported).
template <class D>
HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const {
HWY_ASSERT(0);
}
// This is only called for 16 col networks (not supported).
template <class D>
HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const {
HWY_ASSERT(0);
}
// This is only called for 8 col networks (not supported).
template <class D>
HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const {
HWY_ASSERT(0);
}
};
// Base class shared between OrderAscending128, OrderDescending128.
struct Key128 : public KeyAny128 {
// What type to pass to Sorter::operator().
using KeyType = hwy::uint128_t;
std::string KeyString() const { return "U128"; }
template <class D>
HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
return Eq128(d, a, b);
}
template <class D>
HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
return Ne128(d, a, b);
}
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
return a[0] == b[0] && a[1] == b[1];
}
};
// Anything order-related depends on the key traits *and* the order (see
// FirstOfLanes). We cannot implement just one Compare function because Lt128
// only compiles if the lane type is u64. Thus we need either overloaded
// functions with a tag type, class specializations, or separate classes.
// We avoid overloaded functions because we want all functions to be callable
// from a SortTraits without per-function wrappers. Specializing would work, but
// we are anyway going to specialize at a higher level.
struct OrderAscending128 : public Key128 {
using Order = SortAscending;
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
}
template <class D>
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
return Lt128(d, a, b);
}
// Used by CompareTop
template <class V>
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
return Lt(a, b);
}
template <class D>
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
return Min128(d, a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
return Max128(d, a, b);
}
// Same as for regular lanes because 128-bit lanes are u64.
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k0 = Zero(d);
const Vec<D> k1 = OddEven(k0, Set(d, 1));
const Mask<D> borrow = Eq(v, k0); // don't-care, lo == 0
// lo == 0? 1 : 0, 0
const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1));
return Sub(Sub(v, k1), adjust);
}
};
struct OrderDescending128 : public Key128 {
using Order = SortDescending;
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
}
template <class D>
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
return Lt128(d, b, a);
}
// Used by CompareTop
template <class V>
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
return Lt(b, a);
}
template <class D>
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
return Max128(d, a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
return Min128(d, a, b);
}
// Same as for regular lanes because 128-bit lanes are u64.
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
const Vec<D> added = Add(v, k1);
const Mask<D> overflowed = Lt(added, v); // false, overflowed
// overflowed? 1 : 0, 0
const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1));
return Add(added, adjust);
}
};
// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
struct KeyValue128 : public KeyAny128 {
// What type to pass to Sorter::operator().
using KeyType = K64V64;
std::string KeyString() const { return "KV128"; }
template <class D>
HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
return Eq128Upper(d, a, b);
}
template <class D>
HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
return Ne128Upper(d, a, b);
}
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
return a[1] == b[1];
}
};
struct OrderAscendingKV128 : public KeyValue128 {
using Order = SortAscending;
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
return a[1] < b[1];
}
template <class D>
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
return Lt128Upper(d, a, b);
}
// Used by CompareTop
template <class V>
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
return Lt(a, b);
}
template <class D>
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
return Min128Upper(d, a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
return Max128Upper(d, a, b);
}
// Same as for regular lanes because 128-bit lanes are u64.
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Set(d, 1), Zero(d));
return Sub(v, k1);
}
};
struct OrderDescendingKV128 : public KeyValue128 {
using Order = SortDescending;
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
return b[1] < a[1];
}
template <class D>
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
return Lt128Upper(d, b, a);
}
// Used by CompareTop
template <class V>
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
return Lt(b, a);
}
template <class D>
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
return Max128Upper(d, a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
return Min128Upper(d, a, b);
}
// Same as for regular lanes because 128-bit lanes are u64.
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Set(d, 1), Zero(d));
return Add(v, k1);
}
};
// Shared code that depends on Order.
template <class Base>
class Traits128 : public Base {
// Special case for >= 256 bit vectors
#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
// Returns vector with only the top u64 lane valid. Useful when the next step
// is to replicate the mask anyway.
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
const Base* base = static_cast<const Base*>(this);
const Mask<D> eqHL = Eq(a, b);
const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
#if HWY_TARGET == HWY_SVE_256
return IfThenElse(eqHL, DupEven(ltHL), ltHL);
#else
const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX);
#endif
}
// We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
// the most-significant of those lanes (the result of CompareTop), so
// replicate it 4x. Only called for >= 256-bit vectors.
template <class V>
HWY_INLINE V ReplicateTop4x(V v) const {
#if HWY_TARGET == HWY_SVE_256
return svdup_lane_u64(v, 3);
#elif HWY_TARGET <= HWY_AVX3
return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
#else // AVX2
return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
#endif
}
#endif // HWY_TARGET
public:
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
TFromD<D>* HWY_RESTRICT buf) const {
const Base* base = static_cast<const Base*>(this);
const size_t N = Lanes(d);
Store(v, d, buf);
v = base->SetKey(d, buf + 0); // result must be broadcasted
for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
v = base->First(d, v, base->SetKey(d, buf + i));
}
return v;
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
TFromD<D>* HWY_RESTRICT buf) const {
const Base* base = static_cast<const Base*>(this);
const size_t N = Lanes(d);
Store(v, d, buf);
v = base->SetKey(d, buf + 0); // result must be broadcasted
for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
v = base->Last(d, v, base->SetKey(d, buf + i));
}
return v;
}
template <class D>
HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
const Base* base = static_cast<const Base*>(this);
const Vec<D> a_copy = a;
const auto lt = base->Compare(d, a, b);
a = IfThenElse(lt, a, b);
b = IfThenElse(lt, b, a_copy);
}
// Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
template <class D>
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys2(d, v);
#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
return IfVecThenElse(select, swapped, v);
#else
Sort2(d, v, swapped);
return base->OddEvenKeys(swapped, v);
#endif
}
// Swaps with the vector formed by reversing contiguous groups of 4 keys.
template <class D>
HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys4(d, v);
// Only specialize for AVX3 because this requires 512-bit vectors.
#if HWY_TARGET <= HWY_AVX3
const Vec512<uint64_t> outHx = CompareTop(d, v, swapped);
// Similar to ReplicateTop4x, we want to gang together 2 comparison results
// (4 lanes). They are not contiguous, so use permute to replicate 4x.
alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7};
const Vec512<uint64_t> select =
TableLookupLanes(outHx, SetTableIndices(d, kIndices));
return IfVecThenElse(select, swapped, v);
#else
Sort2(d, v, swapped);
return base->OddEvenPairs(d, swapped, v);
#endif
}
// Conditionally swaps lane 0 with 4, 1 with 5 etc.
template <class D>
HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const {
// Only used by Merge16, which would require 2048 bit vectors (unsupported).
HWY_ASSERT(0);
}
};
#endif // VQSORT_ENABLED
} // namespace detail
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,184 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#include <string.h> // memset
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/shared-inl.h"
// Architectures for which we know HWY_HAVE_SCALABLE == 0. This opts into an
// optimization that replaces dynamic allocation with stack storage.
#ifndef VQSORT_STACK
#if HWY_ARCH_X86 || HWY_ARCH_WASM
#define VQSORT_STACK 1
#else
#define VQSORT_STACK 0
#endif
#endif // VQSORT_STACK
#if !VQSORT_STACK
#include "hwy/aligned_allocator.h"
#endif
// Check if we have sys/random.h. First skip some systems on which the check
// itself (features.h) might be problematic.
#if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV
#define VQSORT_GETRANDOM 0
#endif
#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX
#include <features.h>
// ---- which libc
#if defined(__UCLIBC__)
#define VQSORT_GETRANDOM 1 // added Mar 2015, before uclibc-ng 1.0
#elif defined(__GLIBC__) && defined(__GLIBC_PREREQ)
#if __GLIBC_PREREQ(2, 25)
#define VQSORT_GETRANDOM 1
#else
#define VQSORT_GETRANDOM 0
#endif
#else
// Assume MUSL, which has getrandom since 2018. There is no macro to test, see
// https://www.openwall.com/lists/musl/2013/03/29/13.
#define VQSORT_GETRANDOM 1
#endif // ---- which libc
#endif // linux
#if !defined(VQSORT_GETRANDOM)
#define VQSORT_GETRANDOM 0
#endif
// Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom
// (not all Android support the getrandom wrapper)
#ifndef VQSORT_SECURE_SEED
#if VQSORT_GETRANDOM
#define VQSORT_SECURE_SEED 1
#elif defined(_WIN32) || defined(_WIN64)
#define VQSORT_SECURE_SEED 2
#else
#define VQSORT_SECURE_SEED 0
#endif
#endif // VQSORT_SECURE_SEED
#if !VQSORT_SECURE_RNG
#include <time.h>
#if VQSORT_SECURE_SEED == 1
#include <sys/random.h>
#elif VQSORT_SECURE_SEED == 2
#include <windows.h>
#pragma comment(lib, "advapi32.lib")
// Must come after windows.h.
#include <wincrypt.h>
#endif // VQSORT_SECURE_SEED
#endif // !VQSORT_SECURE_RNG
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
size_t VectorSize() { return Lanes(ScalableTag<uint8_t, 3>()); }
bool HaveFloat64() { return HWY_HAVE_FLOAT64; }
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(VectorSize);
HWY_EXPORT(HaveFloat64);
} // namespace
Sorter::Sorter() {
#if VQSORT_STACK
ptr_ = nullptr; // Sort will use stack storage instead
#else
// Determine the largest buffer size required for any type by trying them all.
// (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t
// may require a larger buffer.)
const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)();
const size_t max_bytes =
HWY_MAX(HWY_MAX(SortConstants::BufBytes<uint16_t>(vector_size),
SortConstants::BufBytes<uint32_t>(vector_size)),
SortConstants::BufBytes<uint64_t>(vector_size));
ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr);
// Prevent msan errors by initializing.
memset(ptr_, 0, max_bytes);
#endif
}
void Sorter::Delete() {
#if !VQSORT_STACK
FreeAlignedBytes(ptr_, nullptr, nullptr);
ptr_ = nullptr;
#endif
}
#if !VQSORT_SECURE_RNG
void Sorter::Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes) {
#if VQSORT_SECURE_SEED == 1
// May block if urandom is not yet initialized.
const ssize_t ret = getrandom(bytes, 24, /*flags=*/0);
if (ret == 24) return;
#elif VQSORT_SECURE_SEED == 2
HCRYPTPROV hProvider{};
if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL,
CRYPT_VERIFYCONTEXT)) {
const BOOL ok =
CryptGenRandom(hProvider, 24, reinterpret_cast<BYTE*>(bytes));
CryptReleaseContext(hProvider, 0);
if (ok) return;
}
#endif
// VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from
// stack/heap/code addresses and the clock() timer.
uint64_t* words = reinterpret_cast<uint64_t*>(bytes);
uint64_t** seed_stack = &words;
void (*seed_code)(const void*, size_t, void*) = &Fill24Bytes;
const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack);
const uintptr_t bits_heap = reinterpret_cast<uintptr_t>(seed_heap);
const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code);
const uint64_t bits_time = static_cast<uint64_t>(clock());
words[0] = bits_stack ^ bits_time ^ seed_num;
words[1] = bits_heap ^ bits_time ^ seed_num;
words[2] = bits_code ^ bits_time ^ seed_num;
}
#endif // !VQSORT_SECURE_RNG
bool Sorter::HaveFloat64() { return HWY_DYNAMIC_DISPATCH(HaveFloat64)(); }
} // namespace hwy
#endif // HWY_ONCE
+108
View File
@@ -0,0 +1,108 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Interface to vectorized quicksort with dynamic dispatch.
// Blog post: https://tinyurl.com/vqsort-blog
// Paper with measurements: https://arxiv.org/abs/2205.05982
//
// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is
// worthwhile, we recommend using this code for sorting arrays whose size is at
// least 512 KiB.
#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
#include "hwy/base.h"
namespace hwy {
// Tag arguments that determine the sort order.
struct SortAscending {
constexpr bool IsAscending() const { return true; }
};
struct SortDescending {
constexpr bool IsAscending() const { return false; }
};
// Allocates O(1) space. Type-erased RAII wrapper over hwy/aligned_allocator.h.
// This allows amortizing the allocation over multiple sorts.
class HWY_CONTRIB_DLLEXPORT Sorter {
public:
Sorter();
~Sorter() { Delete(); }
// Move-only
Sorter(const Sorter&) = delete;
Sorter& operator=(const Sorter&) = delete;
Sorter(Sorter&& other) {
Delete();
ptr_ = other.ptr_;
other.ptr_ = nullptr;
}
Sorter& operator=(Sorter&& other) {
Delete();
ptr_ = other.ptr_;
other.ptr_ = nullptr;
return *this;
}
// Sorts keys[0, n). Dispatches to the best available instruction set,
// and does not allocate memory.
void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(float* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(double* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(double* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortDescending) const;
// For internal use only
static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
static bool HaveFloat64();
private:
void Delete();
template <typename T>
T* Get() const {
return static_cast<T*>(ptr_);
}
void* ptr_ = nullptr;
};
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
@@ -0,0 +1,62 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
#if VQSORT_ENABLED
SortTag<uint64_t> d;
detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
Sort(d, st, keys, num, buf);
#else
(void) keys;
(void) num;
(void) buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(Sort128Asc);
} // namespace
void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(Sort128Asc)
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,62 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
#if VQSORT_ENABLED
SortTag<uint64_t> d;
detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
Sort(d, st, keys, num, buf);
#else
(void) keys;
(void) num;
(void) buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(Sort128Desc);
} // namespace
void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(Sort128Desc)
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,53 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
SortTag<float> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<float>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortF32Asc);
} // namespace
void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n, Get<float>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,54 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
float* HWY_RESTRICT buf) {
SortTag<float> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<float>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortF32Desc);
} // namespace
void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n, Get<float>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,61 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortF64Asc(double* HWY_RESTRICT keys, size_t num,
double* HWY_RESTRICT buf) {
#if HWY_HAVE_FLOAT64
SortTag<double> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<double>>> st;
Sort(d, st, keys, num, buf);
#else
(void)keys;
(void)num;
(void)buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortF64Asc);
} // namespace
void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n, Get<double>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,61 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortF64Desc(double* HWY_RESTRICT keys, size_t num,
double* HWY_RESTRICT buf) {
#if HWY_HAVE_FLOAT64
SortTag<double> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<double>>> st;
Sort(d, st, keys, num, buf);
#else
(void)keys;
(void)num;
(void)buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortF64Desc);
} // namespace
void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n, Get<double>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,54 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
int16_t* HWY_RESTRICT buf) {
SortTag<int16_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int16_t>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI16Asc);
} // namespace
void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n, Get<int16_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,54 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
int16_t* HWY_RESTRICT buf) {
SortTag<int16_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int16_t>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI16Desc);
} // namespace
void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n, Get<int16_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,54 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
int32_t* HWY_RESTRICT buf) {
SortTag<int32_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int32_t>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI32Asc);
} // namespace
void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n, Get<int32_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,54 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
int32_t* HWY_RESTRICT buf) {
SortTag<int32_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int32_t>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI32Desc);
} // namespace
void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n, Get<int32_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,54 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
int64_t* HWY_RESTRICT buf) {
SortTag<int64_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI64Asc);
} // namespace
void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n, Get<int64_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,54 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
int64_t* HWY_RESTRICT buf) {
SortTag<int64_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int64_t>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI64Desc);
} // namespace
void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n, Get<int64_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,65 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
// clang-format off
// (avoid line break, which would prevent Copybara rules from matching)
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc" //NOLINT
// clang-format on
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
#if VQSORT_ENABLED
SortTag<uint64_t> d;
detail::SharedTraits<detail::Traits128<detail::OrderAscendingKV128>> st;
Sort(d, st, keys, num, buf);
#else
(void) keys;
(void) num;
(void) buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortKV128Asc);
} // namespace
void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortKV128Asc)
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,65 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
// clang-format off
// (avoid line break, which would prevent Copybara rules from matching)
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc" //NOLINT
// clang-format on
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
#if VQSORT_ENABLED
SortTag<uint64_t> d;
detail::SharedTraits<detail::Traits128<detail::OrderDescendingKV128>> st;
Sort(d, st, keys, num, buf);
#else
(void) keys;
(void) num;
(void) buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortKV128Desc);
} // namespace
void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortKV128Desc)
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,65 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
// clang-format off
// (avoid line break, which would prevent Copybara rules from matching)
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64a.cc" //NOLINT
// clang-format on
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortKV64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
#if VQSORT_ENABLED
SortTag<uint64_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderAscendingKV64>> st;
Sort(d, st, keys, num, buf);
#else
(void) keys;
(void) num;
(void) buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortKV64Asc);
} // namespace
void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortKV64Asc)
(reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,65 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
// clang-format off
// (avoid line break, which would prevent Copybara rules from matching)
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64d.cc" //NOLINT
// clang-format on
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortKV64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
#if VQSORT_ENABLED
SortTag<uint64_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderDescendingKV64>> st;
Sort(d, st, keys, num, buf);
#else
(void) keys;
(void) num;
(void) buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortKV64Desc);
} // namespace
void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortKV64Desc)
(reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,54 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
uint16_t* HWY_RESTRICT buf) {
SortTag<uint16_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint16_t>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU16Asc);
} // namespace
void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n, Get<uint16_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,55 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
uint16_t* HWY_RESTRICT buf) {
SortTag<uint16_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint16_t>>>
st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU16Desc);
} // namespace
void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n, Get<uint16_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,54 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
uint32_t* HWY_RESTRICT buf) {
SortTag<uint32_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint32_t>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU32Asc);
} // namespace
void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n, Get<uint32_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,55 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
uint32_t* HWY_RESTRICT buf) {
SortTag<uint32_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint32_t>>>
st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU32Desc);
} // namespace
void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n, Get<uint32_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,54 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
SortTag<uint64_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint64_t>>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU64Asc);
} // namespace
void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,55 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
SortTag<uint64_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint64_t>>>
st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU64Desc);
} // namespace
void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE
@@ -0,0 +1,234 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
#define HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
// Detects compiler and arch from predefined macros. Zero dependencies for
// inclusion by foreach_target.h.
// Add to #if conditions to prevent IDE from graying out code.
#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
(defined Q_CREATOR_RUN) || (defined(__CLANGD__))
#define HWY_IDE 1
#else
#define HWY_IDE 0
#endif
//------------------------------------------------------------------------------
// Compiler
// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like
// MSVC in other aspects (e.g. HWY_DIAGNOSTICS).
#if defined(_MSC_VER) && !defined(__clang__)
#define HWY_COMPILER_MSVC _MSC_VER
#else
#define HWY_COMPILER_MSVC 0
#endif
#if defined(_MSC_VER) && defined(__clang__)
#define HWY_COMPILER_CLANGCL _MSC_VER
#else
#define HWY_COMPILER_CLANGCL 0
#endif
#ifdef __INTEL_COMPILER
#define HWY_COMPILER_ICC __INTEL_COMPILER
#else
#define HWY_COMPILER_ICC 0
#endif
#ifdef __INTEL_LLVM_COMPILER
#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER
#else
#define HWY_COMPILER_ICX 0
#endif
// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
// compiler extensions (eg. Clang, Intel...)
#ifdef __GNUC__
#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
#else
#define HWY_COMPILER_GCC 0
#endif
// Clang or clang-cl, not GCC.
#ifdef __clang__
// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
// an invalid version number, deduce it from the presence of warnings.
// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
#if defined(__apple_build_version__) || __clang_major__ >= 999
#if __has_warning("-Wbitwise-instead-of-logical")
#define HWY_COMPILER_CLANG 1400
#elif __has_warning("-Wreserved-identifier")
#define HWY_COMPILER_CLANG 1300
#elif __has_warning("-Wformat-insufficient-args")
#define HWY_COMPILER_CLANG 1200
#elif __has_warning("-Wimplicit-const-int-float-conversion")
#define HWY_COMPILER_CLANG 1100
#elif __has_warning("-Wmisleading-indentation")
#define HWY_COMPILER_CLANG 1000
#elif defined(__FILE_NAME__)
#define HWY_COMPILER_CLANG 900
#elif __has_warning("-Wextra-semi-stmt") || \
__has_builtin(__builtin_rotateleft32)
#define HWY_COMPILER_CLANG 800
// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently
// based on Clang 7, but does not support the warning we test.
// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and
// https://trac.macports.org/wiki/XcodeVersionInfo.
#elif __has_warning("-Wc++98-compat-extra-semi") || \
(defined(__apple_build_version__) && __apple_build_version__ >= 10010000)
#define HWY_COMPILER_CLANG 700
#else // Anything older than 7.0 is not recommended for Highway.
#define HWY_COMPILER_CLANG 600
#endif // __has_warning chain
#else // use normal version
#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
#endif
#else // Not clang
#define HWY_COMPILER_CLANG 0
#endif
#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
#else
#define HWY_COMPILER_GCC_ACTUAL 0
#endif
// More than one may be nonzero, but we want at least one.
#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
#error "Unsupported compiler"
#endif
// We should only detect one of these (only clang/clangcl overlap)
#if 1 < \
(!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
!!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
#error "Detected multiple compilers"
#endif
#ifdef __has_builtin
#define HWY_HAS_BUILTIN(name) __has_builtin(name)
#else
#define HWY_HAS_BUILTIN(name) 0
#endif
#ifdef __has_attribute
#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
#else
#define HWY_HAS_ATTRIBUTE(name) 0
#endif
#ifdef __has_feature
#define HWY_HAS_FEATURE(name) __has_feature(name)
#else
#define HWY_HAS_FEATURE(name) 0
#endif
//------------------------------------------------------------------------------
// Architecture
#if defined(__i386__) || defined(_M_IX86)
#define HWY_ARCH_X86_32 1
#else
#define HWY_ARCH_X86_32 0
#endif
#if defined(__x86_64__) || defined(_M_X64)
#define HWY_ARCH_X86_64 1
#else
#define HWY_ARCH_X86_64 0
#endif
#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
#error "Cannot have both x86-32 and x86-64"
#endif
#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
#define HWY_ARCH_X86 1
#else
#define HWY_ARCH_X86 0
#endif
#if defined(__powerpc64__) || defined(_M_PPC)
#define HWY_ARCH_PPC 1
#else
#define HWY_ARCH_PPC 0
#endif
#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
#define HWY_ARCH_ARM_A64 1
#else
#define HWY_ARCH_ARM_A64 0
#endif
#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7)
#define HWY_ARCH_ARM_V7 1
#else
#define HWY_ARCH_ARM_V7 0
#endif
#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
#error "Cannot have both A64 and V7"
#endif
// Any *supported* version of Arm, i.e. 7 or later
#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
#define HWY_ARCH_ARM 1
#else
#define HWY_ARCH_ARM 0
#endif
// Older than v7 (e.g. armel aka Arm v5), in which case we do not support SIMD.
#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM
#define HWY_ARCH_ARM_OLD 1
#else
#define HWY_ARCH_ARM_OLD 0
#endif
#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
#define HWY_ARCH_WASM 1
#else
#define HWY_ARCH_WASM 0
#endif
#ifdef __riscv
#define HWY_ARCH_RVV 1
#else
#define HWY_ARCH_RVV 0
#endif
// It is an error to detect multiple architectures at the same time, but OK to
// detect none of the above.
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
HWY_ARCH_WASM + HWY_ARCH_RVV) > 1
#error "Must not detect more than one architecture"
#endif
#if defined(_WIN32) || defined(_WIN64)
#define HWY_OS_WIN 1
#else
#define HWY_OS_WIN 0
#endif
#if defined(linux) || defined(__linux__)
#define HWY_OS_LINUX 1
#else
#define HWY_OS_LINUX 0
#endif
#endif // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
+478
View File
@@ -0,0 +1,478 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
#define HIGHWAY_HWY_DETECT_TARGETS_H_
// Defines targets and chooses which to enable.
#include "hwy/detect_compiler_arch.h"
//------------------------------------------------------------------------------
// Optional configuration
// See g3doc/quick_reference.md for documentation of these macros.
// Uncomment to override the default baseline determined from predefined macros:
// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
// Uncomment to override the default blocklist:
// #define HWY_BROKEN_TARGETS HWY_AVX3
// Uncomment to definitely avoid generating those target(s):
// #define HWY_DISABLED_TARGETS HWY_SSE4
// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
// AVX2 target for VMs which support AVX2 but not the other instruction sets)
// #define HWY_DISABLE_BMI2_FMA
// Uncomment to enable SSSE3/SSE4 on MSVC even if AVX is not enabled
// #define HWY_WANT_SSSE3
// #define HWY_WANT_SSE4
//------------------------------------------------------------------------------
// Targets
// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
//
// All values are unconditionally defined so we can test HWY_TARGETS without
// first checking the HWY_ARCH_*.
//
// The C99 preprocessor evaluates #if expressions using intmax_t types. This
// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
// avoid overflow when computing HWY_TARGETS (subtracting one instead of
// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
// --------------------------- x86: 15 targets (+ one fallback)
// Bits 0..6 reserved (7 targets)
// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
// VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
// Tiger Lake? We do not yet have uses for GFNI.
#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below
#define HWY_AVX3 (1LL << 8)
#define HWY_AVX2 (1LL << 9)
// Bit 10: reserved for AVX
#define HWY_SSE4 (1LL << 11)
#define HWY_SSSE3 (1LL << 12)
// Bits 13..14 reserved for SSE3 or SSE2 (2 targets)
// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
// dynamic dispatch. All x86 target bits must be lower or equal to
// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
// HWY_MAX_DYNAMIC_TARGETS in total.
#define HWY_HIGHEST_TARGET_BIT_X86 14
// --------------------------- Arm: 15 targets (+ one fallback)
// Bits 15..23 reserved (9 targets)
#define HWY_SVE2_128 (1LL << 24) // specialized target (e.g. Arm N2)
#define HWY_SVE_256 (1LL << 25) // specialized target (e.g. Arm V1)
#define HWY_SVE2 (1LL << 26)
#define HWY_SVE (1LL << 27)
#define HWY_NEON (1LL << 28) // On A64, includes/requires AES
// Bit 29 reserved (Helium?)
#define HWY_HIGHEST_TARGET_BIT_ARM 29
// --------------------------- RISC-V: 9 targets (+ one fallback)
// Bits 30..36 reserved (7 targets)
#define HWY_RVV (1LL << 37)
// Bit 38 reserved
#define HWY_HIGHEST_TARGET_BIT_RVV 38
// --------------------------- Future expansion: 4 targets
// Bits 39..42 reserved
// --------------------------- IBM Power: 9 targets (+ one fallback)
// Bits 43..48 reserved (6 targets)
#define HWY_PPC8 (1LL << 49) // v2.07 or 3
// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
#define HWY_HIGHEST_TARGET_BIT_PPC 51
// --------------------------- WebAssembly: 9 targets (+ one fallback)
// Bits 52..57 reserved (6 targets)
#define HWY_WASM_EMU256 (1LL << 58) // Experimental
#define HWY_WASM (1LL << 59)
// Bits 60 reserved
#define HWY_HIGHEST_TARGET_BIT_WASM 60
// --------------------------- Emulation: 2 targets
#define HWY_EMU128 (1LL << 61)
// We do not add/left-shift, so this will not overflow to a negative number.
#define HWY_SCALAR (1LL << 62)
#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
// Do not use bit 63 - would be confusing to have negative numbers.
//------------------------------------------------------------------------------
// Set default blocklists
// Disabled means excluded from enabled at user's request. A separate config
// macro allows disabling without deactivating the blocklist below.
#ifndef HWY_DISABLED_TARGETS
#define HWY_DISABLED_TARGETS 0
#endif
// Broken means excluded from enabled due to known compiler issues. Allow the
// user to override this blocklist without any guarantee of success.
#ifndef HWY_BROKEN_TARGETS
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
// SSE4 codegen (possibly only for msan), so disable all those targets.
#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
// This entails a major speed reduction, so warn unless the user explicitly
// opts in to scalar-only.
#if !defined(HWY_COMPILE_ONLY_SCALAR)
#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
#endif
// 32-bit may fail to compile AVX2/3.
#elif HWY_ARCH_X86_32
#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
#elif HWY_COMPILER_MSVC != 0
#define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
// armv7be has not been tested and is not yet supported.
#elif HWY_ARCH_ARM_V7 && \
(defined(__ARM_BIG_ENDIAN) || \
(defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
#define HWY_BROKEN_TARGETS (HWY_NEON)
// SVE[2] require recent clang or gcc versions.
#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
#else
#define HWY_BROKEN_TARGETS 0
#endif
#endif // HWY_BROKEN_TARGETS
// Enabled means not disabled nor blocklisted.
#define HWY_ENABLED(targets) \
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate
// from HWY_BROKEN_TARGETS because it affects the fallback target, which must
// always be enabled. If 1, we instead choose HWY_SCALAR even without
// HWY_COMPILE_ONLY_SCALAR being set.
#if !defined(HWY_BROKEN_EMU128) // allow overriding
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203
#define HWY_BROKEN_EMU128 1
#else
#define HWY_BROKEN_EMU128 0
#endif
#endif // HWY_BROKEN_EMU128
//------------------------------------------------------------------------------
// Detect baseline targets using predefined macros
// Baseline means the targets for which the compiler is allowed to generate
// instructions, implying the target CPU would have to support them. This does
// not take the blocklist into account.
#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
#define HWY_BASELINE_SCALAR HWY_SCALAR
#else
#define HWY_BASELINE_SCALAR HWY_EMU128
#endif
// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
// HWY_TARGET == HWY_BASELINE_SCALAR.
#if HWY_ARCH_WASM && defined(__wasm_simd128__)
#if defined(HWY_WANT_WASM2)
#define HWY_BASELINE_WASM HWY_WASM_EMU256
#else
#define HWY_BASELINE_WASM HWY_WASM
#endif // HWY_WANT_WASM2
#else
#define HWY_BASELINE_WASM 0
#endif
// Avoid choosing the PPC target until we have an implementation.
#if HWY_ARCH_PPC && defined(__VSX__) && 0
#define HWY_BASELINE_PPC8 HWY_PPC8
#else
#define HWY_BASELINE_PPC8 0
#endif
#define HWY_BASELINE_SVE2 0
#define HWY_BASELINE_SVE 0
#define HWY_BASELINE_NEON 0
#if HWY_ARCH_ARM
#if defined(__ARM_FEATURE_SVE2)
#undef HWY_BASELINE_SVE2 // was 0, will be re-defined
// If user specified -msve-vector-bits=128, they assert the vector length is
// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
#define HWY_BASELINE_SVE2 HWY_SVE2_128
// Otherwise we're not sure what the vector length will be. The baseline must be
// unconditionally valid, so we can only assume HWY_SVE2. However, when running
// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
#else
#define HWY_BASELINE_SVE2 HWY_SVE2
#endif // __ARM_FEATURE_SVE_BITS
#endif // __ARM_FEATURE_SVE2
#if defined(__ARM_FEATURE_SVE)
#undef HWY_BASELINE_SVE // was 0, will be re-defined
// See above. If user-specified vector length matches our optimization, use it.
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
#define HWY_BASELINE_SVE HWY_SVE_256
#else
#define HWY_BASELINE_SVE HWY_SVE
#endif // __ARM_FEATURE_SVE_BITS
#endif // __ARM_FEATURE_SVE
// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#undef HWY_BASELINE_NEON
#define HWY_BASELINE_NEON HWY_NEON
#endif
#endif // HWY_ARCH_ARM
// Special handling for MSVC because it has fewer predefined macros:
#if HWY_COMPILER_MSVC
// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
// https://stackoverflow.com/questions/18563978/.
#if defined(__AVX__)
#define HWY_CHECK_SSSE3 1
#define HWY_CHECK_SSE4 1
#else
#define HWY_CHECK_SSSE3 0
#define HWY_CHECK_SSE4 0
#endif
// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
#define HWY_CHECK_PCLMUL_AES 1
#define HWY_CHECK_BMI2_FMA 1
#define HWY_CHECK_F16C 1
#else // non-MSVC
#if defined(__SSSE3__)
#define HWY_CHECK_SSSE3 1
#else
#define HWY_CHECK_SSSE3 0
#endif
#if defined(__SSE4_1__) && defined(__SSE4_2__)
#define HWY_CHECK_SSE4 1
#else
#define HWY_CHECK_SSE4 0
#endif
// If these are disabled, they should not gate the availability of SSE4/AVX2.
#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
#define HWY_CHECK_PCLMUL_AES 1
#else
#define HWY_CHECK_PCLMUL_AES 0
#endif
#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
#define HWY_CHECK_BMI2_FMA 1
#else
#define HWY_CHECK_BMI2_FMA 0
#endif
#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
#define HWY_CHECK_F16C 1
#else
#define HWY_CHECK_F16C 0
#endif
#endif // non-MSVC
#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
#define HWY_BASELINE_SSSE3 HWY_SSSE3
#else
#define HWY_BASELINE_SSSE3 0
#endif
#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
#define HWY_BASELINE_SSE4 HWY_SSE4
#else
#define HWY_BASELINE_SSE4 0
#endif
#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
defined(__AVX2__)
#define HWY_BASELINE_AVX2 HWY_AVX2
#else
#define HWY_BASELINE_AVX2 0
#endif
// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
defined(__AVX512DQ__) && defined(__AVX512VL__)
#define HWY_BASELINE_AVX3 HWY_AVX3
#else
#define HWY_BASELINE_AVX3 0
#endif
// TODO(janwas): not yet known whether these will be set by MSVC
#if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
defined(__AVX512BITALG__)
#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
#else
#define HWY_BASELINE_AVX3_DL 0
#endif
#if HWY_ARCH_RVV && defined(__riscv_vector)
#define HWY_BASELINE_RVV HWY_RVV
#else
#define HWY_BASELINE_RVV 0
#endif
// Allow the user to override this without any guarantee of success.
#ifndef HWY_BASELINE_TARGETS
#define HWY_BASELINE_TARGETS \
(HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | HWY_BASELINE_NEON | \
HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | \
HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
#endif // HWY_BASELINE_TARGETS
//------------------------------------------------------------------------------
// Choose target for static dispatch
#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
#if HWY_ENABLED_BASELINE == 0
#error "At least one baseline target must be defined and enabled"
#endif
// Best baseline, used for static dispatch. This is the least-significant 1-bit
// within HWY_ENABLED_BASELINE and lower bit values imply "better".
#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
// Start by assuming static dispatch. If we later use dynamic dispatch, this
// will be defined to other targets during the multiple-inclusion, and finally
// return to the initial value. Defining this outside begin/end_target ensures
// inl headers successfully compile by themselves (required by Bazel).
#define HWY_TARGET HWY_STATIC_TARGET
//------------------------------------------------------------------------------
// Choose targets for dynamic dispatch according to one of four policies
#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
defined(HWY_COMPILE_ONLY_STATIC))
#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
#endif
// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
// Clang, GCC and MSVC allow runtime dispatch on x86.
#if HWY_ARCH_X86
#define HWY_HAVE_RUNTIME_DISPATCH 1
// On Arm, currently only GCC does, and we require Linux to detect CPU
// capabilities.
#elif HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX
#define HWY_HAVE_RUNTIME_DISPATCH 1
#else
#define HWY_HAVE_RUNTIME_DISPATCH 0
#endif
// AVX3_DL is not widely available yet. To reduce code size and compile time,
// only include it in the set of attainable targets (for dynamic dispatch) if
// the user opts in, OR it is in the baseline (we check whether enabled below).
#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE & HWY_AVX3_DL)
#define HWY_ATTAINABLE_AVX3_DL HWY_AVX3_DL
#else
#define HWY_ATTAINABLE_AVX3_DL 0
#endif
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
(HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
#else
#define HWY_ATTAINABLE_SVE 0
#endif
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
(HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
#else
#define HWY_ATTAINABLE_SVE2 0
#endif
// Attainable means enabled and the compiler allows intrinsics (even when not
// allowed to autovectorize). Used in 3 and 4.
#if HWY_ARCH_X86
#define HWY_ATTAINABLE_TARGETS \
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
#define HWY_ATTAINABLE_TARGETS \
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_NEON | HWY_ATTAINABLE_SVE | \
HWY_ATTAINABLE_SVE2)
#else
#define HWY_ATTAINABLE_TARGETS \
(HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE | HWY_ATTAINABLE_SVE2)
#endif
// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
#undef HWY_STATIC_TARGET
#define HWY_STATIC_TARGET HWY_EMU128 // override baseline
#define HWY_TARGETS HWY_EMU128
// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
// we currently still support it for backwards compatibility.
#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
(defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
#undef HWY_STATIC_TARGET
#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
#define HWY_TARGETS HWY_SCALAR
// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
#elif defined(HWY_COMPILE_ONLY_STATIC)
#define HWY_TARGETS HWY_STATIC_TARGET
// 3) For tests: include all attainable targets (in particular: scalar)
#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
// sets all lower bits (better targets), then we also include the static target.
#else
#define HWY_TARGETS \
(HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
#endif // target policy
// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
#error "Logic error: best baseline should be included in dynamic targets"
#endif
#endif // HIGHWAY_HWY_DETECT_TARGETS_H_
+254
View File
@@ -0,0 +1,254 @@
// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <memory>
#include <numeric> // iota
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// Must come after foreach_target.h to avoid redefinition errors.
#include "hwy/aligned_allocator.h"
#include "hwy/highway.h"
#include "hwy/nanobenchmark.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// These templates are not found via ADL.
#if HWY_TARGET != HWY_SCALAR
using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
#endif
class TwoArray {
public:
// Must be a multiple of the vector lane count * 8.
static size_t NumItems() { return 3456; }
TwoArray()
: a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
// = 1, but compiler doesn't know
const float init = static_cast<float>(Unpredictable1());
std::iota(a_.get(), a_.get() + NumItems(), init);
std::iota(b_, b_ + NumItems(), init);
}
protected:
AlignedFreeUniquePtr<float[]> a_;
float* b_;
};
// Measures durations, verifies results, prints timings.
template <class Benchmark>
void RunBenchmark(const char* caption) {
printf("%10s: ", caption);
const size_t kNumInputs = 1;
const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1());
const FuncInput inputs[kNumInputs] = {num_items};
Result results[kNumInputs];
Benchmark benchmark;
Params p;
p.verbose = false;
p.max_evals = 7;
p.target_rel_mad = 0.002;
const size_t num_results = MeasureClosure(
[&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
kNumInputs, results, p);
if (num_results != kNumInputs) {
fprintf(stderr, "MeasureClosure failed.\n");
}
benchmark.Verify(num_items);
for (size_t i = 0; i < num_results; ++i) {
const double cycles_per_item =
results[i].ticks / static_cast<double>(results[i].input);
const double mad = results[i].variability * cycles_per_item;
printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
}
}
void Intro() {
const float in[16] = {1, 2, 3, 4, 5, 6};
float out[16];
const ScalableTag<float> d; // largest possible vector
for (size_t i = 0; i < 16; i += Lanes(d)) {
const auto vec = LoadU(d, in + i); // no alignment requirement
auto result = Mul(vec, vec);
result = Add(result, result); // can update if not const
StoreU(result, d, out + i);
}
printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
}
// BEGINNER: dot product
// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
class BenchmarkDot : public TwoArray {
public:
BenchmarkDot() : dot_{-1.0f} {}
FuncOutput operator()(const size_t num_items) {
const ScalableTag<float> d;
const size_t N = Lanes(d);
using V = decltype(Zero(d));
// Compiler doesn't make independent sum* accumulators, so unroll manually.
// We cannot use an array because V might be a sizeless type. For reasonable
// code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
V sum0 = Zero(d);
V sum1 = Zero(d);
V sum2 = Zero(d);
V sum3 = Zero(d);
const float* const HWY_RESTRICT pa = &a_[0];
const float* const HWY_RESTRICT pb = b_;
for (size_t i = 0; i < num_items; i += 4 * N) {
const auto a0 = Load(d, pa + i + 0 * N);
const auto b0 = Load(d, pb + i + 0 * N);
sum0 = MulAdd(a0, b0, sum0);
const auto a1 = Load(d, pa + i + 1 * N);
const auto b1 = Load(d, pb + i + 1 * N);
sum1 = MulAdd(a1, b1, sum1);
const auto a2 = Load(d, pa + i + 2 * N);
const auto b2 = Load(d, pb + i + 2 * N);
sum2 = MulAdd(a2, b2, sum2);
const auto a3 = Load(d, pa + i + 3 * N);
const auto b3 = Load(d, pb + i + 3 * N);
sum3 = MulAdd(a3, b3, sum3);
}
// Reduction tree: sum of all accumulators by pairs into sum0.
sum0 = Add(sum0, sum1);
sum2 = Add(sum2, sum3);
sum0 = Add(sum0, sum2);
dot_ = GetLane(SumOfLanes(d, sum0));
return static_cast<FuncOutput>(dot_);
}
void Verify(size_t num_items) {
if (dot_ == -1.0f) {
fprintf(stderr, "Dot: must call Verify after benchmark");
abort();
}
const float expected =
std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
const float rel_err = std::abs(expected - dot_) / expected;
if (rel_err > 1.1E-6f) {
fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_,
rel_err);
abort();
}
}
private:
float dot_; // for Verify
};
// INTERMEDIATE: delta coding
// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
struct BenchmarkDelta : public TwoArray {
FuncOutput operator()(const size_t num_items) const {
#if HWY_TARGET == HWY_SCALAR
b_[0] = a_[0];
for (size_t i = 1; i < num_items; ++i) {
b_[i] = a_[i] - a_[i - 1];
}
#elif HWY_CAP_GE256
// Larger vectors are split into 128-bit blocks, easiest to use the
// unaligned load support to shift between them.
const ScalableTag<float> df;
const size_t N = Lanes(df);
size_t i;
b_[0] = a_[0];
for (i = 1; i < N; ++i) {
b_[i] = a_[i] - a_[i - 1];
}
for (; i < num_items; i += N) {
const auto a = Load(df, &a_[i]);
const auto shifted = LoadU(df, &a_[i - 1]);
Store(a - shifted, df, &b_[i]);
}
#else // 128-bit
// Slightly better than unaligned loads
const HWY_CAPPED(float, 4) df;
const size_t N = Lanes(df);
size_t i;
b_[0] = a_[0];
for (i = 1; i < N; ++i) {
b_[i] = a_[i] - a_[i - 1];
}
auto prev = Load(df, &a_[0]);
for (; i < num_items; i += Lanes(df)) {
const auto a = Load(df, &a_[i]);
const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
prev = a;
Store(Sub(a, shifted), df, &b_[i]);
}
#endif
return static_cast<FuncOutput>(b_[num_items - 1]);
}
void Verify(size_t num_items) {
for (size_t i = 0; i < num_items; ++i) {
const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
const float err = std::abs(expected - b_[i]);
if (err > 1E-6f) {
fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]);
}
}
}
};
void RunBenchmarks() {
Intro();
printf("------------------------ %s\n", TargetName(HWY_TARGET));
RunBenchmark<BenchmarkDot>("dot");
RunBenchmark<BenchmarkDelta>("delta");
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_EXPORT(RunBenchmarks);
void Run() {
for (int64_t target : SupportedAndGeneratedTargets()) {
SetSupportedTargetsForTest(target);
HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
}
SetSupportedTargetsForTest(0); // Reset the mask afterwards.
}
} // namespace hwy
int main(int /*argc*/, char** /*argv*/) {
hwy::Run();
return 0;
}
#endif // HWY_ONCE
@@ -0,0 +1,66 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Demo of functions that might be called from multiple SIMD modules (either
// other -inl.h files, or a .cc file between begin/end_target-inl). This is
// optional - all SIMD code can reside in .cc files. However, this allows
// splitting code into different files while still inlining instead of requiring
// calling through function pointers.
// Per-target include guard. This is only required when using dynamic dispatch,
// i.e. including foreach_target.h. For static dispatch, a normal include
// guard would be fine because the header is only compiled once.
#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#else
#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#endif
// It is fine to #include normal or *-inl headers.
#include <stddef.h>
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace skeleton {
namespace HWY_NAMESPACE {
// Highway ops reside here; ADL does not find templates nor builtins.
namespace hn = hwy::HWY_NAMESPACE;
// Example of a type-agnostic (caller-specified lane type) and width-agnostic
// (uses best available instruction set) function in a header.
//
// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
template <class D, typename T>
HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
const T* HWY_RESTRICT add_array,
const size_t size, T* HWY_RESTRICT x_array) {
for (size_t i = 0; i < size; i += hn::Lanes(d)) {
const auto mul = hn::Load(d, mul_array + i);
const auto add = hn::Load(d, add_array + i);
auto x = hn::Load(d, x_array + i);
x = hn::MulAdd(mul, x, add);
hn::Store(x, d, x_array + i);
}
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace skeleton
HWY_AFTER_NAMESPACE();
#endif // include guard
+121
View File
@@ -0,0 +1,121 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/examples/skeleton.h"
#include <stdio.h>
// >>>> for dynamic dispatch only, skip if you want static dispatch
// First undef to prevent error when re-included.
#undef HWY_TARGET_INCLUDE
// For dynamic dispatch, specify the name of the current file (unfortunately
// __FILE__ is not reliable) so that foreach_target.h can re-include it.
#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
// Generates code for each enabled target by re-including this source file.
#include "hwy/foreach_target.h" // IWYU pragma: keep
// <<<< end of dynamic dispatch
// Must come after foreach_target.h to avoid redefinition errors.
#include "hwy/highway.h"
// Optional, can instead add HWY_ATTR to all functions.
HWY_BEFORE_NAMESPACE();
namespace skeleton {
// This namespace name is unique per target, which allows code for multiple
// targets to co-exist in the same translation unit. Required when using dynamic
// dispatch, otherwise optional.
namespace HWY_NAMESPACE {
// Highway ops reside here; ADL does not find templates nor builtins.
namespace hn = hwy::HWY_NAMESPACE;
// Computes log2 by converting to a vector of floats. Compiled once per target.
template <class DF>
HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
const uint8_t* HWY_RESTRICT values,
uint8_t* HWY_RESTRICT log2) {
// Type tags for converting to other element types (Rebind = same count).
const hn::RebindToSigned<DF> d32;
const hn::Rebind<uint8_t, DF> d8;
const auto u8 = hn::Load(d8, values);
const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
}
void CodepathDemo() {
// Highway defaults to portability, but per-target codepaths may be selected
// via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
#if HWY_HAVE_INTEGER64
const char* gather = "Has int64";
#else
const char* gather = "No int64";
#endif
printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
}
void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
uint8_t* HWY_RESTRICT log2) {
CodepathDemo();
const hn::ScalableTag<float> df;
const size_t N = hn::Lanes(df);
size_t i = 0;
for (; i + N <= count; i += N) {
OneFloorLog2(df, values + i, log2 + i);
}
for (; i < count; ++i) {
hn::CappedTag<float, 1> d1;
OneFloorLog2(d1, values + i, log2 + i);
}
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace skeleton
HWY_AFTER_NAMESPACE();
// The table of pointers to the various implementations in HWY_NAMESPACE must
// be compiled only once (foreach_target #includes this file multiple times).
// HWY_ONCE is true for only one of these 'compilation passes'.
#if HWY_ONCE
namespace skeleton {
// This macro declares a static array used for dynamic dispatch; it resides in
// the same outer namespace that contains FloorLog2.
HWY_EXPORT(FloorLog2);
// This function is optional and only needed in the case of exposing it in the
// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
// is equivalent to inlining this function.
HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
const size_t count,
uint8_t* HWY_RESTRICT out) {
// This must reside outside of HWY_NAMESPACE because it references (calls the
// appropriate one from) the per-target implementations there.
// For static dispatch, use HWY_STATIC_DISPATCH.
return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
}
// Optional: anything to compile only once, e.g. non-SIMD implementations of
// public functions provided by this module, can go inside #if HWY_ONCE.
} // namespace skeleton
#endif // HWY_ONCE
+36
View File
@@ -0,0 +1,36 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Demo interface to target-specific code in skeleton.cc
// Normal header with include guard and namespace.
#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_
#include <stddef.h>
// Platform-specific definitions used for declaring an interface, independent of
// the SIMD instruction set.
#include "hwy/base.h" // HWY_RESTRICT
namespace skeleton {
// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
const size_t count, uint8_t* HWY_RESTRICT out);
} // namespace skeleton
#endif // HIGHWAY_HWY_EXAMPLES_SKELETON_H_
@@ -0,0 +1,110 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Example of unit test for the "skeleton" library.
#include "hwy/examples/skeleton.h"
#include <stdio.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
// Must come after foreach_target.h to avoid redefinition errors.
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
// Optional: factor out parts of the implementation into *-inl.h
// (must also come after foreach_target.h to avoid redefinition errors)
#include "hwy/examples/skeleton-inl.h"
HWY_BEFORE_NAMESPACE();
namespace skeleton {
namespace HWY_NAMESPACE {
namespace hn = hwy::HWY_NAMESPACE;
// Calls function defined in skeleton.cc.
struct TestFloorLog2 {
template <class T, class DF>
HWY_NOINLINE void operator()(T /*unused*/, DF df) {
const size_t count = 5 * hn::Lanes(df);
auto in = hwy::AllocateAligned<uint8_t>(count);
auto expected = hwy::AllocateAligned<uint8_t>(count);
hwy::RandomState rng;
for (size_t i = 0; i < count; ++i) {
expected[i] = Random32(&rng) & 7;
in[i] = static_cast<uint8_t>(1u << expected[i]);
}
auto out = hwy::AllocateAligned<uint8_t>(count);
CallFloorLog2(in.get(), count, out.get());
int sum = 0;
for (size_t i = 0; i < count; ++i) {
HWY_ASSERT_EQ(expected[i], out[i]);
sum += out[i];
}
hwy::PreventElision(sum);
}
};
HWY_NOINLINE void TestAllFloorLog2() {
hn::ForPartialVectors<TestFloorLog2>()(float());
}
// Calls function defined in skeleton-inl.h.
struct TestSumMulAdd {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
hwy::RandomState rng;
const size_t count = 4096;
EXPECT_EQ(0, count % hn::Lanes(d));
auto mul = hwy::AllocateAligned<T>(count);
auto x = hwy::AllocateAligned<T>(count);
auto add = hwy::AllocateAligned<T>(count);
for (size_t i = 0; i < count; ++i) {
mul[i] = static_cast<T>(Random32(&rng) & 0xF);
x[i] = static_cast<T>(Random32(&rng) & 0xFF);
add[i] = static_cast<T>(Random32(&rng) & 0xFF);
}
double expected_sum = 0.0;
for (size_t i = 0; i < count; ++i) {
expected_sum += mul[i] * x[i] + add[i];
}
MulAddLoop(d, mul.get(), add.get(), count, x.get());
HWY_ASSERT_EQ(4344240.0, expected_sum);
}
};
HWY_NOINLINE void TestAllSumMulAdd() {
hn::ForFloatTypes(hn::ForPartialVectors<TestSumMulAdd>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace skeleton
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace skeleton {
HWY_BEFORE_TEST(SkeletonTest);
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
} // namespace skeleton
#endif
+261
View File
@@ -0,0 +1,261 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_
#define HIGHWAY_HWY_FOREACH_TARGET_H_
// Re-includes the translation unit zero or more times to compile for any
// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
// highway.h defines the corresponding macro/namespace.
#include "hwy/detect_targets.h"
// *_inl.h may include other headers, which requires include guards to prevent
// repeated inclusion. The guards must be reset after compiling each target, so
// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE,
// defining it if undefined and vice versa. This macro is initially undefined
// so that IDEs don't gray out the contents of each header.
#ifdef HWY_TARGET_TOGGLE
#error "This macro must not be defined outside foreach_target.h"
#endif
#ifdef HWY_HIGHWAY_INCLUDED // highway.h include guard
// Trigger fixup at the bottom of this header.
#define HWY_ALREADY_INCLUDED
// The next highway.h must re-include set_macros-inl.h because the first
// highway.h chose the static target instead of what we will set below.
#undef HWY_SET_MACROS_PER_TARGET
#endif
// Disable HWY_EXPORT in user code until we have generated all targets. Note
// that a subsequent highway.h will not override this definition.
#undef HWY_ONCE
#define HWY_ONCE (0 || HWY_IDE)
// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE;
// also skip if only 1 target defined (no re-inclusion will be necessary).
#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
#if !defined(HWY_TARGET_INCLUDE)
#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
#endif
#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128)
#undef HWY_TARGET
#define HWY_TARGET HWY_EMU128
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
#undef HWY_TARGET
#define HWY_TARGET HWY_SCALAR
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON)
#undef HWY_TARGET
#define HWY_TARGET HWY_NEON
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_RVV) && (HWY_STATIC_TARGET != HWY_RVV)
#undef HWY_TARGET
#define HWY_TARGET HWY_RVV
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
#undef HWY_TARGET
#define HWY_TARGET HWY_SVE
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2)
#undef HWY_TARGET
#define HWY_TARGET HWY_SVE2
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256)
#undef HWY_TARGET
#define HWY_TARGET HWY_SVE_256
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128)
#undef HWY_TARGET
#define HWY_TARGET HWY_SVE2_128
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
#undef HWY_TARGET
#define HWY_TARGET HWY_SSSE3
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
#undef HWY_TARGET
#define HWY_TARGET HWY_SSE4
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2)
#undef HWY_TARGET
#define HWY_TARGET HWY_AVX2
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3)
#undef HWY_TARGET
#define HWY_TARGET HWY_AVX3
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_AVX3_DL) && (HWY_STATIC_TARGET != HWY_AVX3_DL)
#undef HWY_TARGET
#define HWY_TARGET HWY_AVX3_DL
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256)
#undef HWY_TARGET
#define HWY_TARGET HWY_WASM_EMU256
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
#undef HWY_TARGET
#define HWY_TARGET HWY_WASM
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8)
#undef HWY_TARGET
#define HWY_TARGET HWY_PPC8
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#endif // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
// Now that all but the static target have been generated, re-enable HWY_EXPORT.
#undef HWY_ONCE
#define HWY_ONCE 1
// If we re-include once per enabled target, the translation unit's
// implementation would have to be skipped via #if to avoid redefining symbols.
// We instead skip the re-include for HWY_STATIC_TARGET, and generate its
// implementation when resuming compilation of the translation unit.
#undef HWY_TARGET
#define HWY_TARGET HWY_STATIC_TARGET
#ifdef HWY_ALREADY_INCLUDED
// Revert the previous toggle to prevent redefinitions for the static target.
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored.
#ifdef HWY_SET_MACROS_PER_TARGET
#undef HWY_SET_MACROS_PER_TARGET
#else
#define HWY_SET_MACROS_PER_TARGET
#endif
#endif
#endif // HIGHWAY_HWY_FOREACH_TARGET_H_
+378
View File
@@ -0,0 +1,378 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This include guard is checked by foreach_target, so avoid the usual _H_
// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
// after/outside this include guard.
#ifndef HWY_HIGHWAY_INCLUDED
#define HWY_HIGHWAY_INCLUDED
// Main header required before using vector types.
#include "hwy/base.h"
#include "hwy/targets.h"
namespace hwy {
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
#define HWY_MAJOR 1
#define HWY_MINOR 0
#define HWY_PATCH 2
//------------------------------------------------------------------------------
// Shorthand for tags (defined in shared-inl.h) used to select overloads.
// Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
// HWY_CAPPED(T, N).
// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
// registers in the group, and is ignored on targets that do not support groups.
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
#define HWY_FULL2(T, LMUL) \
hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
// Workaround for MSVC grouping __VA_ARGS__ into a single argument
#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
// Trailing comma avoids -pedantic false alarm
#define HWY_CHOOSE_FULL(...) \
HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>
//------------------------------------------------------------------------------
// Export user functions for static/dynamic dispatch
// Evaluates to 0 inside a translation unit if it is generating anything but the
// static target (the last one if multiple targets are enabled). Used to prevent
// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
// compile once anyway, so this is 1 unless it is or has been included.
#ifndef HWY_ONCE
#define HWY_ONCE 1
#endif
// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
// defined), and can be used to deduce the return type of Choose*.
#if HWY_STATIC_TARGET == HWY_SCALAR
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_EMU128
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_RVV
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_WASM
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_NEON
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE2
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE_256
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE2_128
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_PPC8
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SSSE3
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SSE4
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX2
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3_DL
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
#endif
// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
// nullptr is that target was not compiled.
#if HWY_TARGETS & HWY_EMU128
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
#elif HWY_TARGETS & HWY_SCALAR
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
#else
// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
// runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
#endif
#if HWY_TARGETS & HWY_WASM_EMU256
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
#else
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_WASM
#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
#else
#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_RVV
#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
#else
#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_NEON
#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
#else
#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SVE
#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
#else
#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SVE2
#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
#else
#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SVE_256
#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
#else
#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SVE2_128
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
#else
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_PPC8
#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
#else
#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SSSE3
#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
#else
#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SSE4
#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
#else
#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX2
#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
#else
#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX3
#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX3_DL
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
#endif
// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
// apparently cannot be an array. Use a function pointer instead, which has the
// disadvantage that we call the static (not best) target on the first call to
// any HWY_DYNAMIC_DISPATCH.
#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
#define HWY_DISPATCH_WORKAROUND 1
#else
#define HWY_DISPATCH_WORKAROUND 0
#endif
// Provides a static member function which is what is called during the first
// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
// this function are the first entry in the tables created by HWY_EXPORT.
template <typename RetType, typename... Args>
struct FunctionCache {
public:
typedef RetType(FunctionType)(Args...);
#if HWY_DISPATCH_WORKAROUND
template <FunctionType* const func>
static RetType ChooseAndCall(Args... args) {
ChosenTarget& chosen_target = GetChosenTarget();
chosen_target.Update(SupportedTargets());
return (*func)(args...);
}
#else
// A template function that when instantiated has the same signature as the
// function being called. This function initializes the bit array of targets
// supported by the current CPU and then calls the appropriate entry within
// the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
// exported functions, even those defined by different translation units,
// will dispatch directly to the best available target.
template <FunctionType* const table[]>
static RetType ChooseAndCall(Args... args) {
ChosenTarget& chosen_target = GetChosenTarget();
chosen_target.Update(SupportedTargets());
return (table[chosen_target.GetIndex()])(args...);
}
#endif // HWY_DISPATCH_WORKAROUND
};
// Used to deduce the template parameters RetType and Args from a function.
template <typename RetType, typename... Args>
FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
return FunctionCache<RetType, Args...>();
}
#define HWY_DISPATCH_TABLE(FUNC_NAME) \
HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
// static array must be defined at the same namespace level as the function
// it is exporting.
// After being exported, it can be called from other parts of the same source
// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
// like in the following example:
//
// #include "hwy/highway.h"
// HWY_BEFORE_NAMESPACE();
// namespace skeleton {
// namespace HWY_NAMESPACE {
//
// void MyFunction(int a, char b, const char* c) { ... }
//
// // NOLINTNEXTLINE(google-readability-namespace-comments)
// } // namespace HWY_NAMESPACE
// } // namespace skeleton
// HWY_AFTER_NAMESPACE();
//
// namespace skeleton {
// HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope.
//
// void MyFunction(int a, char b, const char* c) {
// return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
// }
// } // namespace skeleton
//
#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
// Simplified version for IDE or the dynamic dispatch case with only one target.
// This case still uses a table, although of a single element, to provide the
// same compile error conditions as with the dynamic dispatch case when multiple
// targets are being compiled.
#define HWY_EXPORT(FUNC_NAME) \
HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
#else
// Simplified version for MSVC 2017: function pointer instead of table.
#if HWY_DISPATCH_WORKAROUND
#define HWY_EXPORT(FUNC_NAME) \
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
/* The first entry in the table initializes the global cache and \
* calls the function from HWY_STATIC_TARGET. */ \
&decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>, \
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
HWY_CHOOSE_FALLBACK(FUNC_NAME), \
}
#else
// Dynamic dispatch case with one entry per dynamic target plus the fallback
// target and the initialization wrapper.
#define HWY_EXPORT(FUNC_NAME) \
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
/* The first entry in the table initializes the global cache and \
* calls the appropriate function. */ \
&decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
HWY_CHOOSE_FALLBACK(FUNC_NAME), \
}
#endif // HWY_DISPATCH_WORKAROUND
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
// DEPRECATED names; please use HWY_HAVE_* instead.
#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
} // namespace hwy
#endif // HWY_HIGHWAY_INCLUDED
//------------------------------------------------------------------------------
// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
// to include them once per target, which is ensured by the toggle check.
// Because ops/*.h are included under it, they do not need their own guard.
#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
#ifdef HWY_HIGHWAY_PER_TARGET
#undef HWY_HIGHWAY_PER_TARGET
#else
#define HWY_HIGHWAY_PER_TARGET
#endif
// These define ops inside namespace hwy::HWY_NAMESPACE.
#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
#include "hwy/ops/x86_128-inl.h"
#elif HWY_TARGET == HWY_AVX2
#include "hwy/ops/x86_256-inl.h"
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL
#include "hwy/ops/x86_512-inl.h"
#elif HWY_TARGET == HWY_PPC8
#error "PPC is not yet supported"
#elif HWY_TARGET == HWY_NEON
#include "hwy/ops/arm_neon-inl.h"
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
#include "hwy/ops/arm_sve-inl.h"
#elif HWY_TARGET == HWY_WASM_EMU256
#include "hwy/ops/wasm_256-inl.h"
#elif HWY_TARGET == HWY_WASM
#include "hwy/ops/wasm_128-inl.h"
#elif HWY_TARGET == HWY_RVV
#include "hwy/ops/rvv-inl.h"
#elif HWY_TARGET == HWY_EMU128
#include "hwy/ops/emu128-inl.h"
#elif HWY_TARGET == HWY_SCALAR
#include "hwy/ops/scalar-inl.h"
#else
#pragma message("HWY_TARGET does not match any known target")
#endif // HWY_TARGET
#include "hwy/ops/generic_ops-inl.h"
#endif // HWY_HIGHWAY_PER_TARGET
+74
View File
@@ -0,0 +1,74 @@
// Pseudo-generated file to handle both cmake & bazel build system.
// Initial generation done using cmake code:
// include(GenerateExportHeader)
// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME
// hwy/highway_export.h)
// code reformatted using clang-format --style=Google
#ifndef HWY_DLLEXPORT_H
#define HWY_DLLEXPORT_H
#if !defined(HWY_SHARED_DEFINE)
#define HWY_DLLEXPORT
#define HWY_CONTRIB_DLLEXPORT
#define HWY_TEST_DLLEXPORT
#else // !HWY_SHARED_DEFINE
#ifndef HWY_DLLEXPORT
#if defined(hwy_EXPORTS)
/* We are building this library */
#ifdef _WIN32
#define HWY_DLLEXPORT __declspec(dllexport)
#else
#define HWY_DLLEXPORT __attribute__((visibility("default")))
#endif
#else // defined(hwy_EXPORTS)
/* We are using this library */
#ifdef _WIN32
#define HWY_DLLEXPORT __declspec(dllimport)
#else
#define HWY_DLLEXPORT __attribute__((visibility("default")))
#endif
#endif // defined(hwy_EXPORTS)
#endif // HWY_DLLEXPORT
#ifndef HWY_CONTRIB_DLLEXPORT
#if defined(hwy_contrib_EXPORTS)
/* We are building this library */
#ifdef _WIN32
#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport)
#else
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
#endif
#else // defined(hwy_contrib_EXPORTS)
/* We are using this library */
#ifdef _WIN32
#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
#else
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
#endif
#endif // defined(hwy_contrib_EXPORTS)
#endif // HWY_CONTRIB_DLLEXPORT
#ifndef HWY_TEST_DLLEXPORT
#if defined(hwy_test_EXPORTS)
/* We are building this library */
#ifdef _WIN32
#define HWY_TEST_DLLEXPORT __declspec(dllexport)
#else
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
#endif
#else // defined(hwy_test_EXPORTS)
/* We are using this library */
#ifdef _WIN32
#define HWY_TEST_DLLEXPORT __declspec(dllimport)
#else
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
#endif
#endif // defined(hwy_test_EXPORTS)
#endif // HWY_TEST_DLLEXPORT
#endif // !HWY_SHARED_DEFINE
#endif /* HWY_DLLEXPORT_H */
+485
View File
@@ -0,0 +1,485 @@
// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#include <bitset>
#include "hwy/base.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "highway_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/highway.h"
#include "hwy/nanobenchmark.h" // Unpredictable1
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <size_t kLimit, typename T>
HWY_NOINLINE void TestCappedLimit(T /* tag */) {
CappedTag<T, kLimit> d;
// Ensure two ops compile
HWY_ASSERT_VEC_EQ(d, Zero(d), Set(d, T{0}));
// Ensure we do not write more than kLimit lanes
const size_t N = Lanes(d);
if (kLimit < N) {
auto lanes = AllocateAligned<T>(N);
std::fill(lanes.get(), lanes.get() + N, T{0});
Store(Set(d, T{1}), d, lanes.get());
for (size_t i = kLimit; i < N; ++i) {
HWY_ASSERT_EQ(lanes[i], T{0});
}
}
}
// Adapter for ForAllTypes - we are constructing our own Simd<> and thus do not
// use ForPartialVectors etc.
struct TestCapped {
template <typename T>
void operator()(T t) const {
TestCappedLimit<1>(t);
TestCappedLimit<3>(t);
TestCappedLimit<5>(t);
TestCappedLimit<1ull << 15>(t);
}
};
HWY_NOINLINE void TestAllCapped() { ForAllTypes(TestCapped()); }
// For testing that ForPartialVectors reaches every possible size:
using NumLanesSet = std::bitset<HWY_MAX_BYTES + 1>;
// Monostate pattern because ForPartialVectors takes a template argument, not a
// functor by reference.
static NumLanesSet* NumLanesForSize(size_t sizeof_t) {
HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
static NumLanesSet num_lanes[sizeof(uint64_t) + 1];
return num_lanes + sizeof_t;
}
static size_t* MaxLanesForSize(size_t sizeof_t) {
HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
static size_t num_lanes[sizeof(uint64_t) + 1] = {0};
return num_lanes + sizeof_t;
}
struct TestMaxLanes {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const size_t kMax = MaxLanes(d); // for RVV, includes LMUL
HWY_ASSERT(N <= kMax);
HWY_ASSERT(kMax <= (HWY_MAX_BYTES / sizeof(T)));
NumLanesForSize(sizeof(T))->set(N);
*MaxLanesForSize(sizeof(T)) = HWY_MAX(*MaxLanesForSize(sizeof(T)), N);
}
};
HWY_NOINLINE void TestAllMaxLanes() {
ForAllTypes(ForPartialVectors<TestMaxLanes>());
// Ensure ForPartialVectors visited all powers of two [1, N].
for (size_t sizeof_t : {sizeof(uint8_t), sizeof(uint16_t), sizeof(uint32_t),
sizeof(uint64_t)}) {
const size_t N = *MaxLanesForSize(sizeof_t);
for (size_t i = 1; i <= N; i += i) {
if (!NumLanesForSize(sizeof_t)->test(i)) {
fprintf(stderr, "T=%d: did not visit for N=%d, max=%d\n",
static_cast<int>(sizeof_t), static_cast<int>(i),
static_cast<int>(N));
HWY_ASSERT(false);
}
}
}
}
struct TestSet {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// Zero
const auto v0 = Zero(d);
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
std::fill(expected.get(), expected.get() + N, T(0));
HWY_ASSERT_VEC_EQ(d, expected.get(), v0);
// Set
const auto v2 = Set(d, T(2));
for (size_t i = 0; i < N; ++i) {
expected[i] = 2;
}
HWY_ASSERT_VEC_EQ(d, expected.get(), v2);
// Iota
const auto vi = Iota(d, T(5));
for (size_t i = 0; i < N; ++i) {
expected[i] = T(5 + i);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), vi);
// Undefined
const auto vu = Undefined(d);
Store(vu, d, expected.get());
}
};
HWY_NOINLINE void TestAllSet() { ForAllTypes(ForPartialVectors<TestSet>()); }
// Ensures wraparound (mod 2^bits)
struct TestOverflow {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v1 = Set(d, T(1));
const auto vmax = Set(d, LimitsMax<T>());
const auto vmin = Set(d, LimitsMin<T>());
// Unsigned underflow / negative -> positive
HWY_ASSERT_VEC_EQ(d, vmax, Sub(vmin, v1));
// Unsigned overflow / positive -> negative
HWY_ASSERT_VEC_EQ(d, vmin, Add(vmax, v1));
}
};
HWY_NOINLINE void TestAllOverflow() {
ForIntegerTypes(ForPartialVectors<TestOverflow>());
}
struct TestClamp {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
const auto v1 = Set(d, 1);
const auto v2 = Set(d, 2);
HWY_ASSERT_VEC_EQ(d, v1, Clamp(v2, v0, v1));
HWY_ASSERT_VEC_EQ(d, v1, Clamp(v0, v1, v2));
}
};
HWY_NOINLINE void TestAllClamp() {
ForAllTypes(ForPartialVectors<TestClamp>());
}
struct TestSignBitInteger {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
const auto all = VecFromMask(d, Eq(v0, v0));
const auto vs = SignBit(d);
const auto other = Sub(vs, Set(d, 1));
// Shifting left by one => overflow, equal zero
HWY_ASSERT_VEC_EQ(d, v0, Add(vs, vs));
// Verify the lower bits are zero (only +/- and logical ops are available
// for all types)
HWY_ASSERT_VEC_EQ(d, all, Add(vs, other));
}
};
struct TestSignBitFloat {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
const auto vs = SignBit(d);
const auto vp = Set(d, 2.25);
const auto vn = Set(d, -2.25);
HWY_ASSERT_VEC_EQ(d, Or(vp, vs), vn);
HWY_ASSERT_VEC_EQ(d, AndNot(vs, vn), vp);
HWY_ASSERT_VEC_EQ(d, v0, vs);
}
};
HWY_NOINLINE void TestAllSignBit() {
ForIntegerTypes(ForPartialVectors<TestSignBitInteger>());
ForFloatTypes(ForPartialVectors<TestSignBitFloat>());
}
// inline to work around incorrect SVE codegen (only first 128 bits used).
template <class D, class V>
HWY_INLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
using T = TFromD<D>;
const size_t N = Lanes(d);
if (!AllTrue(d, IsNaN(v))) {
Print(d, "not all NaN", v, 0, N);
Print(d, "mask", VecFromMask(d, IsNaN(v)), 0, N);
const std::string type_name = TypeName(T(), N);
// RVV lacks PRIu64 and MSYS still has problems with %zu, so print bytes to
// avoid truncating doubles.
uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0};
const T lane = GetLane(v);
CopyBytes<sizeof(T)>(&lane, bytes);
Abort(file, line,
"Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x "
"%02x)",
type_name.c_str(), lane, bytes[0], bytes[1], bytes[2], bytes[3],
bytes[4], bytes[5], bytes[6], bytes[7]);
}
}
#define HWY_ASSERT_NAN(d, v) AssertNaN(d, v, __FILE__, __LINE__)
struct TestNaN {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v1 = Set(d, T(Unpredictable1()));
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
HWY_ASSERT_NAN(d, nan);
// Arithmetic
HWY_ASSERT_NAN(d, Add(nan, v1));
HWY_ASSERT_NAN(d, Add(v1, nan));
HWY_ASSERT_NAN(d, Sub(nan, v1));
HWY_ASSERT_NAN(d, Sub(v1, nan));
HWY_ASSERT_NAN(d, Mul(nan, v1));
HWY_ASSERT_NAN(d, Mul(v1, nan));
HWY_ASSERT_NAN(d, Div(nan, v1));
HWY_ASSERT_NAN(d, Div(v1, nan));
// FMA
HWY_ASSERT_NAN(d, MulAdd(nan, v1, v1));
HWY_ASSERT_NAN(d, MulAdd(v1, nan, v1));
HWY_ASSERT_NAN(d, MulAdd(v1, v1, nan));
HWY_ASSERT_NAN(d, MulSub(nan, v1, v1));
HWY_ASSERT_NAN(d, MulSub(v1, nan, v1));
HWY_ASSERT_NAN(d, MulSub(v1, v1, nan));
HWY_ASSERT_NAN(d, NegMulAdd(nan, v1, v1));
HWY_ASSERT_NAN(d, NegMulAdd(v1, nan, v1));
HWY_ASSERT_NAN(d, NegMulAdd(v1, v1, nan));
HWY_ASSERT_NAN(d, NegMulSub(nan, v1, v1));
HWY_ASSERT_NAN(d, NegMulSub(v1, nan, v1));
HWY_ASSERT_NAN(d, NegMulSub(v1, v1, nan));
// Rcp/Sqrt
HWY_ASSERT_NAN(d, Sqrt(nan));
// Sign manipulation
HWY_ASSERT_NAN(d, Abs(nan));
HWY_ASSERT_NAN(d, Neg(nan));
HWY_ASSERT_NAN(d, CopySign(nan, v1));
HWY_ASSERT_NAN(d, CopySignToAbs(nan, v1));
// Rounding
HWY_ASSERT_NAN(d, Ceil(nan));
HWY_ASSERT_NAN(d, Floor(nan));
HWY_ASSERT_NAN(d, Round(nan));
HWY_ASSERT_NAN(d, Trunc(nan));
// Logical (And/AndNot/Xor will clear NaN!)
HWY_ASSERT_NAN(d, Or(nan, v1));
// Comparison
HWY_ASSERT(AllFalse(d, Eq(nan, v1)));
HWY_ASSERT(AllFalse(d, Gt(nan, v1)));
HWY_ASSERT(AllFalse(d, Lt(nan, v1)));
HWY_ASSERT(AllFalse(d, Ge(nan, v1)));
HWY_ASSERT(AllFalse(d, Le(nan, v1)));
// Reduction
HWY_ASSERT_NAN(d, SumOfLanes(d, nan));
// TODO(janwas): re-enable after QEMU/Spike are fixed
#if HWY_TARGET != HWY_RVV
HWY_ASSERT_NAN(d, MinOfLanes(d, nan));
HWY_ASSERT_NAN(d, MaxOfLanes(d, nan));
#endif
// Min
#if HWY_ARCH_X86 && (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128)
// x86 SIMD returns the second operand if any input is NaN.
HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
HWY_ASSERT_NAN(d, Min(v1, nan));
HWY_ASSERT_NAN(d, Max(v1, nan));
#elif HWY_ARCH_WASM
// Should return NaN if any input is NaN, but does not for scalar.
// TODO(janwas): remove once this is fixed.
#elif HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
// ARMv7 NEON returns NaN if any input is NaN.
HWY_ASSERT_NAN(d, Min(v1, nan));
HWY_ASSERT_NAN(d, Max(v1, nan));
HWY_ASSERT_NAN(d, Min(nan, v1));
HWY_ASSERT_NAN(d, Max(nan, v1));
#else
// IEEE 754-2019 minimumNumber is defined as the other argument if exactly
// one is NaN, and qNaN if both are.
HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
HWY_ASSERT_VEC_EQ(d, v1, Min(v1, nan));
HWY_ASSERT_VEC_EQ(d, v1, Max(v1, nan));
#endif
HWY_ASSERT_NAN(d, Min(nan, nan));
HWY_ASSERT_NAN(d, Max(nan, nan));
}
};
// For functions only available for float32
struct TestF32NaN {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v1 = Set(d, T(Unpredictable1()));
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
HWY_ASSERT_NAN(d, ApproximateReciprocal(nan));
HWY_ASSERT_NAN(d, ApproximateReciprocalSqrt(nan));
HWY_ASSERT_NAN(d, AbsDiff(nan, v1));
HWY_ASSERT_NAN(d, AbsDiff(v1, nan));
}
};
HWY_NOINLINE void TestAllNaN() {
ForFloatTypes(ForPartialVectors<TestNaN>());
ForPartialVectors<TestF32NaN>()(float());
}
struct TestIsNaN {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v1 = Set(d, T(Unpredictable1()));
const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
const auto neg = Set(d, T{-1});
HWY_ASSERT_NAN(d, nan);
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(inf));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(CopySign(inf, neg)));
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(nan));
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(CopySign(nan, neg)));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(v1));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Zero(d)));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::LowestValue<T>())));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::HighestValue<T>())));
}
};
HWY_NOINLINE void TestAllIsNaN() {
ForFloatTypes(ForPartialVectors<TestIsNaN>());
}
struct TestIsInf {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v1 = Set(d, T(Unpredictable1()));
const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
const auto neg = Set(d, T{-1});
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(inf));
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(CopySign(inf, neg)));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(nan));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(CopySign(nan, neg)));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(v1));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Zero(d)));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::LowestValue<T>())));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::HighestValue<T>())));
}
};
HWY_NOINLINE void TestAllIsInf() {
ForFloatTypes(ForPartialVectors<TestIsInf>());
}
struct TestIsFinite {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v1 = Set(d, T(Unpredictable1()));
const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
const auto neg = Set(d, T{-1});
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(inf));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(inf, neg)));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(nan));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(nan, neg)));
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(v1));
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Zero(d)));
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Set(d, hwy::LowestValue<T>())));
HWY_ASSERT_MASK_EQ(d, MaskTrue(d),
IsFinite(Set(d, hwy::HighestValue<T>())));
}
};
HWY_NOINLINE void TestAllIsFinite() {
ForFloatTypes(ForPartialVectors<TestIsFinite>());
}
struct TestCopyAndAssign {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// copy V
const auto v3 = Iota(d, 3);
auto v3b(v3);
HWY_ASSERT_VEC_EQ(d, v3, v3b);
// assign V
auto v3c = Undefined(d);
v3c = v3;
HWY_ASSERT_VEC_EQ(d, v3, v3c);
}
};
HWY_NOINLINE void TestAllCopyAndAssign() {
ForAllTypes(ForPartialVectors<TestCopyAndAssign>());
}
struct TestGetLane {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
HWY_ASSERT_EQ(T(0), GetLane(Zero(d)));
HWY_ASSERT_EQ(T(1), GetLane(Set(d, 1)));
}
};
HWY_NOINLINE void TestAllGetLane() {
ForAllTypes(ForPartialVectors<TestGetLane>());
}
struct TestDFromV {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
using D0 = DFromV<decltype(v0)>; // not necessarily same as D
const auto v0b = And(v0, Set(D0(), 1)); // but vectors can interoperate
HWY_ASSERT_VEC_EQ(d, v0, v0b);
}
};
HWY_NOINLINE void TestAllDFromV() {
ForAllTypes(ForPartialVectors<TestDFromV>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HighwayTest);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCapped);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllMaxLanes);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsNaN);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsInf);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsFinite);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllDFromV);
} // namespace hwy
#endif
+19
View File
@@ -0,0 +1,19 @@
HWY_0 {
global:
extern "C++" {
*hwy::*;
};
local:
# Hide all the std namespace symbols. std namespace is explicitly marked
# as visibility(default) and header-only functions or methods (such as those
# from templates) should be exposed in shared libraries as weak symbols but
# this is only needed when we expose those types in the shared library API
# in any way. We don't use C++ std types in the API and we also don't
# support exceptions in the library.
# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion
# about this.
extern "C++" {
*std::*;
};
};
+762
View File
@@ -0,0 +1,762 @@
// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/nanobenchmark.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h> // clock_gettime
#include <algorithm> // sort
#include <array>
#include <atomic>
#include <chrono> //NOLINT
#include <limits>
#include <numeric> // iota
#include <random>
#include <string>
#include <vector>
#if defined(_WIN32) || defined(_WIN64)
#ifndef NOMINMAX
#define NOMINMAX
#endif // NOMINMAX
#include <windows.h>
#endif
#if defined(__APPLE__)
#include <mach/mach.h>
#include <mach/mach_time.h>
#endif
#if defined(__HAIKU__)
#include <OS.h>
#endif
#include "hwy/base.h"
#if HWY_ARCH_PPC && defined(__GLIBC__)
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
#elif HWY_ARCH_X86
#if HWY_COMPILER_MSVC
#include <intrin.h>
#else
#include <cpuid.h> // NOLINT
#endif // HWY_COMPILER_MSVC
#endif // HWY_ARCH_X86
namespace hwy {
namespace {
namespace timer {
// Ticks := platform-specific timer values (CPU cycles on x86). Must be
// unsigned to guarantee wraparound on overflow.
using Ticks = uint64_t;
// Start/Stop return absolute timestamps and must be placed immediately before
// and after the region to measure. We provide separate Start/Stop functions
// because they use different fences.
//
// Background: RDTSC is not 'serializing'; earlier instructions may complete
// after it, and/or later instructions may complete before it. 'Fences' ensure
// regions' elapsed times are independent of such reordering. The only
// documented unprivileged serializing instruction is CPUID, which acts as a
// full fence (no reordering across it in either direction). Unfortunately
// the latency of CPUID varies wildly (perhaps made worse by not initializing
// its EAX input). Because it cannot reliably be deducted from the region's
// elapsed time, it must not be included in the region to measure (i.e.
// between the two RDTSC).
//
// The newer RDTSCP is sometimes described as serializing, but it actually
// only serves as a half-fence with release semantics. Although all
// instructions in the region will complete before the final timestamp is
// captured, subsequent instructions may leak into the region and increase the
// elapsed time. Inserting another fence after the final RDTSCP would prevent
// such reordering without affecting the measured region.
//
// Fortunately, such a fence exists. The LFENCE instruction is only documented
// to delay later loads until earlier loads are visible. However, Intel's
// reference manual says it acts as a full fence (waiting until all earlier
// instructions have completed, and delaying later instructions until it
// completes). AMD assigns the same behavior to MFENCE.
//
// We need a fence before the initial RDTSC to prevent earlier instructions
// from leaking into the region, and arguably another after RDTSC to avoid
// region instructions from completing before the timestamp is recorded.
// When surrounded by fences, the additional RDTSCP half-fence provides no
// benefit, so the initial timestamp can be recorded via RDTSC, which has
// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
//
// Using Start+Start leads to higher variance and overhead than Stop+Stop.
// However, Stop+Stop includes an LFENCE in the region measurements, which
// adds a delay dependent on earlier loads. The combination of Start+Stop
// is faster than Start+Start and more consistent than Stop+Stop because
// the first LFENCE already delayed subsequent loads before the measured
// region. This combination seems not to have been considered in prior work:
// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
//
// Note: performance counters can measure 'exact' instructions-retired or
// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
// requires fences. Unfortunately, it is not accessible on all OSes and we
// prefer to avoid kernel-mode drivers. Performance counters are also affected
// by several under/over-count errata, so we use the TSC instead.
// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
// divide by InvariantTicksPerSecond.
inline Ticks Start() {
Ticks t;
#if HWY_ARCH_PPC && defined(__GLIBC__)
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
t = __rdtsc();
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
#elif HWY_ARCH_X86_64
asm volatile(
"lfence\n\t"
"rdtsc\n\t"
"shl $32, %%rdx\n\t"
"or %%rdx, %0\n\t"
"lfence"
: "=a"(t)
:
// "memory" avoids reordering. rdx = TSC >> 32.
// "cc" = flags modified by SHL.
: "rdx", "memory", "cc");
#elif HWY_ARCH_RVV
asm volatile("rdtime %0" : "=r"(t));
#elif defined(_WIN32) || defined(_WIN64)
LARGE_INTEGER counter;
(void)QueryPerformanceCounter(&counter);
t = counter.QuadPart;
#elif defined(__APPLE__)
t = mach_absolute_time();
#elif defined(__HAIKU__)
t = system_time_nsecs(); // since boot
#else // POSIX
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
#endif
return t;
}
// WARNING: on x86, caller must check HasRDTSCP before using this!
inline Ticks Stop() {
uint64_t t;
#if HWY_ARCH_PPC && defined(__GLIBC__)
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
_ReadWriteBarrier();
unsigned aux;
t = __rdtscp(&aux);
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
#elif HWY_ARCH_X86_64
// Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
asm volatile(
"rdtscp\n\t"
"shl $32, %%rdx\n\t"
"or %%rdx, %0\n\t"
"lfence"
: "=a"(t)
:
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
// "cc" = flags modified by SHL.
: "rcx", "rdx", "memory", "cc");
#else
t = Start();
#endif
return t;
}
} // namespace timer
namespace robust_statistics {
// Sorts integral values in ascending order (e.g. for Mode). About 3x faster
// than std::sort for input distributions with very few unique values.
template <class T>
void CountingSort(T* values, size_t num_values) {
// Unique values and their frequency (similar to flat_map).
using Unique = std::pair<T, int>;
std::vector<Unique> unique;
for (size_t i = 0; i < num_values; ++i) {
const T value = values[i];
const auto pos =
std::find_if(unique.begin(), unique.end(),
[value](const Unique u) { return u.first == value; });
if (pos == unique.end()) {
unique.push_back(std::make_pair(value, 1));
} else {
++pos->second;
}
}
// Sort in ascending order of value (pair.first).
std::sort(unique.begin(), unique.end());
// Write that many copies of each unique value to the array.
T* HWY_RESTRICT p = values;
for (const auto& value_count : unique) {
std::fill(p, p + value_count.second, value_count.first);
p += value_count.second;
}
NANOBENCHMARK_CHECK(p == values + num_values);
}
// @return i in [idx_begin, idx_begin + half_count) that minimizes
// sorted[i + half_count] - sorted[i].
template <typename T>
size_t MinRange(const T* const HWY_RESTRICT sorted, const size_t idx_begin,
const size_t half_count) {
T min_range = std::numeric_limits<T>::max();
size_t min_idx = 0;
for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
NANOBENCHMARK_CHECK(sorted[idx] <= sorted[idx + half_count]);
const T range = sorted[idx + half_count] - sorted[idx];
if (range < min_range) {
min_range = range;
min_idx = idx;
}
}
return min_idx;
}
// Returns an estimate of the mode by calling MinRange on successively
// halved intervals. "sorted" must be in ascending order. This is the
// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
// estimator of the mode", with complexity O(N log N). The mode is less
// affected by outliers in highly-skewed distributions than the median.
// The averaging operation below assumes "T" is an unsigned integer type.
template <typename T>
T ModeOfSorted(const T* const HWY_RESTRICT sorted, const size_t num_values) {
size_t idx_begin = 0;
size_t half_count = num_values / 2;
while (half_count > 1) {
idx_begin = MinRange(sorted, idx_begin, half_count);
half_count >>= 1;
}
const T x = sorted[idx_begin + 0];
if (half_count == 0) {
return x;
}
NANOBENCHMARK_CHECK(half_count == 1);
const T average = (x + sorted[idx_begin + 1] + 1) / 2;
return average;
}
// Returns the mode. Side effect: sorts "values".
template <typename T>
T Mode(T* values, const size_t num_values) {
CountingSort(values, num_values);
return ModeOfSorted(values, num_values);
}
template <typename T, size_t N>
T Mode(T (&values)[N]) {
return Mode(&values[0], N);
}
// Returns the median value. Side effect: sorts "values".
template <typename T>
T Median(T* values, const size_t num_values) {
NANOBENCHMARK_CHECK(!values->empty());
std::sort(values, values + num_values);
const size_t half = num_values / 2;
// Odd count: return middle
if (num_values % 2) {
return values[half];
}
// Even count: return average of middle two.
return (values[half] + values[half - 1] + 1) / 2;
}
// Returns a robust measure of variability.
template <typename T>
T MedianAbsoluteDeviation(const T* values, const size_t num_values,
const T median) {
NANOBENCHMARK_CHECK(num_values != 0);
std::vector<T> abs_deviations;
abs_deviations.reserve(num_values);
for (size_t i = 0; i < num_values; ++i) {
const int64_t abs = std::abs(static_cast<int64_t>(values[i]) -
static_cast<int64_t>(median));
abs_deviations.push_back(static_cast<T>(abs));
}
return Median(abs_deviations.data(), num_values);
}
} // namespace robust_statistics
} // namespace
namespace platform {
namespace {
// Prevents the compiler from eliding the computations that led to "output".
template <class T>
inline void PreventElision(T&& output) {
#if HWY_COMPILER_MSVC == 0
// Works by indicating to the compiler that "output" is being read and
// modified. The +r constraint avoids unnecessary writes to memory, but only
// works for built-in types (typically FuncOutput).
asm volatile("" : "+r"(output) : : "memory");
#else
// MSVC does not support inline assembly anymore (and never supported GCC's
// RTL constraints). Self-assignment with #pragma optimize("off") might be
// expected to prevent elision, but it does not with MSVC 2015. Type-punning
// with volatile pointers generates inefficient code on MSVC 2017.
static std::atomic<T> dummy(T{});
dummy.store(output, std::memory_order_relaxed);
#endif
}
// Measures the actual current frequency of Ticks. We cannot rely on the nominal
// frequency encoded in x86 BrandString because it is misleading on M1 Rosetta,
// and not reported by AMD. CPUID 0x15 is also not yet widely supported. Also
// used on RISC-V and ARM64.
HWY_MAYBE_UNUSED double MeasureNominalClockRate() {
double max_ticks_per_sec = 0.0;
// Arbitrary, enough to ignore 2 outliers without excessive init time.
for (int rep = 0; rep < 3; ++rep) {
auto time0 = std::chrono::steady_clock::now();
using Time = decltype(time0);
const timer::Ticks ticks0 = timer::Start();
const Time time_min = time0 + std::chrono::milliseconds(10);
Time time1;
timer::Ticks ticks1;
for (;;) {
time1 = std::chrono::steady_clock::now();
// Ideally this would be Stop, but that requires RDTSCP on x86. To avoid
// another codepath, just use Start instead. now() presumably has its own
// fence-like behavior.
ticks1 = timer::Start(); // Do not use Stop, see comment above
if (time1 >= time_min) break;
}
const double dticks = static_cast<double>(ticks1 - ticks0);
std::chrono::duration<double, std::ratio<1>> dtime = time1 - time0;
const double ticks_per_sec = dticks / dtime.count();
max_ticks_per_sec = std::max(max_ticks_per_sec, ticks_per_sec);
}
return max_ticks_per_sec;
}
#if HWY_ARCH_X86
void Cpuid(const uint32_t level, const uint32_t count,
uint32_t* HWY_RESTRICT abcd) {
#if HWY_COMPILER_MSVC
int regs[4];
__cpuidex(regs, level, count);
for (int i = 0; i < 4; ++i) {
abcd[i] = regs[i];
}
#else
uint32_t a;
uint32_t b;
uint32_t c;
uint32_t d;
__cpuid_count(level, count, a, b, c, d);
abcd[0] = a;
abcd[1] = b;
abcd[2] = c;
abcd[3] = d;
#endif
}
bool HasRDTSCP() {
uint32_t abcd[4];
Cpuid(0x80000001U, 0, abcd); // Extended feature flags
return (abcd[3] & (1u << 27)) != 0; // RDTSCP
}
std::string BrandString() {
char brand_string[49];
std::array<uint32_t, 4> abcd;
// Check if brand string is supported (it is on all reasonable Intel/AMD)
Cpuid(0x80000000U, 0, abcd.data());
if (abcd[0] < 0x80000004U) {
return std::string();
}
for (size_t i = 0; i < 3; ++i) {
Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
CopyBytes<sizeof(abcd)>(&abcd[0], brand_string + i * 16); // not same size
}
brand_string[48] = 0;
return brand_string;
}
#endif // HWY_ARCH_X86
} // namespace
HWY_DLLEXPORT double InvariantTicksPerSecond() {
#if HWY_ARCH_PPC && defined(__GLIBC__)
return static_cast<double>(__ppc_get_timebase_freq());
#elif HWY_ARCH_X86 || HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC)
// We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs.
static const double freq = MeasureNominalClockRate();
return freq;
#elif defined(_WIN32) || defined(_WIN64)
LARGE_INTEGER freq;
(void)QueryPerformanceFrequency(&freq);
return static_cast<double>(freq.QuadPart);
#elif defined(__APPLE__)
// https://developer.apple.com/library/mac/qa/qa1398/_index.html
mach_timebase_info_data_t timebase;
(void)mach_timebase_info(&timebase);
return static_cast<double>(timebase.denom) / timebase.numer * 1E9;
#else
return 1E9; // Haiku and clock_gettime return nanoseconds.
#endif
}
HWY_DLLEXPORT double Now() {
static const double mul = 1.0 / InvariantTicksPerSecond();
return static_cast<double>(timer::Start()) * mul;
}
HWY_DLLEXPORT uint64_t TimerResolution() {
#if HWY_ARCH_X86
bool can_use_stop = platform::HasRDTSCP();
#else
constexpr bool can_use_stop = true;
#endif
// Nested loop avoids exceeding stack/L1 capacity.
timer::Ticks repetitions[Params::kTimerSamples];
for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
timer::Ticks samples[Params::kTimerSamples];
if (can_use_stop) {
for (size_t i = 0; i < Params::kTimerSamples; ++i) {
const timer::Ticks t0 = timer::Start();
const timer::Ticks t1 = timer::Stop(); // we checked HasRDTSCP above
samples[i] = t1 - t0;
}
} else {
for (size_t i = 0; i < Params::kTimerSamples; ++i) {
const timer::Ticks t0 = timer::Start();
const timer::Ticks t1 = timer::Start(); // do not use Stop, see above
samples[i] = t1 - t0;
}
}
repetitions[rep] = robust_statistics::Mode(samples);
}
return robust_statistics::Mode(repetitions);
}
} // namespace platform
namespace {
static const timer::Ticks timer_resolution = platform::TimerResolution();
// Estimates the expected value of "lambda" values with a variable number of
// samples until the variability "rel_mad" is less than "max_rel_mad".
template <class Lambda>
timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
const Params& p, const Lambda& lambda) {
// Choose initial samples_per_eval based on a single estimated duration.
timer::Ticks t0 = timer::Start();
lambda();
timer::Ticks t1 = timer::Stop(); // Caller checks HasRDTSCP
timer::Ticks est = t1 - t0;
static const double ticks_per_second = platform::InvariantTicksPerSecond();
const size_t ticks_per_eval =
static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
size_t samples_per_eval = est == 0
? p.min_samples_per_eval
: static_cast<size_t>(ticks_per_eval / est);
samples_per_eval = HWY_MAX(samples_per_eval, p.min_samples_per_eval);
std::vector<timer::Ticks> samples;
samples.reserve(1 + samples_per_eval);
samples.push_back(est);
// Percentage is too strict for tiny differences, so also allow a small
// absolute "median absolute deviation".
const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100;
*rel_mad = 0.0; // ensure initialized
for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
samples.reserve(samples.size() + samples_per_eval);
for (size_t i = 0; i < samples_per_eval; ++i) {
t0 = timer::Start();
lambda();
t1 = timer::Stop(); // Caller checks HasRDTSCP
samples.push_back(t1 - t0);
}
if (samples.size() >= p.min_mode_samples) {
est = robust_statistics::Mode(samples.data(), samples.size());
} else {
// For "few" (depends also on the variance) samples, Median is safer.
est = robust_statistics::Median(samples.data(), samples.size());
}
NANOBENCHMARK_CHECK(est != 0);
// Median absolute deviation (mad) is a robust measure of 'variability'.
const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
samples.data(), samples.size(), est);
*rel_mad = static_cast<double>(abs_mad) / static_cast<double>(est);
if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
if (p.verbose) {
printf("%6" PRIu64 " samples => %5" PRIu64 " (abs_mad=%4" PRIu64
", rel_mad=%4.2f%%)\n",
static_cast<uint64_t>(samples.size()),
static_cast<uint64_t>(est), static_cast<uint64_t>(abs_mad),
*rel_mad * 100.0);
}
return est;
}
}
if (p.verbose) {
printf("WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6" PRIu64
" samples.\n",
*rel_mad * 100.0, max_rel_mad * 100.0,
static_cast<uint64_t>(samples.size()));
}
return est;
}
using InputVec = std::vector<FuncInput>;
// Returns vector of unique input values.
InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) {
InputVec unique(inputs, inputs + num_inputs);
std::sort(unique.begin(), unique.end());
unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
return unique;
}
// Returns how often we need to call func for sufficient precision.
size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
const Params& p) {
// Min elapsed ticks for any input.
timer::Ticks min_duration = ~timer::Ticks(0);
for (const FuncInput input : unique) {
double rel_mad;
const timer::Ticks total = SampleUntilStable(
p.target_rel_mad, &rel_mad, p,
[func, arg, input]() { platform::PreventElision(func(arg, input)); });
min_duration = HWY_MIN(min_duration, total - timer_resolution);
}
// Number of repetitions required to reach the target resolution.
const size_t max_skip = p.precision_divisor;
// Number of repetitions given the estimated duration.
const size_t num_skip =
min_duration == 0
? 0
: static_cast<size_t>((max_skip + min_duration - 1) / min_duration);
if (p.verbose) {
printf("res=%" PRIu64 " max_skip=%" PRIu64 " min_dur=%" PRIu64
" num_skip=%" PRIu64 "\n",
static_cast<uint64_t>(timer_resolution),
static_cast<uint64_t>(max_skip), static_cast<uint64_t>(min_duration),
static_cast<uint64_t>(num_skip));
}
return num_skip;
}
// Replicates inputs until we can omit "num_skip" occurrences of an input.
InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs,
const size_t num_unique, const size_t num_skip,
const Params& p) {
InputVec full;
if (num_unique == 1) {
full.assign(p.subset_ratio * num_skip, inputs[0]);
return full;
}
full.reserve(p.subset_ratio * num_skip * num_inputs);
for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) {
full.insert(full.end(), inputs, inputs + num_inputs);
}
std::mt19937 rng;
std::shuffle(full.begin(), full.end(), rng);
return full;
}
// Copies the "full" to "subset" in the same order, but with "num_skip"
// randomly selected occurrences of "input_to_skip" removed.
void FillSubset(const InputVec& full, const FuncInput input_to_skip,
const size_t num_skip, InputVec* subset) {
const size_t count =
static_cast<size_t>(std::count(full.begin(), full.end(), input_to_skip));
// Generate num_skip random indices: which occurrence to skip.
std::vector<uint32_t> omit(count);
std::iota(omit.begin(), omit.end(), 0);
// omit[] is the same on every call, but that's OK because they identify the
// Nth instance of input_to_skip, so the position within full[] differs.
std::mt19937 rng;
std::shuffle(omit.begin(), omit.end(), rng);
omit.resize(num_skip);
std::sort(omit.begin(), omit.end());
uint32_t occurrence = ~0u; // 0 after preincrement
size_t idx_omit = 0; // cursor within omit[]
size_t idx_subset = 0; // cursor within *subset
for (const FuncInput next : full) {
if (next == input_to_skip) {
++occurrence;
// Haven't removed enough already
if (idx_omit < num_skip) {
// This one is up for removal
if (occurrence == omit[idx_omit]) {
++idx_omit;
continue;
}
}
}
if (idx_subset < subset->size()) {
(*subset)[idx_subset++] = next;
}
}
NANOBENCHMARK_CHECK(idx_subset == subset->size());
NANOBENCHMARK_CHECK(idx_omit == omit.size());
NANOBENCHMARK_CHECK(occurrence == count - 1);
}
// Returns total ticks elapsed for all inputs.
timer::Ticks TotalDuration(const Func func, const uint8_t* arg,
const InputVec* inputs, const Params& p,
double* max_rel_mad) {
double rel_mad;
const timer::Ticks duration =
SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
for (const FuncInput input : *inputs) {
platform::PreventElision(func(arg, input));
}
});
*max_rel_mad = HWY_MAX(*max_rel_mad, rel_mad);
return duration;
}
// (Nearly) empty Func for measuring timer overhead/resolution.
HWY_NOINLINE FuncOutput EmptyFunc(const void* /*arg*/, const FuncInput input) {
return input;
}
// Returns overhead of accessing inputs[] and calling a function; this will
// be deducted from future TotalDuration return values.
timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,
const Params& p) {
double rel_mad;
// Zero tolerance because repeatability is crucial and EmptyFunc is fast.
return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
for (const FuncInput input : *inputs) {
platform::PreventElision(EmptyFunc(arg, input));
}
});
}
} // namespace
HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; }
HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
const FuncInput* inputs, const size_t num_inputs,
Result* results, const Params& p) {
NANOBENCHMARK_CHECK(num_inputs != 0);
#if HWY_ARCH_X86
if (!platform::HasRDTSCP()) {
fprintf(stderr, "CPU '%s' does not support RDTSCP, skipping benchmark.\n",
platform::BrandString().c_str());
return 0;
}
#endif
const InputVec& unique = UniqueInputs(inputs, num_inputs);
const size_t num_skip = NumSkip(func, arg, unique, p); // never 0
if (num_skip == 0) return 0; // NumSkip already printed error message
// (slightly less work on x86 to cast from signed integer)
const float mul = 1.0f / static_cast<float>(static_cast<int>(num_skip));
const InputVec& full =
ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
InputVec subset(full.size() - num_skip);
const timer::Ticks overhead = Overhead(arg, &full, p);
const timer::Ticks overhead_skip = Overhead(arg, &subset, p);
if (overhead < overhead_skip) {
fprintf(stderr, "Measurement failed: overhead %" PRIu64 " < %" PRIu64 "\n",
static_cast<uint64_t>(overhead),
static_cast<uint64_t>(overhead_skip));
return 0;
}
if (p.verbose) {
printf("#inputs=%5" PRIu64 ",%5" PRIu64 " overhead=%5" PRIu64 ",%5" PRIu64
"\n",
static_cast<uint64_t>(full.size()),
static_cast<uint64_t>(subset.size()),
static_cast<uint64_t>(overhead),
static_cast<uint64_t>(overhead_skip));
}
double max_rel_mad = 0.0;
const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
for (size_t i = 0; i < unique.size(); ++i) {
FillSubset(full, unique[i], num_skip, &subset);
const timer::Ticks total_skip =
TotalDuration(func, arg, &subset, p, &max_rel_mad);
if (total < total_skip) {
fprintf(stderr, "Measurement failed: total %" PRIu64 " < %" PRIu64 "\n",
static_cast<uint64_t>(total), static_cast<uint64_t>(total_skip));
return 0;
}
const timer::Ticks duration =
(total - overhead) - (total_skip - overhead_skip);
results[i].input = unique[i];
results[i].ticks = static_cast<float>(duration) * mul;
results[i].variability = static_cast<float>(max_rel_mad);
}
return unique.size();
}
} // namespace hwy
+194
View File
@@ -0,0 +1,194 @@
// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
#define HIGHWAY_HWY_NANOBENCHMARK_H_
// Benchmarks functions of a single integer argument with realistic branch
// prediction hit rates. Uses a robust estimator to summarize the measurements.
// The precision is about 0.2%.
//
// Examples: see nanobenchmark_test.cc.
//
// Background: Microbenchmarks such as http://github.com/google/benchmark
// can measure elapsed times on the order of a microsecond. Shorter functions
// are typically measured by repeating them thousands of times and dividing
// the total elapsed time by this count. Unfortunately, repetition (especially
// with the same input parameter!) influences the runtime. In time-critical
// code, it is reasonable to expect warm instruction/data caches and TLBs,
// but a perfect record of which branches will be taken is unrealistic.
// Unless the application also repeatedly invokes the measured function with
// the same parameter, the benchmark is measuring something very different -
// a best-case result, almost as if the parameter were made a compile-time
// constant. This may lead to erroneous conclusions about branch-heavy
// algorithms outperforming branch-free alternatives.
//
// Our approach differs in three ways. Adding fences to the timer functions
// reduces variability due to instruction reordering, improving the timer
// resolution to about 40 CPU cycles. However, shorter functions must still
// be invoked repeatedly. For more realistic branch prediction performance,
// we vary the input parameter according to a user-specified distribution.
// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
// central tendency of the measurement samples with the "half sample mode",
// which is more robust to outliers and skewed data than the mean or median.
#include <stddef.h>
#include <stdint.h>
#include "hwy/highway_export.h"
// Enables sanity checks that verify correct operation at the cost of
// longer benchmark runs.
#ifndef NANOBENCHMARK_ENABLE_CHECKS
#define NANOBENCHMARK_ENABLE_CHECKS 0
#endif
#define NANOBENCHMARK_CHECK_ALWAYS(condition) \
while (!(condition)) { \
fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
abort(); \
}
#if NANOBENCHMARK_ENABLE_CHECKS
#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
#else
#define NANOBENCHMARK_CHECK(condition)
#endif
namespace hwy {
namespace platform {
// Returns tick rate, useful for converting measurements to seconds. Invariant
// means the tick counter frequency is independent of CPU throttling or sleep.
// This call may be expensive, callers should cache the result.
HWY_DLLEXPORT double InvariantTicksPerSecond();
// Returns current timestamp [in seconds] relative to an unspecified origin.
// Features: monotonic (no negative elapsed time), steady (unaffected by system
// time changes), high-resolution (on the order of microseconds).
HWY_DLLEXPORT double Now();
// Returns ticks elapsed in back to back timer calls, i.e. a function of the
// timer resolution (minimum measurable difference) and overhead.
// This call is expensive, callers should cache the result.
HWY_DLLEXPORT uint64_t TimerResolution();
} // namespace platform
// Returns 1, but without the compiler knowing what the value is. This prevents
// optimizing out code.
HWY_DLLEXPORT int Unpredictable1();
// Input influencing the function being measured (e.g. number of bytes to copy).
using FuncInput = size_t;
// "Proof of work" returned by Func to ensure the compiler does not elide it.
using FuncOutput = uint64_t;
// Function to measure: either 1) a captureless lambda or function with two
// arguments or 2) a lambda with capture, in which case the first argument
// is reserved for use by MeasureClosure.
using Func = FuncOutput (*)(const void*, FuncInput);
// Internal parameters that determine precision/resolution/measuring time.
struct Params {
// For measuring timer overhead/resolution. Used in a nested loop =>
// quadratic time, acceptable because we know timer overhead is "low".
// constexpr because this is used to define array bounds.
static constexpr size_t kTimerSamples = 256;
// Best-case precision, expressed as a divisor of the timer resolution.
// Larger => more calls to Func and higher precision.
size_t precision_divisor = 1024;
// Ratio between full and subset input distribution sizes. Cannot be less
// than 2; larger values increase measurement time but more faithfully
// model the given input distribution.
size_t subset_ratio = 2;
// Together with the estimated Func duration, determines how many times to
// call Func before checking the sample variability. Larger values increase
// measurement time, memory/cache use and precision.
double seconds_per_eval = 4E-3;
// The minimum number of samples before estimating the central tendency.
size_t min_samples_per_eval = 7;
// The mode is better than median for estimating the central tendency of
// skewed/fat-tailed distributions, but it requires sufficient samples
// relative to the width of half-ranges.
size_t min_mode_samples = 64;
// Maximum permissible variability (= median absolute deviation / center).
double target_rel_mad = 0.002;
// Abort after this many evals without reaching target_rel_mad. This
// prevents infinite loops.
size_t max_evals = 9;
// Whether to print additional statistics to stdout.
bool verbose = true;
};
// Measurement result for each unique input.
struct Result {
FuncInput input;
// Robust estimate (mode or median) of duration.
float ticks;
// Measure of variability (median absolute deviation relative to "ticks").
float variability;
};
// Precisely measures the number of ticks elapsed when calling "func" with the
// given inputs, shuffled to ensure realistic branch prediction hit rates.
//
// "func" returns a 'proof of work' to ensure its computations are not elided.
// "arg" is passed to Func, or reserved for internal use by MeasureClosure.
// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
// "func". The values should be chosen to maximize coverage of "func". This
// represents a distribution, so a value's frequency should reflect its
// probability in the real application. Order does not matter; for example, a
// uniform distribution over [0, 4) could be represented as {3,0,2,1}.
// Returns how many Result were written to "results": one per unique input, or
// zero if the measurement failed (an error message goes to stderr).
HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
const FuncInput* inputs, const size_t num_inputs,
Result* results, const Params& p = Params());
// Calls operator() of the given closure (lambda function).
template <class Closure>
static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
return (*f)(input);
}
// Same as Measure, except "closure" is typically a lambda function of
// FuncInput -> FuncOutput with a capture list.
template <class Closure>
static inline size_t MeasureClosure(const Closure& closure,
const FuncInput* inputs,
const size_t num_inputs, Result* results,
const Params& p = Params()) {
return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
results, p);
}
} // namespace hwy
#endif // HIGHWAY_HWY_NANOBENCHMARK_H_
@@ -0,0 +1,94 @@
// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/nanobenchmark.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <random>
#include "hwy/tests/test_util-inl.h"
namespace hwy {
namespace {
// Governs duration of test; avoid timeout in debug builds.
#if HWY_IS_DEBUG_BUILD
constexpr size_t kMaxEvals = 3;
#else
constexpr size_t kMaxEvals = 4;
#endif
FuncOutput Div(const void*, FuncInput in) {
// Here we're measuring the throughput because benchmark invocations are
// independent. Any dividend will do; the divisor is nonzero.
return 0xFFFFF / in;
}
template <size_t N>
void MeasureDiv(const FuncInput (&inputs)[N]) {
printf("Measuring integer division (output on final two lines)\n");
Result results[N];
Params params;
params.max_evals = kMaxEvals;
const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params);
for (size_t i = 0; i < num_results; ++i) {
printf("%5" PRIu64 ": %6.2f ticks; MAD=%4.2f%%\n",
static_cast<uint64_t>(results[i].input), results[i].ticks,
results[i].variability * 100.0);
}
}
std::mt19937 rng;
// A function whose runtime depends on rng.
FuncOutput Random(const void* /*arg*/, FuncInput in) {
const size_t r = rng() & 0xF;
FuncOutput ret = static_cast<FuncOutput>(in);
for (size_t i = 0; i < r; ++i) {
ret /= ((rng() & 1) + 2);
}
return ret;
}
// Ensure the measured variability is high.
template <size_t N>
void MeasureRandom(const FuncInput (&inputs)[N]) {
Result results[N];
Params p;
p.max_evals = kMaxEvals;
p.verbose = false;
const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p);
for (size_t i = 0; i < num_results; ++i) {
NANOBENCHMARK_CHECK(results[i].variability > 1E-3);
}
}
TEST(NanobenchmarkTest, RunAll) {
const int unpredictable = Unpredictable1(); // == 1, unknown to compiler.
static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
static_cast<FuncInput>(unpredictable + 9)};
MeasureDiv(inputs);
MeasureRandom(inputs);
}
} // namespace
} // namespace hwy
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+444
View File
@@ -0,0 +1,444 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Sets macros based on HWY_TARGET.
// This include guard is toggled by foreach_target, so avoid the usual _H_
// suffix to prevent copybara from renaming it.
#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
#ifdef HWY_SET_MACROS_PER_TARGET
#undef HWY_SET_MACROS_PER_TARGET
#else
#define HWY_SET_MACROS_PER_TARGET
#endif
#endif // HWY_SET_MACROS_PER_TARGET
#include "hwy/detect_targets.h"
#undef HWY_NAMESPACE
#undef HWY_ALIGN
#undef HWY_MAX_BYTES
#undef HWY_LANES
#undef HWY_HAVE_SCALABLE
#undef HWY_HAVE_INTEGER64
#undef HWY_HAVE_FLOAT16
#undef HWY_HAVE_FLOAT64
#undef HWY_MEM_OPS_MIGHT_FAULT
#undef HWY_NATIVE_FMA
#undef HWY_CAP_GE256
#undef HWY_CAP_GE512
#undef HWY_TARGET_STR
#if defined(HWY_DISABLE_PCLMUL_AES)
#define HWY_TARGET_STR_PCLMUL_AES ""
#else
#define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes"
#endif
#if defined(HWY_DISABLE_BMI2_FMA)
#define HWY_TARGET_STR_BMI2_FMA ""
#else
#define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma"
#endif
#if defined(HWY_DISABLE_F16C)
#define HWY_TARGET_STR_F16C ""
#else
#define HWY_TARGET_STR_F16C ",f16c"
#endif
#define HWY_TARGET_STR_SSSE3 "sse2,ssse3"
#define HWY_TARGET_STR_SSE4 \
HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES
// Include previous targets, which are the half-vectors of the next target.
#define HWY_TARGET_STR_AVX2 \
HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
#define HWY_TARGET_STR_AVX3 \
HWY_TARGET_STR_AVX2 ",avx512f,avx512vl,avx512dq,avx512bw"
// Before include guard so we redefine HWY_TARGET_STR on each include,
// governed by the current HWY_TARGET.
//-----------------------------------------------------------------------------
// SSSE3
#if HWY_TARGET == HWY_SSSE3
#define HWY_NAMESPACE N_SSSE3
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
//-----------------------------------------------------------------------------
// SSE4
#elif HWY_TARGET == HWY_SSE4
#define HWY_NAMESPACE N_SSE4
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_TARGET_STR HWY_TARGET_STR_SSE4
//-----------------------------------------------------------------------------
// AVX2
#elif HWY_TARGET == HWY_AVX2
#define HWY_NAMESPACE N_AVX2
#define HWY_ALIGN alignas(32)
#define HWY_MAX_BYTES 32
#define HWY_LANES(T) (32 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#ifdef HWY_DISABLE_BMI2_FMA
#define HWY_NATIVE_FMA 0
#else
#define HWY_NATIVE_FMA 1
#endif
#define HWY_CAP_GE256 1
#define HWY_CAP_GE512 0
#define HWY_TARGET_STR HWY_TARGET_STR_AVX2
//-----------------------------------------------------------------------------
// AVX3[_DL]
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL
#define HWY_ALIGN alignas(64)
#define HWY_MAX_BYTES 64
#define HWY_LANES(T) (64 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 0
#define HWY_NATIVE_FMA 1
#define HWY_CAP_GE256 1
#define HWY_CAP_GE512 1
#if HWY_TARGET == HWY_AVX3
#define HWY_NAMESPACE N_AVX3
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3
#elif HWY_TARGET == HWY_AVX3_DL
#define HWY_NAMESPACE N_AVX3_DL
#define HWY_TARGET_STR \
HWY_TARGET_STR_AVX3 \
",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avxvnni,avx512bitalg," \
"avx512vpopcntdq"
#else
#error "Logic error"
#endif // HWY_TARGET == HWY_AVX3_DL
//-----------------------------------------------------------------------------
// PPC8
#elif HWY_TARGET == HWY_PPC8
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_PPC8
#define HWY_TARGET_STR "altivec,vsx"
//-----------------------------------------------------------------------------
// NEON
#elif HWY_TARGET == HWY_NEON
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#if HWY_ARCH_ARM_A64
#define HWY_HAVE_FLOAT64 1
#else
#define HWY_HAVE_FLOAT64 0
#endif
#define HWY_MEM_OPS_MIGHT_FAULT 1
#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
#define HWY_NATIVE_FMA 1
#else
#define HWY_NATIVE_FMA 0
#endif
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_NEON
// Can use pragmas instead of -march compiler flag
#if HWY_HAVE_RUNTIME_DISPATCH
#if HWY_ARCH_ARM_V7
#define HWY_TARGET_STR "+neon-vfpv4"
#else
#define HWY_TARGET_STR "+crypto"
#endif // HWY_ARCH_ARM_V7
#else
// HWY_TARGET_STR remains undefined
#endif
//-----------------------------------------------------------------------------
// SVE[2]
#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \
HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
// SVE only requires lane alignment, not natural alignment of the entire vector.
#define HWY_ALIGN alignas(8)
// Value ensures MaxLanes() is the tightest possible upper bound to reduce
// overallocation.
#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
#define HWY_HAVE_SCALABLE 1
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 0
#define HWY_NATIVE_FMA 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#if HWY_TARGET == HWY_SVE2
#define HWY_NAMESPACE N_SVE2
#define HWY_MAX_BYTES 256
#elif HWY_TARGET == HWY_SVE_256
#define HWY_NAMESPACE N_SVE_256
#define HWY_MAX_BYTES 32
#elif HWY_TARGET == HWY_SVE2_128
#define HWY_NAMESPACE N_SVE2_128
#define HWY_MAX_BYTES 16
#else
#define HWY_NAMESPACE N_SVE
#define HWY_MAX_BYTES 256
#endif
// Can use pragmas instead of -march compiler flag
#if HWY_HAVE_RUNTIME_DISPATCH
#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
#define HWY_TARGET_STR "+sve2-aes"
#else
#define HWY_TARGET_STR "+sve"
#endif
#else
// HWY_TARGET_STR remains undefined
#endif
//-----------------------------------------------------------------------------
// WASM
#elif HWY_TARGET == HWY_WASM
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 0
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_WASM
#define HWY_TARGET_STR "simd128"
//-----------------------------------------------------------------------------
// WASM_EMU256
#elif HWY_TARGET == HWY_WASM_EMU256
#define HWY_ALIGN alignas(32)
#define HWY_MAX_BYTES 32
#define HWY_LANES(T) (32 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 0
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_WASM_EMU256
#define HWY_TARGET_STR "simd128"
//-----------------------------------------------------------------------------
// RVV
#elif HWY_TARGET == HWY_RVV
// RVV only requires lane alignment, not natural alignment of the entire vector,
// and the compiler already aligns builtin types, so nothing to do here.
#define HWY_ALIGN
// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
#define HWY_MAX_BYTES 65536
// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
// LMUL. This is the tightest possible upper bound.
#define HWY_LANES(T) (8192 / sizeof(T))
#define HWY_HAVE_SCALABLE 1
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 0
#define HWY_NATIVE_FMA 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#if defined(__riscv_zvfh)
#define HWY_HAVE_FLOAT16 1
#else
#define HWY_HAVE_FLOAT16 0
#endif
#define HWY_NAMESPACE N_RVV
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
// (rv64gcv is not a valid target)
//-----------------------------------------------------------------------------
// EMU128
#elif HWY_TARGET == HWY_EMU128
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_EMU128
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
//-----------------------------------------------------------------------------
// SCALAR
#elif HWY_TARGET == HWY_SCALAR
#define HWY_ALIGN
#define HWY_MAX_BYTES 8
#define HWY_LANES(T) 1
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 0
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_SCALAR
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
#else
#pragma message("HWY_TARGET does not match any known target")
#endif // HWY_TARGET
// Override this to 1 in asan/msan builds, which will still fault.
#if HWY_IS_ASAN || HWY_IS_MSAN
#undef HWY_MEM_OPS_MIGHT_FAULT
#define HWY_MEM_OPS_MIGHT_FAULT 1
#endif
// Clang <9 requires this be invoked at file scope, before any namespace.
#undef HWY_BEFORE_NAMESPACE
#if defined(HWY_TARGET_STR)
#define HWY_BEFORE_NAMESPACE() \
HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \
static_assert(true, "For requiring trailing semicolon")
#else
// avoids compiler warning if no HWY_TARGET_STR
#define HWY_BEFORE_NAMESPACE() \
static_assert(true, "For requiring trailing semicolon")
#endif
// Clang <9 requires any namespaces be closed before this macro.
#undef HWY_AFTER_NAMESPACE
#if defined(HWY_TARGET_STR)
#define HWY_AFTER_NAMESPACE() \
HWY_POP_ATTRIBUTES \
static_assert(true, "For requiring trailing semicolon")
#else
// avoids compiler warning if no HWY_TARGET_STR
#define HWY_AFTER_NAMESPACE() \
static_assert(true, "For requiring trailing semicolon")
#endif
#undef HWY_ATTR
#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
#define HWY_ATTR __attribute__((target(HWY_TARGET_STR)))
#else
#define HWY_ATTR
#endif
+311
View File
@@ -0,0 +1,311 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target definitions shared by ops/*.h and user code.
#include <cmath>
#include "hwy/base.h"
// Separate header because foreach_target.h re-enables its include guard.
#include "hwy/ops/set_macros-inl.h"
// Relies on the external include guard in highway.h.
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Highway operations are implemented as overloaded functions selected using an
// internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
// shift count applied to scalable vectors. Instead of referring to Simd<>
// directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
// full vector, or fractions/groups if the argument is negative/positive),
// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
// Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
// cap. For constexpr-size vectors, N is the actual number of lanes. This
// ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
template <typename Lane, size_t N, int kPow2>
struct Simd {
constexpr Simd() = default;
using T = Lane;
static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
// Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
// warns when using enums and non-enums in the same expression. Cannot be
// static constexpr function (another MSVC limitation).
static constexpr size_t kPrivateN = N;
static constexpr int kPrivatePow2 = kPow2;
template <typename NewT>
static constexpr size_t NewN() {
// Round up to correctly handle scalars with N=1.
return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
}
#if HWY_HAVE_SCALABLE
template <typename NewT>
static constexpr int Pow2Ratio() {
return (sizeof(NewT) > sizeof(T))
? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
: -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)));
}
#endif
// Widening/narrowing ops change the number of lanes and/or their type.
// To initialize such vectors, we need the corresponding tag types:
// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
#if HWY_HAVE_SCALABLE
template <typename NewT>
using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>;
#else
template <typename NewT>
using Rebind = Simd<NewT, N, kPow2>;
#endif
// Change lane type while keeping the same vector size, e.g. for MulEven.
template <typename NewT>
using Repartition = Simd<NewT, NewN<NewT>(), kPow2>;
// Half the lanes while keeping the same lane type, e.g. for LowerHalf.
// Round up to correctly handle scalars with N=1.
#if HWY_HAVE_SCALABLE
// Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
// then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
using Half = Simd<T, (N + 1) / 2, kPow2 - 1>;
#else
using Half = Simd<T, (N + 1) / 2, kPow2>;
#endif
// Twice the lanes while keeping the same lane type, e.g. for Combine.
#if HWY_HAVE_SCALABLE
using Twice = Simd<T, 2 * N, kPow2 + 1>;
#else
using Twice = Simd<T, 2 * N, kPow2>;
#endif
};
namespace detail {
template <typename T, size_t N, int kPow2>
constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
return N == HWY_LANES(T) && kPow2 == 0;
}
// Returns the number of lanes (possibly zero) after applying a shift:
// - 0: no change;
// - [1,3]: a group of 2,4,8 [fractional] vectors;
// - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
constexpr size_t ScaleByPower(size_t N, int pow2) {
#if HWY_TARGET == HWY_RVV
return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
#else
return pow2 >= 0 ? N : (N >> (-pow2));
#endif
}
// Struct wrappers enable validation of arguments via static_assert.
template <typename T, int kPow2>
struct ScalableTagChecker {
static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
#if HWY_TARGET == HWY_RVV
// Only RVV supports register groups.
using type = Simd<T, HWY_LANES(T), kPow2>;
#elif HWY_HAVE_SCALABLE
// For SVE[2], only allow full or fractions.
using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>;
#elif HWY_TARGET == HWY_SCALAR
using type = Simd<T, /*N=*/1, 0>;
#else
// Only allow full or fractions.
using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>;
#endif
};
template <typename T, size_t kLimit>
struct CappedTagChecker {
static_assert(kLimit != 0, "Does not make sense to have zero lanes");
// Safely handle non-power-of-two inputs by rounding down, which is allowed by
// CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
using type = Simd<T, HWY_MIN(kLimitPow2, HWY_LANES(T)), 0>;
};
template <typename T, size_t kNumLanes>
struct FixedTagChecker {
static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
using type = Simd<T, kNumLanes, 0>;
};
} // namespace detail
// Alias for a tag describing a full vector (kPow2 == 0: the most common usage,
// e.g. 1D loops where the application does not care about the vector size) or a
// fraction/multiple of one. Multiples are the same as full vectors for all
// targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return
// value of type promotion and demotion.
template <typename T, int kPow2 = 0>
using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
// Alias for a tag describing a vector with *up to* kLimit active lanes, even on
// targets with scalable vectors and HWY_SCALAR. The runtime lane count
// `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is
// typically used for 1D loops with a relatively low application-defined upper
// bound, e.g. for 8x8 DCTs. However, it is better if data structures are
// designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
// chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
// this would enable vector-length-agnostic loops using ScalableTag).
template <typename T, size_t kLimit>
using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type;
// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
// two not exceeding `HWY_LANES(T)`.
//
// NOTE: if the application does not need to support HWY_SCALAR (+), use this
// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
// This is useful for data structures that rely on exactly 128-bit SIMD, but
// these are discouraged because they cannot benefit from wider vectors.
// Instead, applications would ideally define a larger problem size and loop
// over it with the (unknown size) vectors from ScalableTag.
//
// + e.g. if the baseline is known to support SIMD, or the application requires
// ops such as TableLookupBytes not supported by HWY_SCALAR.
template <typename T, size_t kNumLanes>
using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type;
template <class D>
using TFromD = typename D::T;
// Tag for the same number of lanes as D, but with the LaneType T.
template <class T, class D>
using Rebind = typename D::template Rebind<T>;
template <class D>
using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
template <class D>
using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
template <class D>
using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
// Tag for the same total size as D, but with the LaneType T.
template <class T, class D>
using Repartition = typename D::template Repartition<T>;
template <class D>
using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
template <class D>
using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
// Tag for the same lane type as D, but half the lanes.
template <class D>
using Half = typename D::Half;
// Tag for the same lane type as D, but twice the lanes.
template <class D>
using Twice = typename D::Twice;
template <typename T>
using Full32 = Simd<T, 4 / sizeof(T), 0>;
template <typename T>
using Full64 = Simd<T, 8 / sizeof(T), 0>;
template <typename T>
using Full128 = Simd<T, 16 / sizeof(T), 0>;
// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
// MSVC workaround: use PrivateN directly instead of MaxLanes.
#define HWY_IF_LT128_D(D) \
hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr
#define HWY_IF_GE128_D(D) \
hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
#define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV<V>, bytes)
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
return D::kPrivatePow2;
}
// MSVC requires the explicit <D>.
#define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr
#if HWY_HAVE_SCALABLE
// Upper bound on the number of lanes. Intended for template arguments and
// reducing code size (e.g. for SSE4, we know at compile-time that vectors will
// not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
// actual size for allocating storage. WARNING: MSVC might not be able to deduce
// arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)),
D::kPrivatePow2);
}
#else
// Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
// is not an option, nor does a member function work.
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
return D::kPrivateN;
}
// (Potentially) non-constant actual size of the vector at runtime, subject to
// the limit imposed by the Simd. Useful for advancing loop counters.
// Targets with scalable vectors define this themselves.
template <typename T, size_t N, int kPow2>
HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N, kPow2>) {
return N;
}
#endif // !HWY_HAVE_SCALABLE
// NOTE: GCC generates incorrect code for vector arguments to non-inlined
// functions in two situations:
// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
// all) tests to fail.
//
// We therefore pass by const& only on GCC and (Windows or ARM64). This alias
// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
// and possibly also other functions that are not inlined.
#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
template <class V>
using VecArg = const V&;
#else
template <class V>
using VecArg = V;
#endif
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More